diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,36018 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.375, + "eval_steps": 250, + "global_step": 3000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000125, + "grad_norm": 2.537714958190918, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.5468, + "loss/crossentropy": 2.2066214084625244, + "loss/hidden": 0.248046875, + "loss/logits": 0.03443578630685806, + "loss/reg": 0.026429571211338043, + "step": 1 + }, + { + "epoch": 0.00025, + "grad_norm": 2.4728448390960693, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6642, + "loss/crossentropy": 2.132329225540161, + "loss/hidden": 0.345703125, + "loss/logits": 0.05424630641937256, + "loss/reg": 0.026429571211338043, + "step": 2 + }, + { + "epoch": 0.000375, + "grad_norm": 2.773984670639038, + "learning_rate": 3e-06, + "loss": 0.5822, + "loss/crossentropy": 2.3457791805267334, + "loss/hidden": 0.2734375, + "loss/logits": 0.044443465769290924, + "loss/reg": 0.02642953023314476, + "step": 3 + }, + { + "epoch": 0.0005, + "grad_norm": 4.14040470123291, + "learning_rate": 4.000000000000001e-06, + "loss": 0.7192, + "loss/crossentropy": 2.7209200859069824, + "loss/hidden": 0.35546875, + "loss/logits": 0.09940779209136963, + "loss/reg": 0.026429466903209686, + "step": 4 + }, + { + "epoch": 0.000625, + "grad_norm": 1.9164764881134033, + "learning_rate": 5e-06, + "loss": 0.5467, + "loss/crossentropy": 2.4304752349853516, + "loss/hidden": 0.244140625, + "loss/logits": 0.03826362267136574, + "loss/reg": 0.02642936445772648, + "step": 5 + }, + { + "epoch": 0.00075, + "grad_norm": 1.9878246784210205, + "learning_rate": 6e-06, + "loss": 0.517, + "loss/crossentropy": 2.472181797027588, + "loss/hidden": 0.2255859375, + "loss/logits": 0.027161670848727226, + "loss/reg": 0.02642924338579178, + "step": 6 + }, + { + "epoch": 0.000875, + "grad_norm": 2.1939733028411865, + "learning_rate": 7.000000000000001e-06, + "loss": 0.6043, + "loss/crossentropy": 2.241501808166504, + "loss/hidden": 0.298828125, + "loss/logits": 0.04118040204048157, + "loss/reg": 0.02642909064888954, + "step": 7 + }, + { + "epoch": 0.001, + "grad_norm": 3.516223907470703, + "learning_rate": 8.000000000000001e-06, + "loss": 0.5199, + "loss/crossentropy": 2.409766912460327, + "loss/hidden": 0.2236328125, + "loss/logits": 0.032000549137592316, + "loss/reg": 0.02642889879643917, + "step": 8 + }, + { + "epoch": 0.001125, + "grad_norm": 1.9335486888885498, + "learning_rate": 9e-06, + "loss": 0.5575, + "loss/crossentropy": 2.6256861686706543, + "loss/hidden": 0.255859375, + "loss/logits": 0.037392452359199524, + "loss/reg": 0.02642873302102089, + "step": 9 + }, + { + "epoch": 0.00125, + "grad_norm": 1.6782876253128052, + "learning_rate": 1e-05, + "loss": 0.5162, + "loss/crossentropy": 2.1947107315063477, + "loss/hidden": 0.2255859375, + "loss/logits": 0.026354767382144928, + "loss/reg": 0.026428483426570892, + "step": 10 + }, + { + "epoch": 0.001375, + "grad_norm": 10.848552703857422, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.0046, + "loss/crossentropy": 3.0539069175720215, + "loss/hidden": 0.640625, + "loss/logits": 0.09970991313457489, + "loss/reg": 0.026428230106830597, + "step": 11 + }, + { + "epoch": 0.0015, + "grad_norm": 2.237061023712158, + "learning_rate": 1.2e-05, + "loss": 0.563, + "loss/crossentropy": 2.5601325035095215, + "loss/hidden": 0.248046875, + "loss/logits": 0.050660137087106705, + "loss/reg": 0.02642793208360672, + "step": 12 + }, + { + "epoch": 0.001625, + "grad_norm": 1.4406346082687378, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.507, + "loss/crossentropy": 1.965380311012268, + "loss/hidden": 0.2197265625, + "loss/logits": 0.02295786701142788, + "loss/reg": 0.02642756886780262, + "step": 13 + }, + { + "epoch": 0.00175, + "grad_norm": 3.0757036209106445, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.7761, + "loss/crossentropy": 2.15138840675354, + "loss/hidden": 0.4375, + "loss/logits": 0.07431840896606445, + "loss/reg": 0.026427194476127625, + "step": 14 + }, + { + "epoch": 0.001875, + "grad_norm": 2.8731143474578857, + "learning_rate": 1.5e-05, + "loss": 0.4684, + "loss/crossentropy": 2.5530812740325928, + "loss/hidden": 0.1845703125, + "loss/logits": 0.019558344036340714, + "loss/reg": 0.026426764205098152, + "step": 15 + }, + { + "epoch": 0.002, + "grad_norm": 2.5288755893707275, + "grad_norm_var": 4.846526347105633, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.5781, + "loss/crossentropy": 2.5096747875213623, + "loss/hidden": 0.275390625, + "loss/logits": 0.0384209081530571, + "loss/reg": 0.026426298543810844, + "step": 16 + }, + { + "epoch": 0.002125, + "grad_norm": 2.0281474590301514, + "grad_norm_var": 4.89482291121508, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.5318, + "loss/crossentropy": 2.396097421646118, + "loss/hidden": 0.232421875, + "loss/logits": 0.03516196087002754, + "loss/reg": 0.026425909250974655, + "step": 17 + }, + { + "epoch": 0.00225, + "grad_norm": 2.4487411975860596, + "grad_norm_var": 4.896482229626747, + "learning_rate": 1.8e-05, + "loss": 0.5984, + "loss/crossentropy": 2.3916616439819336, + "loss/hidden": 0.28125, + "loss/logits": 0.052923329174518585, + "loss/reg": 0.026425503194332123, + "step": 18 + }, + { + "epoch": 0.002375, + "grad_norm": 1.986022710800171, + "grad_norm_var": 4.956548008938709, + "learning_rate": 1.9e-05, + "loss": 0.5504, + "loss/crossentropy": 2.4791200160980225, + "loss/hidden": 0.2421875, + "loss/logits": 0.04391499236226082, + "loss/reg": 0.026425078511238098, + "step": 19 + }, + { + "epoch": 0.0025, + "grad_norm": 2.0934784412384033, + "grad_norm_var": 4.887277710992484, + "learning_rate": 2e-05, + "loss": 0.5369, + "loss/crossentropy": 2.1741297245025635, + "loss/hidden": 0.2392578125, + "loss/logits": 0.03335873782634735, + "loss/reg": 0.026424556970596313, + "step": 20 + }, + { + "epoch": 0.002625, + "grad_norm": 1.9445254802703857, + "grad_norm_var": 4.884025740026245, + "learning_rate": 2.1e-05, + "loss": 0.4865, + "loss/crossentropy": 2.45112943649292, + "loss/hidden": 0.1962890625, + "loss/logits": 0.025960583239793777, + "loss/reg": 0.02642405778169632, + "step": 21 + }, + { + "epoch": 0.00275, + "grad_norm": 3.070704221725464, + "grad_norm_var": 4.8399171328504895, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.5887, + "loss/crossentropy": 2.1550512313842773, + "loss/hidden": 0.275390625, + "loss/logits": 0.049097511917352676, + "loss/reg": 0.02642347477376461, + "step": 22 + }, + { + "epoch": 0.002875, + "grad_norm": 2.2452821731567383, + "grad_norm_var": 4.835466428034186, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.5482, + "loss/crossentropy": 2.1640255451202393, + "loss/hidden": 0.2470703125, + "loss/logits": 0.036868080496788025, + "loss/reg": 0.026422815397381783, + "step": 23 + }, + { + "epoch": 0.003, + "grad_norm": 2.032148838043213, + "grad_norm_var": 4.84560617678243, + "learning_rate": 2.4e-05, + "loss": 0.5756, + "loss/crossentropy": 2.323482036590576, + "loss/hidden": 0.271484375, + "loss/logits": 0.03993295133113861, + "loss/reg": 0.02642211876809597, + "step": 24 + }, + { + "epoch": 0.003125, + "grad_norm": 1.763465404510498, + "grad_norm_var": 4.8665883230545335, + "learning_rate": 2.5e-05, + "loss": 0.5139, + "loss/crossentropy": 2.33661150932312, + "loss/hidden": 0.22265625, + "loss/logits": 0.02701444923877716, + "loss/reg": 0.026421383023262024, + "step": 25 + }, + { + "epoch": 0.00325, + "grad_norm": 1.7001625299453735, + "grad_norm_var": 4.863438686484135, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.5696, + "loss/crossentropy": 2.2305383682250977, + "loss/hidden": 0.275390625, + "loss/logits": 0.02997000887989998, + "loss/reg": 0.026420695707201958, + "step": 26 + }, + { + "epoch": 0.003375, + "grad_norm": 3.474130392074585, + "grad_norm_var": 0.318824614624526, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.5456, + "loss/crossentropy": 2.1680750846862793, + "loss/hidden": 0.2451171875, + "loss/logits": 0.036261945962905884, + "loss/reg": 0.0264199897646904, + "step": 27 + }, + { + "epoch": 0.0035, + "grad_norm": 3.8201987743377686, + "grad_norm_var": 0.46030846745106163, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.5181, + "loss/crossentropy": 2.418672800064087, + "loss/hidden": 0.224609375, + "loss/logits": 0.029263213276863098, + "loss/reg": 0.02641921117901802, + "step": 28 + }, + { + "epoch": 0.003625, + "grad_norm": 1.9781090021133423, + "grad_norm_var": 0.40905077024170067, + "learning_rate": 2.9e-05, + "loss": 0.5379, + "loss/crossentropy": 2.390868663787842, + "loss/hidden": 0.2392578125, + "loss/logits": 0.034465983510017395, + "loss/reg": 0.026418352499604225, + "step": 29 + }, + { + "epoch": 0.00375, + "grad_norm": 1.6551319360733032, + "grad_norm_var": 0.41503895204729413, + "learning_rate": 3e-05, + "loss": 0.496, + "loss/crossentropy": 2.5960400104522705, + "loss/hidden": 0.205078125, + "loss/logits": 0.02678578905761242, + "loss/reg": 0.026417305693030357, + "step": 30 + }, + { + "epoch": 0.003875, + "grad_norm": 1.7136921882629395, + "grad_norm_var": 0.4185952392532807, + "learning_rate": 3.1e-05, + "loss": 0.5235, + "loss/crossentropy": 2.349839687347412, + "loss/hidden": 0.2275390625, + "loss/logits": 0.031792763620615005, + "loss/reg": 0.026416433975100517, + "step": 31 + }, + { + "epoch": 0.004, + "grad_norm": 1.9992157220840454, + "grad_norm_var": 0.41856547198587274, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.4928, + "loss/crossentropy": 2.3164803981781006, + "loss/hidden": 0.2041015625, + "loss/logits": 0.024579893797636032, + "loss/reg": 0.02641524001955986, + "step": 32 + }, + { + "epoch": 0.004125, + "grad_norm": 2.705052614212036, + "grad_norm_var": 0.42744416353313663, + "learning_rate": 3.3e-05, + "loss": 0.5732, + "loss/crossentropy": 2.42107892036438, + "loss/hidden": 0.275390625, + "loss/logits": 0.03370767831802368, + "loss/reg": 0.02641397900879383, + "step": 33 + }, + { + "epoch": 0.00425, + "grad_norm": 1.8898464441299438, + "grad_norm_var": 0.4350913020843951, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.5531, + "loss/crossentropy": 2.4147770404815674, + "loss/hidden": 0.25390625, + "loss/logits": 0.03504405915737152, + "loss/reg": 0.026412710547447205, + "step": 34 + }, + { + "epoch": 0.004375, + "grad_norm": 4.9570159912109375, + "grad_norm_var": 0.8804344329355491, + "learning_rate": 3.5e-05, + "loss": 0.6763, + "loss/crossentropy": 1.6753497123718262, + "loss/hidden": 0.376953125, + "loss/logits": 0.03519564867019653, + "loss/reg": 0.0264116358011961, + "step": 35 + }, + { + "epoch": 0.0045, + "grad_norm": 4.928956508636475, + "grad_norm_var": 1.2518721453244992, + "learning_rate": 3.6e-05, + "loss": 0.7329, + "loss/crossentropy": 2.6104867458343506, + "loss/hidden": 0.400390625, + "loss/logits": 0.06845290958881378, + "loss/reg": 0.02641039527952671, + "step": 36 + }, + { + "epoch": 0.004625, + "grad_norm": 7.503647327423096, + "grad_norm_var": 2.684651641752033, + "learning_rate": 3.7e-05, + "loss": 0.6258, + "loss/crossentropy": 2.2158656120300293, + "loss/hidden": 0.318359375, + "loss/logits": 0.043342188000679016, + "loss/reg": 0.026409219950437546, + "step": 37 + }, + { + "epoch": 0.00475, + "grad_norm": 2.6838622093200684, + "grad_norm_var": 2.6885420074665602, + "learning_rate": 3.8e-05, + "loss": 0.5939, + "loss/crossentropy": 2.344879627227783, + "loss/hidden": 0.28515625, + "loss/logits": 0.04461552947759628, + "loss/reg": 0.026408080011606216, + "step": 38 + }, + { + "epoch": 0.004875, + "grad_norm": 3.357893705368042, + "grad_norm_var": 2.662758933855309, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.5729, + "loss/crossentropy": 2.6759543418884277, + "loss/hidden": 0.275390625, + "loss/logits": 0.033413954079151154, + "loss/reg": 0.026406895369291306, + "step": 39 + }, + { + "epoch": 0.005, + "grad_norm": 3.0177316665649414, + "grad_norm_var": 2.5949485604856193, + "learning_rate": 4e-05, + "loss": 0.7498, + "loss/crossentropy": 2.2261273860931396, + "loss/hidden": 0.408203125, + "loss/logits": 0.07758316397666931, + "loss/reg": 0.026405224576592445, + "step": 40 + }, + { + "epoch": 0.005125, + "grad_norm": 2.1196699142456055, + "grad_norm_var": 2.54074274703229, + "learning_rate": 4.1e-05, + "loss": 0.6396, + "loss/crossentropy": 2.193378448486328, + "loss/hidden": 0.30859375, + "loss/logits": 0.06692355871200562, + "loss/reg": 0.026403924450278282, + "step": 41 + }, + { + "epoch": 0.00525, + "grad_norm": 2.456051826477051, + "grad_norm_var": 2.435973046683167, + "learning_rate": 4.2e-05, + "loss": 0.5571, + "loss/crossentropy": 1.9526888132095337, + "loss/hidden": 0.26171875, + "loss/logits": 0.03133418411016464, + "loss/reg": 0.026402529329061508, + "step": 42 + }, + { + "epoch": 0.005375, + "grad_norm": 2.257375717163086, + "grad_norm_var": 2.474501380785125, + "learning_rate": 4.3e-05, + "loss": 0.5544, + "loss/crossentropy": 2.3284847736358643, + "loss/hidden": 0.25390625, + "loss/logits": 0.03650724142789841, + "loss/reg": 0.026400938630104065, + "step": 43 + }, + { + "epoch": 0.0055, + "grad_norm": 2.9145264625549316, + "grad_norm_var": 2.4345975605903694, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.5175, + "loss/crossentropy": 2.295241594314575, + "loss/hidden": 0.220703125, + "loss/logits": 0.03284794092178345, + "loss/reg": 0.026399515569210052, + "step": 44 + }, + { + "epoch": 0.005625, + "grad_norm": 3.1294264793395996, + "grad_norm_var": 2.3592519473156615, + "learning_rate": 4.5e-05, + "loss": 0.5567, + "loss/crossentropy": 2.660597085952759, + "loss/hidden": 0.255859375, + "loss/logits": 0.03686758130788803, + "loss/reg": 0.026398126035928726, + "step": 45 + }, + { + "epoch": 0.00575, + "grad_norm": 2.197265863418579, + "grad_norm_var": 2.2745842657817748, + "learning_rate": 4.600000000000001e-05, + "loss": 0.5512, + "loss/crossentropy": 2.3832643032073975, + "loss/hidden": 0.2490234375, + "loss/logits": 0.038256023079156876, + "loss/reg": 0.02639671601355076, + "step": 46 + }, + { + "epoch": 0.005875, + "grad_norm": 2.883378744125366, + "grad_norm_var": 2.141634704665381, + "learning_rate": 4.7e-05, + "loss": 0.5298, + "loss/crossentropy": 2.6035244464874268, + "loss/hidden": 0.2333984375, + "loss/logits": 0.03240815922617912, + "loss/reg": 0.026395246386528015, + "step": 47 + }, + { + "epoch": 0.006, + "grad_norm": 3.1519744396209717, + "grad_norm_var": 2.0420385103816727, + "learning_rate": 4.8e-05, + "loss": 0.5385, + "loss/crossentropy": 2.250037908554077, + "loss/hidden": 0.244140625, + "loss/logits": 0.03043752908706665, + "loss/reg": 0.026393571868538857, + "step": 48 + }, + { + "epoch": 0.006125, + "grad_norm": 3.187680244445801, + "grad_norm_var": 2.0209109756516943, + "learning_rate": 4.9e-05, + "loss": 0.5614, + "loss/crossentropy": 2.366483688354492, + "loss/hidden": 0.263671875, + "loss/logits": 0.033859170973300934, + "loss/reg": 0.02639181725680828, + "step": 49 + }, + { + "epoch": 0.00625, + "grad_norm": 2.3717658519744873, + "grad_norm_var": 1.9454730589909708, + "learning_rate": 5e-05, + "loss": 0.5865, + "loss/crossentropy": 2.007732391357422, + "loss/hidden": 0.2890625, + "loss/logits": 0.0335388109087944, + "loss/reg": 0.02638987824320793, + "step": 50 + }, + { + "epoch": 0.006375, + "grad_norm": 3.658735990524292, + "grad_norm_var": 1.767425501826429, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.5028, + "loss/crossentropy": 2.511072874069214, + "loss/hidden": 0.2099609375, + "loss/logits": 0.02893088385462761, + "loss/reg": 0.026387827470898628, + "step": 51 + }, + { + "epoch": 0.0065, + "grad_norm": 2.5912654399871826, + "grad_norm_var": 1.582150273328572, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.5619, + "loss/crossentropy": 2.3280093669891357, + "loss/hidden": 0.263671875, + "loss/logits": 0.03436018154025078, + "loss/reg": 0.026385735720396042, + "step": 52 + }, + { + "epoch": 0.006625, + "grad_norm": 2.0419421195983887, + "grad_norm_var": 0.23432357463535497, + "learning_rate": 5.300000000000001e-05, + "loss": 0.5674, + "loss/crossentropy": 2.3851194381713867, + "loss/hidden": 0.263671875, + "loss/logits": 0.039869021624326706, + "loss/reg": 0.026383817195892334, + "step": 53 + }, + { + "epoch": 0.00675, + "grad_norm": 2.4164810180664062, + "grad_norm_var": 0.24119551692934707, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.6087, + "loss/crossentropy": 2.6006996631622314, + "loss/hidden": 0.296875, + "loss/logits": 0.04797635227441788, + "loss/reg": 0.026381801813840866, + "step": 54 + }, + { + "epoch": 0.006875, + "grad_norm": 2.697831153869629, + "grad_norm_var": 0.2135682431387058, + "learning_rate": 5.500000000000001e-05, + "loss": 0.523, + "loss/crossentropy": 2.472208261489868, + "loss/hidden": 0.2275390625, + "loss/logits": 0.031705208122730255, + "loss/reg": 0.026379752904176712, + "step": 55 + }, + { + "epoch": 0.007, + "grad_norm": 4.182509422302246, + "grad_norm_var": 0.34874494246430365, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.6766, + "loss/crossentropy": 2.693652868270874, + "loss/hidden": 0.3671875, + "loss/logits": 0.04566050320863724, + "loss/reg": 0.02637762948870659, + "step": 56 + }, + { + "epoch": 0.007125, + "grad_norm": 2.231238842010498, + "grad_norm_var": 0.33990645656106155, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.5811, + "loss/crossentropy": 2.4935543537139893, + "loss/hidden": 0.28125, + "loss/logits": 0.03613065183162689, + "loss/reg": 0.026375366374850273, + "step": 57 + }, + { + "epoch": 0.00725, + "grad_norm": 2.0192184448242188, + "grad_norm_var": 0.37029866859904437, + "learning_rate": 5.8e-05, + "loss": 0.5285, + "loss/crossentropy": 2.192227840423584, + "loss/hidden": 0.2294921875, + "loss/logits": 0.03531934320926666, + "loss/reg": 0.02637314423918724, + "step": 58 + }, + { + "epoch": 0.007375, + "grad_norm": 2.3108532428741455, + "grad_norm_var": 0.36699486123436575, + "learning_rate": 5.9e-05, + "loss": 0.507, + "loss/crossentropy": 2.39101243019104, + "loss/hidden": 0.2177734375, + "loss/logits": 0.025539016351103783, + "loss/reg": 0.02637065388262272, + "step": 59 + }, + { + "epoch": 0.0075, + "grad_norm": 2.049551486968994, + "grad_norm_var": 0.3946811437011318, + "learning_rate": 6e-05, + "loss": 0.5351, + "loss/crossentropy": 2.7062017917633057, + "loss/hidden": 0.2353515625, + "loss/logits": 0.03605186939239502, + "loss/reg": 0.026368385180830956, + "step": 60 + }, + { + "epoch": 0.007625, + "grad_norm": 2.6327223777770996, + "grad_norm_var": 0.38133460463904284, + "learning_rate": 6.1e-05, + "loss": 0.5344, + "loss/crossentropy": 2.0810956954956055, + "loss/hidden": 0.23828125, + "loss/logits": 0.03246723860502243, + "loss/reg": 0.02636607363820076, + "step": 61 + }, + { + "epoch": 0.00775, + "grad_norm": 2.06585955619812, + "grad_norm_var": 0.39059185941555535, + "learning_rate": 6.2e-05, + "loss": 0.5202, + "loss/crossentropy": 2.6240835189819336, + "loss/hidden": 0.224609375, + "loss/logits": 0.03200242295861244, + "loss/reg": 0.026363445445895195, + "step": 62 + }, + { + "epoch": 0.007875, + "grad_norm": 2.109790563583374, + "grad_norm_var": 0.40452198957435875, + "learning_rate": 6.3e-05, + "loss": 0.5249, + "loss/crossentropy": 2.587536573410034, + "loss/hidden": 0.2294921875, + "loss/logits": 0.03178905323147774, + "loss/reg": 0.02636083774268627, + "step": 63 + }, + { + "epoch": 0.008, + "grad_norm": 3.818783760070801, + "grad_norm_var": 0.48072296241430573, + "learning_rate": 6.400000000000001e-05, + "loss": 0.6632, + "loss/crossentropy": 2.011171817779541, + "loss/hidden": 0.353515625, + "loss/logits": 0.04613731801509857, + "loss/reg": 0.02635800838470459, + "step": 64 + }, + { + "epoch": 0.008125, + "grad_norm": 2.4136369228363037, + "grad_norm_var": 0.46258887231494605, + "learning_rate": 6.500000000000001e-05, + "loss": 0.522, + "loss/crossentropy": 2.600787401199341, + "loss/hidden": 0.224609375, + "loss/logits": 0.03384025767445564, + "loss/reg": 0.026355121284723282, + "step": 65 + }, + { + "epoch": 0.00825, + "grad_norm": 2.3908252716064453, + "grad_norm_var": 0.46202963925557394, + "learning_rate": 6.6e-05, + "loss": 0.5859, + "loss/crossentropy": 2.1056201457977295, + "loss/hidden": 0.279296875, + "loss/logits": 0.04309317469596863, + "loss/reg": 0.02635251171886921, + "step": 66 + }, + { + "epoch": 0.008375, + "grad_norm": 2.5190653800964355, + "grad_norm_var": 0.38262308323354144, + "learning_rate": 6.7e-05, + "loss": 0.5391, + "loss/crossentropy": 2.5527184009552, + "loss/hidden": 0.2421875, + "loss/logits": 0.0334152951836586, + "loss/reg": 0.02634957991540432, + "step": 67 + }, + { + "epoch": 0.0085, + "grad_norm": 2.500368595123291, + "grad_norm_var": 0.3824057294099087, + "learning_rate": 6.800000000000001e-05, + "loss": 0.5958, + "loss/crossentropy": 2.30499267578125, + "loss/hidden": 0.28125, + "loss/logits": 0.05108712613582611, + "loss/reg": 0.026346800848841667, + "step": 68 + }, + { + "epoch": 0.008625, + "grad_norm": 2.7905988693237305, + "grad_norm_var": 0.3692126592154902, + "learning_rate": 6.9e-05, + "loss": 0.6049, + "loss/crossentropy": 2.3807146549224854, + "loss/hidden": 0.294921875, + "loss/logits": 0.04648623988032341, + "loss/reg": 0.026344334706664085, + "step": 69 + }, + { + "epoch": 0.00875, + "grad_norm": 2.147470235824585, + "grad_norm_var": 0.3793077808516782, + "learning_rate": 7e-05, + "loss": 0.5345, + "loss/crossentropy": 2.627505302429199, + "loss/hidden": 0.236328125, + "loss/logits": 0.034769318997859955, + "loss/reg": 0.026341637596488, + "step": 70 + }, + { + "epoch": 0.008875, + "grad_norm": 2.6987268924713135, + "grad_norm_var": 0.3793248871627156, + "learning_rate": 7.1e-05, + "loss": 0.5722, + "loss/crossentropy": 2.382685899734497, + "loss/hidden": 0.271484375, + "loss/logits": 0.03735022246837616, + "loss/reg": 0.02633870206773281, + "step": 71 + }, + { + "epoch": 0.009, + "grad_norm": 3.085085153579712, + "grad_norm_var": 0.2164648496489896, + "learning_rate": 7.2e-05, + "loss": 0.5882, + "loss/crossentropy": 2.371429681777954, + "loss/hidden": 0.27734375, + "loss/logits": 0.04748620092868805, + "loss/reg": 0.02633603662252426, + "step": 72 + }, + { + "epoch": 0.009125, + "grad_norm": 4.158353328704834, + "grad_norm_var": 0.3829897758196862, + "learning_rate": 7.3e-05, + "loss": 0.8663, + "loss/crossentropy": 2.29622745513916, + "loss/hidden": 0.5234375, + "loss/logits": 0.07955377548933029, + "loss/reg": 0.026333071291446686, + "step": 73 + }, + { + "epoch": 0.00925, + "grad_norm": 2.111111879348755, + "grad_norm_var": 0.37631661688178514, + "learning_rate": 7.4e-05, + "loss": 0.5468, + "loss/crossentropy": 2.29744815826416, + "loss/hidden": 0.24609375, + "loss/logits": 0.037446070462465286, + "loss/reg": 0.026330096647143364, + "step": 74 + }, + { + "epoch": 0.009375, + "grad_norm": 2.545919179916382, + "grad_norm_var": 0.37031037444480336, + "learning_rate": 7.500000000000001e-05, + "loss": 0.625, + "loss/crossentropy": 2.4376375675201416, + "loss/hidden": 0.306640625, + "loss/logits": 0.05505819618701935, + "loss/reg": 0.026326792314648628, + "step": 75 + }, + { + "epoch": 0.0095, + "grad_norm": 2.362215042114258, + "grad_norm_var": 0.35233204024674003, + "learning_rate": 7.6e-05, + "loss": 0.5747, + "loss/crossentropy": 2.677924156188965, + "loss/hidden": 0.267578125, + "loss/logits": 0.04392882436513901, + "loss/reg": 0.026323769241571426, + "step": 76 + }, + { + "epoch": 0.009625, + "grad_norm": 3.135709762573242, + "grad_norm_var": 0.3671929300455114, + "learning_rate": 7.7e-05, + "loss": 0.7113, + "loss/crossentropy": 1.972798466682434, + "loss/hidden": 0.384765625, + "loss/logits": 0.06329117715358734, + "loss/reg": 0.026320943608880043, + "step": 77 + }, + { + "epoch": 0.00975, + "grad_norm": 4.418634414672852, + "grad_norm_var": 0.5210260544686395, + "learning_rate": 7.800000000000001e-05, + "loss": 0.6851, + "loss/crossentropy": 2.558809518814087, + "loss/hidden": 0.357421875, + "loss/logits": 0.06447892636060715, + "loss/reg": 0.02631756290793419, + "step": 78 + }, + { + "epoch": 0.009875, + "grad_norm": 3.9261293411254883, + "grad_norm_var": 0.55391532710314, + "learning_rate": 7.900000000000001e-05, + "loss": 0.5968, + "loss/crossentropy": 2.6102137565612793, + "loss/hidden": 0.28515625, + "loss/logits": 0.04846350848674774, + "loss/reg": 0.026314500719308853, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 3.1532020568847656, + "grad_norm_var": 0.5035194586586452, + "learning_rate": 8e-05, + "loss": 0.7066, + "loss/crossentropy": 2.36220121383667, + "loss/hidden": 0.38671875, + "loss/logits": 0.05672474205493927, + "loss/reg": 0.026311254128813744, + "step": 80 + }, + { + "epoch": 0.010125, + "grad_norm": 3.557161808013916, + "grad_norm_var": 0.5115010248662256, + "learning_rate": 8.1e-05, + "loss": 0.6647, + "loss/crossentropy": 2.412325859069824, + "loss/hidden": 0.345703125, + "loss/logits": 0.05596970394253731, + "loss/reg": 0.026307715103030205, + "step": 81 + }, + { + "epoch": 0.01025, + "grad_norm": 2.2158408164978027, + "grad_norm_var": 0.5268993015208875, + "learning_rate": 8.2e-05, + "loss": 0.5575, + "loss/crossentropy": 2.3789050579071045, + "loss/hidden": 0.255859375, + "loss/logits": 0.03855578228831291, + "loss/reg": 0.0263040903955698, + "step": 82 + }, + { + "epoch": 0.010375, + "grad_norm": 3.2140979766845703, + "grad_norm_var": 0.5164286227091848, + "learning_rate": 8.3e-05, + "loss": 0.5548, + "loss/crossentropy": 2.401925802230835, + "loss/hidden": 0.2578125, + "loss/logits": 0.03401009738445282, + "loss/reg": 0.026300577446818352, + "step": 83 + }, + { + "epoch": 0.0105, + "grad_norm": 2.4155867099761963, + "grad_norm_var": 0.5225404018326155, + "learning_rate": 8.4e-05, + "loss": 0.5432, + "loss/crossentropy": 2.6546974182128906, + "loss/hidden": 0.244140625, + "loss/logits": 0.03612750768661499, + "loss/reg": 0.02629682794213295, + "step": 84 + }, + { + "epoch": 0.010625, + "grad_norm": 4.232295036315918, + "grad_norm_var": 0.6129643025970267, + "learning_rate": 8.5e-05, + "loss": 0.7199, + "loss/crossentropy": 2.2300524711608887, + "loss/hidden": 0.412109375, + "loss/logits": 0.044823646545410156, + "loss/reg": 0.0262930728495121, + "step": 85 + }, + { + "epoch": 0.01075, + "grad_norm": 2.7160282135009766, + "grad_norm_var": 0.5620128324129702, + "learning_rate": 8.6e-05, + "loss": 0.6973, + "loss/crossentropy": 2.2953288555145264, + "loss/hidden": 0.37109375, + "loss/logits": 0.06333646178245544, + "loss/reg": 0.02628917805850506, + "step": 86 + }, + { + "epoch": 0.010875, + "grad_norm": 3.0872819423675537, + "grad_norm_var": 0.5495392294868544, + "learning_rate": 8.7e-05, + "loss": 0.636, + "loss/crossentropy": 2.449223756790161, + "loss/hidden": 0.328125, + "loss/logits": 0.045011188834905624, + "loss/reg": 0.02628495544195175, + "step": 87 + }, + { + "epoch": 0.011, + "grad_norm": 2.6966607570648193, + "grad_norm_var": 0.5621192378124493, + "learning_rate": 8.800000000000001e-05, + "loss": 0.6272, + "loss/crossentropy": 2.5449047088623047, + "loss/hidden": 0.31640625, + "loss/logits": 0.04802623763680458, + "loss/reg": 0.026280568912625313, + "step": 88 + }, + { + "epoch": 0.011125, + "grad_norm": 2.9160921573638916, + "grad_norm_var": 0.48685408890983506, + "learning_rate": 8.900000000000001e-05, + "loss": 0.6278, + "loss/crossentropy": 2.1442108154296875, + "loss/hidden": 0.310546875, + "loss/logits": 0.054462507367134094, + "loss/reg": 0.02627684734761715, + "step": 89 + }, + { + "epoch": 0.01125, + "grad_norm": 3.3378944396972656, + "grad_norm_var": 0.4283231906687052, + "learning_rate": 9e-05, + "loss": 0.6536, + "loss/crossentropy": 2.4361062049865723, + "loss/hidden": 0.349609375, + "loss/logits": 0.04130454361438751, + "loss/reg": 0.026273205876350403, + "step": 90 + }, + { + "epoch": 0.011375, + "grad_norm": 2.597607374191284, + "grad_norm_var": 0.42452911296148627, + "learning_rate": 9.1e-05, + "loss": 0.6391, + "loss/crossentropy": 2.0079762935638428, + "loss/hidden": 0.3203125, + "loss/logits": 0.056107863783836365, + "loss/reg": 0.02626909501850605, + "step": 91 + }, + { + "epoch": 0.0115, + "grad_norm": 4.960971355438232, + "grad_norm_var": 0.5826997127177641, + "learning_rate": 9.200000000000001e-05, + "loss": 0.6735, + "loss/crossentropy": 2.708275079727173, + "loss/hidden": 0.349609375, + "loss/logits": 0.06119866296648979, + "loss/reg": 0.026265164837241173, + "step": 92 + }, + { + "epoch": 0.011625, + "grad_norm": 3.8193323612213135, + "grad_norm_var": 0.5981799563928756, + "learning_rate": 9.300000000000001e-05, + "loss": 0.8429, + "loss/crossentropy": 2.4117016792297363, + "loss/hidden": 0.498046875, + "loss/logits": 0.08221981674432755, + "loss/reg": 0.026260720565915108, + "step": 93 + }, + { + "epoch": 0.01175, + "grad_norm": 3.434213638305664, + "grad_norm_var": 0.5157332557296352, + "learning_rate": 9.4e-05, + "loss": 0.6738, + "loss/crossentropy": 2.62178111076355, + "loss/hidden": 0.36328125, + "loss/logits": 0.04794853553175926, + "loss/reg": 0.02625615894794464, + "step": 94 + }, + { + "epoch": 0.011875, + "grad_norm": 3.0944480895996094, + "grad_norm_var": 0.4859309000509171, + "learning_rate": 9.5e-05, + "loss": 0.6829, + "loss/crossentropy": 2.582462787628174, + "loss/hidden": 0.353515625, + "loss/logits": 0.0669020414352417, + "loss/reg": 0.026251958683133125, + "step": 95 + }, + { + "epoch": 0.012, + "grad_norm": 3.9548256397247314, + "grad_norm_var": 0.5194300484800329, + "learning_rate": 9.6e-05, + "loss": 0.8508, + "loss/crossentropy": 2.3243165016174316, + "loss/hidden": 0.49609375, + "loss/logits": 0.09221720695495605, + "loss/reg": 0.026247689500451088, + "step": 96 + }, + { + "epoch": 0.012125, + "grad_norm": 8.949115753173828, + "grad_norm_var": 2.5460815450669125, + "learning_rate": 9.7e-05, + "loss": 0.9668, + "loss/crossentropy": 2.3593697547912598, + "loss/hidden": 0.62109375, + "loss/logits": 0.08330727368593216, + "loss/reg": 0.026243869215250015, + "step": 97 + }, + { + "epoch": 0.01225, + "grad_norm": 3.874511957168579, + "grad_norm_var": 2.411331023611861, + "learning_rate": 9.8e-05, + "loss": 0.7272, + "loss/crossentropy": 1.9933784008026123, + "loss/hidden": 0.41015625, + "loss/logits": 0.05464401841163635, + "loss/reg": 0.026239972561597824, + "step": 98 + }, + { + "epoch": 0.012375, + "grad_norm": 5.088143825531006, + "grad_norm_var": 2.507843574169987, + "learning_rate": 9.900000000000001e-05, + "loss": 0.6761, + "loss/crossentropy": 2.578767776489258, + "loss/hidden": 0.3515625, + "loss/logits": 0.0621890164911747, + "loss/reg": 0.026235179975628853, + "step": 99 + }, + { + "epoch": 0.0125, + "grad_norm": 3.9010627269744873, + "grad_norm_var": 2.366914585761602, + "learning_rate": 0.0001, + "loss": 0.7051, + "loss/crossentropy": 2.4717133045196533, + "loss/hidden": 0.375, + "loss/logits": 0.0677795261144638, + "loss/reg": 0.026230769231915474, + "step": 100 + }, + { + "epoch": 0.012625, + "grad_norm": 5.50706148147583, + "grad_norm_var": 2.522191588171327, + "learning_rate": 0.0001, + "loss": 0.7765, + "loss/crossentropy": 2.3764612674713135, + "loss/hidden": 0.44921875, + "loss/logits": 0.06503438949584961, + "loss/reg": 0.02622627653181553, + "step": 101 + }, + { + "epoch": 0.01275, + "grad_norm": 5.103200435638428, + "grad_norm_var": 2.470966679212188, + "learning_rate": 0.0001, + "loss": 0.7008, + "loss/crossentropy": 2.5796542167663574, + "loss/hidden": 0.38671875, + "loss/logits": 0.05191829800605774, + "loss/reg": 0.026221245527267456, + "step": 102 + }, + { + "epoch": 0.012875, + "grad_norm": 18.05303192138672, + "grad_norm_var": 14.358413039824521, + "learning_rate": 0.0001, + "loss": 1.0008, + "loss/crossentropy": 1.927337646484375, + "loss/hidden": 0.6796875, + "loss/logits": 0.05896752327680588, + "loss/reg": 0.02621658518910408, + "step": 103 + }, + { + "epoch": 0.013, + "grad_norm": 3.410438299179077, + "grad_norm_var": 14.16338361533652, + "learning_rate": 0.0001, + "loss": 0.735, + "loss/crossentropy": 2.290928363800049, + "loss/hidden": 0.40625, + "loss/logits": 0.06666909158229828, + "loss/reg": 0.026211561635136604, + "step": 104 + }, + { + "epoch": 0.013125, + "grad_norm": 3.117622137069702, + "grad_norm_var": 14.10656391346422, + "learning_rate": 0.0001, + "loss": 0.6665, + "loss/crossentropy": 2.6549246311187744, + "loss/hidden": 0.353515625, + "loss/logits": 0.05095440149307251, + "loss/reg": 0.026206739246845245, + "step": 105 + }, + { + "epoch": 0.01325, + "grad_norm": 3.9999241828918457, + "grad_norm_var": 13.97508509706009, + "learning_rate": 0.0001, + "loss": 0.8082, + "loss/crossentropy": 2.460174798965454, + "loss/hidden": 0.46484375, + "loss/logits": 0.08136071264743805, + "loss/reg": 0.02620157040655613, + "step": 106 + }, + { + "epoch": 0.013375, + "grad_norm": 3.405712842941284, + "grad_norm_var": 13.73775124044489, + "learning_rate": 0.0001, + "loss": 0.6518, + "loss/crossentropy": 2.521803855895996, + "loss/hidden": 0.3359375, + "loss/logits": 0.053909383714199066, + "loss/reg": 0.02619684301316738, + "step": 107 + }, + { + "epoch": 0.0135, + "grad_norm": 3.615098237991333, + "grad_norm_var": 13.899167673014775, + "learning_rate": 0.0001, + "loss": 0.7068, + "loss/crossentropy": 2.510159969329834, + "loss/hidden": 0.37890625, + "loss/logits": 0.06601101160049438, + "loss/reg": 0.02619197592139244, + "step": 108 + }, + { + "epoch": 0.013625, + "grad_norm": 4.2520599365234375, + "grad_norm_var": 13.834356012442765, + "learning_rate": 0.0001, + "loss": 0.735, + "loss/crossentropy": 2.508683681488037, + "loss/hidden": 0.41015625, + "loss/logits": 0.0629870742559433, + "loss/reg": 0.0261868704110384, + "step": 109 + }, + { + "epoch": 0.01375, + "grad_norm": 3.215749979019165, + "grad_norm_var": 13.887973421518442, + "learning_rate": 0.0001, + "loss": 0.8268, + "loss/crossentropy": 2.3564252853393555, + "loss/hidden": 0.48046875, + "loss/logits": 0.08449074625968933, + "loss/reg": 0.026181429624557495, + "step": 110 + }, + { + "epoch": 0.013875, + "grad_norm": 4.598328590393066, + "grad_norm_var": 13.615373346458785, + "learning_rate": 0.0001, + "loss": 0.7398, + "loss/crossentropy": 2.366943359375, + "loss/hidden": 0.41796875, + "loss/logits": 0.060083672404289246, + "loss/reg": 0.026176555082201958, + "step": 111 + }, + { + "epoch": 0.014, + "grad_norm": 2.758070707321167, + "grad_norm_var": 13.912012390229316, + "learning_rate": 0.0001, + "loss": 0.6839, + "loss/crossentropy": 2.3351521492004395, + "loss/hidden": 0.365234375, + "loss/logits": 0.056987129151821136, + "loss/reg": 0.0261719711124897, + "step": 112 + }, + { + "epoch": 0.014125, + "grad_norm": 2.9389584064483643, + "grad_norm_var": 13.147693721903025, + "learning_rate": 0.0001, + "loss": 0.8964, + "loss/crossentropy": 2.188626766204834, + "loss/hidden": 0.54296875, + "loss/logits": 0.09171397984027863, + "loss/reg": 0.026167072355747223, + "step": 113 + }, + { + "epoch": 0.01425, + "grad_norm": 2.8545026779174805, + "grad_norm_var": 13.338918720074266, + "learning_rate": 0.0001, + "loss": 0.6652, + "loss/crossentropy": 2.488462448120117, + "loss/hidden": 0.341796875, + "loss/logits": 0.06177069991827011, + "loss/reg": 0.026162149384617805, + "step": 114 + }, + { + "epoch": 0.014375, + "grad_norm": 3.343590497970581, + "grad_norm_var": 13.447848849906688, + "learning_rate": 0.0001, + "loss": 0.7317, + "loss/crossentropy": 2.4826672077178955, + "loss/hidden": 0.396484375, + "loss/logits": 0.07369040697813034, + "loss/reg": 0.026156950742006302, + "step": 115 + }, + { + "epoch": 0.0145, + "grad_norm": 5.309541702270508, + "grad_norm_var": 13.435010363164546, + "learning_rate": 0.0001, + "loss": 0.6918, + "loss/crossentropy": 2.715517282485962, + "loss/hidden": 0.376953125, + "loss/logits": 0.05337735265493393, + "loss/reg": 0.026151426136493683, + "step": 116 + }, + { + "epoch": 0.014625, + "grad_norm": 3.413027763366699, + "grad_norm_var": 13.488672790501717, + "learning_rate": 0.0001, + "loss": 0.7942, + "loss/crossentropy": 2.3932089805603027, + "loss/hidden": 0.451171875, + "loss/logits": 0.08154396712779999, + "loss/reg": 0.026146216318011284, + "step": 117 + }, + { + "epoch": 0.01475, + "grad_norm": 2.735275983810425, + "grad_norm_var": 13.676075950246783, + "learning_rate": 0.0001, + "loss": 0.7606, + "loss/crossentropy": 2.2933082580566406, + "loss/hidden": 0.4296875, + "loss/logits": 0.06949938833713531, + "loss/reg": 0.02614082768559456, + "step": 118 + }, + { + "epoch": 0.014875, + "grad_norm": 3.1346964836120605, + "grad_norm_var": 0.5056645529599865, + "learning_rate": 0.0001, + "loss": 0.7759, + "loss/crossentropy": 2.331713914871216, + "loss/hidden": 0.4375, + "loss/logits": 0.07704727351665497, + "loss/reg": 0.026135168969631195, + "step": 119 + }, + { + "epoch": 0.015, + "grad_norm": 3.8077635765075684, + "grad_norm_var": 0.5104468723684629, + "learning_rate": 0.0001, + "loss": 0.6882, + "loss/crossentropy": 2.331220865249634, + "loss/hidden": 0.373046875, + "loss/logits": 0.05386776477098465, + "loss/reg": 0.02612963318824768, + "step": 120 + }, + { + "epoch": 0.015125, + "grad_norm": 3.209914445877075, + "grad_norm_var": 0.5058893418769751, + "learning_rate": 0.0001, + "loss": 0.7536, + "loss/crossentropy": 2.352771759033203, + "loss/hidden": 0.4140625, + "loss/logits": 0.07825946807861328, + "loss/reg": 0.026124266907572746, + "step": 121 + }, + { + "epoch": 0.01525, + "grad_norm": 3.3548500537872314, + "grad_norm_var": 0.49208198737674874, + "learning_rate": 0.0001, + "loss": 0.7088, + "loss/crossentropy": 2.483644723892212, + "loss/hidden": 0.384765625, + "loss/logits": 0.06287863850593567, + "loss/reg": 0.0261182002723217, + "step": 122 + }, + { + "epoch": 0.015375, + "grad_norm": 3.9953765869140625, + "grad_norm_var": 0.5066601541023895, + "learning_rate": 0.0001, + "loss": 0.7792, + "loss/crossentropy": 2.6117637157440186, + "loss/hidden": 0.435546875, + "loss/logits": 0.08250629901885986, + "loss/reg": 0.026112213730812073, + "step": 123 + }, + { + "epoch": 0.0155, + "grad_norm": 3.1783852577209473, + "grad_norm_var": 0.5138316405800327, + "learning_rate": 0.0001, + "loss": 0.7559, + "loss/crossentropy": 2.401679754257202, + "loss/hidden": 0.423828125, + "loss/logits": 0.07104581594467163, + "loss/reg": 0.02610679157078266, + "step": 124 + }, + { + "epoch": 0.015625, + "grad_norm": 3.2759885787963867, + "grad_norm_var": 0.47631527116517774, + "learning_rate": 0.0001, + "loss": 0.8262, + "loss/crossentropy": 2.3979361057281494, + "loss/hidden": 0.486328125, + "loss/logits": 0.0788530558347702, + "loss/reg": 0.026101654395461082, + "step": 125 + }, + { + "epoch": 0.01575, + "grad_norm": 4.0768632888793945, + "grad_norm_var": 0.4963098069624029, + "learning_rate": 0.0001, + "loss": 0.6983, + "loss/crossentropy": 2.5287113189697266, + "loss/hidden": 0.375, + "loss/logits": 0.062308911234140396, + "loss/reg": 0.026096193119883537, + "step": 126 + }, + { + "epoch": 0.015875, + "grad_norm": 4.300101280212402, + "grad_norm_var": 0.45815803943682926, + "learning_rate": 0.0001, + "loss": 0.858, + "loss/crossentropy": 2.255234956741333, + "loss/hidden": 0.50390625, + "loss/logits": 0.0932290330529213, + "loss/reg": 0.026090849190950394, + "step": 127 + }, + { + "epoch": 0.016, + "grad_norm": 3.303663492202759, + "grad_norm_var": 0.42421384752866137, + "learning_rate": 0.0001, + "loss": 0.7284, + "loss/crossentropy": 2.528862476348877, + "loss/hidden": 0.40234375, + "loss/logits": 0.06523742526769638, + "loss/reg": 0.026085302233695984, + "step": 128 + }, + { + "epoch": 0.016125, + "grad_norm": 6.868241310119629, + "grad_norm_var": 1.0876227157335427, + "learning_rate": 0.0001, + "loss": 0.8999, + "loss/crossentropy": 2.6554996967315674, + "loss/hidden": 0.5390625, + "loss/logits": 0.1000661626458168, + "loss/reg": 0.02607985958456993, + "step": 129 + }, + { + "epoch": 0.01625, + "grad_norm": 3.3035075664520264, + "grad_norm_var": 1.046006684658701, + "learning_rate": 0.0001, + "loss": 0.7367, + "loss/crossentropy": 2.293642282485962, + "loss/hidden": 0.40625, + "loss/logits": 0.06973426043987274, + "loss/reg": 0.026074659079313278, + "step": 130 + }, + { + "epoch": 0.016375, + "grad_norm": 4.2563276290893555, + "grad_norm_var": 1.0439696727840135, + "learning_rate": 0.0001, + "loss": 0.7754, + "loss/crossentropy": 2.2192564010620117, + "loss/hidden": 0.447265625, + "loss/logits": 0.06743350625038147, + "loss/reg": 0.026069074869155884, + "step": 131 + }, + { + "epoch": 0.0165, + "grad_norm": 4.646778583526611, + "grad_norm_var": 0.9420233457711596, + "learning_rate": 0.0001, + "loss": 0.7496, + "loss/crossentropy": 2.430368423461914, + "loss/hidden": 0.4140625, + "loss/logits": 0.07494455575942993, + "loss/reg": 0.026063458994030952, + "step": 132 + }, + { + "epoch": 0.016625, + "grad_norm": 7.465832233428955, + "grad_norm_var": 1.7574380087301391, + "learning_rate": 0.0001, + "loss": 0.9607, + "loss/crossentropy": 2.6182897090911865, + "loss/hidden": 0.57421875, + "loss/logits": 0.1258610337972641, + "loss/reg": 0.026057813316583633, + "step": 133 + }, + { + "epoch": 0.01675, + "grad_norm": 4.5479936599731445, + "grad_norm_var": 1.6433309350178509, + "learning_rate": 0.0001, + "loss": 0.9529, + "loss/crossentropy": 2.238551378250122, + "loss/hidden": 0.58984375, + "loss/logits": 0.10252824425697327, + "loss/reg": 0.02605200558900833, + "step": 134 + }, + { + "epoch": 0.016875, + "grad_norm": 3.4055774211883545, + "grad_norm_var": 1.6105102483452751, + "learning_rate": 0.0001, + "loss": 0.8407, + "loss/crossentropy": 2.4216599464416504, + "loss/hidden": 0.498046875, + "loss/logits": 0.08216647803783417, + "loss/reg": 0.026046328246593475, + "step": 135 + }, + { + "epoch": 0.017, + "grad_norm": 8.839641571044922, + "grad_norm_var": 2.938344740358995, + "learning_rate": 0.0001, + "loss": 1.1023, + "loss/crossentropy": 2.534442901611328, + "loss/hidden": 0.7109375, + "loss/logits": 0.1310025304555893, + "loss/reg": 0.026040658354759216, + "step": 136 + }, + { + "epoch": 0.017125, + "grad_norm": 4.6421589851379395, + "grad_norm_var": 2.819843479450094, + "learning_rate": 0.0001, + "loss": 0.7418, + "loss/crossentropy": 2.605559825897217, + "loss/hidden": 0.416015625, + "loss/logits": 0.06548085808753967, + "loss/reg": 0.02603481523692608, + "step": 137 + }, + { + "epoch": 0.01725, + "grad_norm": 3.1547701358795166, + "grad_norm_var": 2.8553314644504555, + "learning_rate": 0.0001, + "loss": 0.731, + "loss/crossentropy": 2.5905325412750244, + "loss/hidden": 0.40234375, + "loss/logits": 0.06835847347974777, + "loss/reg": 0.02602926455438137, + "step": 138 + }, + { + "epoch": 0.017375, + "grad_norm": 4.074351787567139, + "grad_norm_var": 2.849577549707145, + "learning_rate": 0.0001, + "loss": 0.9582, + "loss/crossentropy": 2.1483733654022217, + "loss/hidden": 0.60546875, + "loss/logits": 0.0924658477306366, + "loss/reg": 0.026023706421256065, + "step": 139 + }, + { + "epoch": 0.0175, + "grad_norm": 3.758636713027954, + "grad_norm_var": 2.7618912420839155, + "learning_rate": 0.0001, + "loss": 0.9996, + "loss/crossentropy": 2.335742473602295, + "loss/hidden": 0.6328125, + "loss/logits": 0.10663188993930817, + "loss/reg": 0.026017924770712852, + "step": 140 + }, + { + "epoch": 0.017625, + "grad_norm": 4.186927795410156, + "grad_norm_var": 2.6505093919275544, + "learning_rate": 0.0001, + "loss": 0.8414, + "loss/crossentropy": 2.281843423843384, + "loss/hidden": 0.498046875, + "loss/logits": 0.08321215212345123, + "loss/reg": 0.026011699810624123, + "step": 141 + }, + { + "epoch": 0.01775, + "grad_norm": 3.1666276454925537, + "grad_norm_var": 2.775123140671519, + "learning_rate": 0.0001, + "loss": 0.7768, + "loss/crossentropy": 2.4267935752868652, + "loss/hidden": 0.44140625, + "loss/logits": 0.07538889348506927, + "loss/reg": 0.02600528486073017, + "step": 142 + }, + { + "epoch": 0.017875, + "grad_norm": 6.386529445648193, + "grad_norm_var": 2.9581845034072245, + "learning_rate": 0.0001, + "loss": 0.8401, + "loss/crossentropy": 2.7848174571990967, + "loss/hidden": 0.50390625, + "loss/logits": 0.07621172070503235, + "loss/reg": 0.025999369099736214, + "step": 143 + }, + { + "epoch": 0.018, + "grad_norm": 3.8083512783050537, + "grad_norm_var": 2.876745593692829, + "learning_rate": 0.0001, + "loss": 0.9046, + "loss/crossentropy": 2.341048002243042, + "loss/hidden": 0.56640625, + "loss/logits": 0.0782276839017868, + "loss/reg": 0.0259928647428751, + "step": 144 + }, + { + "epoch": 0.018125, + "grad_norm": 4.083465576171875, + "grad_norm_var": 2.5868089188751657, + "learning_rate": 0.0001, + "loss": 0.9449, + "loss/crossentropy": 2.3942995071411133, + "loss/hidden": 0.59375, + "loss/logits": 0.09130540490150452, + "loss/reg": 0.025986921042203903, + "step": 145 + }, + { + "epoch": 0.01825, + "grad_norm": 3.654815673828125, + "grad_norm_var": 2.533420197907486, + "learning_rate": 0.0001, + "loss": 0.9459, + "loss/crossentropy": 2.2414467334747314, + "loss/hidden": 0.5859375, + "loss/logits": 0.10016702860593796, + "loss/reg": 0.02598092146217823, + "step": 146 + }, + { + "epoch": 0.018375, + "grad_norm": 5.4976935386657715, + "grad_norm_var": 2.567896035243579, + "learning_rate": 0.0001, + "loss": 0.9079, + "loss/crossentropy": 2.1185100078582764, + "loss/hidden": 0.58203125, + "loss/logits": 0.06617112457752228, + "loss/reg": 0.025974513962864876, + "step": 147 + }, + { + "epoch": 0.0185, + "grad_norm": 3.4107933044433594, + "grad_norm_var": 2.673383097164577, + "learning_rate": 0.0001, + "loss": 0.7982, + "loss/crossentropy": 2.417313575744629, + "loss/hidden": 0.462890625, + "loss/logits": 0.07567355036735535, + "loss/reg": 0.025968506932258606, + "step": 148 + }, + { + "epoch": 0.018625, + "grad_norm": 3.589749574661255, + "grad_norm_var": 2.1469293827286418, + "learning_rate": 0.0001, + "loss": 0.8088, + "loss/crossentropy": 2.2554194927215576, + "loss/hidden": 0.478515625, + "loss/logits": 0.07071521133184433, + "loss/reg": 0.025961775332689285, + "step": 149 + }, + { + "epoch": 0.01875, + "grad_norm": 4.003805160522461, + "grad_norm_var": 2.1538296896943887, + "learning_rate": 0.0001, + "loss": 0.7856, + "loss/crossentropy": 2.7128918170928955, + "loss/hidden": 0.453125, + "loss/logits": 0.07290500402450562, + "loss/reg": 0.025955306366086006, + "step": 150 + }, + { + "epoch": 0.018875, + "grad_norm": 3.536449432373047, + "grad_norm_var": 2.1383506752067736, + "learning_rate": 0.0001, + "loss": 0.7436, + "loss/crossentropy": 2.509995460510254, + "loss/hidden": 0.421875, + "loss/logits": 0.062191903591156006, + "loss/reg": 0.02594931609928608, + "step": 151 + }, + { + "epoch": 0.019, + "grad_norm": 3.434654951095581, + "grad_norm_var": 0.7374638182579057, + "learning_rate": 0.0001, + "loss": 0.9118, + "loss/crossentropy": 2.3447437286376953, + "loss/hidden": 0.55859375, + "loss/logits": 0.0937500149011612, + "loss/reg": 0.025943227112293243, + "step": 152 + }, + { + "epoch": 0.019125, + "grad_norm": 8.066261291503906, + "grad_norm_var": 1.7522972641868202, + "learning_rate": 0.0001, + "loss": 0.8648, + "loss/crossentropy": 2.4481232166290283, + "loss/hidden": 0.52734375, + "loss/logits": 0.07811163365840912, + "loss/reg": 0.025936946272850037, + "step": 153 + }, + { + "epoch": 0.01925, + "grad_norm": 3.2214348316192627, + "grad_norm_var": 1.7429433318934864, + "learning_rate": 0.0001, + "loss": 0.7673, + "loss/crossentropy": 2.3142549991607666, + "loss/hidden": 0.43359375, + "loss/logits": 0.07435894012451172, + "loss/reg": 0.02593095973134041, + "step": 154 + }, + { + "epoch": 0.019375, + "grad_norm": 2.854038715362549, + "grad_norm_var": 1.8633807825236384, + "learning_rate": 0.0001, + "loss": 0.7601, + "loss/crossentropy": 2.7022647857666016, + "loss/hidden": 0.43359375, + "loss/logits": 0.06724615395069122, + "loss/reg": 0.025924943387508392, + "step": 155 + }, + { + "epoch": 0.0195, + "grad_norm": 3.1763484477996826, + "grad_norm_var": 1.9162196068123232, + "learning_rate": 0.0001, + "loss": 0.9, + "loss/crossentropy": 2.574676036834717, + "loss/hidden": 0.54296875, + "loss/logits": 0.09785018861293793, + "loss/reg": 0.025919148698449135, + "step": 156 + }, + { + "epoch": 0.019625, + "grad_norm": 3.999523162841797, + "grad_norm_var": 1.9169889601133074, + "learning_rate": 0.0001, + "loss": 0.858, + "loss/crossentropy": 2.480532646179199, + "loss/hidden": 0.515625, + "loss/logits": 0.08323468267917633, + "loss/reg": 0.025913061574101448, + "step": 157 + }, + { + "epoch": 0.01975, + "grad_norm": 2.9662230014801025, + "grad_norm_var": 1.944924590139983, + "learning_rate": 0.0001, + "loss": 0.818, + "loss/crossentropy": 2.459967613220215, + "loss/hidden": 0.48046875, + "loss/logits": 0.07847169041633606, + "loss/reg": 0.025906959548592567, + "step": 158 + }, + { + "epoch": 0.019875, + "grad_norm": 7.074191093444824, + "grad_norm_var": 2.1836107796529065, + "learning_rate": 0.0001, + "loss": 0.9801, + "loss/crossentropy": 2.2858352661132812, + "loss/hidden": 0.6484375, + "loss/logits": 0.07268328964710236, + "loss/reg": 0.025901462882757187, + "step": 159 + }, + { + "epoch": 0.02, + "grad_norm": 3.6333370208740234, + "grad_norm_var": 2.193465227977892, + "learning_rate": 0.0001, + "loss": 0.9048, + "loss/crossentropy": 2.4626388549804688, + "loss/hidden": 0.5546875, + "loss/logits": 0.09115847945213318, + "loss/reg": 0.02589540183544159, + "step": 160 + }, + { + "epoch": 0.020125, + "grad_norm": 4.283749103546143, + "grad_norm_var": 2.1945247126451437, + "learning_rate": 0.0001, + "loss": 0.9251, + "loss/crossentropy": 2.5746121406555176, + "loss/hidden": 0.58203125, + "loss/logits": 0.0841851755976677, + "loss/reg": 0.025888830423355103, + "step": 161 + }, + { + "epoch": 0.02025, + "grad_norm": 3.628138542175293, + "grad_norm_var": 2.196331220420876, + "learning_rate": 0.0001, + "loss": 0.9408, + "loss/crossentropy": 2.398010730743408, + "loss/hidden": 0.58203125, + "loss/logits": 0.09991887211799622, + "loss/reg": 0.025882074609398842, + "step": 162 + }, + { + "epoch": 0.020375, + "grad_norm": 3.5412559509277344, + "grad_norm_var": 2.0836172065033507, + "learning_rate": 0.0001, + "loss": 0.857, + "loss/crossentropy": 2.5064220428466797, + "loss/hidden": 0.50390625, + "loss/logits": 0.0943569540977478, + "loss/reg": 0.025876009836792946, + "step": 163 + }, + { + "epoch": 0.0205, + "grad_norm": 3.498668670654297, + "grad_norm_var": 2.0768887394910185, + "learning_rate": 0.0001, + "loss": 0.8917, + "loss/crossentropy": 2.535529375076294, + "loss/hidden": 0.52734375, + "loss/logits": 0.10565976053476334, + "loss/reg": 0.02587023191154003, + "step": 164 + }, + { + "epoch": 0.020625, + "grad_norm": 3.14025616645813, + "grad_norm_var": 2.116006039378421, + "learning_rate": 0.0001, + "loss": 0.7816, + "loss/crossentropy": 2.7250640392303467, + "loss/hidden": 0.453125, + "loss/logits": 0.06980661302804947, + "loss/reg": 0.025863803923130035, + "step": 165 + }, + { + "epoch": 0.02075, + "grad_norm": 2.6821706295013428, + "grad_norm_var": 2.2251478520018773, + "learning_rate": 0.0001, + "loss": 0.7558, + "loss/crossentropy": 2.230567693710327, + "loss/hidden": 0.43359375, + "loss/logits": 0.06358660757541656, + "loss/reg": 0.02585742622613907, + "step": 166 + }, + { + "epoch": 0.020875, + "grad_norm": 3.8048856258392334, + "grad_norm_var": 2.2158862694911607, + "learning_rate": 0.0001, + "loss": 0.7965, + "loss/crossentropy": 2.69014310836792, + "loss/hidden": 0.453125, + "loss/logits": 0.08488957583904266, + "loss/reg": 0.025850988924503326, + "step": 167 + }, + { + "epoch": 0.021, + "grad_norm": 4.305826187133789, + "grad_norm_var": 2.2048741298976515, + "learning_rate": 0.0001, + "loss": 1.1009, + "loss/crossentropy": 2.2925851345062256, + "loss/hidden": 0.73828125, + "loss/logits": 0.10412566363811493, + "loss/reg": 0.025845136493444443, + "step": 168 + }, + { + "epoch": 0.021125, + "grad_norm": 4.051706790924072, + "grad_norm_var": 1.0314628897999674, + "learning_rate": 0.0001, + "loss": 0.7756, + "loss/crossentropy": 2.4515578746795654, + "loss/hidden": 0.4453125, + "loss/logits": 0.07186460494995117, + "loss/reg": 0.02583896555006504, + "step": 169 + }, + { + "epoch": 0.02125, + "grad_norm": 3.3800575733184814, + "grad_norm_var": 1.022039210437893, + "learning_rate": 0.0001, + "loss": 0.798, + "loss/crossentropy": 2.3387436866760254, + "loss/hidden": 0.455078125, + "loss/logits": 0.08458675444126129, + "loss/reg": 0.025832952931523323, + "step": 170 + }, + { + "epoch": 0.021375, + "grad_norm": 3.069735527038574, + "grad_norm_var": 0.9991429378891439, + "learning_rate": 0.0001, + "loss": 0.7416, + "loss/crossentropy": 2.660727024078369, + "loss/hidden": 0.412109375, + "loss/logits": 0.07125158607959747, + "loss/reg": 0.025827286764979362, + "step": 171 + }, + { + "epoch": 0.0215, + "grad_norm": 3.5827748775482178, + "grad_norm_var": 0.9775809993657352, + "learning_rate": 0.0001, + "loss": 0.9301, + "loss/crossentropy": 2.160230875015259, + "loss/hidden": 0.5703125, + "loss/logits": 0.10158500075340271, + "loss/reg": 0.025821613147854805, + "step": 172 + }, + { + "epoch": 0.021625, + "grad_norm": 3.0348143577575684, + "grad_norm_var": 1.008817027257092, + "learning_rate": 0.0001, + "loss": 0.9059, + "loss/crossentropy": 2.5519795417785645, + "loss/hidden": 0.5390625, + "loss/logits": 0.10867513716220856, + "loss/reg": 0.025815250352025032, + "step": 173 + }, + { + "epoch": 0.02175, + "grad_norm": 3.325514316558838, + "grad_norm_var": 0.980302655794393, + "learning_rate": 0.0001, + "loss": 0.7651, + "loss/crossentropy": 2.3060855865478516, + "loss/hidden": 0.43359375, + "loss/logits": 0.0733788013458252, + "loss/reg": 0.02580902725458145, + "step": 174 + }, + { + "epoch": 0.021875, + "grad_norm": 2.9402523040771484, + "grad_norm_var": 0.21740374576502078, + "learning_rate": 0.0001, + "loss": 0.859, + "loss/crossentropy": 2.4109151363372803, + "loss/hidden": 0.515625, + "loss/logits": 0.0853327289223671, + "loss/reg": 0.025802936404943466, + "step": 175 + }, + { + "epoch": 0.022, + "grad_norm": 4.970669746398926, + "grad_norm_var": 0.354037293260262, + "learning_rate": 0.0001, + "loss": 0.9835, + "loss/crossentropy": 2.9366648197174072, + "loss/hidden": 0.6171875, + "loss/logits": 0.10840301960706711, + "loss/reg": 0.02579565905034542, + "step": 176 + }, + { + "epoch": 0.022125, + "grad_norm": 4.438536643981934, + "grad_norm_var": 0.37010993593279384, + "learning_rate": 0.0001, + "loss": 0.8574, + "loss/crossentropy": 2.4177615642547607, + "loss/hidden": 0.51171875, + "loss/logits": 0.08775197714567184, + "loss/reg": 0.025788920000195503, + "step": 177 + }, + { + "epoch": 0.02225, + "grad_norm": 4.3905863761901855, + "grad_norm_var": 0.41060424896311526, + "learning_rate": 0.0001, + "loss": 0.8796, + "loss/crossentropy": 2.320418119430542, + "loss/hidden": 0.53125, + "loss/logits": 0.09051363915205002, + "loss/reg": 0.02578234300017357, + "step": 178 + }, + { + "epoch": 0.022375, + "grad_norm": 3.2044453620910645, + "grad_norm_var": 0.42189777730298467, + "learning_rate": 0.0001, + "loss": 0.785, + "loss/crossentropy": 2.327108383178711, + "loss/hidden": 0.447265625, + "loss/logits": 0.08003242313861847, + "loss/reg": 0.025775177404284477, + "step": 179 + }, + { + "epoch": 0.0225, + "grad_norm": 3.2016260623931885, + "grad_norm_var": 0.4319725268587102, + "learning_rate": 0.0001, + "loss": 0.8076, + "loss/crossentropy": 2.6913022994995117, + "loss/hidden": 0.466796875, + "loss/logits": 0.08316424489021301, + "loss/reg": 0.02576799876987934, + "step": 180 + }, + { + "epoch": 0.022625, + "grad_norm": 3.216141939163208, + "grad_norm_var": 0.42772885748243633, + "learning_rate": 0.0001, + "loss": 0.7782, + "loss/crossentropy": 2.4444668292999268, + "loss/hidden": 0.447265625, + "loss/logits": 0.07332297414541245, + "loss/reg": 0.025760415941476822, + "step": 181 + }, + { + "epoch": 0.02275, + "grad_norm": 3.6005637645721436, + "grad_norm_var": 0.3680557604441513, + "learning_rate": 0.0001, + "loss": 0.8578, + "loss/crossentropy": 2.2084226608276367, + "loss/hidden": 0.51171875, + "loss/logits": 0.0885147675871849, + "loss/reg": 0.025753989815711975, + "step": 182 + }, + { + "epoch": 0.022875, + "grad_norm": 4.19577693939209, + "grad_norm_var": 0.3852931468556484, + "learning_rate": 0.0001, + "loss": 0.9394, + "loss/crossentropy": 2.557783842086792, + "loss/hidden": 0.56640625, + "loss/logits": 0.11552520841360092, + "loss/reg": 0.025747526437044144, + "step": 183 + }, + { + "epoch": 0.023, + "grad_norm": 3.024552822113037, + "grad_norm_var": 0.38129301153876227, + "learning_rate": 0.0001, + "loss": 0.8858, + "loss/crossentropy": 2.994615316390991, + "loss/hidden": 0.52734375, + "loss/logits": 0.10108112543821335, + "loss/reg": 0.02574075385928154, + "step": 184 + }, + { + "epoch": 0.023125, + "grad_norm": 3.3255319595336914, + "grad_norm_var": 0.3706833429951111, + "learning_rate": 0.0001, + "loss": 0.8407, + "loss/crossentropy": 2.677245855331421, + "loss/hidden": 0.4765625, + "loss/logits": 0.10678394883871078, + "loss/reg": 0.02573317475616932, + "step": 185 + }, + { + "epoch": 0.02325, + "grad_norm": 3.341599464416504, + "grad_norm_var": 0.3716797590150757, + "learning_rate": 0.0001, + "loss": 0.9127, + "loss/crossentropy": 2.156444549560547, + "loss/hidden": 0.56640625, + "loss/logits": 0.0890708938241005, + "loss/reg": 0.02572541870176792, + "step": 186 + }, + { + "epoch": 0.023375, + "grad_norm": 2.925915241241455, + "grad_norm_var": 0.3822577484351124, + "learning_rate": 0.0001, + "loss": 0.8815, + "loss/crossentropy": 2.2716755867004395, + "loss/hidden": 0.53125, + "loss/logits": 0.09304732084274292, + "loss/reg": 0.02571748197078705, + "step": 187 + }, + { + "epoch": 0.0235, + "grad_norm": 4.192226886749268, + "grad_norm_var": 0.40854537365233884, + "learning_rate": 0.0001, + "loss": 0.8547, + "loss/crossentropy": 2.378901720046997, + "loss/hidden": 0.5078125, + "loss/logits": 0.08977752178907394, + "loss/reg": 0.025709524750709534, + "step": 188 + }, + { + "epoch": 0.023625, + "grad_norm": 5.648179054260254, + "grad_norm_var": 0.6443691048121629, + "learning_rate": 0.0001, + "loss": 1.0032, + "loss/crossentropy": 2.4307687282562256, + "loss/hidden": 0.63671875, + "loss/logits": 0.10948194563388824, + "loss/reg": 0.025701580569148064, + "step": 189 + }, + { + "epoch": 0.02375, + "grad_norm": 6.345841884613037, + "grad_norm_var": 1.045029826307897, + "learning_rate": 0.0001, + "loss": 1.0995, + "loss/crossentropy": 2.279203414916992, + "loss/hidden": 0.74609375, + "loss/logits": 0.09642117470502853, + "loss/reg": 0.025694590061903, + "step": 190 + }, + { + "epoch": 0.023875, + "grad_norm": 4.242865085601807, + "grad_norm_var": 0.9782837984014707, + "learning_rate": 0.0001, + "loss": 1.155, + "loss/crossentropy": 2.235325574874878, + "loss/hidden": 0.7890625, + "loss/logits": 0.10906486213207245, + "loss/reg": 0.025686509907245636, + "step": 191 + }, + { + "epoch": 0.024, + "grad_norm": 6.347895622253418, + "grad_norm_var": 1.272032888242258, + "learning_rate": 0.0001, + "loss": 1.0789, + "loss/crossentropy": 2.5386898517608643, + "loss/hidden": 0.67578125, + "loss/logits": 0.1463102549314499, + "loss/reg": 0.025679145008325577, + "step": 192 + }, + { + "epoch": 0.024125, + "grad_norm": 3.077846050262451, + "grad_norm_var": 1.3268106432816444, + "learning_rate": 0.0001, + "loss": 0.855, + "loss/crossentropy": 2.5889694690704346, + "loss/hidden": 0.515625, + "loss/logits": 0.08266487717628479, + "loss/reg": 0.025671878829598427, + "step": 193 + }, + { + "epoch": 0.02425, + "grad_norm": 3.672849416732788, + "grad_norm_var": 1.323313109234428, + "learning_rate": 0.0001, + "loss": 0.8213, + "loss/crossentropy": 2.269009590148926, + "loss/hidden": 0.4921875, + "loss/logits": 0.07244250178337097, + "loss/reg": 0.02566472254693508, + "step": 194 + }, + { + "epoch": 0.024375, + "grad_norm": 3.13712215423584, + "grad_norm_var": 1.330492936258482, + "learning_rate": 0.0001, + "loss": 0.9044, + "loss/crossentropy": 2.3420536518096924, + "loss/hidden": 0.5390625, + "loss/logits": 0.10874692350625992, + "loss/reg": 0.02565707452595234, + "step": 195 + }, + { + "epoch": 0.0245, + "grad_norm": 5.941372871398926, + "grad_norm_var": 1.5194802994125645, + "learning_rate": 0.0001, + "loss": 1.0268, + "loss/crossentropy": 2.245668649673462, + "loss/hidden": 0.63671875, + "loss/logits": 0.13362175226211548, + "loss/reg": 0.02564912661910057, + "step": 196 + }, + { + "epoch": 0.024625, + "grad_norm": 2.8778631687164307, + "grad_norm_var": 1.5682913914576866, + "learning_rate": 0.0001, + "loss": 0.933, + "loss/crossentropy": 2.4744086265563965, + "loss/hidden": 0.57421875, + "loss/logits": 0.10236240178346634, + "loss/reg": 0.025642510503530502, + "step": 197 + }, + { + "epoch": 0.02475, + "grad_norm": 5.235295295715332, + "grad_norm_var": 1.6223942527523605, + "learning_rate": 0.0001, + "loss": 0.993, + "loss/crossentropy": 2.3120462894439697, + "loss/hidden": 0.61328125, + "loss/logits": 0.12334179133176804, + "loss/reg": 0.025634463876485825, + "step": 198 + }, + { + "epoch": 0.024875, + "grad_norm": 3.2772397994995117, + "grad_norm_var": 1.6781902664948347, + "learning_rate": 0.0001, + "loss": 0.9134, + "loss/crossentropy": 2.2896904945373535, + "loss/hidden": 0.5625, + "loss/logits": 0.0946369469165802, + "loss/reg": 0.025627706199884415, + "step": 199 + }, + { + "epoch": 0.025, + "grad_norm": 4.021130084991455, + "grad_norm_var": 1.5889382838258257, + "learning_rate": 0.0001, + "loss": 0.9467, + "loss/crossentropy": 2.3281702995300293, + "loss/hidden": 0.5859375, + "loss/logits": 0.10459813475608826, + "loss/reg": 0.025620225816965103, + "step": 200 + }, + { + "epoch": 0.025125, + "grad_norm": 3.3705508708953857, + "grad_norm_var": 1.5836618344967157, + "learning_rate": 0.0001, + "loss": 1.0247, + "loss/crossentropy": 2.3704278469085693, + "loss/hidden": 0.640625, + "loss/logits": 0.12795141339302063, + "loss/reg": 0.025613589212298393, + "step": 201 + }, + { + "epoch": 0.02525, + "grad_norm": 5.586423397064209, + "grad_norm_var": 1.6331597901730046, + "learning_rate": 0.0001, + "loss": 1.2703, + "loss/crossentropy": 2.3346736431121826, + "loss/hidden": 0.83984375, + "loss/logits": 0.17438159883022308, + "loss/reg": 0.025606893002986908, + "step": 202 + }, + { + "epoch": 0.025375, + "grad_norm": 6.558523654937744, + "grad_norm_var": 1.7590475344041898, + "learning_rate": 0.0001, + "loss": 1.0711, + "loss/crossentropy": 2.264883518218994, + "loss/hidden": 0.71484375, + "loss/logits": 0.10028564184904099, + "loss/reg": 0.025599893182516098, + "step": 203 + }, + { + "epoch": 0.0255, + "grad_norm": 3.024080991744995, + "grad_norm_var": 1.9071946132324447, + "learning_rate": 0.0001, + "loss": 0.9349, + "loss/crossentropy": 2.505457878112793, + "loss/hidden": 0.58203125, + "loss/logits": 0.09690214693546295, + "loss/reg": 0.025592036545276642, + "step": 204 + }, + { + "epoch": 0.025625, + "grad_norm": 3.268216133117676, + "grad_norm_var": 1.9040994009139534, + "learning_rate": 0.0001, + "loss": 0.8433, + "loss/crossentropy": 2.629786968231201, + "loss/hidden": 0.5, + "loss/logits": 0.08743932843208313, + "loss/reg": 0.025583887472748756, + "step": 205 + }, + { + "epoch": 0.02575, + "grad_norm": 5.203437328338623, + "grad_norm_var": 1.6853258867355625, + "learning_rate": 0.0001, + "loss": 0.9278, + "loss/crossentropy": 2.368603229522705, + "loss/hidden": 0.5625, + "loss/logits": 0.10955986380577087, + "loss/reg": 0.025574835017323494, + "step": 206 + }, + { + "epoch": 0.025875, + "grad_norm": 5.106112480163574, + "grad_norm_var": 1.725017173963382, + "learning_rate": 0.0001, + "loss": 0.9881, + "loss/crossentropy": 2.554746389389038, + "loss/hidden": 0.6171875, + "loss/logits": 0.11524944007396698, + "loss/reg": 0.02556804195046425, + "step": 207 + }, + { + "epoch": 0.026, + "grad_norm": 8.258187294006348, + "grad_norm_var": 2.4602814049521387, + "learning_rate": 0.0001, + "loss": 1.3675, + "loss/crossentropy": 2.4391205310821533, + "loss/hidden": 0.94140625, + "loss/logits": 0.17047560214996338, + "loss/reg": 0.025560656562447548, + "step": 208 + }, + { + "epoch": 0.026125, + "grad_norm": 3.8223764896392822, + "grad_norm_var": 2.3561294395388566, + "learning_rate": 0.0001, + "loss": 1.02, + "loss/crossentropy": 2.5300426483154297, + "loss/hidden": 0.62890625, + "loss/logits": 0.13556598126888275, + "loss/reg": 0.025551345199346542, + "step": 209 + }, + { + "epoch": 0.02625, + "grad_norm": 3.6237666606903076, + "grad_norm_var": 2.36184075461094, + "learning_rate": 0.0001, + "loss": 1.0218, + "loss/crossentropy": 2.3368213176727295, + "loss/hidden": 0.6640625, + "loss/logits": 0.10230866074562073, + "loss/reg": 0.025544527918100357, + "step": 210 + }, + { + "epoch": 0.026375, + "grad_norm": 3.1394050121307373, + "grad_norm_var": 2.3614203164344407, + "learning_rate": 0.0001, + "loss": 0.964, + "loss/crossentropy": 2.338050603866577, + "loss/hidden": 0.609375, + "loss/logits": 0.09931059181690216, + "loss/reg": 0.025535617023706436, + "step": 211 + }, + { + "epoch": 0.0265, + "grad_norm": 3.1498186588287354, + "grad_norm_var": 2.319283484830568, + "learning_rate": 0.0001, + "loss": 0.8039, + "loss/crossentropy": 2.5108940601348877, + "loss/hidden": 0.46875, + "loss/logits": 0.07982419431209564, + "loss/reg": 0.025528721511363983, + "step": 212 + }, + { + "epoch": 0.026625, + "grad_norm": 3.334510326385498, + "grad_norm_var": 2.2429786468962374, + "learning_rate": 0.0001, + "loss": 1.1105, + "loss/crossentropy": 2.458519697189331, + "loss/hidden": 0.71875, + "loss/logits": 0.1365831047296524, + "loss/reg": 0.02552017569541931, + "step": 213 + }, + { + "epoch": 0.02675, + "grad_norm": 3.3243789672851562, + "grad_norm_var": 2.2516768547287977, + "learning_rate": 0.0001, + "loss": 1.0016, + "loss/crossentropy": 2.2530109882354736, + "loss/hidden": 0.62890625, + "loss/logits": 0.11759582161903381, + "loss/reg": 0.025511734187602997, + "step": 214 + }, + { + "epoch": 0.026875, + "grad_norm": 3.3768937587738037, + "grad_norm_var": 2.2393156807375245, + "learning_rate": 0.0001, + "loss": 1.0452, + "loss/crossentropy": 2.3267643451690674, + "loss/hidden": 0.65625, + "loss/logits": 0.13390058279037476, + "loss/reg": 0.0255054272711277, + "step": 215 + }, + { + "epoch": 0.027, + "grad_norm": 7.391561031341553, + "grad_norm_var": 2.841738119885866, + "learning_rate": 0.0001, + "loss": 1.2263, + "loss/crossentropy": 2.285508394241333, + "loss/hidden": 0.8203125, + "loss/logits": 0.1510239541530609, + "loss/reg": 0.025499247014522552, + "step": 216 + }, + { + "epoch": 0.027125, + "grad_norm": 3.143969774246216, + "grad_norm_var": 2.8781965049841705, + "learning_rate": 0.0001, + "loss": 0.9421, + "loss/crossentropy": 2.6111316680908203, + "loss/hidden": 0.5703125, + "loss/logits": 0.11691074818372726, + "loss/reg": 0.025491848587989807, + "step": 217 + }, + { + "epoch": 0.02725, + "grad_norm": 4.989267826080322, + "grad_norm_var": 2.8105564664802234, + "learning_rate": 0.0001, + "loss": 0.9104, + "loss/crossentropy": 2.7856109142303467, + "loss/hidden": 0.546875, + "loss/logits": 0.10870229452848434, + "loss/reg": 0.025485411286354065, + "step": 218 + }, + { + "epoch": 0.027375, + "grad_norm": 8.867380142211914, + "grad_norm_var": 3.802177537112387, + "learning_rate": 0.0001, + "loss": 1.2976, + "loss/crossentropy": 2.414778470993042, + "loss/hidden": 0.84375, + "loss/logits": 0.199102982878685, + "loss/reg": 0.025478005409240723, + "step": 219 + }, + { + "epoch": 0.0275, + "grad_norm": 3.949193239212036, + "grad_norm_var": 3.665725599495321, + "learning_rate": 0.0001, + "loss": 1.0781, + "loss/crossentropy": 2.3898277282714844, + "loss/hidden": 0.6796875, + "loss/logits": 0.14368270337581635, + "loss/reg": 0.02547168917953968, + "step": 220 + }, + { + "epoch": 0.027625, + "grad_norm": 7.980153560638428, + "grad_norm_var": 4.202985170085262, + "learning_rate": 0.0001, + "loss": 1.4005, + "loss/crossentropy": 2.0598959922790527, + "loss/hidden": 0.9921875, + "loss/logits": 0.15361803770065308, + "loss/reg": 0.02546495571732521, + "step": 221 + }, + { + "epoch": 0.02775, + "grad_norm": 4.118736743927002, + "grad_norm_var": 4.234989890674561, + "learning_rate": 0.0001, + "loss": 0.9084, + "loss/crossentropy": 2.2568750381469727, + "loss/hidden": 0.55859375, + "loss/logits": 0.09525588899850845, + "loss/reg": 0.02545757219195366, + "step": 222 + }, + { + "epoch": 0.027875, + "grad_norm": 3.730299711227417, + "grad_norm_var": 4.306033514824207, + "learning_rate": 0.0001, + "loss": 0.9394, + "loss/crossentropy": 2.7180914878845215, + "loss/hidden": 0.57421875, + "loss/logits": 0.11064038425683975, + "loss/reg": 0.025450890883803368, + "step": 223 + }, + { + "epoch": 0.028, + "grad_norm": 3.138925552368164, + "grad_norm_var": 3.557911666554559, + "learning_rate": 0.0001, + "loss": 0.8406, + "loss/crossentropy": 2.3719072341918945, + "loss/hidden": 0.5078125, + "loss/logits": 0.07834647595882416, + "loss/reg": 0.02544352412223816, + "step": 224 + }, + { + "epoch": 0.028125, + "grad_norm": 3.478994846343994, + "grad_norm_var": 3.593674795871404, + "learning_rate": 0.0001, + "loss": 0.9072, + "loss/crossentropy": 2.4101219177246094, + "loss/hidden": 0.56640625, + "loss/logits": 0.0863800048828125, + "loss/reg": 0.025437019765377045, + "step": 225 + }, + { + "epoch": 0.02825, + "grad_norm": 5.091615676879883, + "grad_norm_var": 3.572291640880083, + "learning_rate": 0.0001, + "loss": 0.9864, + "loss/crossentropy": 2.2258856296539307, + "loss/hidden": 0.640625, + "loss/logits": 0.09143185615539551, + "loss/reg": 0.025430168956518173, + "step": 226 + }, + { + "epoch": 0.028375, + "grad_norm": 3.616190195083618, + "grad_norm_var": 3.4991896025782796, + "learning_rate": 0.0001, + "loss": 0.9246, + "loss/crossentropy": 2.765105724334717, + "loss/hidden": 0.5703125, + "loss/logits": 0.10003923624753952, + "loss/reg": 0.025423482060432434, + "step": 227 + }, + { + "epoch": 0.0285, + "grad_norm": 4.406581878662109, + "grad_norm_var": 3.364516245493509, + "learning_rate": 0.0001, + "loss": 0.98, + "loss/crossentropy": 2.41001033782959, + "loss/hidden": 0.6171875, + "loss/logits": 0.10860306769609451, + "loss/reg": 0.025416266173124313, + "step": 228 + }, + { + "epoch": 0.028625, + "grad_norm": 3.003995418548584, + "grad_norm_var": 3.4280449285692023, + "learning_rate": 0.0001, + "loss": 0.9759, + "loss/crossentropy": 2.518749952316284, + "loss/hidden": 0.62109375, + "loss/logits": 0.10075733810663223, + "loss/reg": 0.025409165769815445, + "step": 229 + }, + { + "epoch": 0.02875, + "grad_norm": 4.073727130889893, + "grad_norm_var": 3.3356380380428576, + "learning_rate": 0.0001, + "loss": 0.8881, + "loss/crossentropy": 2.6824846267700195, + "loss/hidden": 0.5390625, + "loss/logits": 0.09498724341392517, + "loss/reg": 0.025402268394827843, + "step": 230 + }, + { + "epoch": 0.028875, + "grad_norm": 3.8958635330200195, + "grad_norm_var": 3.2645611787952435, + "learning_rate": 0.0001, + "loss": 0.8794, + "loss/crossentropy": 2.684971332550049, + "loss/hidden": 0.53515625, + "loss/logits": 0.09024453163146973, + "loss/reg": 0.025395380333065987, + "step": 231 + }, + { + "epoch": 0.029, + "grad_norm": 3.5619406700134277, + "grad_norm_var": 2.796506014438906, + "learning_rate": 0.0001, + "loss": 0.957, + "loss/crossentropy": 2.7883284091949463, + "loss/hidden": 0.6015625, + "loss/logits": 0.10159540176391602, + "loss/reg": 0.02538810484111309, + "step": 232 + }, + { + "epoch": 0.029125, + "grad_norm": 12.658771514892578, + "grad_norm_var": 6.809983669727001, + "learning_rate": 0.0001, + "loss": 1.1843, + "loss/crossentropy": 2.537827253341675, + "loss/hidden": 0.7890625, + "loss/logits": 0.14144758880138397, + "loss/reg": 0.02538110502064228, + "step": 233 + }, + { + "epoch": 0.02925, + "grad_norm": 8.465475082397461, + "grad_norm_var": 7.543990683504188, + "learning_rate": 0.0001, + "loss": 1.2958, + "loss/crossentropy": 2.6715972423553467, + "loss/hidden": 0.875, + "loss/logits": 0.16701380908489227, + "loss/reg": 0.02537420578300953, + "step": 234 + }, + { + "epoch": 0.029375, + "grad_norm": 6.49767541885376, + "grad_norm_var": 6.75275709893707, + "learning_rate": 0.0001, + "loss": 1.0826, + "loss/crossentropy": 2.316210985183716, + "loss/hidden": 0.7109375, + "loss/logits": 0.11794352531433105, + "loss/reg": 0.025367144495248795, + "step": 235 + }, + { + "epoch": 0.0295, + "grad_norm": 4.668674945831299, + "grad_norm_var": 6.674304000957216, + "learning_rate": 0.0001, + "loss": 1.0642, + "loss/crossentropy": 1.956210732460022, + "loss/hidden": 0.69921875, + "loss/logits": 0.11135473847389221, + "loss/reg": 0.02535996399819851, + "step": 236 + }, + { + "epoch": 0.029625, + "grad_norm": 6.132915019989014, + "grad_norm_var": 6.19031909782113, + "learning_rate": 0.0001, + "loss": 1.0462, + "loss/crossentropy": 2.6552460193634033, + "loss/hidden": 0.6640625, + "loss/logits": 0.12862679362297058, + "loss/reg": 0.025352442637085915, + "step": 237 + }, + { + "epoch": 0.02975, + "grad_norm": 4.94125509262085, + "grad_norm_var": 6.132251305092321, + "learning_rate": 0.0001, + "loss": 1.2795, + "loss/crossentropy": 2.204523801803589, + "loss/hidden": 0.84765625, + "loss/logits": 0.17835499346256256, + "loss/reg": 0.025344664230942726, + "step": 238 + }, + { + "epoch": 0.029875, + "grad_norm": 3.7683677673339844, + "grad_norm_var": 6.125464850588167, + "learning_rate": 0.0001, + "loss": 0.985, + "loss/crossentropy": 2.652397632598877, + "loss/hidden": 0.625, + "loss/logits": 0.10665792226791382, + "loss/reg": 0.0253366157412529, + "step": 239 + }, + { + "epoch": 0.03, + "grad_norm": 4.732015609741211, + "grad_norm_var": 5.870172361717241, + "learning_rate": 0.0001, + "loss": 1.1018, + "loss/crossentropy": 2.544055938720703, + "loss/hidden": 0.7109375, + "loss/logits": 0.13754525780677795, + "loss/reg": 0.02532930299639702, + "step": 240 + }, + { + "epoch": 0.030125, + "grad_norm": 4.046056270599365, + "grad_norm_var": 5.761120866273571, + "learning_rate": 0.0001, + "loss": 0.9396, + "loss/crossentropy": 2.6015427112579346, + "loss/hidden": 0.5859375, + "loss/logits": 0.10041546076536179, + "loss/reg": 0.025322062894701958, + "step": 241 + }, + { + "epoch": 0.03025, + "grad_norm": 3.066027879714966, + "grad_norm_var": 6.052926687728684, + "learning_rate": 0.0001, + "loss": 1.0668, + "loss/crossentropy": 2.4402151107788086, + "loss/hidden": 0.6875, + "loss/logits": 0.12614867091178894, + "loss/reg": 0.0253145694732666, + "step": 242 + }, + { + "epoch": 0.030375, + "grad_norm": 3.168888807296753, + "grad_norm_var": 6.1536859873832555, + "learning_rate": 0.0001, + "loss": 0.9413, + "loss/crossentropy": 2.5689940452575684, + "loss/hidden": 0.58984375, + "loss/logits": 0.09841729700565338, + "loss/reg": 0.02530776336789131, + "step": 243 + }, + { + "epoch": 0.0305, + "grad_norm": 3.806450366973877, + "grad_norm_var": 6.229122059899357, + "learning_rate": 0.0001, + "loss": 1.021, + "loss/crossentropy": 2.2438809871673584, + "loss/hidden": 0.6328125, + "loss/logits": 0.13515594601631165, + "loss/reg": 0.02530042454600334, + "step": 244 + }, + { + "epoch": 0.030625, + "grad_norm": 4.313736915588379, + "grad_norm_var": 5.982441934425083, + "learning_rate": 0.0001, + "loss": 1.0307, + "loss/crossentropy": 2.5067784786224365, + "loss/hidden": 0.65234375, + "loss/logits": 0.1253766119480133, + "loss/reg": 0.025293124839663506, + "step": 245 + }, + { + "epoch": 0.03075, + "grad_norm": 4.419394016265869, + "grad_norm_var": 5.942040082684442, + "learning_rate": 0.0001, + "loss": 0.8957, + "loss/crossentropy": 2.4725239276885986, + "loss/hidden": 0.55078125, + "loss/logits": 0.09201550483703613, + "loss/reg": 0.02528616413474083, + "step": 246 + }, + { + "epoch": 0.030875, + "grad_norm": 3.709151268005371, + "grad_norm_var": 5.975041529003943, + "learning_rate": 0.0001, + "loss": 0.9152, + "loss/crossentropy": 2.413250207901001, + "loss/hidden": 0.56640625, + "loss/logits": 0.09596529603004456, + "loss/reg": 0.025278838351368904, + "step": 247 + }, + { + "epoch": 0.031, + "grad_norm": 3.4139935970306396, + "grad_norm_var": 6.007189625317282, + "learning_rate": 0.0001, + "loss": 1.0268, + "loss/crossentropy": 2.1580560207366943, + "loss/hidden": 0.65625, + "loss/logits": 0.11786328256130219, + "loss/reg": 0.025271562859416008, + "step": 248 + }, + { + "epoch": 0.031125, + "grad_norm": 6.089378833770752, + "grad_norm_var": 2.0950588257899443, + "learning_rate": 0.0001, + "loss": 1.0999, + "loss/crossentropy": 2.5466785430908203, + "loss/hidden": 0.703125, + "loss/logits": 0.1441519856452942, + "loss/reg": 0.025264522060751915, + "step": 249 + }, + { + "epoch": 0.03125, + "grad_norm": 4.001245498657227, + "grad_norm_var": 1.1007847740596406, + "learning_rate": 0.0001, + "loss": 0.9263, + "loss/crossentropy": 2.433518171310425, + "loss/hidden": 0.578125, + "loss/logits": 0.09559185057878494, + "loss/reg": 0.025257611647248268, + "step": 250 + }, + { + "epoch": 0.031375, + "grad_norm": 4.982790946960449, + "grad_norm_var": 0.8252532202355399, + "learning_rate": 0.0001, + "loss": 1.1439, + "loss/crossentropy": 2.3542721271514893, + "loss/hidden": 0.73046875, + "loss/logits": 0.16091328859329224, + "loss/reg": 0.025250321254134178, + "step": 251 + }, + { + "epoch": 0.0315, + "grad_norm": 3.6227378845214844, + "grad_norm_var": 0.8462248829388517, + "learning_rate": 0.0001, + "loss": 1.1789, + "loss/crossentropy": 2.1333043575286865, + "loss/hidden": 0.7890625, + "loss/logits": 0.1374010145664215, + "loss/reg": 0.025242896750569344, + "step": 252 + }, + { + "epoch": 0.031625, + "grad_norm": 3.415194511413574, + "grad_norm_var": 0.6304077366129337, + "learning_rate": 0.0001, + "loss": 1.0916, + "loss/crossentropy": 2.23525071144104, + "loss/hidden": 0.71484375, + "loss/logits": 0.1243971735239029, + "loss/reg": 0.025235962122678757, + "step": 253 + }, + { + "epoch": 0.03175, + "grad_norm": 3.7671289443969727, + "grad_norm_var": 0.5838590152321442, + "learning_rate": 0.0001, + "loss": 0.908, + "loss/crossentropy": 2.254054546356201, + "loss/hidden": 0.56640625, + "loss/logits": 0.0893106684088707, + "loss/reg": 0.025228681042790413, + "step": 254 + }, + { + "epoch": 0.031875, + "grad_norm": 4.164376735687256, + "grad_norm_var": 0.5803655311074387, + "learning_rate": 0.0001, + "loss": 0.9548, + "loss/crossentropy": 2.4128520488739014, + "loss/hidden": 0.5859375, + "loss/logits": 0.11668873578310013, + "loss/reg": 0.025221774354577065, + "step": 255 + }, + { + "epoch": 0.032, + "grad_norm": 3.6412672996520996, + "grad_norm_var": 0.5547959425019464, + "learning_rate": 0.0001, + "loss": 1.0098, + "loss/crossentropy": 2.7745320796966553, + "loss/hidden": 0.62890625, + "loss/logits": 0.12875014543533325, + "loss/reg": 0.025214577093720436, + "step": 256 + }, + { + "epoch": 0.032125, + "grad_norm": 2.900871515274048, + "grad_norm_var": 0.6261772657264061, + "learning_rate": 0.0001, + "loss": 0.9348, + "loss/crossentropy": 2.286968469619751, + "loss/hidden": 0.578125, + "loss/logits": 0.10455667227506638, + "loss/reg": 0.02520710788667202, + "step": 257 + }, + { + "epoch": 0.03225, + "grad_norm": 3.6853647232055664, + "grad_norm_var": 0.5808564529014478, + "learning_rate": 0.0001, + "loss": 0.9013, + "loss/crossentropy": 2.4515562057495117, + "loss/hidden": 0.55078125, + "loss/logits": 0.09847953915596008, + "loss/reg": 0.025199349969625473, + "step": 258 + }, + { + "epoch": 0.032375, + "grad_norm": 2.9091956615448, + "grad_norm_var": 0.6119059054418109, + "learning_rate": 0.0001, + "loss": 0.8784, + "loss/crossentropy": 2.163628339767456, + "loss/hidden": 0.5390625, + "loss/logits": 0.08743810653686523, + "loss/reg": 0.025191258639097214, + "step": 259 + }, + { + "epoch": 0.0325, + "grad_norm": 2.9488673210144043, + "grad_norm_var": 0.6717290813098125, + "learning_rate": 0.0001, + "loss": 0.9195, + "loss/crossentropy": 2.3084585666656494, + "loss/hidden": 0.5546875, + "loss/logits": 0.11298206448554993, + "loss/reg": 0.025183765217661858, + "step": 260 + }, + { + "epoch": 0.032625, + "grad_norm": 3.4345953464508057, + "grad_norm_var": 0.6684943296663647, + "learning_rate": 0.0001, + "loss": 0.8851, + "loss/crossentropy": 2.19558048248291, + "loss/hidden": 0.546875, + "loss/logits": 0.08649411797523499, + "loss/reg": 0.02517561800777912, + "step": 261 + }, + { + "epoch": 0.03275, + "grad_norm": 28.159074783325195, + "grad_norm_var": 37.79188620028727, + "learning_rate": 0.0001, + "loss": 1.1781, + "loss/crossentropy": 2.839057207107544, + "loss/hidden": 0.7890625, + "loss/logits": 0.13732999563217163, + "loss/reg": 0.025168145075440407, + "step": 262 + }, + { + "epoch": 0.032875, + "grad_norm": 2.734104871749878, + "grad_norm_var": 38.05849364469687, + "learning_rate": 0.0001, + "loss": 0.8892, + "loss/crossentropy": 2.4754340648651123, + "loss/hidden": 0.54296875, + "loss/logits": 0.09462890028953552, + "loss/reg": 0.025160137563943863, + "step": 263 + }, + { + "epoch": 0.033, + "grad_norm": 4.089654922485352, + "grad_norm_var": 37.922354469790704, + "learning_rate": 0.0001, + "loss": 0.9326, + "loss/crossentropy": 2.476450204849243, + "loss/hidden": 0.57421875, + "loss/logits": 0.1068512499332428, + "loss/reg": 0.025151856243610382, + "step": 264 + }, + { + "epoch": 0.033125, + "grad_norm": 3.838066577911377, + "grad_norm_var": 37.99741003814723, + "learning_rate": 0.0001, + "loss": 1.0909, + "loss/crossentropy": 2.114243268966675, + "loss/hidden": 0.734375, + "loss/logits": 0.10510236769914627, + "loss/reg": 0.025143325328826904, + "step": 265 + }, + { + "epoch": 0.03325, + "grad_norm": 3.0674874782562256, + "grad_norm_var": 38.19410456778619, + "learning_rate": 0.0001, + "loss": 0.8898, + "loss/crossentropy": 2.6426563262939453, + "loss/hidden": 0.54296875, + "loss/logits": 0.09550425410270691, + "loss/reg": 0.025135278701782227, + "step": 266 + }, + { + "epoch": 0.033375, + "grad_norm": 7.777696132659912, + "grad_norm_var": 38.64421623432597, + "learning_rate": 0.0001, + "loss": 1.0482, + "loss/crossentropy": 2.640554666519165, + "loss/hidden": 0.6796875, + "loss/logits": 0.11728060245513916, + "loss/reg": 0.025126684457063675, + "step": 267 + }, + { + "epoch": 0.0335, + "grad_norm": 3.6375653743743896, + "grad_norm_var": 38.64099364344996, + "learning_rate": 0.0001, + "loss": 0.9888, + "loss/crossentropy": 2.227538824081421, + "loss/hidden": 0.63671875, + "loss/logits": 0.1008879542350769, + "loss/reg": 0.0251180287450552, + "step": 268 + }, + { + "epoch": 0.033625, + "grad_norm": 4.305724620819092, + "grad_norm_var": 38.4714335626231, + "learning_rate": 0.0001, + "loss": 1.1268, + "loss/crossentropy": 2.556248903274536, + "loss/hidden": 0.73828125, + "loss/logits": 0.1374487727880478, + "loss/reg": 0.02511041797697544, + "step": 269 + }, + { + "epoch": 0.03375, + "grad_norm": 3.658052921295166, + "grad_norm_var": 38.4947077039297, + "learning_rate": 0.0001, + "loss": 0.9616, + "loss/crossentropy": 2.437307596206665, + "loss/hidden": 0.60546875, + "loss/logits": 0.1050904244184494, + "loss/reg": 0.025102900341153145, + "step": 270 + }, + { + "epoch": 0.033875, + "grad_norm": 6.534849643707275, + "grad_norm_var": 38.483973576312195, + "learning_rate": 0.0001, + "loss": 1.2119, + "loss/crossentropy": 2.36796236038208, + "loss/hidden": 0.81640625, + "loss/logits": 0.1445278525352478, + "loss/reg": 0.025095317512750626, + "step": 271 + }, + { + "epoch": 0.034, + "grad_norm": 5.486751079559326, + "grad_norm_var": 38.24988881420663, + "learning_rate": 0.0001, + "loss": 1.0337, + "loss/crossentropy": 2.517333984375, + "loss/hidden": 0.66015625, + "loss/logits": 0.12263435125350952, + "loss/reg": 0.025087665766477585, + "step": 272 + }, + { + "epoch": 0.034125, + "grad_norm": 3.231940746307373, + "grad_norm_var": 38.13878485092763, + "learning_rate": 0.0001, + "loss": 1.0586, + "loss/crossentropy": 2.6812305450439453, + "loss/hidden": 0.67578125, + "loss/logits": 0.13204006850719452, + "loss/reg": 0.025080092251300812, + "step": 273 + }, + { + "epoch": 0.03425, + "grad_norm": 4.71685791015625, + "grad_norm_var": 37.942827296069424, + "learning_rate": 0.0001, + "loss": 1.0409, + "loss/crossentropy": 2.093151807785034, + "loss/hidden": 0.6796875, + "loss/logits": 0.11049959808588028, + "loss/reg": 0.02507280558347702, + "step": 274 + }, + { + "epoch": 0.034375, + "grad_norm": 3.7086901664733887, + "grad_norm_var": 37.68973967522894, + "learning_rate": 0.0001, + "loss": 0.9144, + "loss/crossentropy": 2.2392518520355225, + "loss/hidden": 0.57421875, + "loss/logits": 0.08954055607318878, + "loss/reg": 0.02506582997739315, + "step": 275 + }, + { + "epoch": 0.0345, + "grad_norm": 5.537543773651123, + "grad_norm_var": 37.1561912525544, + "learning_rate": 0.0001, + "loss": 1.3586, + "loss/crossentropy": 1.8622019290924072, + "loss/hidden": 0.96484375, + "loss/logits": 0.14318415522575378, + "loss/reg": 0.025058824568986893, + "step": 276 + }, + { + "epoch": 0.034625, + "grad_norm": 2.898383617401123, + "grad_norm_var": 37.34827444255352, + "learning_rate": 0.0001, + "loss": 0.9413, + "loss/crossentropy": 2.4155869483947754, + "loss/hidden": 0.578125, + "loss/logits": 0.11263684928417206, + "loss/reg": 0.025052132084965706, + "step": 277 + }, + { + "epoch": 0.03475, + "grad_norm": 2.803711175918579, + "grad_norm_var": 2.0625830733920933, + "learning_rate": 0.0001, + "loss": 0.9404, + "loss/crossentropy": 2.2058048248291016, + "loss/hidden": 0.59765625, + "loss/logits": 0.09232598543167114, + "loss/reg": 0.02504453808069229, + "step": 278 + }, + { + "epoch": 0.034875, + "grad_norm": 3.180114507675171, + "grad_norm_var": 1.9847680294286107, + "learning_rate": 0.0001, + "loss": 0.9311, + "loss/crossentropy": 2.6069722175598145, + "loss/hidden": 0.57421875, + "loss/logits": 0.10655493289232254, + "loss/reg": 0.02503693662583828, + "step": 279 + }, + { + "epoch": 0.035, + "grad_norm": 5.609209060668945, + "grad_norm_var": 2.0906055341906256, + "learning_rate": 0.0001, + "loss": 1.0675, + "loss/crossentropy": 2.5023670196533203, + "loss/hidden": 0.68359375, + "loss/logits": 0.13361594080924988, + "loss/reg": 0.025029515847563744, + "step": 280 + }, + { + "epoch": 0.035125, + "grad_norm": 5.165370464324951, + "grad_norm_var": 2.105772188928517, + "learning_rate": 0.0001, + "loss": 1.1103, + "loss/crossentropy": 2.5107007026672363, + "loss/hidden": 0.73828125, + "loss/logits": 0.12183347344398499, + "loss/reg": 0.025022249668836594, + "step": 281 + }, + { + "epoch": 0.03525, + "grad_norm": 4.322115421295166, + "grad_norm_var": 1.9716269568170388, + "learning_rate": 0.0001, + "loss": 0.95, + "loss/crossentropy": 2.483142137527466, + "loss/hidden": 0.5859375, + "loss/logits": 0.11390361189842224, + "loss/reg": 0.025014575570821762, + "step": 282 + }, + { + "epoch": 0.035375, + "grad_norm": 4.432065486907959, + "grad_norm_var": 1.2250959918754403, + "learning_rate": 0.0001, + "loss": 0.9886, + "loss/crossentropy": 2.1206424236297607, + "loss/hidden": 0.6328125, + "loss/logits": 0.10568062961101532, + "loss/reg": 0.025007015094161034, + "step": 283 + }, + { + "epoch": 0.0355, + "grad_norm": 4.0429558753967285, + "grad_norm_var": 1.198112283867575, + "learning_rate": 0.0001, + "loss": 1.0004, + "loss/crossentropy": 2.6465060710906982, + "loss/hidden": 0.6328125, + "loss/logits": 0.11765305697917938, + "loss/reg": 0.024997249245643616, + "step": 284 + }, + { + "epoch": 0.035625, + "grad_norm": 3.551006555557251, + "grad_norm_var": 1.23838358717468, + "learning_rate": 0.0001, + "loss": 0.9764, + "loss/crossentropy": 2.4274182319641113, + "loss/hidden": 0.61328125, + "loss/logits": 0.11320526152849197, + "loss/reg": 0.02498740889132023, + "step": 285 + }, + { + "epoch": 0.03575, + "grad_norm": 2.874697685241699, + "grad_norm_var": 1.344305852802292, + "learning_rate": 0.0001, + "loss": 1.0133, + "loss/crossentropy": 2.512948751449585, + "loss/hidden": 0.6484375, + "loss/logits": 0.11510799080133438, + "loss/reg": 0.02497740648686886, + "step": 286 + }, + { + "epoch": 0.035875, + "grad_norm": 3.4613194465637207, + "grad_norm_var": 1.0008425760722102, + "learning_rate": 0.0001, + "loss": 0.9274, + "loss/crossentropy": 2.4450602531433105, + "loss/hidden": 0.5703125, + "loss/logits": 0.10743874311447144, + "loss/reg": 0.024968616664409637, + "step": 287 + }, + { + "epoch": 0.036, + "grad_norm": 3.452150344848633, + "grad_norm_var": 0.8735820507410946, + "learning_rate": 0.0001, + "loss": 0.952, + "loss/crossentropy": 2.304729700088501, + "loss/hidden": 0.578125, + "loss/logits": 0.1242954432964325, + "loss/reg": 0.024959923699498177, + "step": 288 + }, + { + "epoch": 0.036125, + "grad_norm": 3.3603436946868896, + "grad_norm_var": 0.8625457550688983, + "learning_rate": 0.0001, + "loss": 0.9447, + "loss/crossentropy": 2.425712823867798, + "loss/hidden": 0.58984375, + "loss/logits": 0.1053181067109108, + "loss/reg": 0.024950530380010605, + "step": 289 + }, + { + "epoch": 0.03625, + "grad_norm": 3.3667662143707275, + "grad_norm_var": 0.8374846368179986, + "learning_rate": 0.0001, + "loss": 0.9067, + "loss/crossentropy": 2.2623162269592285, + "loss/hidden": 0.55859375, + "loss/logits": 0.098650723695755, + "loss/reg": 0.02494119666516781, + "step": 290 + }, + { + "epoch": 0.036375, + "grad_norm": 2.7996857166290283, + "grad_norm_var": 0.9075153562133816, + "learning_rate": 0.0001, + "loss": 0.9594, + "loss/crossentropy": 2.370417356491089, + "loss/hidden": 0.59765625, + "loss/logits": 0.11243726313114166, + "loss/reg": 0.024933209642767906, + "step": 291 + }, + { + "epoch": 0.0365, + "grad_norm": 2.9999380111694336, + "grad_norm_var": 0.7233017120786666, + "learning_rate": 0.0001, + "loss": 1.0396, + "loss/crossentropy": 2.286776542663574, + "loss/hidden": 0.6640625, + "loss/logits": 0.12626898288726807, + "loss/reg": 0.02492516115307808, + "step": 292 + }, + { + "epoch": 0.036625, + "grad_norm": 3.880777597427368, + "grad_norm_var": 0.685825505757979, + "learning_rate": 0.0001, + "loss": 0.9764, + "loss/crossentropy": 2.613823652267456, + "loss/hidden": 0.61328125, + "loss/logits": 0.11395551264286041, + "loss/reg": 0.02491726726293564, + "step": 293 + }, + { + "epoch": 0.03675, + "grad_norm": 4.2950568199157715, + "grad_norm_var": 0.6453385025094112, + "learning_rate": 0.0001, + "loss": 1.0535, + "loss/crossentropy": 2.6882171630859375, + "loss/hidden": 0.66796875, + "loss/logits": 0.13644324243068695, + "loss/reg": 0.02490835078060627, + "step": 294 + }, + { + "epoch": 0.036875, + "grad_norm": 3.61423659324646, + "grad_norm_var": 0.6212598300915251, + "learning_rate": 0.0001, + "loss": 1.155, + "loss/crossentropy": 2.0867068767547607, + "loss/hidden": 0.76953125, + "loss/logits": 0.13649392127990723, + "loss/reg": 0.024899456650018692, + "step": 295 + }, + { + "epoch": 0.037, + "grad_norm": 3.9245386123657227, + "grad_norm_var": 0.3982568915415859, + "learning_rate": 0.0001, + "loss": 0.951, + "loss/crossentropy": 2.703791618347168, + "loss/hidden": 0.5859375, + "loss/logits": 0.11614765971899033, + "loss/reg": 0.02489159069955349, + "step": 296 + }, + { + "epoch": 0.037125, + "grad_norm": 4.020047187805176, + "grad_norm_var": 0.2597397925732442, + "learning_rate": 0.0001, + "loss": 0.9707, + "loss/crossentropy": 2.429636001586914, + "loss/hidden": 0.61328125, + "loss/logits": 0.1086028665304184, + "loss/reg": 0.024883201345801353, + "step": 297 + }, + { + "epoch": 0.03725, + "grad_norm": 4.086516857147217, + "grad_norm_var": 0.24209119003572066, + "learning_rate": 0.0001, + "loss": 1.0769, + "loss/crossentropy": 2.8581056594848633, + "loss/hidden": 0.6796875, + "loss/logits": 0.14844708144664764, + "loss/reg": 0.024874594062566757, + "step": 298 + }, + { + "epoch": 0.037375, + "grad_norm": 2.956085443496704, + "grad_norm_var": 0.22141400399237127, + "learning_rate": 0.0001, + "loss": 0.9903, + "loss/crossentropy": 2.4802041053771973, + "loss/hidden": 0.609375, + "loss/logits": 0.13230839371681213, + "loss/reg": 0.024866636842489243, + "step": 299 + }, + { + "epoch": 0.0375, + "grad_norm": 3.165804386138916, + "grad_norm_var": 0.2110158468875736, + "learning_rate": 0.0001, + "loss": 1.0347, + "loss/crossentropy": 2.4370639324188232, + "loss/hidden": 0.66015625, + "loss/logits": 0.1259341686964035, + "loss/reg": 0.024857714772224426, + "step": 300 + }, + { + "epoch": 0.037625, + "grad_norm": 3.0942695140838623, + "grad_norm_var": 0.22022059823099174, + "learning_rate": 0.0001, + "loss": 0.9747, + "loss/crossentropy": 2.674190044403076, + "loss/hidden": 0.6015625, + "loss/logits": 0.12467385828495026, + "loss/reg": 0.024848785251379013, + "step": 301 + }, + { + "epoch": 0.03775, + "grad_norm": 2.949205160140991, + "grad_norm_var": 0.21475779393049, + "learning_rate": 0.0001, + "loss": 0.8898, + "loss/crossentropy": 2.491746664047241, + "loss/hidden": 0.55078125, + "loss/logits": 0.09065801650285721, + "loss/reg": 0.02483983524143696, + "step": 302 + }, + { + "epoch": 0.037875, + "grad_norm": 3.549431324005127, + "grad_norm_var": 0.21520952048913009, + "learning_rate": 0.0001, + "loss": 0.9197, + "loss/crossentropy": 2.7578208446502686, + "loss/hidden": 0.57421875, + "loss/logits": 0.09712585806846619, + "loss/reg": 0.024831857532262802, + "step": 303 + }, + { + "epoch": 0.038, + "grad_norm": 3.0621931552886963, + "grad_norm_var": 0.22562503941355938, + "learning_rate": 0.0001, + "loss": 0.894, + "loss/crossentropy": 2.440351724624634, + "loss/hidden": 0.546875, + "loss/logits": 0.09888219833374023, + "loss/reg": 0.02482294850051403, + "step": 304 + }, + { + "epoch": 0.038125, + "grad_norm": 3.334942579269409, + "grad_norm_var": 0.22595311715915212, + "learning_rate": 0.0001, + "loss": 1.1509, + "loss/crossentropy": 2.378168821334839, + "loss/hidden": 0.74609375, + "loss/logits": 0.15670377016067505, + "loss/reg": 0.024814244359731674, + "step": 305 + }, + { + "epoch": 0.03825, + "grad_norm": 3.3253893852233887, + "grad_norm_var": 0.22648465837488155, + "learning_rate": 0.0001, + "loss": 0.9744, + "loss/crossentropy": 2.4409470558166504, + "loss/hidden": 0.6015625, + "loss/logits": 0.12483348697423935, + "loss/reg": 0.024805361405014992, + "step": 306 + }, + { + "epoch": 0.038375, + "grad_norm": 3.1687541007995605, + "grad_norm_var": 0.20343285009947346, + "learning_rate": 0.0001, + "loss": 0.8998, + "loss/crossentropy": 2.3339829444885254, + "loss/hidden": 0.55078125, + "loss/logits": 0.10110392421483994, + "loss/reg": 0.02479635737836361, + "step": 307 + }, + { + "epoch": 0.0385, + "grad_norm": 4.096661567687988, + "grad_norm_var": 0.21071919009248533, + "learning_rate": 0.0001, + "loss": 1.0154, + "loss/crossentropy": 2.468813180923462, + "loss/hidden": 0.64453125, + "loss/logits": 0.12295258045196533, + "loss/reg": 0.024787236005067825, + "step": 308 + }, + { + "epoch": 0.038625, + "grad_norm": 3.097642660140991, + "grad_norm_var": 0.2127095324618587, + "learning_rate": 0.0001, + "loss": 0.8772, + "loss/crossentropy": 2.5380513668060303, + "loss/hidden": 0.53125, + "loss/logits": 0.09821398556232452, + "loss/reg": 0.024777989834547043, + "step": 309 + }, + { + "epoch": 0.03875, + "grad_norm": 4.647727012634277, + "grad_norm_var": 0.25863060133758775, + "learning_rate": 0.0001, + "loss": 0.9162, + "loss/crossentropy": 2.0996336936950684, + "loss/hidden": 0.5625, + "loss/logits": 0.10599061101675034, + "loss/reg": 0.024768849834799767, + "step": 310 + }, + { + "epoch": 0.038875, + "grad_norm": 3.627246141433716, + "grad_norm_var": 0.25882920418562944, + "learning_rate": 0.0001, + "loss": 1.0161, + "loss/crossentropy": 2.2958993911743164, + "loss/hidden": 0.65234375, + "loss/logits": 0.11611323803663254, + "loss/reg": 0.024759870022535324, + "step": 311 + }, + { + "epoch": 0.039, + "grad_norm": 3.7236764430999756, + "grad_norm_var": 0.2501591619921593, + "learning_rate": 0.0001, + "loss": 1.1308, + "loss/crossentropy": 2.4322752952575684, + "loss/hidden": 0.7421875, + "loss/logits": 0.1410805881023407, + "loss/reg": 0.024751078337430954, + "step": 312 + }, + { + "epoch": 0.039125, + "grad_norm": 3.998807430267334, + "grad_norm_var": 0.24869789076210413, + "learning_rate": 0.0001, + "loss": 1.0749, + "loss/crossentropy": 2.374758720397949, + "loss/hidden": 0.703125, + "loss/logits": 0.1243831142783165, + "loss/reg": 0.024742012843489647, + "step": 313 + }, + { + "epoch": 0.03925, + "grad_norm": 4.315618515014648, + "grad_norm_var": 0.2701154191311919, + "learning_rate": 0.0001, + "loss": 0.9542, + "loss/crossentropy": 2.290766477584839, + "loss/hidden": 0.60546875, + "loss/logits": 0.10138154029846191, + "loss/reg": 0.02473386563360691, + "step": 314 + }, + { + "epoch": 0.039375, + "grad_norm": 11.789870262145996, + "grad_norm_var": 4.498354875640615, + "learning_rate": 0.0001, + "loss": 1.6276, + "loss/crossentropy": 2.391585350036621, + "loss/hidden": 1.25, + "loss/logits": 0.13034726679325104, + "loss/reg": 0.02472575195133686, + "step": 315 + }, + { + "epoch": 0.0395, + "grad_norm": 3.499288320541382, + "grad_norm_var": 4.465581075155145, + "learning_rate": 0.0001, + "loss": 0.9478, + "loss/crossentropy": 2.4527926445007324, + "loss/hidden": 0.5703125, + "loss/logits": 0.13031822443008423, + "loss/reg": 0.024717645719647408, + "step": 316 + }, + { + "epoch": 0.039625, + "grad_norm": 3.0139851570129395, + "grad_norm_var": 4.476536239649594, + "learning_rate": 0.0001, + "loss": 0.9555, + "loss/crossentropy": 2.506457805633545, + "loss/hidden": 0.58203125, + "loss/logits": 0.12640972435474396, + "loss/reg": 0.02470862865447998, + "step": 317 + }, + { + "epoch": 0.03975, + "grad_norm": 3.671523094177246, + "grad_norm_var": 4.4007183053584145, + "learning_rate": 0.0001, + "loss": 1.0902, + "loss/crossentropy": 2.1730313301086426, + "loss/hidden": 0.7109375, + "loss/logits": 0.13228294253349304, + "loss/reg": 0.024700626730918884, + "step": 318 + }, + { + "epoch": 0.039875, + "grad_norm": 3.453159809112549, + "grad_norm_var": 4.408623714873802, + "learning_rate": 0.0001, + "loss": 0.9518, + "loss/crossentropy": 2.7452075481414795, + "loss/hidden": 0.58984375, + "loss/logits": 0.11500866711139679, + "loss/reg": 0.02469259686768055, + "step": 319 + }, + { + "epoch": 0.04, + "grad_norm": 8.18558406829834, + "grad_norm_var": 5.330579476502794, + "learning_rate": 0.0001, + "loss": 1.2593, + "loss/crossentropy": 2.295231819152832, + "loss/hidden": 0.78125, + "loss/logits": 0.2312176525592804, + "loss/reg": 0.024684064090251923, + "step": 320 + }, + { + "epoch": 0.040125, + "grad_norm": 4.58513879776001, + "grad_norm_var": 5.245000173569268, + "learning_rate": 0.0001, + "loss": 1.0428, + "loss/crossentropy": 2.3456668853759766, + "loss/hidden": 0.67578125, + "loss/logits": 0.12027865648269653, + "loss/reg": 0.024676108732819557, + "step": 321 + }, + { + "epoch": 0.04025, + "grad_norm": 3.6148953437805176, + "grad_norm_var": 5.20441494141252, + "learning_rate": 0.0001, + "loss": 0.885, + "loss/crossentropy": 2.3615198135375977, + "loss/hidden": 0.53515625, + "loss/logits": 0.10311760008335114, + "loss/reg": 0.02466769702732563, + "step": 322 + }, + { + "epoch": 0.040375, + "grad_norm": 4.596187114715576, + "grad_norm_var": 5.072570501388933, + "learning_rate": 0.0001, + "loss": 1.3642, + "loss/crossentropy": 1.9407044649124146, + "loss/hidden": 0.9140625, + "loss/logits": 0.20351722836494446, + "loss/reg": 0.02465960383415222, + "step": 323 + }, + { + "epoch": 0.0405, + "grad_norm": 4.305622577667236, + "grad_norm_var": 5.060723771971758, + "learning_rate": 0.0001, + "loss": 1.2854, + "loss/crossentropy": 2.1887283325195312, + "loss/hidden": 0.83203125, + "loss/logits": 0.20682212710380554, + "loss/reg": 0.024651024490594864, + "step": 324 + }, + { + "epoch": 0.040625, + "grad_norm": 3.1052956581115723, + "grad_norm_var": 5.059160883569213, + "learning_rate": 0.0001, + "loss": 0.9277, + "loss/crossentropy": 2.7552340030670166, + "loss/hidden": 0.56640625, + "loss/logits": 0.11491444706916809, + "loss/reg": 0.024642454460263252, + "step": 325 + }, + { + "epoch": 0.04075, + "grad_norm": 3.902174234390259, + "grad_norm_var": 5.092472426369553, + "learning_rate": 0.0001, + "loss": 1.0922, + "loss/crossentropy": 2.328878402709961, + "loss/hidden": 0.70703125, + "loss/logits": 0.13887017965316772, + "loss/reg": 0.024634363129734993, + "step": 326 + }, + { + "epoch": 0.040875, + "grad_norm": 3.144326686859131, + "grad_norm_var": 5.168830163995767, + "learning_rate": 0.0001, + "loss": 0.9028, + "loss/crossentropy": 2.765711545944214, + "loss/hidden": 0.55078125, + "loss/logits": 0.10574661940336227, + "loss/reg": 0.02462565153837204, + "step": 327 + }, + { + "epoch": 0.041, + "grad_norm": 4.085259914398193, + "grad_norm_var": 5.136846736797656, + "learning_rate": 0.0001, + "loss": 1.1366, + "loss/crossentropy": 2.2278430461883545, + "loss/hidden": 0.75390625, + "loss/logits": 0.13655412197113037, + "loss/reg": 0.0246175117790699, + "step": 328 + }, + { + "epoch": 0.041125, + "grad_norm": 3.578554391860962, + "grad_norm_var": 5.180404969237465, + "learning_rate": 0.0001, + "loss": 1.0227, + "loss/crossentropy": 2.7795305252075195, + "loss/hidden": 0.640625, + "loss/logits": 0.13600832223892212, + "loss/reg": 0.024609515443444252, + "step": 329 + }, + { + "epoch": 0.04125, + "grad_norm": 3.7418434619903564, + "grad_norm_var": 5.219134310055354, + "learning_rate": 0.0001, + "loss": 1.0244, + "loss/crossentropy": 2.636258840560913, + "loss/hidden": 0.63671875, + "loss/logits": 0.1417178213596344, + "loss/reg": 0.024600951001048088, + "step": 330 + }, + { + "epoch": 0.041375, + "grad_norm": 3.369840383529663, + "grad_norm_var": 1.4852025101017772, + "learning_rate": 0.0001, + "loss": 1.0208, + "loss/crossentropy": 2.448961019515991, + "loss/hidden": 0.6640625, + "loss/logits": 0.1107659712433815, + "loss/reg": 0.024592852219939232, + "step": 331 + }, + { + "epoch": 0.0415, + "grad_norm": 2.90629506111145, + "grad_norm_var": 1.5460412234751968, + "learning_rate": 0.0001, + "loss": 0.9411, + "loss/crossentropy": 2.595834732055664, + "loss/hidden": 0.59375, + "loss/logits": 0.10147438943386078, + "loss/reg": 0.024584442377090454, + "step": 332 + }, + { + "epoch": 0.041625, + "grad_norm": 3.402386426925659, + "grad_norm_var": 1.5068032644485272, + "learning_rate": 0.0001, + "loss": 1.0144, + "loss/crossentropy": 2.363499402999878, + "loss/hidden": 0.62890625, + "loss/logits": 0.1397327035665512, + "loss/reg": 0.02457563206553459, + "step": 333 + }, + { + "epoch": 0.04175, + "grad_norm": 3.597465753555298, + "grad_norm_var": 1.5101723473758888, + "learning_rate": 0.0001, + "loss": 1.1917, + "loss/crossentropy": 2.3256943225860596, + "loss/hidden": 0.7890625, + "loss/logits": 0.15691371262073517, + "loss/reg": 0.02456764318048954, + "step": 334 + }, + { + "epoch": 0.041875, + "grad_norm": 3.512730121612549, + "grad_norm_var": 1.506262203991567, + "learning_rate": 0.0001, + "loss": 1.0631, + "loss/crossentropy": 2.383385181427002, + "loss/hidden": 0.69140625, + "loss/logits": 0.12608817219734192, + "loss/reg": 0.024559510871767998, + "step": 335 + }, + { + "epoch": 0.042, + "grad_norm": 5.968827247619629, + "grad_norm_var": 0.5694964439723395, + "learning_rate": 0.0001, + "loss": 1.258, + "loss/crossentropy": 2.212991952896118, + "loss/hidden": 0.828125, + "loss/logits": 0.18438610434532166, + "loss/reg": 0.02455153875052929, + "step": 336 + }, + { + "epoch": 0.042125, + "grad_norm": 3.644115924835205, + "grad_norm_var": 0.5311677507970897, + "learning_rate": 0.0001, + "loss": 1.1203, + "loss/crossentropy": 2.3620948791503906, + "loss/hidden": 0.72265625, + "loss/logits": 0.15219135582447052, + "loss/reg": 0.02454366721212864, + "step": 337 + }, + { + "epoch": 0.04225, + "grad_norm": 2.716367244720459, + "grad_norm_var": 0.6013761572733584, + "learning_rate": 0.0001, + "loss": 0.919, + "loss/crossentropy": 2.3863117694854736, + "loss/hidden": 0.578125, + "loss/logits": 0.0955524817109108, + "loss/reg": 0.024536145851016045, + "step": 338 + }, + { + "epoch": 0.042375, + "grad_norm": 3.8117287158966064, + "grad_norm_var": 0.5485673092684531, + "learning_rate": 0.0001, + "loss": 1.0529, + "loss/crossentropy": 2.4951469898223877, + "loss/hidden": 0.67578125, + "loss/logits": 0.1318708062171936, + "loss/reg": 0.024528514593839645, + "step": 339 + }, + { + "epoch": 0.0425, + "grad_norm": 2.5218894481658936, + "grad_norm_var": 0.5973356289049185, + "learning_rate": 0.0001, + "loss": 0.9673, + "loss/crossentropy": 2.370839834213257, + "loss/hidden": 0.6015625, + "loss/logits": 0.12049120664596558, + "loss/reg": 0.02452007494866848, + "step": 340 + }, + { + "epoch": 0.042625, + "grad_norm": 2.7343108654022217, + "grad_norm_var": 0.6285810690168129, + "learning_rate": 0.0001, + "loss": 1.1201, + "loss/crossentropy": 2.2990617752075195, + "loss/hidden": 0.7109375, + "loss/logits": 0.16403642296791077, + "loss/reg": 0.024511409923434258, + "step": 341 + }, + { + "epoch": 0.04275, + "grad_norm": 2.5267186164855957, + "grad_norm_var": 0.6803812464423664, + "learning_rate": 0.0001, + "loss": 0.9056, + "loss/crossentropy": 2.5629754066467285, + "loss/hidden": 0.5546875, + "loss/logits": 0.10588675737380981, + "loss/reg": 0.024503152817487717, + "step": 342 + }, + { + "epoch": 0.042875, + "grad_norm": 2.2731988430023193, + "grad_norm_var": 0.7637691760365584, + "learning_rate": 0.0001, + "loss": 0.8154, + "loss/crossentropy": 2.4123644828796387, + "loss/hidden": 0.490234375, + "loss/logits": 0.08022630214691162, + "loss/reg": 0.024494923651218414, + "step": 343 + }, + { + "epoch": 0.043, + "grad_norm": 3.2791945934295654, + "grad_norm_var": 0.7306725618305314, + "learning_rate": 0.0001, + "loss": 0.9879, + "loss/crossentropy": 2.7643816471099854, + "loss/hidden": 0.6171875, + "loss/logits": 0.12586694955825806, + "loss/reg": 0.024486759677529335, + "step": 344 + }, + { + "epoch": 0.043125, + "grad_norm": 3.6740994453430176, + "grad_norm_var": 0.7341663188433093, + "learning_rate": 0.0001, + "loss": 0.8947, + "loss/crossentropy": 2.6802780628204346, + "loss/hidden": 0.5546875, + "loss/logits": 0.0951782613992691, + "loss/reg": 0.024478696286678314, + "step": 345 + }, + { + "epoch": 0.04325, + "grad_norm": 3.234722137451172, + "grad_norm_var": 0.7240869727338347, + "learning_rate": 0.0001, + "loss": 0.8517, + "loss/crossentropy": 2.577921152114868, + "loss/hidden": 0.515625, + "loss/logits": 0.09137749671936035, + "loss/reg": 0.024470962584018707, + "step": 346 + }, + { + "epoch": 0.043375, + "grad_norm": 4.024074554443359, + "grad_norm_var": 0.7548921970504276, + "learning_rate": 0.0001, + "loss": 1.0017, + "loss/crossentropy": 2.8129634857177734, + "loss/hidden": 0.63671875, + "loss/logits": 0.120377317070961, + "loss/reg": 0.02446298860013485, + "step": 347 + }, + { + "epoch": 0.0435, + "grad_norm": 3.4327027797698975, + "grad_norm_var": 0.7400679146500114, + "learning_rate": 0.0001, + "loss": 0.9864, + "loss/crossentropy": 2.4310224056243896, + "loss/hidden": 0.640625, + "loss/logits": 0.10121208429336548, + "loss/reg": 0.0244552381336689, + "step": 348 + }, + { + "epoch": 0.043625, + "grad_norm": 3.2115890979766846, + "grad_norm_var": 0.7422101391295163, + "learning_rate": 0.0001, + "loss": 1.0594, + "loss/crossentropy": 2.3658318519592285, + "loss/hidden": 0.6796875, + "loss/logits": 0.13528358936309814, + "loss/reg": 0.02444704994559288, + "step": 349 + }, + { + "epoch": 0.04375, + "grad_norm": 3.4005136489868164, + "grad_norm_var": 0.739061242813568, + "learning_rate": 0.0001, + "loss": 1.0154, + "loss/crossentropy": 2.9171154499053955, + "loss/hidden": 0.6171875, + "loss/logits": 0.15386469662189484, + "loss/reg": 0.024438532069325447, + "step": 350 + }, + { + "epoch": 0.043875, + "grad_norm": 4.1175127029418945, + "grad_norm_var": 0.7731950105324129, + "learning_rate": 0.0001, + "loss": 0.949, + "loss/crossentropy": 2.3405494689941406, + "loss/hidden": 0.59765625, + "loss/logits": 0.10704682767391205, + "loss/reg": 0.024429937824606895, + "step": 351 + }, + { + "epoch": 0.044, + "grad_norm": 3.9496090412139893, + "grad_norm_var": 0.33930652052577653, + "learning_rate": 0.0001, + "loss": 0.889, + "loss/crossentropy": 2.9135613441467285, + "loss/hidden": 0.54296875, + "loss/logits": 0.1018136739730835, + "loss/reg": 0.024421829730272293, + "step": 352 + }, + { + "epoch": 0.044125, + "grad_norm": 3.794520139694214, + "grad_norm_var": 0.347931624130162, + "learning_rate": 0.0001, + "loss": 1.1771, + "loss/crossentropy": 2.5529844760894775, + "loss/hidden": 0.75390625, + "loss/logits": 0.17909468710422516, + "loss/reg": 0.02441396936774254, + "step": 353 + }, + { + "epoch": 0.04425, + "grad_norm": 3.164349317550659, + "grad_norm_var": 0.3259767305032634, + "learning_rate": 0.0001, + "loss": 1.0804, + "loss/crossentropy": 2.3843231201171875, + "loss/hidden": 0.71484375, + "loss/logits": 0.12148329615592957, + "loss/reg": 0.02440580353140831, + "step": 354 + }, + { + "epoch": 0.044375, + "grad_norm": 2.7847914695739746, + "grad_norm_var": 0.32482231475126633, + "learning_rate": 0.0001, + "loss": 1.0463, + "loss/crossentropy": 2.521649122238159, + "loss/hidden": 0.68359375, + "loss/logits": 0.1187494546175003, + "loss/reg": 0.024397339671850204, + "step": 355 + }, + { + "epoch": 0.0445, + "grad_norm": 3.2391860485076904, + "grad_norm_var": 0.28660331114573767, + "learning_rate": 0.0001, + "loss": 1.0225, + "loss/crossentropy": 2.4073832035064697, + "loss/hidden": 0.640625, + "loss/logits": 0.13795481622219086, + "loss/reg": 0.024389205500483513, + "step": 356 + }, + { + "epoch": 0.044625, + "grad_norm": 5.839352130889893, + "grad_norm_var": 0.6539216724231817, + "learning_rate": 0.0001, + "loss": 1.1058, + "loss/crossentropy": 2.6860220432281494, + "loss/hidden": 0.7265625, + "loss/logits": 0.13543812930583954, + "loss/reg": 0.02438061311841011, + "step": 357 + }, + { + "epoch": 0.04475, + "grad_norm": 4.340758800506592, + "grad_norm_var": 0.6249977794062299, + "learning_rate": 0.0001, + "loss": 1.1, + "loss/crossentropy": 2.770012378692627, + "loss/hidden": 0.703125, + "loss/logits": 0.1531440019607544, + "loss/reg": 0.024372335523366928, + "step": 358 + }, + { + "epoch": 0.044875, + "grad_norm": 8.795727729797363, + "grad_norm_var": 2.1213731683569863, + "learning_rate": 0.0001, + "loss": 1.3149, + "loss/crossentropy": 2.2893428802490234, + "loss/hidden": 0.91796875, + "loss/logits": 0.15326841175556183, + "loss/reg": 0.02436378225684166, + "step": 359 + }, + { + "epoch": 0.045, + "grad_norm": 4.104447841644287, + "grad_norm_var": 2.0826812332104394, + "learning_rate": 0.0001, + "loss": 1.15, + "loss/crossentropy": 2.625378370285034, + "loss/hidden": 0.7578125, + "loss/logits": 0.14858195185661316, + "loss/reg": 0.024355949833989143, + "step": 360 + }, + { + "epoch": 0.045125, + "grad_norm": 3.354828357696533, + "grad_norm_var": 2.105873348197963, + "learning_rate": 0.0001, + "loss": 0.9494, + "loss/crossentropy": 2.4916443824768066, + "loss/hidden": 0.5859375, + "loss/logits": 0.11993111670017242, + "loss/reg": 0.024348480626940727, + "step": 361 + }, + { + "epoch": 0.04525, + "grad_norm": 3.3247056007385254, + "grad_norm_var": 2.096606359520402, + "learning_rate": 0.0001, + "loss": 0.9604, + "loss/crossentropy": 2.282543420791626, + "loss/hidden": 0.609375, + "loss/logits": 0.10765975713729858, + "loss/reg": 0.02434113249182701, + "step": 362 + }, + { + "epoch": 0.045375, + "grad_norm": 4.314214706420898, + "grad_norm_var": 2.1006745469652883, + "learning_rate": 0.0001, + "loss": 1.0684, + "loss/crossentropy": 2.447218179702759, + "loss/hidden": 0.6953125, + "loss/logits": 0.12974585592746735, + "loss/reg": 0.024332784116268158, + "step": 363 + }, + { + "epoch": 0.0455, + "grad_norm": 3.6783666610717773, + "grad_norm_var": 2.083471757970481, + "learning_rate": 0.0001, + "loss": 1.1509, + "loss/crossentropy": 2.6020665168762207, + "loss/hidden": 0.75, + "loss/logits": 0.1576002687215805, + "loss/reg": 0.024325383827090263, + "step": 364 + }, + { + "epoch": 0.045625, + "grad_norm": 3.535550832748413, + "grad_norm_var": 2.0521572529950523, + "learning_rate": 0.0001, + "loss": 0.9672, + "loss/crossentropy": 2.2514841556549072, + "loss/hidden": 0.61328125, + "loss/logits": 0.110772505402565, + "loss/reg": 0.024317855015397072, + "step": 365 + }, + { + "epoch": 0.04575, + "grad_norm": 6.190865993499756, + "grad_norm_var": 2.275325586048537, + "learning_rate": 0.0001, + "loss": 1.2696, + "loss/crossentropy": 2.5250887870788574, + "loss/hidden": 0.87890625, + "loss/logits": 0.1475657969713211, + "loss/reg": 0.02430957928299904, + "step": 366 + }, + { + "epoch": 0.045875, + "grad_norm": 6.9109907150268555, + "grad_norm_var": 2.701389202772653, + "learning_rate": 0.0001, + "loss": 1.42, + "loss/crossentropy": 2.0714285373687744, + "loss/hidden": 0.98828125, + "loss/logits": 0.18868008255958557, + "loss/reg": 0.024301210418343544, + "step": 367 + }, + { + "epoch": 0.046, + "grad_norm": 3.939924955368042, + "grad_norm_var": 2.702051041555256, + "learning_rate": 0.0001, + "loss": 1.0163, + "loss/crossentropy": 2.677281379699707, + "loss/hidden": 0.640625, + "loss/logits": 0.1327013224363327, + "loss/reg": 0.024292904883623123, + "step": 368 + }, + { + "epoch": 0.046125, + "grad_norm": 2.942032814025879, + "grad_norm_var": 2.822776844100568, + "learning_rate": 0.0001, + "loss": 0.9582, + "loss/crossentropy": 2.3630495071411133, + "loss/hidden": 0.6015625, + "loss/logits": 0.11380600929260254, + "loss/reg": 0.02428455464541912, + "step": 369 + }, + { + "epoch": 0.04625, + "grad_norm": 3.125349521636963, + "grad_norm_var": 2.8293167859701627, + "learning_rate": 0.0001, + "loss": 1.0463, + "loss/crossentropy": 2.5113635063171387, + "loss/hidden": 0.67578125, + "loss/logits": 0.12774598598480225, + "loss/reg": 0.02427608147263527, + "step": 370 + }, + { + "epoch": 0.046375, + "grad_norm": 3.15159273147583, + "grad_norm_var": 2.758666518773039, + "learning_rate": 0.0001, + "loss": 0.9198, + "loss/crossentropy": 2.31132435798645, + "loss/hidden": 0.5703125, + "loss/logits": 0.10678299516439438, + "loss/reg": 0.024267377331852913, + "step": 371 + }, + { + "epoch": 0.0465, + "grad_norm": 4.394677639007568, + "grad_norm_var": 2.6595375525429406, + "learning_rate": 0.0001, + "loss": 0.9954, + "loss/crossentropy": 2.4987642765045166, + "loss/hidden": 0.6328125, + "loss/logits": 0.11996030062437057, + "loss/reg": 0.0242580845952034, + "step": 372 + }, + { + "epoch": 0.046625, + "grad_norm": 3.7915477752685547, + "grad_norm_var": 2.5549678839666425, + "learning_rate": 0.0001, + "loss": 0.9193, + "loss/crossentropy": 2.3996403217315674, + "loss/hidden": 0.578125, + "loss/logits": 0.09870465099811554, + "loss/reg": 0.02424856275320053, + "step": 373 + }, + { + "epoch": 0.04675, + "grad_norm": 2.833364725112915, + "grad_norm_var": 2.7025530371611066, + "learning_rate": 0.0001, + "loss": 0.9417, + "loss/crossentropy": 2.336583137512207, + "loss/hidden": 0.59765625, + "loss/logits": 0.10166990756988525, + "loss/reg": 0.024240419268608093, + "step": 374 + }, + { + "epoch": 0.046875, + "grad_norm": 5.302695274353027, + "grad_norm_var": 1.359315799583595, + "learning_rate": 0.0001, + "loss": 0.9495, + "loss/crossentropy": 2.5733697414398193, + "loss/hidden": 0.578125, + "loss/logits": 0.1290503740310669, + "loss/reg": 0.0242319293320179, + "step": 375 + }, + { + "epoch": 0.047, + "grad_norm": 5.087683200836182, + "grad_norm_var": 1.426096117003745, + "learning_rate": 0.0001, + "loss": 0.9067, + "loss/crossentropy": 2.548323631286621, + "loss/hidden": 0.56640625, + "loss/logits": 0.09806410223245621, + "loss/reg": 0.02422359585762024, + "step": 376 + }, + { + "epoch": 0.047125, + "grad_norm": 2.840883255004883, + "grad_norm_var": 1.4948607984557458, + "learning_rate": 0.0001, + "loss": 1.0234, + "loss/crossentropy": 2.484494209289551, + "loss/hidden": 0.65234375, + "loss/logits": 0.12894591689109802, + "loss/reg": 0.024214565753936768, + "step": 377 + }, + { + "epoch": 0.04725, + "grad_norm": 6.647733688354492, + "grad_norm_var": 1.848030946106532, + "learning_rate": 0.0001, + "loss": 1.131, + "loss/crossentropy": 2.832048177719116, + "loss/hidden": 0.7421875, + "loss/logits": 0.14676988124847412, + "loss/reg": 0.024205682799220085, + "step": 378 + }, + { + "epoch": 0.047375, + "grad_norm": 3.465564727783203, + "grad_norm_var": 1.8906396391038638, + "learning_rate": 0.0001, + "loss": 1.0051, + "loss/crossentropy": 2.5129964351654053, + "loss/hidden": 0.62890625, + "loss/logits": 0.134174644947052, + "loss/reg": 0.024197354912757874, + "step": 379 + }, + { + "epoch": 0.0475, + "grad_norm": 3.6985855102539062, + "grad_norm_var": 1.8891513099755568, + "learning_rate": 0.0001, + "loss": 1.4422, + "loss/crossentropy": 2.2384328842163086, + "loss/hidden": 1.0, + "loss/logits": 0.20027095079421997, + "loss/reg": 0.02418883889913559, + "step": 380 + }, + { + "epoch": 0.047625, + "grad_norm": 3.4343388080596924, + "grad_norm_var": 1.8993141107729303, + "learning_rate": 0.0001, + "loss": 0.8648, + "loss/crossentropy": 2.753706932067871, + "loss/hidden": 0.5234375, + "loss/logits": 0.09959565848112106, + "loss/reg": 0.024180689826607704, + "step": 381 + }, + { + "epoch": 0.04775, + "grad_norm": 3.418168067932129, + "grad_norm_var": 1.6566847859375398, + "learning_rate": 0.0001, + "loss": 1.0532, + "loss/crossentropy": 2.1205835342407227, + "loss/hidden": 0.69921875, + "loss/logits": 0.11221310496330261, + "loss/reg": 0.02417258359491825, + "step": 382 + }, + { + "epoch": 0.047875, + "grad_norm": 3.3881125450134277, + "grad_norm_var": 1.093930487598201, + "learning_rate": 0.0001, + "loss": 0.9968, + "loss/crossentropy": 2.540379285812378, + "loss/hidden": 0.62890625, + "loss/logits": 0.1262589991092682, + "loss/reg": 0.024164721369743347, + "step": 383 + }, + { + "epoch": 0.048, + "grad_norm": 3.142486095428467, + "grad_norm_var": 1.1231981378319982, + "learning_rate": 0.0001, + "loss": 0.9186, + "loss/crossentropy": 2.5265657901763916, + "loss/hidden": 0.58203125, + "loss/logits": 0.0949624702334404, + "loss/reg": 0.02415630966424942, + "step": 384 + }, + { + "epoch": 0.048125, + "grad_norm": 7.363763332366943, + "grad_norm_var": 1.8443340238906671, + "learning_rate": 0.0001, + "loss": 1.1703, + "loss/crossentropy": 2.442409038543701, + "loss/hidden": 0.8046875, + "loss/logits": 0.12408800423145294, + "loss/reg": 0.024148130789399147, + "step": 385 + }, + { + "epoch": 0.04825, + "grad_norm": 3.801536798477173, + "grad_norm_var": 1.7879312710551798, + "learning_rate": 0.0001, + "loss": 1.0129, + "loss/crossentropy": 2.461942434310913, + "loss/hidden": 0.640625, + "loss/logits": 0.13092291355133057, + "loss/reg": 0.024139659479260445, + "step": 386 + }, + { + "epoch": 0.048375, + "grad_norm": 4.24082612991333, + "grad_norm_var": 1.7228677295443293, + "learning_rate": 0.0001, + "loss": 1.1161, + "loss/crossentropy": 2.2889018058776855, + "loss/hidden": 0.74609375, + "loss/logits": 0.12870003283023834, + "loss/reg": 0.024130841717123985, + "step": 387 + }, + { + "epoch": 0.0485, + "grad_norm": 4.190494060516357, + "grad_norm_var": 1.7195812284180172, + "learning_rate": 0.0001, + "loss": 1.1649, + "loss/crossentropy": 2.4475483894348145, + "loss/hidden": 0.76171875, + "loss/logits": 0.16198021173477173, + "loss/reg": 0.024122456088662148, + "step": 388 + }, + { + "epoch": 0.048625, + "grad_norm": 15.206843376159668, + "grad_norm_var": 9.294742605150057, + "learning_rate": 0.0001, + "loss": 1.0606, + "loss/crossentropy": 2.3164589405059814, + "loss/hidden": 0.703125, + "loss/logits": 0.116313636302948, + "loss/reg": 0.024113710969686508, + "step": 389 + }, + { + "epoch": 0.04875, + "grad_norm": 10.421854972839355, + "grad_norm_var": 10.824103712955843, + "learning_rate": 0.0001, + "loss": 1.0638, + "loss/crossentropy": 2.1764075756073, + "loss/hidden": 0.70703125, + "loss/logits": 0.11567908525466919, + "loss/reg": 0.024104835465550423, + "step": 390 + }, + { + "epoch": 0.048875, + "grad_norm": 4.923640727996826, + "grad_norm_var": 10.83563756748113, + "learning_rate": 0.0001, + "loss": 0.9465, + "loss/crossentropy": 2.6762068271636963, + "loss/hidden": 0.6015625, + "loss/logits": 0.10399520397186279, + "loss/reg": 0.02409605123102665, + "step": 391 + }, + { + "epoch": 0.049, + "grad_norm": 4.840577125549316, + "grad_norm_var": 10.847422220224528, + "learning_rate": 0.0001, + "loss": 1.2037, + "loss/crossentropy": 2.482144832611084, + "loss/hidden": 0.8125, + "loss/logits": 0.15031081438064575, + "loss/reg": 0.02408732660114765, + "step": 392 + }, + { + "epoch": 0.049125, + "grad_norm": 2.9535470008850098, + "grad_norm_var": 10.811063470934846, + "learning_rate": 0.0001, + "loss": 1.0811, + "loss/crossentropy": 2.5998470783233643, + "loss/hidden": 0.71484375, + "loss/logits": 0.12545132637023926, + "loss/reg": 0.024078134447336197, + "step": 393 + }, + { + "epoch": 0.04925, + "grad_norm": 4.08555793762207, + "grad_norm_var": 10.768160950066, + "learning_rate": 0.0001, + "loss": 1.3211, + "loss/crossentropy": 2.611746072769165, + "loss/hidden": 0.90625, + "loss/logits": 0.17412351071834564, + "loss/reg": 0.02406897209584713, + "step": 394 + }, + { + "epoch": 0.049375, + "grad_norm": 4.730119705200195, + "grad_norm_var": 10.582242923890295, + "learning_rate": 0.0001, + "loss": 0.9463, + "loss/crossentropy": 2.5379066467285156, + "loss/hidden": 0.59765625, + "loss/logits": 0.10806328058242798, + "loss/reg": 0.024059420451521873, + "step": 395 + }, + { + "epoch": 0.0495, + "grad_norm": 4.012064456939697, + "grad_norm_var": 10.523956759484621, + "learning_rate": 0.0001, + "loss": 0.9181, + "loss/crossentropy": 2.567375898361206, + "loss/hidden": 0.57421875, + "loss/logits": 0.10337453335523605, + "loss/reg": 0.024049852043390274, + "step": 396 + }, + { + "epoch": 0.049625, + "grad_norm": 4.57706880569458, + "grad_norm_var": 10.32746400090784, + "learning_rate": 0.0001, + "loss": 1.051, + "loss/crossentropy": 2.6955151557922363, + "loss/hidden": 0.6875, + "loss/logits": 0.12309969961643219, + "loss/reg": 0.02403969317674637, + "step": 397 + }, + { + "epoch": 0.04975, + "grad_norm": 2.7931346893310547, + "grad_norm_var": 10.511295288820635, + "learning_rate": 0.0001, + "loss": 1.0373, + "loss/crossentropy": 2.34134578704834, + "loss/hidden": 0.66796875, + "loss/logits": 0.12901151180267334, + "loss/reg": 0.02403116784989834, + "step": 398 + }, + { + "epoch": 0.049875, + "grad_norm": 2.4789700508117676, + "grad_norm_var": 10.793738555266915, + "learning_rate": 0.0001, + "loss": 1.1463, + "loss/crossentropy": 2.1852893829345703, + "loss/hidden": 0.75390625, + "loss/logits": 0.15216518938541412, + "loss/reg": 0.024022690951824188, + "step": 399 + }, + { + "epoch": 0.05, + "grad_norm": 5.340605735778809, + "grad_norm_var": 10.482396698239997, + "learning_rate": 0.0001, + "loss": 1.2874, + "loss/crossentropy": 2.4223577976226807, + "loss/hidden": 0.91015625, + "loss/logits": 0.13714072108268738, + "loss/reg": 0.024014031514525414, + "step": 400 + }, + { + "epoch": 0.050125, + "grad_norm": 5.600348949432373, + "grad_norm_var": 10.208567826877847, + "learning_rate": 0.0001, + "loss": 0.9905, + "loss/crossentropy": 2.494180917739868, + "loss/hidden": 0.64453125, + "loss/logits": 0.10594967007637024, + "loss/reg": 0.024006787687540054, + "step": 401 + }, + { + "epoch": 0.05025, + "grad_norm": 5.094600677490234, + "grad_norm_var": 10.061216488426146, + "learning_rate": 0.0001, + "loss": 1.2828, + "loss/crossentropy": 2.1899051666259766, + "loss/hidden": 0.89453125, + "loss/logits": 0.1482730507850647, + "loss/reg": 0.02399739809334278, + "step": 402 + }, + { + "epoch": 0.050375, + "grad_norm": 3.915607452392578, + "grad_norm_var": 10.115626051260246, + "learning_rate": 0.0001, + "loss": 1.0375, + "loss/crossentropy": 2.790240526199341, + "loss/hidden": 0.65625, + "loss/logits": 0.14136558771133423, + "loss/reg": 0.02398892305791378, + "step": 403 + }, + { + "epoch": 0.0505, + "grad_norm": 70.49039459228516, + "grad_norm_var": 274.8357269833382, + "learning_rate": 0.0001, + "loss": 1.1534, + "loss/crossentropy": 2.2490358352661133, + "loss/hidden": 0.79296875, + "loss/logits": 0.12061962485313416, + "loss/reg": 0.02398114837706089, + "step": 404 + }, + { + "epoch": 0.050625, + "grad_norm": 6.5985307693481445, + "grad_norm_var": 272.87861181728414, + "learning_rate": 0.0001, + "loss": 1.1479, + "loss/crossentropy": 2.856632709503174, + "loss/hidden": 0.78515625, + "loss/logits": 0.12297768890857697, + "loss/reg": 0.0239717997610569, + "step": 405 + }, + { + "epoch": 0.05075, + "grad_norm": 4.041878700256348, + "grad_norm_var": 274.1523084795167, + "learning_rate": 0.0001, + "loss": 1.1408, + "loss/crossentropy": 2.5744457244873047, + "loss/hidden": 0.76171875, + "loss/logits": 0.13939061760902405, + "loss/reg": 0.02396412193775177, + "step": 406 + }, + { + "epoch": 0.050875, + "grad_norm": 3.5284969806671143, + "grad_norm_var": 274.94477307618513, + "learning_rate": 0.0001, + "loss": 1.0838, + "loss/crossentropy": 2.7705729007720947, + "loss/hidden": 0.72265625, + "loss/logits": 0.12160193920135498, + "loss/reg": 0.023956267163157463, + "step": 407 + }, + { + "epoch": 0.051, + "grad_norm": 3.867558240890503, + "grad_norm_var": 275.4712566581114, + "learning_rate": 0.0001, + "loss": 1.1238, + "loss/crossentropy": 2.2807302474975586, + "loss/hidden": 0.75390625, + "loss/logits": 0.1304117739200592, + "loss/reg": 0.023948216810822487, + "step": 408 + }, + { + "epoch": 0.051125, + "grad_norm": 4.9265217781066895, + "grad_norm_var": 274.2865770164501, + "learning_rate": 0.0001, + "loss": 0.955, + "loss/crossentropy": 2.6176843643188477, + "loss/hidden": 0.62890625, + "loss/logits": 0.08674542605876923, + "loss/reg": 0.02393944188952446, + "step": 409 + }, + { + "epoch": 0.05125, + "grad_norm": 3.1925463676452637, + "grad_norm_var": 274.86264478448203, + "learning_rate": 0.0001, + "loss": 1.0329, + "loss/crossentropy": 2.790286064147949, + "loss/hidden": 0.6796875, + "loss/logits": 0.11385629326105118, + "loss/reg": 0.02393159456551075, + "step": 410 + }, + { + "epoch": 0.051375, + "grad_norm": 2.6993377208709717, + "grad_norm_var": 276.12743945534294, + "learning_rate": 0.0001, + "loss": 0.9268, + "loss/crossentropy": 2.5643763542175293, + "loss/hidden": 0.58203125, + "loss/logits": 0.10557639598846436, + "loss/reg": 0.0239238403737545, + "step": 411 + }, + { + "epoch": 0.0515, + "grad_norm": 4.7841315269470215, + "grad_norm_var": 275.72098389943244, + "learning_rate": 0.0001, + "loss": 1.3399, + "loss/crossentropy": 2.3737518787384033, + "loss/hidden": 0.89453125, + "loss/logits": 0.2062419056892395, + "loss/reg": 0.023915138095617294, + "step": 412 + }, + { + "epoch": 0.051625, + "grad_norm": 2.8743667602539062, + "grad_norm_var": 276.7634192046356, + "learning_rate": 0.0001, + "loss": 1.105, + "loss/crossentropy": 2.225154399871826, + "loss/hidden": 0.734375, + "loss/logits": 0.13158033788204193, + "loss/reg": 0.023907041177153587, + "step": 413 + }, + { + "epoch": 0.05175, + "grad_norm": 4.873403072357178, + "grad_norm_var": 275.51638736026473, + "learning_rate": 0.0001, + "loss": 1.1328, + "loss/crossentropy": 2.444675922393799, + "loss/hidden": 0.734375, + "loss/logits": 0.15947584807872772, + "loss/reg": 0.023899447172880173, + "step": 414 + }, + { + "epoch": 0.051875, + "grad_norm": 3.6676955223083496, + "grad_norm_var": 274.6671585398764, + "learning_rate": 0.0001, + "loss": 0.9504, + "loss/crossentropy": 2.6828341484069824, + "loss/hidden": 0.6015625, + "loss/logits": 0.10996139049530029, + "loss/reg": 0.023892199620604515, + "step": 415 + }, + { + "epoch": 0.052, + "grad_norm": 2.703623056411743, + "grad_norm_var": 276.2015243387772, + "learning_rate": 0.0001, + "loss": 1.049, + "loss/crossentropy": 2.4529809951782227, + "loss/hidden": 0.6796875, + "loss/logits": 0.1305149495601654, + "loss/reg": 0.023883724585175514, + "step": 416 + }, + { + "epoch": 0.052125, + "grad_norm": 2.9288852214813232, + "grad_norm_var": 277.61048629826035, + "learning_rate": 0.0001, + "loss": 1.0151, + "loss/crossentropy": 2.6102705001831055, + "loss/hidden": 0.640625, + "loss/logits": 0.13567429780960083, + "loss/reg": 0.023875238373875618, + "step": 417 + }, + { + "epoch": 0.05225, + "grad_norm": 2.5006144046783447, + "grad_norm_var": 279.0831974622093, + "learning_rate": 0.0001, + "loss": 1.0571, + "loss/crossentropy": 2.457202672958374, + "loss/hidden": 0.69140625, + "loss/logits": 0.12700514495372772, + "loss/reg": 0.023867420852184296, + "step": 418 + }, + { + "epoch": 0.052375, + "grad_norm": 5.180849552154541, + "grad_norm_var": 278.49850212580674, + "learning_rate": 0.0001, + "loss": 1.1214, + "loss/crossentropy": 2.7064497470855713, + "loss/hidden": 0.74609375, + "loss/logits": 0.13676053285598755, + "loss/reg": 0.02385888434946537, + "step": 419 + }, + { + "epoch": 0.0525, + "grad_norm": 3.2954249382019043, + "grad_norm_var": 1.3051375489769337, + "learning_rate": 0.0001, + "loss": 1.0717, + "loss/crossentropy": 2.31532621383667, + "loss/hidden": 0.703125, + "loss/logits": 0.13004590570926666, + "loss/reg": 0.023851698264479637, + "step": 420 + }, + { + "epoch": 0.052625, + "grad_norm": 2.8389148712158203, + "grad_norm_var": 0.8127685868335741, + "learning_rate": 0.0001, + "loss": 1.0781, + "loss/crossentropy": 2.3562088012695312, + "loss/hidden": 0.70703125, + "loss/logits": 0.13266143202781677, + "loss/reg": 0.023843195289373398, + "step": 421 + }, + { + "epoch": 0.05275, + "grad_norm": 4.581103324890137, + "grad_norm_var": 0.8613437167520175, + "learning_rate": 0.0001, + "loss": 1.4299, + "loss/crossentropy": 2.1004514694213867, + "loss/hidden": 1.0546875, + "loss/logits": 0.13683074712753296, + "loss/reg": 0.023835282772779465, + "step": 422 + }, + { + "epoch": 0.052875, + "grad_norm": 4.119724750518799, + "grad_norm_var": 0.8733982923945914, + "learning_rate": 0.0001, + "loss": 1.2152, + "loss/crossentropy": 2.640662670135498, + "loss/hidden": 0.8515625, + "loss/logits": 0.12537327408790588, + "loss/reg": 0.02382684126496315, + "step": 423 + }, + { + "epoch": 0.053, + "grad_norm": 4.544858455657959, + "grad_norm_var": 0.918133871994677, + "learning_rate": 0.0001, + "loss": 1.0957, + "loss/crossentropy": 2.8237485885620117, + "loss/hidden": 0.7265625, + "loss/logits": 0.1309557855129242, + "loss/reg": 0.023818302899599075, + "step": 424 + }, + { + "epoch": 0.053125, + "grad_norm": 2.757870674133301, + "grad_norm_var": 0.8666742418813403, + "learning_rate": 0.0001, + "loss": 0.962, + "loss/crossentropy": 2.5533342361450195, + "loss/hidden": 0.62109375, + "loss/logits": 0.10283903032541275, + "loss/reg": 0.023810207843780518, + "step": 425 + }, + { + "epoch": 0.05325, + "grad_norm": 2.8035690784454346, + "grad_norm_var": 0.8970790990362765, + "learning_rate": 0.0001, + "loss": 1.0877, + "loss/crossentropy": 2.610870361328125, + "loss/hidden": 0.6953125, + "loss/logits": 0.1544169783592224, + "loss/reg": 0.02380150742828846, + "step": 426 + }, + { + "epoch": 0.053375, + "grad_norm": 3.5559544563293457, + "grad_norm_var": 0.8432525593756196, + "learning_rate": 0.0001, + "loss": 1.1169, + "loss/crossentropy": 2.537252187728882, + "loss/hidden": 0.74609375, + "loss/logits": 0.1328693926334381, + "loss/reg": 0.02379263937473297, + "step": 427 + }, + { + "epoch": 0.0535, + "grad_norm": 3.58505916595459, + "grad_norm_var": 0.7479056021171611, + "learning_rate": 0.0001, + "loss": 1.0427, + "loss/crossentropy": 2.240241289138794, + "loss/hidden": 0.68359375, + "loss/logits": 0.1212427169084549, + "loss/reg": 0.02378367818892002, + "step": 428 + }, + { + "epoch": 0.053625, + "grad_norm": 3.6646056175231934, + "grad_norm_var": 0.7156687449512967, + "learning_rate": 0.0001, + "loss": 1.0166, + "loss/crossentropy": 2.765550374984741, + "loss/hidden": 0.6328125, + "loss/logits": 0.14600837230682373, + "loss/reg": 0.023774517700076103, + "step": 429 + }, + { + "epoch": 0.05375, + "grad_norm": 3.2596421241760254, + "grad_norm_var": 0.6044660126437359, + "learning_rate": 0.0001, + "loss": 1.13, + "loss/crossentropy": 2.1970372200012207, + "loss/hidden": 0.7578125, + "loss/logits": 0.13451042771339417, + "loss/reg": 0.023765094578266144, + "step": 430 + }, + { + "epoch": 0.053875, + "grad_norm": 4.039770603179932, + "grad_norm_var": 0.6214738630237046, + "learning_rate": 0.0001, + "loss": 1.2708, + "loss/crossentropy": 2.5254645347595215, + "loss/hidden": 0.83984375, + "loss/logits": 0.19336232542991638, + "loss/reg": 0.023756500333547592, + "step": 431 + }, + { + "epoch": 0.054, + "grad_norm": 2.9176530838012695, + "grad_norm_var": 0.6009675102137348, + "learning_rate": 0.0001, + "loss": 0.9399, + "loss/crossentropy": 2.611178398132324, + "loss/hidden": 0.59375, + "loss/logits": 0.10868553817272186, + "loss/reg": 0.023747922852635384, + "step": 432 + }, + { + "epoch": 0.054125, + "grad_norm": 3.540189027786255, + "grad_norm_var": 0.5748467113480954, + "learning_rate": 0.0001, + "loss": 1.164, + "loss/crossentropy": 2.4766628742218018, + "loss/hidden": 0.80078125, + "loss/logits": 0.1258353739976883, + "loss/reg": 0.023739352822303772, + "step": 433 + }, + { + "epoch": 0.05425, + "grad_norm": 3.7053966522216797, + "grad_norm_var": 0.4931212433281331, + "learning_rate": 0.0001, + "loss": 1.0522, + "loss/crossentropy": 2.491478681564331, + "loss/hidden": 0.67578125, + "loss/logits": 0.13909485936164856, + "loss/reg": 0.023731039837002754, + "step": 434 + }, + { + "epoch": 0.054375, + "grad_norm": 3.4101648330688477, + "grad_norm_var": 0.32751985750053336, + "learning_rate": 0.0001, + "loss": 0.9527, + "loss/crossentropy": 2.5495922565460205, + "loss/hidden": 0.59765625, + "loss/logits": 0.1178436130285263, + "loss/reg": 0.023722674697637558, + "step": 435 + }, + { + "epoch": 0.0545, + "grad_norm": 3.923790693283081, + "grad_norm_var": 0.33181180777142816, + "learning_rate": 0.0001, + "loss": 1.1217, + "loss/crossentropy": 2.350161552429199, + "loss/hidden": 0.75, + "loss/logits": 0.13453412055969238, + "loss/reg": 0.02371453307569027, + "step": 436 + }, + { + "epoch": 0.054625, + "grad_norm": 3.2052547931671143, + "grad_norm_var": 0.3040979482718351, + "learning_rate": 0.0001, + "loss": 1.0641, + "loss/crossentropy": 2.5055789947509766, + "loss/hidden": 0.71484375, + "loss/logits": 0.11222882568836212, + "loss/reg": 0.023706616833806038, + "step": 437 + }, + { + "epoch": 0.05475, + "grad_norm": 2.820063591003418, + "grad_norm_var": 0.26777286633351405, + "learning_rate": 0.0001, + "loss": 1.1811, + "loss/crossentropy": 2.535069704055786, + "loss/hidden": 0.78125, + "loss/logits": 0.1628357172012329, + "loss/reg": 0.023698095232248306, + "step": 438 + }, + { + "epoch": 0.054875, + "grad_norm": 3.095841884613037, + "grad_norm_var": 0.24744105333314317, + "learning_rate": 0.0001, + "loss": 1.1936, + "loss/crossentropy": 2.313276767730713, + "loss/hidden": 0.78125, + "loss/logits": 0.17543524503707886, + "loss/reg": 0.02368931844830513, + "step": 439 + }, + { + "epoch": 0.055, + "grad_norm": 2.955540895462036, + "grad_norm_var": 0.1683967569747227, + "learning_rate": 0.0001, + "loss": 0.991, + "loss/crossentropy": 2.747128486633301, + "loss/hidden": 0.6484375, + "loss/logits": 0.10580303519964218, + "loss/reg": 0.023680580779910088, + "step": 440 + }, + { + "epoch": 0.055125, + "grad_norm": 2.866759777069092, + "grad_norm_var": 0.16086728592038804, + "learning_rate": 0.0001, + "loss": 0.9014, + "loss/crossentropy": 2.4465224742889404, + "loss/hidden": 0.5703125, + "loss/logits": 0.09433356672525406, + "loss/reg": 0.023672088980674744, + "step": 441 + }, + { + "epoch": 0.05525, + "grad_norm": 3.1793012619018555, + "grad_norm_var": 0.14310091597801508, + "learning_rate": 0.0001, + "loss": 1.0742, + "loss/crossentropy": 2.364579677581787, + "loss/hidden": 0.6875, + "loss/logits": 0.15003816783428192, + "loss/reg": 0.023663459345698357, + "step": 442 + }, + { + "epoch": 0.055375, + "grad_norm": 4.3040900230407715, + "grad_norm_var": 0.19784760386154687, + "learning_rate": 0.0001, + "loss": 1.1406, + "loss/crossentropy": 2.955233573913574, + "loss/hidden": 0.74609375, + "loss/logits": 0.1579587459564209, + "loss/reg": 0.023654496297240257, + "step": 443 + }, + { + "epoch": 0.0555, + "grad_norm": 3.6495676040649414, + "grad_norm_var": 0.19966009525053988, + "learning_rate": 0.0001, + "loss": 1.0678, + "loss/crossentropy": 2.184626340866089, + "loss/hidden": 0.71484375, + "loss/logits": 0.11654888093471527, + "loss/reg": 0.023645464330911636, + "step": 444 + }, + { + "epoch": 0.055625, + "grad_norm": 2.7992637157440186, + "grad_norm_var": 0.21692371557562992, + "learning_rate": 0.0001, + "loss": 1.0746, + "loss/crossentropy": 1.9573417901992798, + "loss/hidden": 0.71484375, + "loss/logits": 0.1234317272901535, + "loss/reg": 0.023636594414711, + "step": 445 + }, + { + "epoch": 0.05575, + "grad_norm": 3.4131267070770264, + "grad_norm_var": 0.21645445922388262, + "learning_rate": 0.0001, + "loss": 1.0938, + "loss/crossentropy": 2.3615517616271973, + "loss/hidden": 0.71875, + "loss/logits": 0.1387380063533783, + "loss/reg": 0.023627731949090958, + "step": 446 + }, + { + "epoch": 0.055875, + "grad_norm": 2.890436887741089, + "grad_norm_var": 0.19547383544341201, + "learning_rate": 0.0001, + "loss": 0.8903, + "loss/crossentropy": 2.2940785884857178, + "loss/hidden": 0.5546875, + "loss/logits": 0.09945578873157501, + "loss/reg": 0.023619333282113075, + "step": 447 + }, + { + "epoch": 0.056, + "grad_norm": 3.9202187061309814, + "grad_norm_var": 0.20821686288428035, + "learning_rate": 0.0001, + "loss": 1.1124, + "loss/crossentropy": 2.561680316925049, + "loss/hidden": 0.73828125, + "loss/logits": 0.13801740109920502, + "loss/reg": 0.023610329255461693, + "step": 448 + }, + { + "epoch": 0.056125, + "grad_norm": 3.0585479736328125, + "grad_norm_var": 0.21081889060941586, + "learning_rate": 0.0001, + "loss": 0.9545, + "loss/crossentropy": 2.4095382690429688, + "loss/hidden": 0.609375, + "loss/logits": 0.10911431908607483, + "loss/reg": 0.02360081672668457, + "step": 449 + }, + { + "epoch": 0.05625, + "grad_norm": 3.3532161712646484, + "grad_norm_var": 0.2007006666523369, + "learning_rate": 0.0001, + "loss": 1.1045, + "loss/crossentropy": 1.8727831840515137, + "loss/hidden": 0.7421875, + "loss/logits": 0.1264159381389618, + "loss/reg": 0.023592744022607803, + "step": 450 + }, + { + "epoch": 0.056375, + "grad_norm": 3.1456074714660645, + "grad_norm_var": 0.20128870800301848, + "learning_rate": 0.0001, + "loss": 0.9574, + "loss/crossentropy": 2.4792492389678955, + "loss/hidden": 0.609375, + "loss/logits": 0.11215440928936005, + "loss/reg": 0.02358343079686165, + "step": 451 + }, + { + "epoch": 0.0565, + "grad_norm": 3.402637004852295, + "grad_norm_var": 0.1739656178124496, + "learning_rate": 0.0001, + "loss": 1.0743, + "loss/crossentropy": 2.3384110927581787, + "loss/hidden": 0.6875, + "loss/logits": 0.15106429159641266, + "loss/reg": 0.02357417158782482, + "step": 452 + }, + { + "epoch": 0.056625, + "grad_norm": 3.259817361831665, + "grad_norm_var": 0.17379912081048493, + "learning_rate": 0.0001, + "loss": 1.1221, + "loss/crossentropy": 2.456613779067993, + "loss/hidden": 0.734375, + "loss/logits": 0.15212517976760864, + "loss/reg": 0.023564757779240608, + "step": 453 + }, + { + "epoch": 0.05675, + "grad_norm": 2.531987190246582, + "grad_norm_var": 0.195773570863138, + "learning_rate": 0.0001, + "loss": 0.9605, + "loss/crossentropy": 2.394343376159668, + "loss/hidden": 0.609375, + "loss/logits": 0.11555634438991547, + "loss/reg": 0.023556271567940712, + "step": 454 + }, + { + "epoch": 0.056875, + "grad_norm": 2.594336986541748, + "grad_norm_var": 0.22107356191806862, + "learning_rate": 0.0001, + "loss": 0.9207, + "loss/crossentropy": 2.781731367111206, + "loss/hidden": 0.58203125, + "loss/logits": 0.10317155718803406, + "loss/reg": 0.023547139018774033, + "step": 455 + }, + { + "epoch": 0.057, + "grad_norm": 3.1260924339294434, + "grad_norm_var": 0.21715561662650557, + "learning_rate": 0.0001, + "loss": 1.1637, + "loss/crossentropy": 2.5018725395202637, + "loss/hidden": 0.78125, + "loss/logits": 0.14706720411777496, + "loss/reg": 0.02353852428495884, + "step": 456 + }, + { + "epoch": 0.057125, + "grad_norm": 3.159911632537842, + "grad_norm_var": 0.20878072756432645, + "learning_rate": 0.0001, + "loss": 1.0676, + "loss/crossentropy": 2.668788433074951, + "loss/hidden": 0.68359375, + "loss/logits": 0.1487644910812378, + "loss/reg": 0.02352879010140896, + "step": 457 + }, + { + "epoch": 0.05725, + "grad_norm": 3.0937418937683105, + "grad_norm_var": 0.20989373673105333, + "learning_rate": 0.0001, + "loss": 1.0188, + "loss/crossentropy": 2.498286008834839, + "loss/hidden": 0.6640625, + "loss/logits": 0.11953231692314148, + "loss/reg": 0.023519227281212807, + "step": 458 + }, + { + "epoch": 0.057375, + "grad_norm": 2.4465949535369873, + "grad_norm_var": 0.15987096754079838, + "learning_rate": 0.0001, + "loss": 1.0333, + "loss/crossentropy": 2.401630401611328, + "loss/hidden": 0.671875, + "loss/logits": 0.12631601095199585, + "loss/reg": 0.023509083315730095, + "step": 459 + }, + { + "epoch": 0.0575, + "grad_norm": 3.8185830116271973, + "grad_norm_var": 0.17369585396981316, + "learning_rate": 0.0001, + "loss": 1.2185, + "loss/crossentropy": 2.3343100547790527, + "loss/hidden": 0.84375, + "loss/logits": 0.13973468542099, + "loss/reg": 0.023500461131334305, + "step": 460 + }, + { + "epoch": 0.057625, + "grad_norm": 2.769894599914551, + "grad_norm_var": 0.1750287637092998, + "learning_rate": 0.0001, + "loss": 1.0478, + "loss/crossentropy": 2.594421148300171, + "loss/hidden": 0.66796875, + "loss/logits": 0.14495806396007538, + "loss/reg": 0.02349086105823517, + "step": 461 + }, + { + "epoch": 0.05775, + "grad_norm": 3.924386501312256, + "grad_norm_var": 0.21107140664515026, + "learning_rate": 0.0001, + "loss": 1.1201, + "loss/crossentropy": 1.8543033599853516, + "loss/hidden": 0.71875, + "loss/logits": 0.16654378175735474, + "loss/reg": 0.023482073098421097, + "step": 462 + }, + { + "epoch": 0.057875, + "grad_norm": 2.719325304031372, + "grad_norm_var": 0.21896016035892957, + "learning_rate": 0.0001, + "loss": 0.9973, + "loss/crossentropy": 2.7110376358032227, + "loss/hidden": 0.6328125, + "loss/logits": 0.1297917366027832, + "loss/reg": 0.02347267046570778, + "step": 463 + }, + { + "epoch": 0.058, + "grad_norm": 3.110532522201538, + "grad_norm_var": 0.17627651595159174, + "learning_rate": 0.0001, + "loss": 0.9503, + "loss/crossentropy": 2.424137830734253, + "loss/hidden": 0.61328125, + "loss/logits": 0.10241679847240448, + "loss/reg": 0.023463333025574684, + "step": 464 + }, + { + "epoch": 0.058125, + "grad_norm": 3.2945988178253174, + "grad_norm_var": 0.17862116157392785, + "learning_rate": 0.0001, + "loss": 1.1634, + "loss/crossentropy": 2.8107378482818604, + "loss/hidden": 0.75, + "loss/logits": 0.17888996005058289, + "loss/reg": 0.0234534852206707, + "step": 465 + }, + { + "epoch": 0.05825, + "grad_norm": 4.864523887634277, + "grad_norm_var": 0.37049430510921844, + "learning_rate": 0.0001, + "loss": 1.0812, + "loss/crossentropy": 2.5031394958496094, + "loss/hidden": 0.71484375, + "loss/logits": 0.13192051649093628, + "loss/reg": 0.023444540798664093, + "step": 466 + }, + { + "epoch": 0.058375, + "grad_norm": 3.8722984790802, + "grad_norm_var": 0.3978501673810001, + "learning_rate": 0.0001, + "loss": 1.2316, + "loss/crossentropy": 1.9772007465362549, + "loss/hidden": 0.8515625, + "loss/logits": 0.14563970267772675, + "loss/reg": 0.023435747250914574, + "step": 467 + }, + { + "epoch": 0.0585, + "grad_norm": 4.621346473693848, + "grad_norm_var": 0.5155902021721573, + "learning_rate": 0.0001, + "loss": 1.1725, + "loss/crossentropy": 2.4977753162384033, + "loss/hidden": 0.7890625, + "loss/logits": 0.149122953414917, + "loss/reg": 0.023427119478583336, + "step": 468 + }, + { + "epoch": 0.058625, + "grad_norm": 3.36370849609375, + "grad_norm_var": 0.5153549660191058, + "learning_rate": 0.0001, + "loss": 1.2411, + "loss/crossentropy": 2.4750683307647705, + "loss/hidden": 0.8203125, + "loss/logits": 0.18663738667964935, + "loss/reg": 0.023417862132191658, + "step": 469 + }, + { + "epoch": 0.05875, + "grad_norm": 3.391871690750122, + "grad_norm_var": 0.46984604899865395, + "learning_rate": 0.0001, + "loss": 1.1066, + "loss/crossentropy": 2.5189239978790283, + "loss/hidden": 0.71875, + "loss/logits": 0.1537722945213318, + "loss/reg": 0.023408619686961174, + "step": 470 + }, + { + "epoch": 0.058875, + "grad_norm": 3.903122901916504, + "grad_norm_var": 0.43880097595690587, + "learning_rate": 0.0001, + "loss": 0.9258, + "loss/crossentropy": 2.548199415206909, + "loss/hidden": 0.59375, + "loss/logits": 0.09802491217851639, + "loss/reg": 0.02339930646121502, + "step": 471 + }, + { + "epoch": 0.059, + "grad_norm": 5.576291084289551, + "grad_norm_var": 0.7024716555345464, + "learning_rate": 0.0001, + "loss": 1.2843, + "loss/crossentropy": 2.682227849960327, + "loss/hidden": 0.859375, + "loss/logits": 0.19106051325798035, + "loss/reg": 0.023390140384435654, + "step": 472 + }, + { + "epoch": 0.059125, + "grad_norm": 2.4201464653015137, + "grad_norm_var": 0.7821220779043936, + "learning_rate": 0.0001, + "loss": 0.9341, + "loss/crossentropy": 2.5251269340515137, + "loss/hidden": 0.59375, + "loss/logits": 0.10658347606658936, + "loss/reg": 0.023381320759654045, + "step": 473 + }, + { + "epoch": 0.05925, + "grad_norm": 4.325901985168457, + "grad_norm_var": 0.7980385459591806, + "learning_rate": 0.0001, + "loss": 0.9798, + "loss/crossentropy": 2.3592865467071533, + "loss/hidden": 0.640625, + "loss/logits": 0.10548710823059082, + "loss/reg": 0.023372096940875053, + "step": 474 + }, + { + "epoch": 0.059375, + "grad_norm": 3.1382455825805664, + "grad_norm_var": 0.7168259193102716, + "learning_rate": 0.0001, + "loss": 0.9839, + "loss/crossentropy": 2.6056907176971436, + "loss/hidden": 0.62890625, + "loss/logits": 0.121395543217659, + "loss/reg": 0.023363398388028145, + "step": 475 + }, + { + "epoch": 0.0595, + "grad_norm": 3.0779755115509033, + "grad_norm_var": 0.7388713721113239, + "learning_rate": 0.0001, + "loss": 1.0798, + "loss/crossentropy": 2.320854663848877, + "loss/hidden": 0.72265625, + "loss/logits": 0.12356055527925491, + "loss/reg": 0.023354284465312958, + "step": 476 + }, + { + "epoch": 0.059625, + "grad_norm": 3.293567657470703, + "grad_norm_var": 0.6946720185858983, + "learning_rate": 0.0001, + "loss": 0.9685, + "loss/crossentropy": 2.7064430713653564, + "loss/hidden": 0.625, + "loss/logits": 0.11008161306381226, + "loss/reg": 0.023345019668340683, + "step": 477 + }, + { + "epoch": 0.05975, + "grad_norm": 2.664088249206543, + "grad_norm_var": 0.7530647477645431, + "learning_rate": 0.0001, + "loss": 0.9101, + "loss/crossentropy": 2.6147005558013916, + "loss/hidden": 0.5703125, + "loss/logits": 0.10643748193979263, + "loss/reg": 0.02333623729646206, + "step": 478 + }, + { + "epoch": 0.059875, + "grad_norm": 2.9174795150756836, + "grad_norm_var": 0.7321888983535926, + "learning_rate": 0.0001, + "loss": 0.9342, + "loss/crossentropy": 2.566849946975708, + "loss/hidden": 0.58984375, + "loss/logits": 0.11108069121837616, + "loss/reg": 0.023327510803937912, + "step": 479 + }, + { + "epoch": 0.06, + "grad_norm": 3.3596463203430176, + "grad_norm_var": 0.71932045702877, + "learning_rate": 0.0001, + "loss": 1.0961, + "loss/crossentropy": 2.645395040512085, + "loss/hidden": 0.703125, + "loss/logits": 0.15979796648025513, + "loss/reg": 0.023318573832511902, + "step": 480 + }, + { + "epoch": 0.060125, + "grad_norm": 4.0903096199035645, + "grad_norm_var": 0.7232764591548666, + "learning_rate": 0.0001, + "loss": 1.1686, + "loss/crossentropy": 2.077467918395996, + "loss/hidden": 0.79296875, + "loss/logits": 0.14252969622612, + "loss/reg": 0.02330981194972992, + "step": 481 + }, + { + "epoch": 0.06025, + "grad_norm": 3.277656078338623, + "grad_norm_var": 0.6300433507978689, + "learning_rate": 0.0001, + "loss": 1.1136, + "loss/crossentropy": 2.588385581970215, + "loss/hidden": 0.7421875, + "loss/logits": 0.13843819499015808, + "loss/reg": 0.02330118976533413, + "step": 482 + }, + { + "epoch": 0.060375, + "grad_norm": 4.117528915405273, + "grad_norm_var": 0.6433314640873874, + "learning_rate": 0.0001, + "loss": 1.0239, + "loss/crossentropy": 2.766176700592041, + "loss/hidden": 0.6640625, + "loss/logits": 0.12686920166015625, + "loss/reg": 0.023292165249586105, + "step": 483 + }, + { + "epoch": 0.0605, + "grad_norm": 11.29445743560791, + "grad_norm_var": 4.338621670503842, + "learning_rate": 0.0001, + "loss": 1.9614, + "loss/crossentropy": 2.5279059410095215, + "loss/hidden": 1.2109375, + "loss/logits": 0.5176718235015869, + "loss/reg": 0.02328311838209629, + "step": 484 + }, + { + "epoch": 0.060625, + "grad_norm": 3.5054728984832764, + "grad_norm_var": 4.327600163307723, + "learning_rate": 0.0001, + "loss": 1.0719, + "loss/crossentropy": 2.726095199584961, + "loss/hidden": 0.7109375, + "loss/logits": 0.12821289896965027, + "loss/reg": 0.023273879662156105, + "step": 485 + }, + { + "epoch": 0.06075, + "grad_norm": 3.157118558883667, + "grad_norm_var": 4.350771203860321, + "learning_rate": 0.0001, + "loss": 0.9326, + "loss/crossentropy": 2.207131862640381, + "loss/hidden": 0.60546875, + "loss/logits": 0.09451892971992493, + "loss/reg": 0.02326469123363495, + "step": 486 + }, + { + "epoch": 0.060875, + "grad_norm": 5.006563186645508, + "grad_norm_var": 4.411522578027558, + "learning_rate": 0.0001, + "loss": 1.1884, + "loss/crossentropy": 3.005479335784912, + "loss/hidden": 0.78515625, + "loss/logits": 0.17073442041873932, + "loss/reg": 0.02325539104640484, + "step": 487 + }, + { + "epoch": 0.061, + "grad_norm": 8.253157615661621, + "grad_norm_var": 5.3947068177792, + "learning_rate": 0.0001, + "loss": 1.4073, + "loss/crossentropy": 2.329857349395752, + "loss/hidden": 0.984375, + "loss/logits": 0.19045758247375488, + "loss/reg": 0.02324584126472473, + "step": 488 + }, + { + "epoch": 0.061125, + "grad_norm": 4.795626640319824, + "grad_norm_var": 5.169810789054156, + "learning_rate": 0.0001, + "loss": 1.2274, + "loss/crossentropy": 2.354893207550049, + "loss/hidden": 0.8515625, + "loss/logits": 0.14352190494537354, + "loss/reg": 0.023236218839883804, + "step": 489 + }, + { + "epoch": 0.06125, + "grad_norm": 3.1360483169555664, + "grad_norm_var": 5.268809256909969, + "learning_rate": 0.0001, + "loss": 1.0242, + "loss/crossentropy": 2.2810122966766357, + "loss/hidden": 0.671875, + "loss/logits": 0.12006018310785294, + "loss/reg": 0.02322734333574772, + "step": 490 + }, + { + "epoch": 0.061375, + "grad_norm": 3.931467056274414, + "grad_norm_var": 5.1833802842946906, + "learning_rate": 0.0001, + "loss": 1.1688, + "loss/crossentropy": 2.3372247219085693, + "loss/hidden": 0.78515625, + "loss/logits": 0.15147234499454498, + "loss/reg": 0.023217879235744476, + "step": 491 + }, + { + "epoch": 0.0615, + "grad_norm": 3.8858096599578857, + "grad_norm_var": 5.085283642122107, + "learning_rate": 0.0001, + "loss": 1.0309, + "loss/crossentropy": 2.597487688064575, + "loss/hidden": 0.6796875, + "loss/logits": 0.11914543062448502, + "loss/reg": 0.02320869080722332, + "step": 492 + }, + { + "epoch": 0.061625, + "grad_norm": 6.116000175476074, + "grad_norm_var": 5.160062314221837, + "learning_rate": 0.0001, + "loss": 1.2295, + "loss/crossentropy": 2.4041733741760254, + "loss/hidden": 0.8515625, + "loss/logits": 0.14599129557609558, + "loss/reg": 0.023199014365673065, + "step": 493 + }, + { + "epoch": 0.06175, + "grad_norm": 3.4635026454925537, + "grad_norm_var": 4.994267697000357, + "learning_rate": 0.0001, + "loss": 1.0957, + "loss/crossentropy": 2.692230224609375, + "loss/hidden": 0.73046875, + "loss/logits": 0.13329669833183289, + "loss/reg": 0.023189352825284004, + "step": 494 + }, + { + "epoch": 0.061875, + "grad_norm": 6.833379745483398, + "grad_norm_var": 5.051083471594066, + "learning_rate": 0.0001, + "loss": 1.1437, + "loss/crossentropy": 2.470660448074341, + "loss/hidden": 0.7890625, + "loss/logits": 0.12278926372528076, + "loss/reg": 0.023179946467280388, + "step": 495 + }, + { + "epoch": 0.062, + "grad_norm": 3.2948107719421387, + "grad_norm_var": 5.064566926371495, + "learning_rate": 0.0001, + "loss": 1.109, + "loss/crossentropy": 2.3682029247283936, + "loss/hidden": 0.7421875, + "loss/logits": 0.13507232069969177, + "loss/reg": 0.02317013218998909, + "step": 496 + }, + { + "epoch": 0.062125, + "grad_norm": 4.057919025421143, + "grad_norm_var": 5.068064269732227, + "learning_rate": 0.0001, + "loss": 1.1648, + "loss/crossentropy": 2.4153223037719727, + "loss/hidden": 0.765625, + "loss/logits": 0.16754823923110962, + "loss/reg": 0.02316114492714405, + "step": 497 + }, + { + "epoch": 0.06225, + "grad_norm": 3.3985679149627686, + "grad_norm_var": 5.043098814178749, + "learning_rate": 0.0001, + "loss": 1.1033, + "loss/crossentropy": 2.6683197021484375, + "loss/hidden": 0.73828125, + "loss/logits": 0.13353273272514343, + "loss/reg": 0.023151271045207977, + "step": 498 + }, + { + "epoch": 0.062375, + "grad_norm": 2.6574859619140625, + "grad_norm_var": 5.326800856327217, + "learning_rate": 0.0001, + "loss": 0.9413, + "loss/crossentropy": 2.490849733352661, + "loss/hidden": 0.60546875, + "loss/logits": 0.1043705940246582, + "loss/reg": 0.023141290992498398, + "step": 499 + }, + { + "epoch": 0.0625, + "grad_norm": 3.2627739906311035, + "grad_norm_var": 2.402846049322009, + "learning_rate": 0.0001, + "loss": 1.0233, + "loss/crossentropy": 2.3429057598114014, + "loss/hidden": 0.63671875, + "loss/logits": 0.1552838534116745, + "loss/reg": 0.02313125506043434, + "step": 500 + }, + { + "epoch": 0.062625, + "grad_norm": 3.2453906536102295, + "grad_norm_var": 2.4345300369903553, + "learning_rate": 0.0001, + "loss": 1.1719, + "loss/crossentropy": 2.497878313064575, + "loss/hidden": 0.76953125, + "loss/logits": 0.17116406559944153, + "loss/reg": 0.02312229759991169, + "step": 501 + }, + { + "epoch": 0.06275, + "grad_norm": 4.79340934753418, + "grad_norm_var": 2.3566760840149366, + "learning_rate": 0.0001, + "loss": 1.0001, + "loss/crossentropy": 2.360431671142578, + "loss/hidden": 0.6484375, + "loss/logits": 0.12051868438720703, + "loss/reg": 0.02311263047158718, + "step": 502 + }, + { + "epoch": 0.062875, + "grad_norm": 3.1595826148986816, + "grad_norm_var": 2.4163836713766425, + "learning_rate": 0.0001, + "loss": 0.9158, + "loss/crossentropy": 2.266618490219116, + "loss/hidden": 0.58203125, + "loss/logits": 0.1027822494506836, + "loss/reg": 0.023102767765522003, + "step": 503 + }, + { + "epoch": 0.063, + "grad_norm": 3.59019136428833, + "grad_norm_var": 1.2975304557542653, + "learning_rate": 0.0001, + "loss": 0.9685, + "loss/crossentropy": 2.6386334896087646, + "loss/hidden": 0.625, + "loss/logits": 0.11253425478935242, + "loss/reg": 0.023093828931450844, + "step": 504 + }, + { + "epoch": 0.063125, + "grad_norm": 3.1218326091766357, + "grad_norm_var": 1.2897946661694502, + "learning_rate": 0.0001, + "loss": 0.9493, + "loss/crossentropy": 2.2931203842163086, + "loss/hidden": 0.6171875, + "loss/logits": 0.10128200799226761, + "loss/reg": 0.0230838842689991, + "step": 505 + }, + { + "epoch": 0.06325, + "grad_norm": 7.243019104003906, + "grad_norm_var": 1.941121973828988, + "learning_rate": 0.0001, + "loss": 1.1363, + "loss/crossentropy": 2.510519504547119, + "loss/hidden": 0.7890625, + "loss/logits": 0.11650878190994263, + "loss/reg": 0.023074399679899216, + "step": 506 + }, + { + "epoch": 0.063375, + "grad_norm": 2.7458910942077637, + "grad_norm_var": 2.0601092371510408, + "learning_rate": 0.0001, + "loss": 0.8688, + "loss/crossentropy": 2.504798173904419, + "loss/hidden": 0.54296875, + "loss/logits": 0.09517204016447067, + "loss/reg": 0.023064618930220604, + "step": 507 + }, + { + "epoch": 0.0635, + "grad_norm": 3.834894895553589, + "grad_norm_var": 2.061415401484546, + "learning_rate": 0.0001, + "loss": 1.0614, + "loss/crossentropy": 2.504178285598755, + "loss/hidden": 0.69140625, + "loss/logits": 0.13941214978694916, + "loss/reg": 0.023055192083120346, + "step": 508 + }, + { + "epoch": 0.063625, + "grad_norm": 3.0524418354034424, + "grad_norm_var": 1.8045701590720038, + "learning_rate": 0.0001, + "loss": 0.9395, + "loss/crossentropy": 2.6670830249786377, + "loss/hidden": 0.6015625, + "loss/logits": 0.10751838982105255, + "loss/reg": 0.023046277463436127, + "step": 509 + }, + { + "epoch": 0.06375, + "grad_norm": 2.638979196548462, + "grad_norm_var": 1.8906158947457992, + "learning_rate": 0.0001, + "loss": 0.9649, + "loss/crossentropy": 2.5606770515441895, + "loss/hidden": 0.62109375, + "loss/logits": 0.11341118812561035, + "loss/reg": 0.02303677424788475, + "step": 510 + }, + { + "epoch": 0.063875, + "grad_norm": 4.029105186462402, + "grad_norm_var": 1.2509737999906814, + "learning_rate": 0.0001, + "loss": 1.0378, + "loss/crossentropy": 2.446560859680176, + "loss/hidden": 0.7109375, + "loss/logits": 0.09661944955587387, + "loss/reg": 0.023027852177619934, + "step": 511 + }, + { + "epoch": 0.064, + "grad_norm": 3.203378438949585, + "grad_norm_var": 1.255617850639648, + "learning_rate": 0.0001, + "loss": 0.9705, + "loss/crossentropy": 2.08353328704834, + "loss/hidden": 0.63671875, + "loss/logits": 0.10362571477890015, + "loss/reg": 0.02301831543445587, + "step": 512 + }, + { + "epoch": 0.064125, + "grad_norm": 2.5737931728363037, + "grad_norm_var": 1.3080458668089554, + "learning_rate": 0.0001, + "loss": 1.0597, + "loss/crossentropy": 2.3520448207855225, + "loss/hidden": 0.7109375, + "loss/logits": 0.11864635348320007, + "loss/reg": 0.023009376600384712, + "step": 513 + }, + { + "epoch": 0.06425, + "grad_norm": 3.6731107234954834, + "grad_norm_var": 1.307783724921588, + "learning_rate": 0.0001, + "loss": 1.1013, + "loss/crossentropy": 2.77113938331604, + "loss/hidden": 0.73046875, + "loss/logits": 0.1408485472202301, + "loss/reg": 0.022999830543994904, + "step": 514 + }, + { + "epoch": 0.064375, + "grad_norm": 3.06605863571167, + "grad_norm_var": 1.269509965568249, + "learning_rate": 0.0001, + "loss": 0.9224, + "loss/crossentropy": 2.3147072792053223, + "loss/hidden": 0.59375, + "loss/logits": 0.0987289547920227, + "loss/reg": 0.022990131750702858, + "step": 515 + }, + { + "epoch": 0.0645, + "grad_norm": 3.462446689605713, + "grad_norm_var": 1.2636330593023397, + "learning_rate": 0.0001, + "loss": 1.0201, + "loss/crossentropy": 2.4042327404022217, + "loss/hidden": 0.66796875, + "loss/logits": 0.12231434136629105, + "loss/reg": 0.02298046089708805, + "step": 516 + }, + { + "epoch": 0.064625, + "grad_norm": 4.2029500007629395, + "grad_norm_var": 1.2769943636458339, + "learning_rate": 0.0001, + "loss": 1.222, + "loss/crossentropy": 2.514112710952759, + "loss/hidden": 0.82421875, + "loss/logits": 0.16808617115020752, + "loss/reg": 0.022970519959926605, + "step": 517 + }, + { + "epoch": 0.06475, + "grad_norm": 3.433554172515869, + "grad_norm_var": 1.1851525686550586, + "learning_rate": 0.0001, + "loss": 1.2444, + "loss/crossentropy": 2.493224859237671, + "loss/hidden": 0.84375, + "loss/logits": 0.17107471823692322, + "loss/reg": 0.022961357608437538, + "step": 518 + }, + { + "epoch": 0.064875, + "grad_norm": 4.310100078582764, + "grad_norm_var": 1.2057753361074104, + "learning_rate": 0.0001, + "loss": 1.0108, + "loss/crossentropy": 2.5606930255889893, + "loss/hidden": 0.67578125, + "loss/logits": 0.10546360909938812, + "loss/reg": 0.022952331230044365, + "step": 519 + }, + { + "epoch": 0.065, + "grad_norm": 3.674527883529663, + "grad_norm_var": 1.2057007253632908, + "learning_rate": 0.0001, + "loss": 0.9803, + "loss/crossentropy": 2.4196624755859375, + "loss/hidden": 0.64453125, + "loss/logits": 0.10631287097930908, + "loss/reg": 0.02294265851378441, + "step": 520 + }, + { + "epoch": 0.065125, + "grad_norm": 3.101484775543213, + "grad_norm_var": 1.20713683658368, + "learning_rate": 0.0001, + "loss": 1.2375, + "loss/crossentropy": 2.1448891162872314, + "loss/hidden": 0.88671875, + "loss/logits": 0.12145140767097473, + "loss/reg": 0.02293260022997856, + "step": 521 + }, + { + "epoch": 0.06525, + "grad_norm": 3.2564265727996826, + "grad_norm_var": 0.2854656858181736, + "learning_rate": 0.0001, + "loss": 0.9783, + "loss/crossentropy": 2.3986809253692627, + "loss/hidden": 0.609375, + "loss/logits": 0.1396813988685608, + "loss/reg": 0.022922798991203308, + "step": 522 + }, + { + "epoch": 0.065375, + "grad_norm": 3.3007333278656006, + "grad_norm_var": 0.2569672821287893, + "learning_rate": 0.0001, + "loss": 0.9586, + "loss/crossentropy": 2.7955212593078613, + "loss/hidden": 0.62109375, + "loss/logits": 0.10833179950714111, + "loss/reg": 0.02291307970881462, + "step": 523 + }, + { + "epoch": 0.0655, + "grad_norm": 2.9546499252319336, + "grad_norm_var": 0.25738909944094907, + "learning_rate": 0.0001, + "loss": 1.0937, + "loss/crossentropy": 2.409029006958008, + "loss/hidden": 0.71875, + "loss/logits": 0.14591118693351746, + "loss/reg": 0.022904111072421074, + "step": 524 + }, + { + "epoch": 0.065625, + "grad_norm": 3.2193830013275146, + "grad_norm_var": 0.25204334767618, + "learning_rate": 0.0001, + "loss": 1.1221, + "loss/crossentropy": 2.491401195526123, + "loss/hidden": 0.75390625, + "loss/logits": 0.13920898735523224, + "loss/reg": 0.02289445698261261, + "step": 525 + }, + { + "epoch": 0.06575, + "grad_norm": 40.478694915771484, + "grad_norm_var": 85.9971082258447, + "learning_rate": 0.0001, + "loss": 1.1342, + "loss/crossentropy": 2.4311652183532715, + "loss/hidden": 0.796875, + "loss/logits": 0.10845671594142914, + "loss/reg": 0.022885650396347046, + "step": 526 + }, + { + "epoch": 0.065875, + "grad_norm": 3.2905385494232178, + "grad_norm_var": 86.20029999738617, + "learning_rate": 0.0001, + "loss": 1.0101, + "loss/crossentropy": 2.1737422943115234, + "loss/hidden": 0.671875, + "loss/logits": 0.10944204032421112, + "loss/reg": 0.022876843810081482, + "step": 527 + }, + { + "epoch": 0.066, + "grad_norm": 4.666721343994141, + "grad_norm_var": 85.84699165642114, + "learning_rate": 0.0001, + "loss": 1.2142, + "loss/crossentropy": 2.462200880050659, + "loss/hidden": 0.86328125, + "loss/logits": 0.12223749607801437, + "loss/reg": 0.02286742813885212, + "step": 528 + }, + { + "epoch": 0.066125, + "grad_norm": 3.0347273349761963, + "grad_norm_var": 85.66251244998139, + "learning_rate": 0.0001, + "loss": 1.0285, + "loss/crossentropy": 2.38840651512146, + "loss/hidden": 0.6640625, + "loss/logits": 0.13581448793411255, + "loss/reg": 0.022858494892716408, + "step": 529 + }, + { + "epoch": 0.06625, + "grad_norm": 2.5890092849731445, + "grad_norm_var": 86.04634847608627, + "learning_rate": 0.0001, + "loss": 0.967, + "loss/crossentropy": 2.5042731761932373, + "loss/hidden": 0.62890625, + "loss/logits": 0.10961504280567169, + "loss/reg": 0.02284966967999935, + "step": 530 + }, + { + "epoch": 0.066375, + "grad_norm": 3.3963401317596436, + "grad_norm_var": 85.93485657047692, + "learning_rate": 0.0001, + "loss": 1.072, + "loss/crossentropy": 2.7259116172790527, + "loss/hidden": 0.7109375, + "loss/logits": 0.13268503546714783, + "loss/reg": 0.02284088172018528, + "step": 531 + }, + { + "epoch": 0.0665, + "grad_norm": 3.731293201446533, + "grad_norm_var": 85.85653980693046, + "learning_rate": 0.0001, + "loss": 1.0647, + "loss/crossentropy": 2.277968168258667, + "loss/hidden": 0.71484375, + "loss/logits": 0.12150134146213531, + "loss/reg": 0.02283208817243576, + "step": 532 + }, + { + "epoch": 0.066625, + "grad_norm": 4.581428050994873, + "grad_norm_var": 85.78540060231343, + "learning_rate": 0.0001, + "loss": 1.2486, + "loss/crossentropy": 2.067720890045166, + "loss/hidden": 0.84375, + "loss/logits": 0.17660680413246155, + "loss/reg": 0.02282322198152542, + "step": 533 + }, + { + "epoch": 0.06675, + "grad_norm": 3.0526421070098877, + "grad_norm_var": 85.91535378874303, + "learning_rate": 0.0001, + "loss": 1.1318, + "loss/crossentropy": 2.4441521167755127, + "loss/hidden": 0.7734375, + "loss/logits": 0.13023720681667328, + "loss/reg": 0.022814445197582245, + "step": 534 + }, + { + "epoch": 0.066875, + "grad_norm": 3.4852664470672607, + "grad_norm_var": 86.12062292738887, + "learning_rate": 0.0001, + "loss": 0.9936, + "loss/crossentropy": 2.418733596801758, + "loss/hidden": 0.66015625, + "loss/logits": 0.10543158650398254, + "loss/reg": 0.022805610671639442, + "step": 535 + }, + { + "epoch": 0.067, + "grad_norm": 2.7321274280548096, + "grad_norm_var": 86.43545869041343, + "learning_rate": 0.0001, + "loss": 1.0096, + "loss/crossentropy": 2.3275394439697266, + "loss/hidden": 0.65625, + "loss/logits": 0.12536926567554474, + "loss/reg": 0.022797243669629097, + "step": 536 + }, + { + "epoch": 0.067125, + "grad_norm": 3.029811382293701, + "grad_norm_var": 86.46041611877568, + "learning_rate": 0.0001, + "loss": 1.1049, + "loss/crossentropy": 2.2477617263793945, + "loss/hidden": 0.74609375, + "loss/logits": 0.1309519112110138, + "loss/reg": 0.022788099944591522, + "step": 537 + }, + { + "epoch": 0.06725, + "grad_norm": 2.7345895767211914, + "grad_norm_var": 86.64571497988939, + "learning_rate": 0.0001, + "loss": 1.1272, + "loss/crossentropy": 2.4289236068725586, + "loss/hidden": 0.7578125, + "loss/logits": 0.141631618142128, + "loss/reg": 0.022778736427426338, + "step": 538 + }, + { + "epoch": 0.067375, + "grad_norm": 3.2934482097625732, + "grad_norm_var": 86.6479928457627, + "learning_rate": 0.0001, + "loss": 1.0286, + "loss/crossentropy": 2.538973093032837, + "loss/hidden": 0.67578125, + "loss/logits": 0.12509004771709442, + "loss/reg": 0.02277030609548092, + "step": 539 + }, + { + "epoch": 0.0675, + "grad_norm": 3.833656072616577, + "grad_norm_var": 86.38133368804911, + "learning_rate": 0.0001, + "loss": 0.999, + "loss/crossentropy": 2.429593563079834, + "loss/hidden": 0.64453125, + "loss/logits": 0.12682604789733887, + "loss/reg": 0.022761952131986618, + "step": 540 + }, + { + "epoch": 0.067625, + "grad_norm": 3.544104814529419, + "grad_norm_var": 86.28065873545309, + "learning_rate": 0.0001, + "loss": 1.0002, + "loss/crossentropy": 2.166292905807495, + "loss/hidden": 0.66015625, + "loss/logits": 0.11254848539829254, + "loss/reg": 0.02275264821946621, + "step": 541 + }, + { + "epoch": 0.06775, + "grad_norm": 4.461411952972412, + "grad_norm_var": 0.4229304647166086, + "learning_rate": 0.0001, + "loss": 1.6203, + "loss/crossentropy": 2.6347737312316895, + "loss/hidden": 1.1328125, + "loss/logits": 0.2600440979003906, + "loss/reg": 0.022744029760360718, + "step": 542 + }, + { + "epoch": 0.067875, + "grad_norm": 2.6814959049224854, + "grad_norm_var": 0.46036790462302574, + "learning_rate": 0.0001, + "loss": 0.9923, + "loss/crossentropy": 2.580264091491699, + "loss/hidden": 0.65234375, + "loss/logits": 0.11260214447975159, + "loss/reg": 0.022734828293323517, + "step": 543 + }, + { + "epoch": 0.068, + "grad_norm": 3.685408353805542, + "grad_norm_var": 0.35847800648438505, + "learning_rate": 0.0001, + "loss": 1.1452, + "loss/crossentropy": 2.7129862308502197, + "loss/hidden": 0.78515625, + "loss/logits": 0.13279825448989868, + "loss/reg": 0.022726204246282578, + "step": 544 + }, + { + "epoch": 0.068125, + "grad_norm": 6.349724292755127, + "grad_norm_var": 0.8985836730566762, + "learning_rate": 0.0001, + "loss": 1.1353, + "loss/crossentropy": 2.7293214797973633, + "loss/hidden": 0.65625, + "loss/logits": 0.2518823742866516, + "loss/reg": 0.022716930136084557, + "step": 545 + }, + { + "epoch": 0.06825, + "grad_norm": 3.681774616241455, + "grad_norm_var": 0.829722440393675, + "learning_rate": 0.0001, + "loss": 1.1759, + "loss/crossentropy": 2.278223991394043, + "loss/hidden": 0.8046875, + "loss/logits": 0.1441642791032791, + "loss/reg": 0.022707859054207802, + "step": 546 + }, + { + "epoch": 0.068375, + "grad_norm": 3.84778094291687, + "grad_norm_var": 0.8276635905853821, + "learning_rate": 0.0001, + "loss": 1.3398, + "loss/crossentropy": 2.1253304481506348, + "loss/hidden": 0.94921875, + "loss/logits": 0.16354900598526, + "loss/reg": 0.022699227556586266, + "step": 547 + }, + { + "epoch": 0.0685, + "grad_norm": 5.178676605224609, + "grad_norm_var": 0.9703527182714744, + "learning_rate": 0.0001, + "loss": 1.022, + "loss/crossentropy": 2.7832319736480713, + "loss/hidden": 0.67578125, + "loss/logits": 0.11932960152626038, + "loss/reg": 0.02269013226032257, + "step": 548 + }, + { + "epoch": 0.068625, + "grad_norm": 3.0284383296966553, + "grad_norm_var": 0.9511722709094016, + "learning_rate": 0.0001, + "loss": 1.0412, + "loss/crossentropy": 1.9599052667617798, + "loss/hidden": 0.703125, + "loss/logits": 0.11125050485134125, + "loss/reg": 0.022681355476379395, + "step": 549 + }, + { + "epoch": 0.06875, + "grad_norm": 3.167809247970581, + "grad_norm_var": 0.942616955302132, + "learning_rate": 0.0001, + "loss": 1.0545, + "loss/crossentropy": 2.5844783782958984, + "loss/hidden": 0.69140625, + "loss/logits": 0.13634443283081055, + "loss/reg": 0.02267223782837391, + "step": 550 + }, + { + "epoch": 0.068875, + "grad_norm": 3.2949278354644775, + "grad_norm_var": 0.9495941353113788, + "learning_rate": 0.0001, + "loss": 1.0619, + "loss/crossentropy": 2.314173460006714, + "loss/hidden": 0.71484375, + "loss/logits": 0.12046810984611511, + "loss/reg": 0.022662866860628128, + "step": 551 + }, + { + "epoch": 0.069, + "grad_norm": 2.8322856426239014, + "grad_norm_var": 0.9378422714313653, + "learning_rate": 0.0001, + "loss": 1.0272, + "loss/crossentropy": 2.768298387527466, + "loss/hidden": 0.66796875, + "loss/logits": 0.1326732635498047, + "loss/reg": 0.022653890773653984, + "step": 552 + }, + { + "epoch": 0.069125, + "grad_norm": 6.815005779266357, + "grad_norm_var": 1.5125797637252996, + "learning_rate": 0.0001, + "loss": 1.1457, + "loss/crossentropy": 2.8011491298675537, + "loss/hidden": 0.78515625, + "loss/logits": 0.13409599661827087, + "loss/reg": 0.022644398733973503, + "step": 553 + }, + { + "epoch": 0.06925, + "grad_norm": 4.219581127166748, + "grad_norm_var": 1.4192768991356985, + "learning_rate": 0.0001, + "loss": 1.2504, + "loss/crossentropy": 2.5874292850494385, + "loss/hidden": 0.8515625, + "loss/logits": 0.17252284288406372, + "loss/reg": 0.02263464592397213, + "step": 554 + }, + { + "epoch": 0.069375, + "grad_norm": 3.8279531002044678, + "grad_norm_var": 1.3871550629864857, + "learning_rate": 0.0001, + "loss": 1.1814, + "loss/crossentropy": 2.5384669303894043, + "loss/hidden": 0.80078125, + "loss/logits": 0.1543978452682495, + "loss/reg": 0.022625621408224106, + "step": 555 + }, + { + "epoch": 0.0695, + "grad_norm": 3.563680648803711, + "grad_norm_var": 1.3987108056073487, + "learning_rate": 0.0001, + "loss": 1.0727, + "loss/crossentropy": 2.111318349838257, + "loss/hidden": 0.73828125, + "loss/logits": 0.10826431214809418, + "loss/reg": 0.022616824135184288, + "step": 556 + }, + { + "epoch": 0.069625, + "grad_norm": 3.9599223136901855, + "grad_norm_var": 1.3836174934919199, + "learning_rate": 0.0001, + "loss": 1.0309, + "loss/crossentropy": 2.429032325744629, + "loss/hidden": 0.67578125, + "loss/logits": 0.12908612191677094, + "loss/reg": 0.022607678547501564, + "step": 557 + }, + { + "epoch": 0.06975, + "grad_norm": 2.8072519302368164, + "grad_norm_var": 1.4610802306207225, + "learning_rate": 0.0001, + "loss": 1.0694, + "loss/crossentropy": 2.5817906856536865, + "loss/hidden": 0.6875, + "loss/logits": 0.15589380264282227, + "loss/reg": 0.022598396986722946, + "step": 558 + }, + { + "epoch": 0.069875, + "grad_norm": 2.764833927154541, + "grad_norm_var": 1.4475983977607596, + "learning_rate": 0.0001, + "loss": 1.0233, + "loss/crossentropy": 2.202237844467163, + "loss/hidden": 0.66015625, + "loss/logits": 0.13721255958080292, + "loss/reg": 0.02258932963013649, + "step": 559 + }, + { + "epoch": 0.07, + "grad_norm": 5.113494396209717, + "grad_norm_var": 1.5267634464669517, + "learning_rate": 0.0001, + "loss": 1.1672, + "loss/crossentropy": 2.006186008453369, + "loss/hidden": 0.80078125, + "loss/logits": 0.14061376452445984, + "loss/reg": 0.022580046206712723, + "step": 560 + }, + { + "epoch": 0.070125, + "grad_norm": 3.8133227825164795, + "grad_norm_var": 1.1437787263680603, + "learning_rate": 0.0001, + "loss": 1.0148, + "loss/crossentropy": 2.37170672416687, + "loss/hidden": 0.67578125, + "loss/logits": 0.11329137533903122, + "loss/reg": 0.02257111482322216, + "step": 561 + }, + { + "epoch": 0.07025, + "grad_norm": 2.7208938598632812, + "grad_norm_var": 1.2255733087022764, + "learning_rate": 0.0001, + "loss": 1.0621, + "loss/crossentropy": 2.271667242050171, + "loss/hidden": 0.703125, + "loss/logits": 0.1333208978176117, + "loss/reg": 0.02256210334599018, + "step": 562 + }, + { + "epoch": 0.070375, + "grad_norm": 53.38179016113281, + "grad_norm_var": 154.8279377341773, + "learning_rate": 0.0001, + "loss": 0.9618, + "loss/crossentropy": 2.2667322158813477, + "loss/hidden": 0.640625, + "loss/logits": 0.09567096829414368, + "loss/reg": 0.02255306765437126, + "step": 563 + }, + { + "epoch": 0.0705, + "grad_norm": 5.538954257965088, + "grad_norm_var": 154.75309317540348, + "learning_rate": 0.0001, + "loss": 1.782, + "loss/crossentropy": 2.5168824195861816, + "loss/hidden": 1.3671875, + "loss/logits": 0.1893981695175171, + "loss/reg": 0.02254408597946167, + "step": 564 + }, + { + "epoch": 0.070625, + "grad_norm": 2.8311338424682617, + "grad_norm_var": 154.85811657117597, + "learning_rate": 0.0001, + "loss": 1.1104, + "loss/crossentropy": 2.1938281059265137, + "loss/hidden": 0.75, + "loss/logits": 0.13508498668670654, + "loss/reg": 0.02253509685397148, + "step": 565 + }, + { + "epoch": 0.07075, + "grad_norm": 4.604408264160156, + "grad_norm_var": 154.26918998432618, + "learning_rate": 0.0001, + "loss": 1.2995, + "loss/crossentropy": 2.103062391281128, + "loss/hidden": 0.91015625, + "loss/logits": 0.16405051946640015, + "loss/reg": 0.02252543345093727, + "step": 566 + }, + { + "epoch": 0.070875, + "grad_norm": 4.917901992797852, + "grad_norm_var": 153.63084329919155, + "learning_rate": 0.0001, + "loss": 1.2945, + "loss/crossentropy": 2.6932315826416016, + "loss/hidden": 0.87890625, + "loss/logits": 0.1904207468032837, + "loss/reg": 0.022515632212162018, + "step": 567 + }, + { + "epoch": 0.071, + "grad_norm": 4.055351257324219, + "grad_norm_var": 153.02723135387427, + "learning_rate": 0.0001, + "loss": 1.2253, + "loss/crossentropy": 2.3849422931671143, + "loss/hidden": 0.8359375, + "loss/logits": 0.1642536073923111, + "loss/reg": 0.02250652387738228, + "step": 568 + }, + { + "epoch": 0.071125, + "grad_norm": 2.851072072982788, + "grad_norm_var": 154.20402053832456, + "learning_rate": 0.0001, + "loss": 1.0724, + "loss/crossentropy": 2.580428123474121, + "loss/hidden": 0.703125, + "loss/logits": 0.144349604845047, + "loss/reg": 0.022497190162539482, + "step": 569 + }, + { + "epoch": 0.07125, + "grad_norm": 3.257493495941162, + "grad_norm_var": 154.6102933496205, + "learning_rate": 0.0001, + "loss": 0.9328, + "loss/crossentropy": 2.4263837337493896, + "loss/hidden": 0.59375, + "loss/logits": 0.11414404958486557, + "loss/reg": 0.02248740941286087, + "step": 570 + }, + { + "epoch": 0.071375, + "grad_norm": 3.6194608211517334, + "grad_norm_var": 154.69773136421824, + "learning_rate": 0.0001, + "loss": 1.0417, + "loss/crossentropy": 2.7035937309265137, + "loss/hidden": 0.69140625, + "loss/logits": 0.12549756467342377, + "loss/reg": 0.022477447986602783, + "step": 571 + }, + { + "epoch": 0.0715, + "grad_norm": 4.193033695220947, + "grad_norm_var": 154.44566535859553, + "learning_rate": 0.0001, + "loss": 1.6505, + "loss/crossentropy": 2.274057149887085, + "loss/hidden": 1.1640625, + "loss/logits": 0.26177138090133667, + "loss/reg": 0.022467276081442833, + "step": 572 + }, + { + "epoch": 0.071625, + "grad_norm": 2.7127623558044434, + "grad_norm_var": 155.03209308401435, + "learning_rate": 0.0001, + "loss": 0.9118, + "loss/crossentropy": 2.4076664447784424, + "loss/hidden": 0.58203125, + "loss/logits": 0.10521911084651947, + "loss/reg": 0.022456735372543335, + "step": 573 + }, + { + "epoch": 0.07175, + "grad_norm": 3.5871214866638184, + "grad_norm_var": 154.65243889362196, + "learning_rate": 0.0001, + "loss": 0.9755, + "loss/crossentropy": 2.8088905811309814, + "loss/hidden": 0.62890625, + "loss/logits": 0.12216061353683472, + "loss/reg": 0.022446416318416595, + "step": 574 + }, + { + "epoch": 0.071875, + "grad_norm": 20.843276977539062, + "grad_norm_var": 165.17750310305152, + "learning_rate": 0.0001, + "loss": 1.1707, + "loss/crossentropy": 2.569218635559082, + "loss/hidden": 0.82421875, + "loss/logits": 0.1221412867307663, + "loss/reg": 0.02243630215525627, + "step": 575 + }, + { + "epoch": 0.072, + "grad_norm": 3.375251293182373, + "grad_norm_var": 166.03594003131977, + "learning_rate": 0.0001, + "loss": 0.9923, + "loss/crossentropy": 2.7540183067321777, + "loss/hidden": 0.640625, + "loss/logits": 0.12738245725631714, + "loss/reg": 0.022425668314099312, + "step": 576 + }, + { + "epoch": 0.072125, + "grad_norm": 3.457340717315674, + "grad_norm_var": 166.23754433202586, + "learning_rate": 0.0001, + "loss": 1.0371, + "loss/crossentropy": 2.2943716049194336, + "loss/hidden": 0.66796875, + "loss/logits": 0.14497271180152893, + "loss/reg": 0.022415172308683395, + "step": 577 + }, + { + "epoch": 0.07225, + "grad_norm": 2.966371774673462, + "grad_norm_var": 166.07272256293106, + "learning_rate": 0.0001, + "loss": 1.2044, + "loss/crossentropy": 2.760359048843384, + "loss/hidden": 0.8125, + "loss/logits": 0.16780292987823486, + "loss/reg": 0.022406071424484253, + "step": 578 + }, + { + "epoch": 0.072375, + "grad_norm": 2.5532329082489014, + "grad_norm_var": 19.219812763276813, + "learning_rate": 0.0001, + "loss": 0.9996, + "loss/crossentropy": 2.427797794342041, + "loss/hidden": 0.66796875, + "loss/logits": 0.10767525434494019, + "loss/reg": 0.02239692024886608, + "step": 579 + }, + { + "epoch": 0.0725, + "grad_norm": 3.250906229019165, + "grad_norm_var": 19.294198335433887, + "learning_rate": 0.0001, + "loss": 1.1148, + "loss/crossentropy": 2.29978084564209, + "loss/hidden": 0.75, + "loss/logits": 0.1408957540988922, + "loss/reg": 0.0223868228495121, + "step": 580 + }, + { + "epoch": 0.072625, + "grad_norm": 3.8051562309265137, + "grad_norm_var": 19.12802354300362, + "learning_rate": 0.0001, + "loss": 1.2329, + "loss/crossentropy": 2.402350664138794, + "loss/hidden": 0.83203125, + "loss/logits": 0.17713911831378937, + "loss/reg": 0.022376833483576775, + "step": 581 + }, + { + "epoch": 0.07275, + "grad_norm": 4.466115951538086, + "grad_norm_var": 19.129656316190147, + "learning_rate": 0.0001, + "loss": 1.1846, + "loss/crossentropy": 2.5129504203796387, + "loss/hidden": 0.8125, + "loss/logits": 0.14841441810131073, + "loss/reg": 0.02236761339008808, + "step": 582 + }, + { + "epoch": 0.072875, + "grad_norm": 3.1465752124786377, + "grad_norm_var": 19.255278342461487, + "learning_rate": 0.0001, + "loss": 1.1857, + "loss/crossentropy": 2.5782837867736816, + "loss/hidden": 0.8203125, + "loss/logits": 0.14178822934627533, + "loss/reg": 0.022358402609825134, + "step": 583 + }, + { + "epoch": 0.073, + "grad_norm": 2.7474803924560547, + "grad_norm_var": 19.441256858474677, + "learning_rate": 0.0001, + "loss": 1.0532, + "loss/crossentropy": 2.4901747703552246, + "loss/hidden": 0.703125, + "loss/logits": 0.12661507725715637, + "loss/reg": 0.02234930731356144, + "step": 584 + }, + { + "epoch": 0.073125, + "grad_norm": 2.7027359008789062, + "grad_norm_var": 19.47380183903334, + "learning_rate": 0.0001, + "loss": 0.8469, + "loss/crossentropy": 2.694582223892212, + "loss/hidden": 0.5390625, + "loss/logits": 0.08438676595687866, + "loss/reg": 0.022340187802910805, + "step": 585 + }, + { + "epoch": 0.07325, + "grad_norm": 2.847770929336548, + "grad_norm_var": 19.547679388785195, + "learning_rate": 0.0001, + "loss": 0.9885, + "loss/crossentropy": 2.5558888912200928, + "loss/hidden": 0.6484375, + "loss/logits": 0.11670757830142975, + "loss/reg": 0.022330984473228455, + "step": 586 + }, + { + "epoch": 0.073375, + "grad_norm": 3.20444917678833, + "grad_norm_var": 19.601201389954156, + "learning_rate": 0.0001, + "loss": 1.2216, + "loss/crossentropy": 2.0031001567840576, + "loss/hidden": 0.84375, + "loss/logits": 0.1546502560377121, + "loss/reg": 0.022321749478578568, + "step": 587 + }, + { + "epoch": 0.0735, + "grad_norm": 4.657837390899658, + "grad_norm_var": 19.6039707895662, + "learning_rate": 0.0001, + "loss": 1.1377, + "loss/crossentropy": 2.598100185394287, + "loss/hidden": 0.78125, + "loss/logits": 0.13329939544200897, + "loss/reg": 0.02231265790760517, + "step": 588 + }, + { + "epoch": 0.073625, + "grad_norm": 3.1166787147521973, + "grad_norm_var": 19.52355503271277, + "learning_rate": 0.0001, + "loss": 1.304, + "loss/crossentropy": 2.2209084033966064, + "loss/hidden": 0.89453125, + "loss/logits": 0.18648099899291992, + "loss/reg": 0.022303014993667603, + "step": 589 + }, + { + "epoch": 0.07375, + "grad_norm": 2.988344669342041, + "grad_norm_var": 19.61249925539728, + "learning_rate": 0.0001, + "loss": 0.9412, + "loss/crossentropy": 2.4533708095550537, + "loss/hidden": 0.60546875, + "loss/logits": 0.11275988817214966, + "loss/reg": 0.022293319925665855, + "step": 590 + }, + { + "epoch": 0.073875, + "grad_norm": 3.0553901195526123, + "grad_norm_var": 0.3491433903254536, + "learning_rate": 0.0001, + "loss": 1.0883, + "loss/crossentropy": 2.3986990451812744, + "loss/hidden": 0.73046875, + "loss/logits": 0.13498544692993164, + "loss/reg": 0.022283662110567093, + "step": 591 + }, + { + "epoch": 0.074, + "grad_norm": 4.445681095123291, + "grad_norm_var": 0.43558600780207446, + "learning_rate": 0.0001, + "loss": 1.3195, + "loss/crossentropy": 2.335937976837158, + "loss/hidden": 0.90625, + "loss/logits": 0.19050292670726776, + "loss/reg": 0.022273709997534752, + "step": 592 + }, + { + "epoch": 0.074125, + "grad_norm": 41.39726638793945, + "grad_norm_var": 91.00287624901027, + "learning_rate": 0.0001, + "loss": 1.2527, + "loss/crossentropy": 2.3441669940948486, + "loss/hidden": 0.86328125, + "loss/logits": 0.1667976826429367, + "loss/reg": 0.02226419560611248, + "step": 593 + }, + { + "epoch": 0.07425, + "grad_norm": 2.8557939529418945, + "grad_norm_var": 91.04408434440504, + "learning_rate": 0.0001, + "loss": 1.0612, + "loss/crossentropy": 2.380885362625122, + "loss/hidden": 0.703125, + "loss/logits": 0.1354852318763733, + "loss/reg": 0.022254258394241333, + "step": 594 + }, + { + "epoch": 0.074375, + "grad_norm": 6.565737724304199, + "grad_norm_var": 90.36543928633743, + "learning_rate": 0.0001, + "loss": 1.2519, + "loss/crossentropy": 2.3877017498016357, + "loss/hidden": 0.90625, + "loss/logits": 0.1231798455119133, + "loss/reg": 0.022244345396757126, + "step": 595 + }, + { + "epoch": 0.0745, + "grad_norm": 3.6746349334716797, + "grad_norm_var": 90.22397938232942, + "learning_rate": 0.0001, + "loss": 1.1718, + "loss/crossentropy": 2.397080898284912, + "loss/hidden": 0.80078125, + "loss/logits": 0.1486300826072693, + "loss/reg": 0.022234413772821426, + "step": 596 + }, + { + "epoch": 0.074625, + "grad_norm": 2.981614589691162, + "grad_norm_var": 90.50516196939815, + "learning_rate": 0.0001, + "loss": 1.043, + "loss/crossentropy": 2.37412428855896, + "loss/hidden": 0.69140625, + "loss/logits": 0.12931303679943085, + "loss/reg": 0.022224588319659233, + "step": 597 + }, + { + "epoch": 0.07475, + "grad_norm": 3.6531307697296143, + "grad_norm_var": 90.70497774366554, + "learning_rate": 0.0001, + "loss": 1.1272, + "loss/crossentropy": 2.6906161308288574, + "loss/hidden": 0.76953125, + "loss/logits": 0.13550975918769836, + "loss/reg": 0.02221417799592018, + "step": 598 + }, + { + "epoch": 0.074875, + "grad_norm": 3.751652240753174, + "grad_norm_var": 90.50753182721607, + "learning_rate": 0.0001, + "loss": 1.1618, + "loss/crossentropy": 2.6506831645965576, + "loss/hidden": 0.79296875, + "loss/logits": 0.1468081772327423, + "loss/reg": 0.022204989567399025, + "step": 599 + }, + { + "epoch": 0.075, + "grad_norm": 3.228563070297241, + "grad_norm_var": 90.31879350061254, + "learning_rate": 0.0001, + "loss": 1.0791, + "loss/crossentropy": 2.381368398666382, + "loss/hidden": 0.71875, + "loss/logits": 0.13835087418556213, + "loss/reg": 0.022195899859070778, + "step": 600 + }, + { + "epoch": 0.075125, + "grad_norm": 5.392886638641357, + "grad_norm_var": 89.60797997668053, + "learning_rate": 0.0001, + "loss": 1.2309, + "loss/crossentropy": 2.52596378326416, + "loss/hidden": 0.8359375, + "loss/logits": 0.17307403683662415, + "loss/reg": 0.022185994312167168, + "step": 601 + }, + { + "epoch": 0.07525, + "grad_norm": 3.2832775115966797, + "grad_norm_var": 89.43019603463321, + "learning_rate": 0.0001, + "loss": 1.1462, + "loss/crossentropy": 2.4442057609558105, + "loss/hidden": 0.76953125, + "loss/logits": 0.15487736463546753, + "loss/reg": 0.02217610739171505, + "step": 602 + }, + { + "epoch": 0.075375, + "grad_norm": 4.1014084815979, + "grad_norm_var": 89.1293068696746, + "learning_rate": 0.0001, + "loss": 1.0847, + "loss/crossentropy": 2.531064987182617, + "loss/hidden": 0.7265625, + "loss/logits": 0.13643184304237366, + "loss/reg": 0.02216634899377823, + "step": 603 + }, + { + "epoch": 0.0755, + "grad_norm": 3.656172037124634, + "grad_norm_var": 89.39756111673695, + "learning_rate": 0.0001, + "loss": 1.0161, + "loss/crossentropy": 2.470468044281006, + "loss/hidden": 0.68359375, + "loss/logits": 0.11097002029418945, + "loss/reg": 0.02215682342648506, + "step": 604 + }, + { + "epoch": 0.075625, + "grad_norm": 3.8061492443084717, + "grad_norm_var": 89.1498668494714, + "learning_rate": 0.0001, + "loss": 1.1195, + "loss/crossentropy": 2.469788074493408, + "loss/hidden": 0.7734375, + "loss/logits": 0.12457874417304993, + "loss/reg": 0.022147687152028084, + "step": 605 + }, + { + "epoch": 0.07575, + "grad_norm": 2.658135414123535, + "grad_norm_var": 89.29708722871554, + "learning_rate": 0.0001, + "loss": 1.0657, + "loss/crossentropy": 2.2552011013031006, + "loss/hidden": 0.71484375, + "loss/logits": 0.12950366735458374, + "loss/reg": 0.022138802334666252, + "step": 606 + }, + { + "epoch": 0.075875, + "grad_norm": 3.291750907897949, + "grad_norm_var": 89.20284122750789, + "learning_rate": 0.0001, + "loss": 1.133, + "loss/crossentropy": 2.225055456161499, + "loss/hidden": 0.73828125, + "loss/logits": 0.1734064519405365, + "loss/reg": 0.022129878401756287, + "step": 607 + }, + { + "epoch": 0.076, + "grad_norm": 4.001339912414551, + "grad_norm_var": 89.31742762195415, + "learning_rate": 0.0001, + "loss": 0.9658, + "loss/crossentropy": 2.3826169967651367, + "loss/hidden": 0.64453125, + "loss/logits": 0.10007497668266296, + "loss/reg": 0.022121025249361992, + "step": 608 + }, + { + "epoch": 0.076125, + "grad_norm": 2.9729602336883545, + "grad_norm_var": 0.9817241823775095, + "learning_rate": 0.0001, + "loss": 1.1962, + "loss/crossentropy": 2.4926881790161133, + "loss/hidden": 0.82421875, + "loss/logits": 0.1508128046989441, + "loss/reg": 0.022111859172582626, + "step": 609 + }, + { + "epoch": 0.07625, + "grad_norm": 2.5144121646881104, + "grad_norm_var": 1.0293551003726749, + "learning_rate": 0.0001, + "loss": 0.9883, + "loss/crossentropy": 2.478576898574829, + "loss/hidden": 0.65234375, + "loss/logits": 0.11488830298185349, + "loss/reg": 0.02210419811308384, + "step": 610 + }, + { + "epoch": 0.076375, + "grad_norm": 3.7777743339538574, + "grad_norm_var": 0.4576308797359, + "learning_rate": 0.0001, + "loss": 1.0059, + "loss/crossentropy": 2.4356257915496826, + "loss/hidden": 0.6484375, + "loss/logits": 0.13651525974273682, + "loss/reg": 0.022095149382948875, + "step": 611 + }, + { + "epoch": 0.0765, + "grad_norm": 3.232398748397827, + "grad_norm_var": 0.4623055923756754, + "learning_rate": 0.0001, + "loss": 1.0108, + "loss/crossentropy": 2.4058218002319336, + "loss/hidden": 0.67578125, + "loss/logits": 0.11410736292600632, + "loss/reg": 0.022087210789322853, + "step": 612 + }, + { + "epoch": 0.076625, + "grad_norm": 2.61934757232666, + "grad_norm_var": 0.49646373584008807, + "learning_rate": 0.0001, + "loss": 0.9466, + "loss/crossentropy": 2.3981077671051025, + "loss/hidden": 0.59765625, + "loss/logits": 0.1281721591949463, + "loss/reg": 0.022078126668930054, + "step": 613 + }, + { + "epoch": 0.07675, + "grad_norm": 4.228268146514893, + "grad_norm_var": 0.5291615579452734, + "learning_rate": 0.0001, + "loss": 1.0729, + "loss/crossentropy": 2.664149761199951, + "loss/hidden": 0.73046875, + "loss/logits": 0.12176868319511414, + "loss/reg": 0.02207016758620739, + "step": 614 + }, + { + "epoch": 0.076875, + "grad_norm": 2.9137685298919678, + "grad_norm_var": 0.5485319535320492, + "learning_rate": 0.0001, + "loss": 0.8708, + "loss/crossentropy": 2.5457890033721924, + "loss/hidden": 0.55078125, + "loss/logits": 0.09941907972097397, + "loss/reg": 0.022060981020331383, + "step": 615 + }, + { + "epoch": 0.077, + "grad_norm": 3.4499685764312744, + "grad_norm_var": 0.5441756848342263, + "learning_rate": 0.0001, + "loss": 1.1439, + "loss/crossentropy": 2.3822786808013916, + "loss/hidden": 0.78125, + "loss/logits": 0.14211627840995789, + "loss/reg": 0.02205180749297142, + "step": 616 + }, + { + "epoch": 0.077125, + "grad_norm": 2.271921157836914, + "grad_norm_var": 0.36266744154546565, + "learning_rate": 0.0001, + "loss": 0.9061, + "loss/crossentropy": 2.707848072052002, + "loss/hidden": 0.58203125, + "loss/logits": 0.10360105335712433, + "loss/reg": 0.022042402997612953, + "step": 617 + }, + { + "epoch": 0.07725, + "grad_norm": 3.16980242729187, + "grad_norm_var": 0.36370543210803513, + "learning_rate": 0.0001, + "loss": 1.2733, + "loss/crossentropy": 2.1980466842651367, + "loss/hidden": 0.86328125, + "loss/logits": 0.18972548842430115, + "loss/reg": 0.02203306369483471, + "step": 618 + }, + { + "epoch": 0.077375, + "grad_norm": 2.4408295154571533, + "grad_norm_var": 0.3567501583972517, + "learning_rate": 0.0001, + "loss": 1.1352, + "loss/crossentropy": 2.3986692428588867, + "loss/hidden": 0.76953125, + "loss/logits": 0.1454274207353592, + "loss/reg": 0.022023871541023254, + "step": 619 + }, + { + "epoch": 0.0775, + "grad_norm": 2.558687925338745, + "grad_norm_var": 0.36349398943808237, + "learning_rate": 0.0001, + "loss": 0.9776, + "loss/crossentropy": 2.4290616512298584, + "loss/hidden": 0.65625, + "loss/logits": 0.1012360006570816, + "loss/reg": 0.0220141913741827, + "step": 620 + }, + { + "epoch": 0.077625, + "grad_norm": 3.7468619346618652, + "grad_norm_var": 0.3582835152003213, + "learning_rate": 0.0001, + "loss": 1.1635, + "loss/crossentropy": 2.4566454887390137, + "loss/hidden": 0.78515625, + "loss/logits": 0.15830263495445251, + "loss/reg": 0.02200442925095558, + "step": 621 + }, + { + "epoch": 0.07775, + "grad_norm": 3.8364651203155518, + "grad_norm_var": 0.3732032502257139, + "learning_rate": 0.0001, + "loss": 1.3213, + "loss/crossentropy": 2.5934860706329346, + "loss/hidden": 0.9296875, + "loss/logits": 0.17167437076568604, + "loss/reg": 0.021995313465595245, + "step": 622 + }, + { + "epoch": 0.077875, + "grad_norm": 2.9136173725128174, + "grad_norm_var": 0.37696739372618043, + "learning_rate": 0.0001, + "loss": 1.1191, + "loss/crossentropy": 2.1625287532806396, + "loss/hidden": 0.75, + "loss/logits": 0.1492336541414261, + "loss/reg": 0.021986283361911774, + "step": 623 + }, + { + "epoch": 0.078, + "grad_norm": 3.3136067390441895, + "grad_norm_var": 0.3298862344756988, + "learning_rate": 0.0001, + "loss": 1.1487, + "loss/crossentropy": 2.3915464878082275, + "loss/hidden": 0.7890625, + "loss/logits": 0.1398892104625702, + "loss/reg": 0.021976841613650322, + "step": 624 + }, + { + "epoch": 0.078125, + "grad_norm": 3.1731748580932617, + "grad_norm_var": 0.3283984444798058, + "learning_rate": 0.0001, + "loss": 1.0945, + "loss/crossentropy": 2.53873610496521, + "loss/hidden": 0.7109375, + "loss/logits": 0.16392138600349426, + "loss/reg": 0.021966535598039627, + "step": 625 + }, + { + "epoch": 0.07825, + "grad_norm": 4.172556400299072, + "grad_norm_var": 0.3630228628347131, + "learning_rate": 0.0001, + "loss": 1.1888, + "loss/crossentropy": 2.4219956398010254, + "loss/hidden": 0.79296875, + "loss/logits": 0.17627781629562378, + "loss/reg": 0.021956363692879677, + "step": 626 + }, + { + "epoch": 0.078375, + "grad_norm": 3.99817156791687, + "grad_norm_var": 0.3819004722530487, + "learning_rate": 0.0001, + "loss": 1.3074, + "loss/crossentropy": 2.2407050132751465, + "loss/hidden": 0.8828125, + "loss/logits": 0.20513707399368286, + "loss/reg": 0.02194611169397831, + "step": 627 + }, + { + "epoch": 0.0785, + "grad_norm": 3.2869558334350586, + "grad_norm_var": 0.3819405314837089, + "learning_rate": 0.0001, + "loss": 0.923, + "loss/crossentropy": 2.5824477672576904, + "loss/hidden": 0.59375, + "loss/logits": 0.10985252261161804, + "loss/reg": 0.021935785189270973, + "step": 628 + }, + { + "epoch": 0.078625, + "grad_norm": 3.0037360191345215, + "grad_norm_var": 0.35855200267849247, + "learning_rate": 0.0001, + "loss": 1.0636, + "loss/crossentropy": 2.4916722774505615, + "loss/hidden": 0.7109375, + "loss/logits": 0.13343587517738342, + "loss/reg": 0.021925168111920357, + "step": 629 + }, + { + "epoch": 0.07875, + "grad_norm": 3.5274088382720947, + "grad_norm_var": 0.3006291732182412, + "learning_rate": 0.0001, + "loss": 1.1441, + "loss/crossentropy": 1.7279773950576782, + "loss/hidden": 0.78515625, + "loss/logits": 0.13975940644741058, + "loss/reg": 0.02191445231437683, + "step": 630 + }, + { + "epoch": 0.078875, + "grad_norm": 3.2895989418029785, + "grad_norm_var": 0.29330515223301745, + "learning_rate": 0.0001, + "loss": 1.0394, + "loss/crossentropy": 2.370850086212158, + "loss/hidden": 0.68359375, + "loss/logits": 0.13678821921348572, + "loss/reg": 0.02190525084733963, + "step": 631 + }, + { + "epoch": 0.079, + "grad_norm": 3.741135597229004, + "grad_norm_var": 0.30599490652712474, + "learning_rate": 0.0001, + "loss": 1.2655, + "loss/crossentropy": 2.365464210510254, + "loss/hidden": 0.87890625, + "loss/logits": 0.16766133904457092, + "loss/reg": 0.02189476415514946, + "step": 632 + }, + { + "epoch": 0.079125, + "grad_norm": 4.110158443450928, + "grad_norm_var": 0.2706546096444164, + "learning_rate": 0.0001, + "loss": 1.1804, + "loss/crossentropy": 2.658433437347412, + "loss/hidden": 0.796875, + "loss/logits": 0.1646716445684433, + "loss/reg": 0.021885616704821587, + "step": 633 + }, + { + "epoch": 0.07925, + "grad_norm": 3.7503654956817627, + "grad_norm_var": 0.27446839769864156, + "learning_rate": 0.0001, + "loss": 1.1, + "loss/crossentropy": 2.4457831382751465, + "loss/hidden": 0.74609375, + "loss/logits": 0.13510534167289734, + "loss/reg": 0.021875550970435143, + "step": 634 + }, + { + "epoch": 0.079375, + "grad_norm": 2.917835235595703, + "grad_norm_var": 0.22584356567046956, + "learning_rate": 0.0001, + "loss": 1.1498, + "loss/crossentropy": 2.3328001499176025, + "loss/hidden": 0.77734375, + "loss/logits": 0.1537725031375885, + "loss/reg": 0.021865583956241608, + "step": 635 + }, + { + "epoch": 0.0795, + "grad_norm": 2.907240152359009, + "grad_norm_var": 0.19160647764443486, + "learning_rate": 0.0001, + "loss": 0.9073, + "loss/crossentropy": 2.556042432785034, + "loss/hidden": 0.59375, + "loss/logits": 0.09503352642059326, + "loss/reg": 0.02185530960559845, + "step": 636 + }, + { + "epoch": 0.079625, + "grad_norm": 2.5937955379486084, + "grad_norm_var": 0.2337615816576618, + "learning_rate": 0.0001, + "loss": 0.9198, + "loss/crossentropy": 2.363370180130005, + "loss/hidden": 0.5859375, + "loss/logits": 0.11537887156009674, + "loss/reg": 0.021845519542694092, + "step": 637 + }, + { + "epoch": 0.07975, + "grad_norm": 3.9080708026885986, + "grad_norm_var": 0.2381681132369368, + "learning_rate": 0.0001, + "loss": 1.1552, + "loss/crossentropy": 2.5054194927215576, + "loss/hidden": 0.81640625, + "loss/logits": 0.12043754756450653, + "loss/reg": 0.021835271269083023, + "step": 638 + }, + { + "epoch": 0.079875, + "grad_norm": 2.8192946910858154, + "grad_norm_var": 0.24500412598165416, + "learning_rate": 0.0001, + "loss": 0.9375, + "loss/crossentropy": 2.6611719131469727, + "loss/hidden": 0.60546875, + "loss/logits": 0.11382012814283371, + "loss/reg": 0.021824965253472328, + "step": 639 + }, + { + "epoch": 0.08, + "grad_norm": 4.111716270446777, + "grad_norm_var": 0.27486954530744445, + "learning_rate": 0.0001, + "loss": 1.2972, + "loss/crossentropy": 2.472568988800049, + "loss/hidden": 0.91015625, + "loss/logits": 0.16890740394592285, + "loss/reg": 0.02181575633585453, + "step": 640 + }, + { + "epoch": 0.080125, + "grad_norm": 5.715061187744141, + "grad_norm_var": 0.5825168124345791, + "learning_rate": 0.0001, + "loss": 1.0307, + "loss/crossentropy": 2.2123966217041016, + "loss/hidden": 0.68359375, + "loss/logits": 0.12899596989154816, + "loss/reg": 0.021806620061397552, + "step": 641 + }, + { + "epoch": 0.08025, + "grad_norm": 3.8680315017700195, + "grad_norm_var": 0.5657073815126407, + "learning_rate": 0.0001, + "loss": 1.053, + "loss/crossentropy": 2.4526453018188477, + "loss/hidden": 0.703125, + "loss/logits": 0.13190723955631256, + "loss/reg": 0.021797508001327515, + "step": 642 + }, + { + "epoch": 0.080375, + "grad_norm": 4.992859363555908, + "grad_norm_var": 0.680778895488037, + "learning_rate": 0.0001, + "loss": 1.1683, + "loss/crossentropy": 2.3445968627929688, + "loss/hidden": 0.82421875, + "loss/logits": 0.12617573142051697, + "loss/reg": 0.02178841643035412, + "step": 643 + }, + { + "epoch": 0.0805, + "grad_norm": 4.339104175567627, + "grad_norm_var": 0.6977811040599325, + "learning_rate": 0.0001, + "loss": 1.3433, + "loss/crossentropy": 1.8577097654342651, + "loss/hidden": 0.9453125, + "loss/logits": 0.18017446994781494, + "loss/reg": 0.02177964523434639, + "step": 644 + }, + { + "epoch": 0.080625, + "grad_norm": 4.026562213897705, + "grad_norm_var": 0.6648423545945282, + "learning_rate": 0.0001, + "loss": 1.3266, + "loss/crossentropy": 2.415410041809082, + "loss/hidden": 0.859375, + "loss/logits": 0.24951061606407166, + "loss/reg": 0.021770501509308815, + "step": 645 + }, + { + "epoch": 0.08075, + "grad_norm": 2.8517775535583496, + "grad_norm_var": 0.7169049906385166, + "learning_rate": 0.0001, + "loss": 0.9283, + "loss/crossentropy": 2.6426851749420166, + "loss/hidden": 0.6015625, + "loss/logits": 0.10916159301996231, + "loss/reg": 0.021761184558272362, + "step": 646 + }, + { + "epoch": 0.080875, + "grad_norm": 3.3171377182006836, + "grad_norm_var": 0.7152750431492562, + "learning_rate": 0.0001, + "loss": 1.0088, + "loss/crossentropy": 2.638533115386963, + "loss/hidden": 0.62890625, + "loss/logits": 0.1623907834291458, + "loss/reg": 0.02175196446478367, + "step": 647 + }, + { + "epoch": 0.081, + "grad_norm": 3.653881311416626, + "grad_norm_var": 0.7158322952113887, + "learning_rate": 0.0001, + "loss": 1.1132, + "loss/crossentropy": 2.745229482650757, + "loss/hidden": 0.765625, + "loss/logits": 0.1301097273826599, + "loss/reg": 0.021742329001426697, + "step": 648 + }, + { + "epoch": 0.081125, + "grad_norm": 4.31439733505249, + "grad_norm_var": 0.728446489341123, + "learning_rate": 0.0001, + "loss": 1.2706, + "loss/crossentropy": 2.7004082202911377, + "loss/hidden": 0.80078125, + "loss/logits": 0.25250470638275146, + "loss/reg": 0.021732579916715622, + "step": 649 + }, + { + "epoch": 0.08125, + "grad_norm": 4.621687889099121, + "grad_norm_var": 0.7753064642270201, + "learning_rate": 0.0001, + "loss": 1.4091, + "loss/crossentropy": 2.38405179977417, + "loss/hidden": 0.89453125, + "loss/logits": 0.29737186431884766, + "loss/reg": 0.02172265760600567, + "step": 650 + }, + { + "epoch": 0.081375, + "grad_norm": 2.87424898147583, + "grad_norm_var": 0.7806094534209419, + "learning_rate": 0.0001, + "loss": 1.2103, + "loss/crossentropy": 2.0905954837799072, + "loss/hidden": 0.8125, + "loss/logits": 0.18066659569740295, + "loss/reg": 0.021712414920330048, + "step": 651 + }, + { + "epoch": 0.0815, + "grad_norm": 4.724517345428467, + "grad_norm_var": 0.7689569917943563, + "learning_rate": 0.0001, + "loss": 1.2928, + "loss/crossentropy": 2.500594139099121, + "loss/hidden": 0.89453125, + "loss/logits": 0.1812204122543335, + "loss/reg": 0.021703310310840607, + "step": 652 + }, + { + "epoch": 0.081625, + "grad_norm": 2.8673081398010254, + "grad_norm_var": 0.7252403996552116, + "learning_rate": 0.0001, + "loss": 1.0588, + "loss/crossentropy": 2.3599724769592285, + "loss/hidden": 0.703125, + "loss/logits": 0.13873916864395142, + "loss/reg": 0.021693557500839233, + "step": 653 + }, + { + "epoch": 0.08175, + "grad_norm": 3.099315643310547, + "grad_norm_var": 0.7693322976491117, + "learning_rate": 0.0001, + "loss": 1.1961, + "loss/crossentropy": 2.3167355060577393, + "loss/hidden": 0.83203125, + "loss/logits": 0.14727652072906494, + "loss/reg": 0.021683741360902786, + "step": 654 + }, + { + "epoch": 0.081875, + "grad_norm": 3.287602186203003, + "grad_norm_var": 0.7163515778113151, + "learning_rate": 0.0001, + "loss": 1.0429, + "loss/crossentropy": 2.7967636585235596, + "loss/hidden": 0.6875, + "loss/logits": 0.13863959908485413, + "loss/reg": 0.021674364805221558, + "step": 655 + }, + { + "epoch": 0.082, + "grad_norm": 3.487874746322632, + "grad_norm_var": 0.7244436337536264, + "learning_rate": 0.0001, + "loss": 1.0883, + "loss/crossentropy": 2.5409998893737793, + "loss/hidden": 0.71875, + "loss/logits": 0.1528949737548828, + "loss/reg": 0.02166520059108734, + "step": 656 + }, + { + "epoch": 0.082125, + "grad_norm": 3.6202752590179443, + "grad_norm_var": 0.4854858648423857, + "learning_rate": 0.0001, + "loss": 1.2516, + "loss/crossentropy": 2.3633673191070557, + "loss/hidden": 0.88671875, + "loss/logits": 0.14828172326087952, + "loss/reg": 0.021655315533280373, + "step": 657 + }, + { + "epoch": 0.08225, + "grad_norm": 3.7921783924102783, + "grad_norm_var": 0.484617963461113, + "learning_rate": 0.0001, + "loss": 1.07, + "loss/crossentropy": 2.497013807296753, + "loss/hidden": 0.71484375, + "loss/logits": 0.13874131441116333, + "loss/reg": 0.021646033972501755, + "step": 658 + }, + { + "epoch": 0.082375, + "grad_norm": 5.818857669830322, + "grad_norm_var": 0.6650298211735747, + "learning_rate": 0.0001, + "loss": 1.2961, + "loss/crossentropy": 2.6599833965301514, + "loss/hidden": 0.91015625, + "loss/logits": 0.16955840587615967, + "loss/reg": 0.021636882796883583, + "step": 659 + }, + { + "epoch": 0.0825, + "grad_norm": 3.465527057647705, + "grad_norm_var": 0.6491808619433999, + "learning_rate": 0.0001, + "loss": 1.1365, + "loss/crossentropy": 2.5972306728363037, + "loss/hidden": 0.7734375, + "loss/logits": 0.14674408733844757, + "loss/reg": 0.02162766456604004, + "step": 660 + }, + { + "epoch": 0.082625, + "grad_norm": 3.143159866333008, + "grad_norm_var": 0.664078497493661, + "learning_rate": 0.0001, + "loss": 1.2241, + "loss/crossentropy": 2.5945792198181152, + "loss/hidden": 0.83203125, + "loss/logits": 0.17588043212890625, + "loss/reg": 0.021618474274873734, + "step": 661 + }, + { + "epoch": 0.08275, + "grad_norm": 3.7091660499572754, + "grad_norm_var": 0.6149151800980365, + "learning_rate": 0.0001, + "loss": 1.1901, + "loss/crossentropy": 2.631565809249878, + "loss/hidden": 0.8125, + "loss/logits": 0.16149985790252686, + "loss/reg": 0.021609637886285782, + "step": 662 + }, + { + "epoch": 0.082875, + "grad_norm": 3.1449224948883057, + "grad_norm_var": 0.6264170707357067, + "learning_rate": 0.0001, + "loss": 1.1491, + "loss/crossentropy": 2.2890501022338867, + "loss/hidden": 0.78515625, + "loss/logits": 0.14797118306159973, + "loss/reg": 0.021600957959890366, + "step": 663 + }, + { + "epoch": 0.083, + "grad_norm": 3.096752405166626, + "grad_norm_var": 0.6512152784754629, + "learning_rate": 0.0001, + "loss": 1.0925, + "loss/crossentropy": 2.767069101333618, + "loss/hidden": 0.7421875, + "loss/logits": 0.1343650072813034, + "loss/reg": 0.021592585369944572, + "step": 664 + }, + { + "epoch": 0.083125, + "grad_norm": 3.8935883045196533, + "grad_norm_var": 0.6273466460071402, + "learning_rate": 0.0001, + "loss": 1.0996, + "loss/crossentropy": 2.4779109954833984, + "loss/hidden": 0.7578125, + "loss/logits": 0.12599951028823853, + "loss/reg": 0.021583350375294685, + "step": 665 + }, + { + "epoch": 0.08325, + "grad_norm": 3.6881141662597656, + "grad_norm_var": 0.5627883047301658, + "learning_rate": 0.0001, + "loss": 1.1184, + "loss/crossentropy": 2.1603968143463135, + "loss/hidden": 0.76171875, + "loss/logits": 0.14095276594161987, + "loss/reg": 0.021574225276708603, + "step": 666 + }, + { + "epoch": 0.083375, + "grad_norm": 2.9750800132751465, + "grad_norm_var": 0.5535713466115603, + "learning_rate": 0.0001, + "loss": 1.1215, + "loss/crossentropy": 2.4817895889282227, + "loss/hidden": 0.75390625, + "loss/logits": 0.15194162726402283, + "loss/reg": 0.02156493254005909, + "step": 667 + }, + { + "epoch": 0.0835, + "grad_norm": 2.649543046951294, + "grad_norm_var": 0.5152581471177806, + "learning_rate": 0.0001, + "loss": 0.918, + "loss/crossentropy": 2.334226608276367, + "loss/hidden": 0.59375, + "loss/logits": 0.10870292782783508, + "loss/reg": 0.02155502513051033, + "step": 668 + }, + { + "epoch": 0.083625, + "grad_norm": 4.535495281219482, + "grad_norm_var": 0.5520843285134825, + "learning_rate": 0.0001, + "loss": 1.168, + "loss/crossentropy": 2.4633467197418213, + "loss/hidden": 0.796875, + "loss/logits": 0.15570059418678284, + "loss/reg": 0.021545063704252243, + "step": 669 + }, + { + "epoch": 0.08375, + "grad_norm": 3.3291449546813965, + "grad_norm_var": 0.5404115229162234, + "learning_rate": 0.0001, + "loss": 1.1241, + "loss/crossentropy": 2.317607879638672, + "loss/hidden": 0.78515625, + "loss/logits": 0.12363065779209137, + "loss/reg": 0.021535001695156097, + "step": 670 + }, + { + "epoch": 0.083875, + "grad_norm": 4.071917533874512, + "grad_norm_var": 0.5459456401930327, + "learning_rate": 0.0001, + "loss": 1.1652, + "loss/crossentropy": 2.4951958656311035, + "loss/hidden": 0.83203125, + "loss/logits": 0.11796893179416656, + "loss/reg": 0.021524924784898758, + "step": 671 + }, + { + "epoch": 0.084, + "grad_norm": 4.647782802581787, + "grad_norm_var": 0.6047501670354971, + "learning_rate": 0.0001, + "loss": 1.157, + "loss/crossentropy": 2.6994447708129883, + "loss/hidden": 0.78125, + "loss/logits": 0.16059228777885437, + "loss/reg": 0.021514689549803734, + "step": 672 + }, + { + "epoch": 0.084125, + "grad_norm": 4.240062713623047, + "grad_norm_var": 0.6201999433703328, + "learning_rate": 0.0001, + "loss": 1.0984, + "loss/crossentropy": 2.592426300048828, + "loss/hidden": 0.7734375, + "loss/logits": 0.10989370942115784, + "loss/reg": 0.021505359560251236, + "step": 673 + }, + { + "epoch": 0.08425, + "grad_norm": 15.647865295410156, + "grad_norm_var": 9.451818582857973, + "learning_rate": 0.0001, + "loss": 1.6418, + "loss/crossentropy": 2.723198413848877, + "loss/hidden": 1.078125, + "loss/logits": 0.3486996293067932, + "loss/reg": 0.02149534970521927, + "step": 674 + }, + { + "epoch": 0.084375, + "grad_norm": 3.6480963230133057, + "grad_norm_var": 9.365638761154676, + "learning_rate": 0.0001, + "loss": 1.1052, + "loss/crossentropy": 2.618795871734619, + "loss/hidden": 0.734375, + "loss/logits": 0.1559816300868988, + "loss/reg": 0.02148519456386566, + "step": 675 + }, + { + "epoch": 0.0845, + "grad_norm": 3.5789527893066406, + "grad_norm_var": 9.352796045350159, + "learning_rate": 0.0001, + "loss": 1.1092, + "loss/crossentropy": 2.4817607402801514, + "loss/hidden": 0.75, + "loss/logits": 0.14444658160209656, + "loss/reg": 0.021475963294506073, + "step": 676 + }, + { + "epoch": 0.084625, + "grad_norm": 3.143718719482422, + "grad_norm_var": 9.352704277495931, + "learning_rate": 0.0001, + "loss": 1.0394, + "loss/crossentropy": 2.28035044670105, + "loss/hidden": 0.70703125, + "loss/logits": 0.11767329275608063, + "loss/reg": 0.02146601676940918, + "step": 677 + }, + { + "epoch": 0.08475, + "grad_norm": 4.2889580726623535, + "grad_norm_var": 9.322240526517621, + "learning_rate": 0.0001, + "loss": 1.1067, + "loss/crossentropy": 2.28658390045166, + "loss/hidden": 0.7109375, + "loss/logits": 0.18121060729026794, + "loss/reg": 0.021456118673086166, + "step": 678 + }, + { + "epoch": 0.084875, + "grad_norm": 2.800516128540039, + "grad_norm_var": 9.387804829955044, + "learning_rate": 0.0001, + "loss": 0.975, + "loss/crossentropy": 2.699044704437256, + "loss/hidden": 0.63671875, + "loss/logits": 0.12379397451877594, + "loss/reg": 0.02144702896475792, + "step": 679 + }, + { + "epoch": 0.085, + "grad_norm": 3.064215898513794, + "grad_norm_var": 9.393480165725077, + "learning_rate": 0.0001, + "loss": 1.1186, + "loss/crossentropy": 2.547557830810547, + "loss/hidden": 0.76171875, + "loss/logits": 0.14249341189861298, + "loss/reg": 0.02143782004714012, + "step": 680 + }, + { + "epoch": 0.085125, + "grad_norm": 3.7314915657043457, + "grad_norm_var": 9.405801361337378, + "learning_rate": 0.0001, + "loss": 1.1376, + "loss/crossentropy": 2.705601930618286, + "loss/hidden": 0.7734375, + "loss/logits": 0.14988452196121216, + "loss/reg": 0.02142806351184845, + "step": 681 + }, + { + "epoch": 0.08525, + "grad_norm": 2.256387948989868, + "grad_norm_var": 9.665529326305519, + "learning_rate": 0.0001, + "loss": 0.945, + "loss/crossentropy": 2.464989423751831, + "loss/hidden": 0.62109375, + "loss/logits": 0.10975323617458344, + "loss/reg": 0.02141808532178402, + "step": 682 + }, + { + "epoch": 0.085375, + "grad_norm": 3.150348424911499, + "grad_norm_var": 9.636765682886749, + "learning_rate": 0.0001, + "loss": 1.0573, + "loss/crossentropy": 2.2520275115966797, + "loss/hidden": 0.72265625, + "loss/logits": 0.12059713900089264, + "loss/reg": 0.021408328786492348, + "step": 683 + }, + { + "epoch": 0.0855, + "grad_norm": 2.973684549331665, + "grad_norm_var": 9.572043410499656, + "learning_rate": 0.0001, + "loss": 1.0663, + "loss/crossentropy": 2.4148662090301514, + "loss/hidden": 0.73828125, + "loss/logits": 0.11397817730903625, + "loss/reg": 0.021399127319455147, + "step": 684 + }, + { + "epoch": 0.085625, + "grad_norm": 4.39288330078125, + "grad_norm_var": 9.56920341692891, + "learning_rate": 0.0001, + "loss": 1.206, + "loss/crossentropy": 2.5025415420532227, + "loss/hidden": 0.84765625, + "loss/logits": 0.1444375216960907, + "loss/reg": 0.021389208734035492, + "step": 685 + }, + { + "epoch": 0.08575, + "grad_norm": 2.815019369125366, + "grad_norm_var": 9.652987248771876, + "learning_rate": 0.0001, + "loss": 1.0631, + "loss/crossentropy": 2.236358165740967, + "loss/hidden": 0.72265625, + "loss/logits": 0.1266387552022934, + "loss/reg": 0.021379247307777405, + "step": 686 + }, + { + "epoch": 0.085875, + "grad_norm": 3.6744375228881836, + "grad_norm_var": 9.673796390527396, + "learning_rate": 0.0001, + "loss": 1.2523, + "loss/crossentropy": 2.1673667430877686, + "loss/hidden": 0.8671875, + "loss/logits": 0.1714528650045395, + "loss/reg": 0.02136901021003723, + "step": 687 + }, + { + "epoch": 0.086, + "grad_norm": 3.169265031814575, + "grad_norm_var": 9.732675648460468, + "learning_rate": 0.0001, + "loss": 1.1389, + "loss/crossentropy": 2.45385479927063, + "loss/hidden": 0.79296875, + "loss/logits": 0.13236522674560547, + "loss/reg": 0.021359853446483612, + "step": 688 + }, + { + "epoch": 0.086125, + "grad_norm": 7.017651557922363, + "grad_norm_var": 10.2441458601344, + "learning_rate": 0.0001, + "loss": 1.4071, + "loss/crossentropy": 2.344740629196167, + "loss/hidden": 1.0625, + "loss/logits": 0.13107535243034363, + "loss/reg": 0.021349839866161346, + "step": 689 + }, + { + "epoch": 0.08625, + "grad_norm": 3.5523998737335205, + "grad_norm_var": 1.142674868302701, + "learning_rate": 0.0001, + "loss": 1.1408, + "loss/crossentropy": 2.323866367340088, + "loss/hidden": 0.79296875, + "loss/logits": 0.13442741334438324, + "loss/reg": 0.021340306848287582, + "step": 690 + }, + { + "epoch": 0.086375, + "grad_norm": 3.2477400302886963, + "grad_norm_var": 1.1489843436981233, + "learning_rate": 0.0001, + "loss": 1.0067, + "loss/crossentropy": 2.672182083129883, + "loss/hidden": 0.67578125, + "loss/logits": 0.11757320165634155, + "loss/reg": 0.021331045776605606, + "step": 691 + }, + { + "epoch": 0.0865, + "grad_norm": 3.1238696575164795, + "grad_norm_var": 1.1603900529546722, + "learning_rate": 0.0001, + "loss": 1.1122, + "loss/crossentropy": 2.510279893875122, + "loss/hidden": 0.765625, + "loss/logits": 0.13339656591415405, + "loss/reg": 0.021321000531315804, + "step": 692 + }, + { + "epoch": 0.086625, + "grad_norm": 2.552560329437256, + "grad_norm_var": 1.2122975827491755, + "learning_rate": 0.0001, + "loss": 1.0396, + "loss/crossentropy": 2.5303587913513184, + "loss/hidden": 0.6953125, + "loss/logits": 0.13117440044879913, + "loss/reg": 0.021313220262527466, + "step": 693 + }, + { + "epoch": 0.08675, + "grad_norm": 2.7075002193450928, + "grad_norm_var": 1.1997649773340213, + "learning_rate": 0.0001, + "loss": 1.0787, + "loss/crossentropy": 2.4859445095062256, + "loss/hidden": 0.73828125, + "loss/logits": 0.12738223373889923, + "loss/reg": 0.021303845569491386, + "step": 694 + }, + { + "epoch": 0.086875, + "grad_norm": 3.2640225887298584, + "grad_norm_var": 1.1768004922088529, + "learning_rate": 0.0001, + "loss": 1.3383, + "loss/crossentropy": 2.4017555713653564, + "loss/hidden": 0.96875, + "loss/logits": 0.15659019351005554, + "loss/reg": 0.021294469013810158, + "step": 695 + }, + { + "epoch": 0.087, + "grad_norm": 3.5289857387542725, + "grad_norm_var": 1.1683562063707291, + "learning_rate": 0.0001, + "loss": 1.2607, + "loss/crossentropy": 2.2716236114501953, + "loss/hidden": 0.87109375, + "loss/logits": 0.17679363489151, + "loss/reg": 0.021284854039549828, + "step": 696 + }, + { + "epoch": 0.087125, + "grad_norm": 8.924003601074219, + "grad_norm_var": 3.0501856400161764, + "learning_rate": 0.0001, + "loss": 1.474, + "loss/crossentropy": 2.3684239387512207, + "loss/hidden": 1.0546875, + "loss/logits": 0.2065410166978836, + "loss/reg": 0.02127666585147381, + "step": 697 + }, + { + "epoch": 0.08725, + "grad_norm": 4.2463788986206055, + "grad_norm_var": 2.8955696375948605, + "learning_rate": 0.0001, + "loss": 1.1835, + "loss/crossentropy": 2.234137535095215, + "loss/hidden": 0.828125, + "loss/logits": 0.1427059769630432, + "loss/reg": 0.021268585696816444, + "step": 698 + }, + { + "epoch": 0.087375, + "grad_norm": 3.0776546001434326, + "grad_norm_var": 2.903130025314302, + "learning_rate": 0.0001, + "loss": 1.0559, + "loss/crossentropy": 2.4638020992279053, + "loss/hidden": 0.7265625, + "loss/logits": 0.1167045459151268, + "loss/reg": 0.02125934511423111, + "step": 699 + }, + { + "epoch": 0.0875, + "grad_norm": 3.9374022483825684, + "grad_norm_var": 2.843209099820052, + "learning_rate": 0.0001, + "loss": 0.9629, + "loss/crossentropy": 2.88035249710083, + "loss/hidden": 0.63671875, + "loss/logits": 0.11369955539703369, + "loss/reg": 0.021249722689390182, + "step": 700 + }, + { + "epoch": 0.087625, + "grad_norm": 4.235450744628906, + "grad_norm_var": 2.8355032825089417, + "learning_rate": 0.0001, + "loss": 1.278, + "loss/crossentropy": 2.2560718059539795, + "loss/hidden": 0.8828125, + "loss/logits": 0.1828073114156723, + "loss/reg": 0.0212401133030653, + "step": 701 + }, + { + "epoch": 0.08775, + "grad_norm": 2.3705031871795654, + "grad_norm_var": 2.9146564397348773, + "learning_rate": 0.0001, + "loss": 0.9899, + "loss/crossentropy": 2.531632661819458, + "loss/hidden": 0.64453125, + "loss/logits": 0.13305732607841492, + "loss/reg": 0.021231388673186302, + "step": 702 + }, + { + "epoch": 0.087875, + "grad_norm": 3.5063636302948, + "grad_norm_var": 2.921798711310286, + "learning_rate": 0.0001, + "loss": 1.0524, + "loss/crossentropy": 2.548243999481201, + "loss/hidden": 0.72265625, + "loss/logits": 0.11747653782367706, + "loss/reg": 0.02122276835143566, + "step": 703 + }, + { + "epoch": 0.088, + "grad_norm": 3.141618013381958, + "grad_norm_var": 2.924554396554724, + "learning_rate": 0.0001, + "loss": 0.9087, + "loss/crossentropy": 2.3723928928375244, + "loss/hidden": 0.59375, + "loss/logits": 0.10285645723342896, + "loss/reg": 0.021213354542851448, + "step": 704 + }, + { + "epoch": 0.088125, + "grad_norm": 4.195517063140869, + "grad_norm_var": 2.2500098957229806, + "learning_rate": 0.0001, + "loss": 1.1263, + "loss/crossentropy": 2.383502244949341, + "loss/hidden": 0.765625, + "loss/logits": 0.14867368340492249, + "loss/reg": 0.021204529330134392, + "step": 705 + }, + { + "epoch": 0.08825, + "grad_norm": 3.646482467651367, + "grad_norm_var": 2.2483885758775686, + "learning_rate": 0.0001, + "loss": 1.1459, + "loss/crossentropy": 2.239716053009033, + "loss/hidden": 0.79296875, + "loss/logits": 0.14099720120429993, + "loss/reg": 0.021195242181420326, + "step": 706 + }, + { + "epoch": 0.088375, + "grad_norm": 3.3426883220672607, + "grad_norm_var": 2.242826109053838, + "learning_rate": 0.0001, + "loss": 1.0323, + "loss/crossentropy": 2.270444869995117, + "loss/hidden": 0.7109375, + "loss/logits": 0.10951918363571167, + "loss/reg": 0.021186839789152145, + "step": 707 + }, + { + "epoch": 0.0885, + "grad_norm": 3.083806037902832, + "grad_norm_var": 2.2462046620558, + "learning_rate": 0.0001, + "loss": 1.086, + "loss/crossentropy": 2.5013253688812256, + "loss/hidden": 0.75390625, + "loss/logits": 0.12026840448379517, + "loss/reg": 0.021178435534238815, + "step": 708 + }, + { + "epoch": 0.088625, + "grad_norm": 3.71588134765625, + "grad_norm_var": 2.147370219186798, + "learning_rate": 0.0001, + "loss": 1.1125, + "loss/crossentropy": 2.342475175857544, + "loss/hidden": 0.76953125, + "loss/logits": 0.13124068081378937, + "loss/reg": 0.021169869229197502, + "step": 709 + }, + { + "epoch": 0.08875, + "grad_norm": 3.4402713775634766, + "grad_norm_var": 2.0734307300644255, + "learning_rate": 0.0001, + "loss": 1.1229, + "loss/crossentropy": 2.4113690853118896, + "loss/hidden": 0.76171875, + "loss/logits": 0.14960095286369324, + "loss/reg": 0.021161576732993126, + "step": 710 + }, + { + "epoch": 0.088875, + "grad_norm": 10.640914916992188, + "grad_norm_var": 4.894724677276531, + "learning_rate": 0.0001, + "loss": 1.7804, + "loss/crossentropy": 2.6555004119873047, + "loss/hidden": 1.328125, + "loss/logits": 0.240725576877594, + "loss/reg": 0.02115357480943203, + "step": 711 + }, + { + "epoch": 0.089, + "grad_norm": 3.23919415473938, + "grad_norm_var": 4.930329406483421, + "learning_rate": 0.0001, + "loss": 1.1368, + "loss/crossentropy": 2.4652106761932373, + "loss/hidden": 0.77734375, + "loss/logits": 0.14798855781555176, + "loss/reg": 0.021144360303878784, + "step": 712 + }, + { + "epoch": 0.089125, + "grad_norm": 2.8362622261047363, + "grad_norm_var": 3.4904838717428524, + "learning_rate": 0.0001, + "loss": 1.0668, + "loss/crossentropy": 2.216999053955078, + "loss/hidden": 0.7265625, + "loss/logits": 0.1288391500711441, + "loss/reg": 0.021136239171028137, + "step": 713 + }, + { + "epoch": 0.08925, + "grad_norm": 5.395811080932617, + "grad_norm_var": 3.623687874884585, + "learning_rate": 0.0001, + "loss": 1.2022, + "loss/crossentropy": 2.463663101196289, + "loss/hidden": 0.82421875, + "loss/logits": 0.1667168289422989, + "loss/reg": 0.02112707309424877, + "step": 714 + }, + { + "epoch": 0.089375, + "grad_norm": 3.475656032562256, + "grad_norm_var": 3.585286252049487, + "learning_rate": 0.0001, + "loss": 1.1524, + "loss/crossentropy": 2.4688560962677, + "loss/hidden": 0.796875, + "loss/logits": 0.14434993267059326, + "loss/reg": 0.02111782319843769, + "step": 715 + }, + { + "epoch": 0.0895, + "grad_norm": 4.29054594039917, + "grad_norm_var": 3.5895333664829043, + "learning_rate": 0.0001, + "loss": 1.1214, + "loss/crossentropy": 2.4820876121520996, + "loss/hidden": 0.7578125, + "loss/logits": 0.15247562527656555, + "loss/reg": 0.021108638495206833, + "step": 716 + }, + { + "epoch": 0.089625, + "grad_norm": 2.927002429962158, + "grad_norm_var": 3.661532010616088, + "learning_rate": 0.0001, + "loss": 1.0904, + "loss/crossentropy": 2.499390125274658, + "loss/hidden": 0.734375, + "loss/logits": 0.14508014917373657, + "loss/reg": 0.021099381148815155, + "step": 717 + }, + { + "epoch": 0.08975, + "grad_norm": 2.529557943344116, + "grad_norm_var": 3.629551988733732, + "learning_rate": 0.0001, + "loss": 1.1194, + "loss/crossentropy": 2.395061492919922, + "loss/hidden": 0.76953125, + "loss/logits": 0.13898837566375732, + "loss/reg": 0.021089982241392136, + "step": 718 + }, + { + "epoch": 0.089875, + "grad_norm": 3.2681446075439453, + "grad_norm_var": 3.6476018392648397, + "learning_rate": 0.0001, + "loss": 1.0036, + "loss/crossentropy": 2.65079927444458, + "loss/hidden": 0.6640625, + "loss/logits": 0.12875254452228546, + "loss/reg": 0.021080130711197853, + "step": 719 + }, + { + "epoch": 0.09, + "grad_norm": 2.6700246334075928, + "grad_norm_var": 3.7122117675621022, + "learning_rate": 0.0001, + "loss": 0.9929, + "loss/crossentropy": 2.428039789199829, + "loss/hidden": 0.65234375, + "loss/logits": 0.12986770272254944, + "loss/reg": 0.021070368587970734, + "step": 720 + }, + { + "epoch": 0.090125, + "grad_norm": 3.296482563018799, + "grad_norm_var": 3.7295350110356558, + "learning_rate": 0.0001, + "loss": 1.1571, + "loss/crossentropy": 2.317190408706665, + "loss/hidden": 0.80078125, + "loss/logits": 0.14570315182209015, + "loss/reg": 0.02106117643415928, + "step": 721 + }, + { + "epoch": 0.09025, + "grad_norm": 4.451722145080566, + "grad_norm_var": 3.74687645800365, + "learning_rate": 0.0001, + "loss": 1.1385, + "loss/crossentropy": 2.2958080768585205, + "loss/hidden": 0.78125, + "loss/logits": 0.146757572889328, + "loss/reg": 0.021051928400993347, + "step": 722 + }, + { + "epoch": 0.090375, + "grad_norm": 2.4288058280944824, + "grad_norm_var": 3.8685376080960414, + "learning_rate": 0.0001, + "loss": 0.9788, + "loss/crossentropy": 2.4692001342773438, + "loss/hidden": 0.65234375, + "loss/logits": 0.11606692522764206, + "loss/reg": 0.021042969077825546, + "step": 723 + }, + { + "epoch": 0.0905, + "grad_norm": 45.21147918701172, + "grad_norm_var": 110.45448625779909, + "learning_rate": 0.0001, + "loss": 1.0823, + "loss/crossentropy": 2.4428551197052, + "loss/hidden": 0.7421875, + "loss/logits": 0.12983539700508118, + "loss/reg": 0.021031970158219337, + "step": 724 + }, + { + "epoch": 0.090625, + "grad_norm": 3.7111551761627197, + "grad_norm_var": 110.45623490585022, + "learning_rate": 0.0001, + "loss": 1.4512, + "loss/crossentropy": 2.1149277687072754, + "loss/hidden": 1.046875, + "loss/logits": 0.1941366195678711, + "loss/reg": 0.021020574495196342, + "step": 725 + }, + { + "epoch": 0.09075, + "grad_norm": 3.3139920234680176, + "grad_norm_var": 110.50855221427315, + "learning_rate": 0.0001, + "loss": 1.0686, + "loss/crossentropy": 2.798910140991211, + "loss/hidden": 0.71875, + "loss/logits": 0.13971024751663208, + "loss/reg": 0.021009519696235657, + "step": 726 + }, + { + "epoch": 0.090875, + "grad_norm": 2.5206596851348877, + "grad_norm_var": 110.12514261997971, + "learning_rate": 0.0001, + "loss": 1.0391, + "loss/crossentropy": 2.400813579559326, + "loss/hidden": 0.6953125, + "loss/logits": 0.13384617865085602, + "loss/reg": 0.020998528227210045, + "step": 727 + }, + { + "epoch": 0.091, + "grad_norm": 2.698018789291382, + "grad_norm_var": 110.34070270952832, + "learning_rate": 0.0001, + "loss": 0.928, + "loss/crossentropy": 2.505309820175171, + "loss/hidden": 0.61328125, + "loss/logits": 0.10482652485370636, + "loss/reg": 0.020989248529076576, + "step": 728 + }, + { + "epoch": 0.091125, + "grad_norm": 6.086211204528809, + "grad_norm_var": 109.65630388036335, + "learning_rate": 0.0001, + "loss": 1.4543, + "loss/crossentropy": 2.065880537033081, + "loss/hidden": 1.1171875, + "loss/logits": 0.12730881571769714, + "loss/reg": 0.020978538319468498, + "step": 729 + }, + { + "epoch": 0.09125, + "grad_norm": 2.527377128601074, + "grad_norm_var": 110.45601242879228, + "learning_rate": 0.0001, + "loss": 0.9281, + "loss/crossentropy": 2.575125217437744, + "loss/hidden": 0.60546875, + "loss/logits": 0.11296658217906952, + "loss/reg": 0.020967954769730568, + "step": 730 + }, + { + "epoch": 0.091375, + "grad_norm": 2.405383825302124, + "grad_norm_var": 110.88254605251709, + "learning_rate": 0.0001, + "loss": 1.0005, + "loss/crossentropy": 2.1985392570495605, + "loss/hidden": 0.671875, + "loss/logits": 0.1190965548157692, + "loss/reg": 0.02095715142786503, + "step": 731 + }, + { + "epoch": 0.0915, + "grad_norm": 5.095945835113525, + "grad_norm_var": 110.75067974759948, + "learning_rate": 0.0001, + "loss": 1.2535, + "loss/crossentropy": 2.451446056365967, + "loss/hidden": 0.88671875, + "loss/logits": 0.15732397139072418, + "loss/reg": 0.020946422591805458, + "step": 732 + }, + { + "epoch": 0.091625, + "grad_norm": 2.497173547744751, + "grad_norm_var": 110.93526847423995, + "learning_rate": 0.0001, + "loss": 1.1301, + "loss/crossentropy": 2.4981319904327393, + "loss/hidden": 0.76953125, + "loss/logits": 0.15116086602210999, + "loss/reg": 0.020936597138643265, + "step": 733 + }, + { + "epoch": 0.09175, + "grad_norm": 4.977592468261719, + "grad_norm_var": 110.20332761050602, + "learning_rate": 0.0001, + "loss": 1.0306, + "loss/crossentropy": 2.37424898147583, + "loss/hidden": 0.703125, + "loss/logits": 0.11818103492259979, + "loss/reg": 0.020926134660840034, + "step": 734 + }, + { + "epoch": 0.091875, + "grad_norm": 2.8039772510528564, + "grad_norm_var": 110.39035266849663, + "learning_rate": 0.0001, + "loss": 0.9389, + "loss/crossentropy": 2.5503880977630615, + "loss/hidden": 0.62109375, + "loss/logits": 0.1085958182811737, + "loss/reg": 0.02091672271490097, + "step": 735 + }, + { + "epoch": 0.092, + "grad_norm": 3.418717861175537, + "grad_norm_var": 110.08862675247053, + "learning_rate": 0.0001, + "loss": 0.9544, + "loss/crossentropy": 2.6695592403411865, + "loss/hidden": 0.62890625, + "loss/logits": 0.11641087383031845, + "loss/reg": 0.02090657874941826, + "step": 736 + }, + { + "epoch": 0.092125, + "grad_norm": 2.807041883468628, + "grad_norm_var": 110.28591938740921, + "learning_rate": 0.0001, + "loss": 1.1135, + "loss/crossentropy": 2.4328622817993164, + "loss/hidden": 0.76953125, + "loss/logits": 0.13500146567821503, + "loss/reg": 0.02089635282754898, + "step": 737 + }, + { + "epoch": 0.09225, + "grad_norm": 3.713057518005371, + "grad_norm_var": 110.4783888232826, + "learning_rate": 0.0001, + "loss": 1.1755, + "loss/crossentropy": 2.0374624729156494, + "loss/hidden": 0.83984375, + "loss/logits": 0.12679257988929749, + "loss/reg": 0.02088700234889984, + "step": 738 + }, + { + "epoch": 0.092375, + "grad_norm": 3.618948459625244, + "grad_norm_var": 109.99807079993953, + "learning_rate": 0.0001, + "loss": 0.9255, + "loss/crossentropy": 2.536527395248413, + "loss/hidden": 0.609375, + "loss/logits": 0.10737244784832001, + "loss/reg": 0.020877836272120476, + "step": 739 + }, + { + "epoch": 0.0925, + "grad_norm": 2.815113067626953, + "grad_norm_var": 1.1792510177041378, + "learning_rate": 0.0001, + "loss": 0.9755, + "loss/crossentropy": 2.464996337890625, + "loss/hidden": 0.640625, + "loss/logits": 0.12616491317749023, + "loss/reg": 0.020868681371212006, + "step": 740 + }, + { + "epoch": 0.092625, + "grad_norm": 3.194117546081543, + "grad_norm_var": 1.1771383378848062, + "learning_rate": 0.0001, + "loss": 1.1989, + "loss/crossentropy": 2.325310707092285, + "loss/hidden": 0.84375, + "loss/logits": 0.14658081531524658, + "loss/reg": 0.020859118551015854, + "step": 741 + }, + { + "epoch": 0.09275, + "grad_norm": 2.970301628112793, + "grad_norm_var": 1.1887296793511715, + "learning_rate": 0.0001, + "loss": 0.9939, + "loss/crossentropy": 2.4747390747070312, + "loss/hidden": 0.66015625, + "loss/logits": 0.1252739280462265, + "loss/reg": 0.020850006490945816, + "step": 742 + }, + { + "epoch": 0.092875, + "grad_norm": 3.7745604515075684, + "grad_norm_var": 1.1425983881417718, + "learning_rate": 0.0001, + "loss": 0.9835, + "loss/crossentropy": 2.4191653728485107, + "loss/hidden": 0.67578125, + "loss/logits": 0.09930374473333359, + "loss/reg": 0.020840618759393692, + "step": 743 + }, + { + "epoch": 0.093, + "grad_norm": 2.8048055171966553, + "grad_norm_var": 1.132423092522494, + "learning_rate": 0.0001, + "loss": 1.083, + "loss/crossentropy": 2.764143228530884, + "loss/hidden": 0.75390625, + "loss/logits": 0.12075912207365036, + "loss/reg": 0.020830942317843437, + "step": 744 + }, + { + "epoch": 0.093125, + "grad_norm": 9.04628849029541, + "grad_norm_var": 2.712848654929009, + "learning_rate": 0.0001, + "loss": 1.4779, + "loss/crossentropy": 2.2979469299316406, + "loss/hidden": 1.125, + "loss/logits": 0.1446615606546402, + "loss/reg": 0.02082117274403572, + "step": 745 + }, + { + "epoch": 0.09325, + "grad_norm": 3.1999406814575195, + "grad_norm_var": 2.6400540651182967, + "learning_rate": 0.0001, + "loss": 1.0424, + "loss/crossentropy": 2.6157429218292236, + "loss/hidden": 0.71875, + "loss/logits": 0.11555971205234528, + "loss/reg": 0.020811092108488083, + "step": 746 + }, + { + "epoch": 0.093375, + "grad_norm": 3.6833674907684326, + "grad_norm_var": 2.522139333113606, + "learning_rate": 0.0001, + "loss": 1.1046, + "loss/crossentropy": 2.6110498905181885, + "loss/hidden": 0.7578125, + "loss/logits": 0.13881847262382507, + "loss/reg": 0.020801017060875893, + "step": 747 + }, + { + "epoch": 0.0935, + "grad_norm": 3.073922872543335, + "grad_norm_var": 2.4218973518929827, + "learning_rate": 0.0001, + "loss": 0.9672, + "loss/crossentropy": 2.0898826122283936, + "loss/hidden": 0.65625, + "loss/logits": 0.10303568840026855, + "loss/reg": 0.020791731774806976, + "step": 748 + }, + { + "epoch": 0.093625, + "grad_norm": 2.9481358528137207, + "grad_norm_var": 2.365294319547022, + "learning_rate": 0.0001, + "loss": 0.9589, + "loss/crossentropy": 2.477987766265869, + "loss/hidden": 0.640625, + "loss/logits": 0.11042475700378418, + "loss/reg": 0.02078239433467388, + "step": 749 + }, + { + "epoch": 0.09375, + "grad_norm": 5.792114734649658, + "grad_norm_var": 2.5478865053407236, + "learning_rate": 0.0001, + "loss": 1.2395, + "loss/crossentropy": 2.0092225074768066, + "loss/hidden": 0.87109375, + "loss/logits": 0.16063663363456726, + "loss/reg": 0.020772725343704224, + "step": 750 + }, + { + "epoch": 0.093875, + "grad_norm": 3.148350954055786, + "grad_norm_var": 2.512823601683457, + "learning_rate": 0.0001, + "loss": 1.14, + "loss/crossentropy": 2.669532537460327, + "loss/hidden": 0.78125, + "loss/logits": 0.15109741687774658, + "loss/reg": 0.020762871950864792, + "step": 751 + }, + { + "epoch": 0.094, + "grad_norm": 3.2779595851898193, + "grad_norm_var": 2.5202896391695124, + "learning_rate": 0.0001, + "loss": 1.1492, + "loss/crossentropy": 2.477522373199463, + "loss/hidden": 0.8125, + "loss/logits": 0.1291784942150116, + "loss/reg": 0.020753389224410057, + "step": 752 + }, + { + "epoch": 0.094125, + "grad_norm": 2.3718457221984863, + "grad_norm_var": 2.586364485192132, + "learning_rate": 0.0001, + "loss": 1.0399, + "loss/crossentropy": 2.497688055038452, + "loss/hidden": 0.69921875, + "loss/logits": 0.13327988982200623, + "loss/reg": 0.02074403502047062, + "step": 753 + }, + { + "epoch": 0.09425, + "grad_norm": 4.972538948059082, + "grad_norm_var": 2.6852568725766104, + "learning_rate": 0.0001, + "loss": 1.2511, + "loss/crossentropy": 2.232241630554199, + "loss/hidden": 0.90234375, + "loss/logits": 0.1414394974708557, + "loss/reg": 0.020734604448080063, + "step": 754 + }, + { + "epoch": 0.094375, + "grad_norm": 2.6991426944732666, + "grad_norm_var": 2.7595134043336267, + "learning_rate": 0.0001, + "loss": 1.001, + "loss/crossentropy": 2.596348285675049, + "loss/hidden": 0.6640625, + "loss/logits": 0.1297152191400528, + "loss/reg": 0.020725268870592117, + "step": 755 + }, + { + "epoch": 0.0945, + "grad_norm": 2.8633017539978027, + "grad_norm_var": 2.753743097466793, + "learning_rate": 0.0001, + "loss": 1.065, + "loss/crossentropy": 2.3198070526123047, + "loss/hidden": 0.74609375, + "loss/logits": 0.11169925332069397, + "loss/reg": 0.020716087892651558, + "step": 756 + }, + { + "epoch": 0.094625, + "grad_norm": 3.628239154815674, + "grad_norm_var": 2.733994536045853, + "learning_rate": 0.0001, + "loss": 1.3141, + "loss/crossentropy": 2.1950411796569824, + "loss/hidden": 0.92578125, + "loss/logits": 0.1812862753868103, + "loss/reg": 0.020707255229353905, + "step": 757 + }, + { + "epoch": 0.09475, + "grad_norm": 2.805727958679199, + "grad_norm_var": 2.753145827217212, + "learning_rate": 0.0001, + "loss": 1.109, + "loss/crossentropy": 2.40027117729187, + "loss/hidden": 0.76953125, + "loss/logits": 0.13251210749149323, + "loss/reg": 0.02069801278412342, + "step": 758 + }, + { + "epoch": 0.094875, + "grad_norm": 3.1892051696777344, + "grad_norm_var": 2.7730842000576295, + "learning_rate": 0.0001, + "loss": 1.1224, + "loss/crossentropy": 2.2343788146972656, + "loss/hidden": 0.77734375, + "loss/logits": 0.1382053792476654, + "loss/reg": 0.020689615979790688, + "step": 759 + }, + { + "epoch": 0.095, + "grad_norm": 2.925913095474243, + "grad_norm_var": 2.759237877311041, + "learning_rate": 0.0001, + "loss": 0.9857, + "loss/crossentropy": 2.5283889770507812, + "loss/hidden": 0.6640625, + "loss/logits": 0.11487281322479248, + "loss/reg": 0.02068025805056095, + "step": 760 + }, + { + "epoch": 0.095125, + "grad_norm": 4.760406970977783, + "grad_norm_var": 0.8673601536624308, + "learning_rate": 0.0001, + "loss": 1.3807, + "loss/crossentropy": 2.8629026412963867, + "loss/hidden": 0.9765625, + "loss/logits": 0.1973596215248108, + "loss/reg": 0.020673030987381935, + "step": 761 + }, + { + "epoch": 0.09525, + "grad_norm": 2.6182596683502197, + "grad_norm_var": 0.9085803501248163, + "learning_rate": 0.0001, + "loss": 0.9554, + "loss/crossentropy": 2.6935925483703613, + "loss/hidden": 0.6328125, + "loss/logits": 0.11598716676235199, + "loss/reg": 0.020663931965827942, + "step": 762 + }, + { + "epoch": 0.095375, + "grad_norm": 3.404240131378174, + "grad_norm_var": 0.9037375089777697, + "learning_rate": 0.0001, + "loss": 1.0173, + "loss/crossentropy": 2.5958027839660645, + "loss/hidden": 0.6953125, + "loss/logits": 0.11542315781116486, + "loss/reg": 0.020654823631048203, + "step": 763 + }, + { + "epoch": 0.0955, + "grad_norm": 3.0021934509277344, + "grad_norm_var": 0.9072250591900151, + "learning_rate": 0.0001, + "loss": 0.9811, + "loss/crossentropy": 2.4611666202545166, + "loss/hidden": 0.6640625, + "loss/logits": 0.11060373485088348, + "loss/reg": 0.020645687356591225, + "step": 764 + }, + { + "epoch": 0.095625, + "grad_norm": 4.123987674713135, + "grad_norm_var": 0.9227216736856043, + "learning_rate": 0.0001, + "loss": 1.2112, + "loss/crossentropy": 2.327146530151367, + "loss/hidden": 0.87109375, + "loss/logits": 0.13372407853603363, + "loss/reg": 0.020637821406126022, + "step": 765 + }, + { + "epoch": 0.09575, + "grad_norm": 3.835116147994995, + "grad_norm_var": 0.5572045887439032, + "learning_rate": 0.0001, + "loss": 1.0062, + "loss/crossentropy": 2.5438876152038574, + "loss/hidden": 0.67578125, + "loss/logits": 0.12415439635515213, + "loss/reg": 0.02063015103340149, + "step": 766 + }, + { + "epoch": 0.095875, + "grad_norm": 3.2649309635162354, + "grad_norm_var": 0.5548939110280107, + "learning_rate": 0.0001, + "loss": 1.1664, + "loss/crossentropy": 2.6621615886688232, + "loss/hidden": 0.80078125, + "loss/logits": 0.15938454866409302, + "loss/reg": 0.020622732117772102, + "step": 767 + }, + { + "epoch": 0.096, + "grad_norm": 3.398061752319336, + "grad_norm_var": 0.554498685348062, + "learning_rate": 0.0001, + "loss": 0.852, + "loss/crossentropy": 2.3231897354125977, + "loss/hidden": 0.5625, + "loss/logits": 0.08329755067825317, + "loss/reg": 0.0206154715269804, + "step": 768 + }, + { + "epoch": 0.096125, + "grad_norm": 2.8744146823883057, + "grad_norm_var": 0.5036373977995244, + "learning_rate": 0.0001, + "loss": 0.9094, + "loss/crossentropy": 2.5140726566314697, + "loss/hidden": 0.6171875, + "loss/logits": 0.0860939770936966, + "loss/reg": 0.020607706159353256, + "step": 769 + }, + { + "epoch": 0.09625, + "grad_norm": 8.404803276062012, + "grad_norm_var": 1.9605456650252857, + "learning_rate": 0.0001, + "loss": 1.614, + "loss/crossentropy": 2.4072842597961426, + "loss/hidden": 1.1328125, + "loss/logits": 0.2751774787902832, + "loss/reg": 0.020600339397788048, + "step": 770 + }, + { + "epoch": 0.096375, + "grad_norm": 2.581511974334717, + "grad_norm_var": 1.975733645477993, + "learning_rate": 0.0001, + "loss": 1.08, + "loss/crossentropy": 2.3724427223205566, + "loss/hidden": 0.73828125, + "loss/logits": 0.1358003169298172, + "loss/reg": 0.02059323526918888, + "step": 771 + }, + { + "epoch": 0.0965, + "grad_norm": 3.9817845821380615, + "grad_norm_var": 1.9433082266341482, + "learning_rate": 0.0001, + "loss": 1.0926, + "loss/crossentropy": 2.459740400314331, + "loss/hidden": 0.765625, + "loss/logits": 0.12109124660491943, + "loss/reg": 0.02058546058833599, + "step": 772 + }, + { + "epoch": 0.096625, + "grad_norm": 7.001491069793701, + "grad_norm_var": 2.633487351928299, + "learning_rate": 0.0001, + "loss": 1.1253, + "loss/crossentropy": 2.6342828273773193, + "loss/hidden": 0.8125, + "loss/logits": 0.10705184936523438, + "loss/reg": 0.020576275885105133, + "step": 773 + }, + { + "epoch": 0.09675, + "grad_norm": 2.4826319217681885, + "grad_norm_var": 2.6865387021083524, + "learning_rate": 0.0001, + "loss": 1.1239, + "loss/crossentropy": 2.537883996963501, + "loss/hidden": 0.7734375, + "loss/logits": 0.1447601616382599, + "loss/reg": 0.02056770585477352, + "step": 774 + }, + { + "epoch": 0.096875, + "grad_norm": 3.5470290184020996, + "grad_norm_var": 2.6622723084153237, + "learning_rate": 0.0001, + "loss": 1.1572, + "loss/crossentropy": 2.572312831878662, + "loss/hidden": 0.80078125, + "loss/logits": 0.15083444118499756, + "loss/reg": 0.020558428019285202, + "step": 775 + }, + { + "epoch": 0.097, + "grad_norm": 2.8388142585754395, + "grad_norm_var": 2.673918444962514, + "learning_rate": 0.0001, + "loss": 1.1426, + "loss/crossentropy": 2.2697503566741943, + "loss/hidden": 0.79296875, + "loss/logits": 0.14412574470043182, + "loss/reg": 0.020549749955534935, + "step": 776 + }, + { + "epoch": 0.097125, + "grad_norm": 2.9166338443756104, + "grad_norm_var": 2.670560695292122, + "learning_rate": 0.0001, + "loss": 0.8926, + "loss/crossentropy": 2.4288480281829834, + "loss/hidden": 0.58203125, + "loss/logits": 0.10516718029975891, + "loss/reg": 0.02054043672978878, + "step": 777 + }, + { + "epoch": 0.09725, + "grad_norm": 3.565744161605835, + "grad_norm_var": 2.58151597609506, + "learning_rate": 0.0001, + "loss": 1.1299, + "loss/crossentropy": 2.444842576980591, + "loss/hidden": 0.77734375, + "loss/logits": 0.1472683846950531, + "loss/reg": 0.020531047135591507, + "step": 778 + }, + { + "epoch": 0.097375, + "grad_norm": 2.8829805850982666, + "grad_norm_var": 2.627842889624617, + "learning_rate": 0.0001, + "loss": 1.1382, + "loss/crossentropy": 2.548234462738037, + "loss/hidden": 0.78515625, + "loss/logits": 0.1478501409292221, + "loss/reg": 0.02052178978919983, + "step": 779 + }, + { + "epoch": 0.0975, + "grad_norm": 2.908299207687378, + "grad_norm_var": 2.6383052442278556, + "learning_rate": 0.0001, + "loss": 1.069, + "loss/crossentropy": 2.3388137817382812, + "loss/hidden": 0.734375, + "loss/logits": 0.12949424982070923, + "loss/reg": 0.020512979477643967, + "step": 780 + }, + { + "epoch": 0.097625, + "grad_norm": 2.5852208137512207, + "grad_norm_var": 2.717361748364273, + "learning_rate": 0.0001, + "loss": 0.9782, + "loss/crossentropy": 2.4337894916534424, + "loss/hidden": 0.65625, + "loss/logits": 0.11689651757478714, + "loss/reg": 0.020503604784607887, + "step": 781 + }, + { + "epoch": 0.09775, + "grad_norm": 5.328097820281982, + "grad_norm_var": 2.885194693951976, + "learning_rate": 0.0001, + "loss": 1.2053, + "loss/crossentropy": 2.5564181804656982, + "loss/hidden": 0.80078125, + "loss/logits": 0.19961750507354736, + "loss/reg": 0.020494818687438965, + "step": 782 + }, + { + "epoch": 0.097875, + "grad_norm": 3.647054672241211, + "grad_norm_var": 2.8678156226553613, + "learning_rate": 0.0001, + "loss": 1.0039, + "loss/crossentropy": 2.5830650329589844, + "loss/hidden": 0.66796875, + "loss/logits": 0.13103139400482178, + "loss/reg": 0.020485466346144676, + "step": 783 + }, + { + "epoch": 0.098, + "grad_norm": 4.480771541595459, + "grad_norm_var": 2.8817531456144723, + "learning_rate": 0.0001, + "loss": 1.3811, + "loss/crossentropy": 2.4550743103027344, + "loss/hidden": 0.9765625, + "loss/logits": 0.1997724324464798, + "loss/reg": 0.020477164536714554, + "step": 784 + }, + { + "epoch": 0.098125, + "grad_norm": 3.1495656967163086, + "grad_norm_var": 2.8497140664534366, + "learning_rate": 0.0001, + "loss": 1.1386, + "loss/crossentropy": 2.846536636352539, + "loss/hidden": 0.76171875, + "loss/logits": 0.1722334325313568, + "loss/reg": 0.020469149574637413, + "step": 785 + }, + { + "epoch": 0.09825, + "grad_norm": 3.611919641494751, + "grad_norm_var": 1.4027508562338329, + "learning_rate": 0.0001, + "loss": 1.0783, + "loss/crossentropy": 2.7328994274139404, + "loss/hidden": 0.73046875, + "loss/logits": 0.14321856200695038, + "loss/reg": 0.020459884777665138, + "step": 786 + }, + { + "epoch": 0.098375, + "grad_norm": 4.535567283630371, + "grad_norm_var": 1.3775118805215696, + "learning_rate": 0.0001, + "loss": 1.3035, + "loss/crossentropy": 2.616641044616699, + "loss/hidden": 0.8515625, + "loss/logits": 0.24746158719062805, + "loss/reg": 0.020450593903660774, + "step": 787 + }, + { + "epoch": 0.0985, + "grad_norm": 2.7010657787323, + "grad_norm_var": 1.4347220572574786, + "learning_rate": 0.0001, + "loss": 0.9439, + "loss/crossentropy": 2.7851712703704834, + "loss/hidden": 0.62890625, + "loss/logits": 0.11055716872215271, + "loss/reg": 0.020441319793462753, + "step": 788 + }, + { + "epoch": 0.098625, + "grad_norm": 3.3690221309661865, + "grad_norm_var": 0.6296018822433316, + "learning_rate": 0.0001, + "loss": 0.9858, + "loss/crossentropy": 2.5199151039123535, + "loss/hidden": 0.6640625, + "loss/logits": 0.11739970743656158, + "loss/reg": 0.020432572811841965, + "step": 789 + }, + { + "epoch": 0.09875, + "grad_norm": 3.5332016944885254, + "grad_norm_var": 0.5687648370759459, + "learning_rate": 0.0001, + "loss": 1.2114, + "loss/crossentropy": 2.3861191272735596, + "loss/hidden": 0.80859375, + "loss/logits": 0.19853942096233368, + "loss/reg": 0.020423252135515213, + "step": 790 + }, + { + "epoch": 0.098875, + "grad_norm": 3.4778192043304443, + "grad_norm_var": 0.5684000998912779, + "learning_rate": 0.0001, + "loss": 1.3337, + "loss/crossentropy": 2.177149772644043, + "loss/hidden": 0.94921875, + "loss/logits": 0.18037351965904236, + "loss/reg": 0.020413951948285103, + "step": 791 + }, + { + "epoch": 0.099, + "grad_norm": 3.1103479862213135, + "grad_norm_var": 0.5501298461305394, + "learning_rate": 0.0001, + "loss": 1.1811, + "loss/crossentropy": 2.381289005279541, + "loss/hidden": 0.8125, + "loss/logits": 0.16453775763511658, + "loss/reg": 0.020404649898409843, + "step": 792 + }, + { + "epoch": 0.099125, + "grad_norm": 2.7654690742492676, + "grad_norm_var": 0.563068172749172, + "learning_rate": 0.0001, + "loss": 0.9788, + "loss/crossentropy": 2.6645448207855225, + "loss/hidden": 0.65234375, + "loss/logits": 0.12248219549655914, + "loss/reg": 0.020395854488015175, + "step": 793 + }, + { + "epoch": 0.09925, + "grad_norm": 2.9942195415496826, + "grad_norm_var": 0.5768165563916947, + "learning_rate": 0.0001, + "loss": 1.1054, + "loss/crossentropy": 2.388904571533203, + "loss/hidden": 0.73828125, + "loss/logits": 0.16325941681861877, + "loss/reg": 0.020387381315231323, + "step": 794 + }, + { + "epoch": 0.099375, + "grad_norm": 3.1070520877838135, + "grad_norm_var": 0.5632370819485725, + "learning_rate": 0.0001, + "loss": 1.0819, + "loss/crossentropy": 2.4645302295684814, + "loss/hidden": 0.74609375, + "loss/logits": 0.13203924894332886, + "loss/reg": 0.020378144457936287, + "step": 795 + }, + { + "epoch": 0.0995, + "grad_norm": 3.5182595252990723, + "grad_norm_var": 0.5419026805153273, + "learning_rate": 0.0001, + "loss": 1.3242, + "loss/crossentropy": 2.337388038635254, + "loss/hidden": 0.96875, + "loss/logits": 0.15180304646492004, + "loss/reg": 0.020368557423353195, + "step": 796 + }, + { + "epoch": 0.099625, + "grad_norm": 2.4217121601104736, + "grad_norm_var": 0.5634005753459926, + "learning_rate": 0.0001, + "loss": 0.976, + "loss/crossentropy": 2.520176887512207, + "loss/hidden": 0.6640625, + "loss/logits": 0.1083153486251831, + "loss/reg": 0.020359758287668228, + "step": 797 + }, + { + "epoch": 0.09975, + "grad_norm": 3.818143129348755, + "grad_norm_var": 0.33472096860269646, + "learning_rate": 0.0001, + "loss": 1.2273, + "loss/crossentropy": 2.452592134475708, + "loss/hidden": 0.86328125, + "loss/logits": 0.1604856252670288, + "loss/reg": 0.02035023830831051, + "step": 798 + }, + { + "epoch": 0.099875, + "grad_norm": 2.8298075199127197, + "grad_norm_var": 0.3484620943588491, + "learning_rate": 0.0001, + "loss": 1.094, + "loss/crossentropy": 2.6919286251068115, + "loss/hidden": 0.73828125, + "loss/logits": 0.15227138996124268, + "loss/reg": 0.020340625196695328, + "step": 799 + }, + { + "epoch": 0.1, + "grad_norm": 45.50175094604492, + "grad_norm_var": 111.76340644728633, + "learning_rate": 0.0001, + "loss": 1.6213, + "loss/crossentropy": 2.2076241970062256, + "loss/hidden": 1.125, + "loss/logits": 0.2930048406124115, + "loss/reg": 0.020331410691142082, + "step": 800 + }, + { + "epoch": 0.100125, + "grad_norm": 3.073981523513794, + "grad_norm_var": 111.7915103772579, + "learning_rate": 0.0001, + "loss": 1.1146, + "loss/crossentropy": 2.5723347663879395, + "loss/hidden": 0.77734375, + "loss/logits": 0.13403840363025665, + "loss/reg": 0.020322071388363838, + "step": 801 + }, + { + "epoch": 0.10025, + "grad_norm": 4.337286472320557, + "grad_norm_var": 111.60328751499571, + "learning_rate": 0.0001, + "loss": 1.2228, + "loss/crossentropy": 2.7784037590026855, + "loss/hidden": 0.83984375, + "loss/logits": 0.17983263731002808, + "loss/reg": 0.020312372595071793, + "step": 802 + }, + { + "epoch": 0.100375, + "grad_norm": 3.656367778778076, + "grad_norm_var": 111.81663718658596, + "learning_rate": 0.0001, + "loss": 1.1003, + "loss/crossentropy": 2.4638426303863525, + "loss/hidden": 0.76953125, + "loss/logits": 0.12776704132556915, + "loss/reg": 0.020302986726164818, + "step": 803 + }, + { + "epoch": 0.1005, + "grad_norm": 3.38935923576355, + "grad_norm_var": 111.55373057700994, + "learning_rate": 0.0001, + "loss": 0.9749, + "loss/crossentropy": 2.4947969913482666, + "loss/hidden": 0.6640625, + "loss/logits": 0.10794391483068466, + "loss/reg": 0.020293867215514183, + "step": 804 + }, + { + "epoch": 0.100625, + "grad_norm": 3.2775380611419678, + "grad_norm_var": 111.58551029522327, + "learning_rate": 0.0001, + "loss": 1.133, + "loss/crossentropy": 2.4812419414520264, + "loss/hidden": 0.77734375, + "loss/logits": 0.15277597308158875, + "loss/reg": 0.020284701138734818, + "step": 805 + }, + { + "epoch": 0.10075, + "grad_norm": 2.996157646179199, + "grad_norm_var": 111.77485823890761, + "learning_rate": 0.0001, + "loss": 1.0998, + "loss/crossentropy": 2.516984224319458, + "loss/hidden": 0.75, + "loss/logits": 0.14709463715553284, + "loss/reg": 0.020275365561246872, + "step": 806 + }, + { + "epoch": 0.100875, + "grad_norm": 5.575490951538086, + "grad_norm_var": 111.37459403701224, + "learning_rate": 0.0001, + "loss": 1.0629, + "loss/crossentropy": 2.6265506744384766, + "loss/hidden": 0.73828125, + "loss/logits": 0.12198775261640549, + "loss/reg": 0.02026602067053318, + "step": 807 + }, + { + "epoch": 0.101, + "grad_norm": 2.653712749481201, + "grad_norm_var": 111.56498102164149, + "learning_rate": 0.0001, + "loss": 0.957, + "loss/crossentropy": 2.755121946334839, + "loss/hidden": 0.6328125, + "loss/logits": 0.12166983634233475, + "loss/reg": 0.02025618776679039, + "step": 808 + }, + { + "epoch": 0.101125, + "grad_norm": 2.5454790592193604, + "grad_norm_var": 111.66272758702647, + "learning_rate": 0.0001, + "loss": 1.0209, + "loss/crossentropy": 2.475360870361328, + "loss/hidden": 0.6953125, + "loss/logits": 0.1231006383895874, + "loss/reg": 0.02024705521762371, + "step": 809 + }, + { + "epoch": 0.10125, + "grad_norm": 12.002355575561523, + "grad_norm_var": 113.1469842386682, + "learning_rate": 0.0001, + "loss": 1.1289, + "loss/crossentropy": 2.485873222351074, + "loss/hidden": 0.77734375, + "loss/logits": 0.1492297500371933, + "loss/reg": 0.020237451419234276, + "step": 810 + }, + { + "epoch": 0.101375, + "grad_norm": 3.0118675231933594, + "grad_norm_var": 113.19117010752396, + "learning_rate": 0.0001, + "loss": 1.0673, + "loss/crossentropy": 2.610520124435425, + "loss/hidden": 0.7265625, + "loss/logits": 0.13847574591636658, + "loss/reg": 0.020227529108524323, + "step": 811 + }, + { + "epoch": 0.1015, + "grad_norm": 37.33988952636719, + "grad_norm_var": 171.06705552642327, + "learning_rate": 0.0001, + "loss": 4.5438, + "loss/crossentropy": 3.938170909881592, + "loss/hidden": 2.203125, + "loss/logits": 2.1385304927825928, + "loss/reg": 0.02021711878478527, + "step": 812 + }, + { + "epoch": 0.101625, + "grad_norm": 2.306735038757324, + "grad_norm_var": 171.16339278078715, + "learning_rate": 0.0001, + "loss": 0.928, + "loss/crossentropy": 2.3164937496185303, + "loss/hidden": 0.625, + "loss/logits": 0.10087703168392181, + "loss/reg": 0.020207591354846954, + "step": 813 + }, + { + "epoch": 0.10175, + "grad_norm": 3.00903058052063, + "grad_norm_var": 171.72501112959992, + "learning_rate": 0.0001, + "loss": 1.1753, + "loss/crossentropy": 2.4386990070343018, + "loss/hidden": 0.828125, + "loss/logits": 0.1452336609363556, + "loss/reg": 0.02019745111465454, + "step": 814 + }, + { + "epoch": 0.101875, + "grad_norm": 2.680140972137451, + "grad_norm_var": 171.84144221114087, + "learning_rate": 0.0001, + "loss": 1.157, + "loss/crossentropy": 2.531343460083008, + "loss/hidden": 0.796875, + "loss/logits": 0.15824541449546814, + "loss/reg": 0.020187031477689743, + "step": 815 + }, + { + "epoch": 0.102, + "grad_norm": 2.582740068435669, + "grad_norm_var": 75.71062264039246, + "learning_rate": 0.0001, + "loss": 1.0065, + "loss/crossentropy": 2.5258586406707764, + "loss/hidden": 0.69140625, + "loss/logits": 0.11330173909664154, + "loss/reg": 0.02017681486904621, + "step": 816 + }, + { + "epoch": 0.102125, + "grad_norm": 3.5809519290924072, + "grad_norm_var": 75.53549752812218, + "learning_rate": 0.0001, + "loss": 1.2756, + "loss/crossentropy": 2.080437183380127, + "loss/hidden": 0.90625, + "loss/logits": 0.16763544082641602, + "loss/reg": 0.02016652189195156, + "step": 817 + }, + { + "epoch": 0.10225, + "grad_norm": 3.093592405319214, + "grad_norm_var": 75.89695881356819, + "learning_rate": 0.0001, + "loss": 1.1484, + "loss/crossentropy": 2.630833625793457, + "loss/hidden": 0.80078125, + "loss/logits": 0.14600682258605957, + "loss/reg": 0.020157409831881523, + "step": 818 + }, + { + "epoch": 0.102375, + "grad_norm": 3.1120946407318115, + "grad_norm_var": 76.0751246894024, + "learning_rate": 0.0001, + "loss": 1.038, + "loss/crossentropy": 2.5411901473999023, + "loss/hidden": 0.69140625, + "loss/logits": 0.14511412382125854, + "loss/reg": 0.020148303359746933, + "step": 819 + }, + { + "epoch": 0.1025, + "grad_norm": 2.9903945922851562, + "grad_norm_var": 76.21449508483448, + "learning_rate": 0.0001, + "loss": 1.0769, + "loss/crossentropy": 2.556246042251587, + "loss/hidden": 0.73828125, + "loss/logits": 0.13721047341823578, + "loss/reg": 0.020139139145612717, + "step": 820 + }, + { + "epoch": 0.102625, + "grad_norm": 3.297760248184204, + "grad_norm_var": 76.2077263993312, + "learning_rate": 0.0001, + "loss": 1.025, + "loss/crossentropy": 2.5704898834228516, + "loss/hidden": 0.6796875, + "loss/logits": 0.14399868249893188, + "loss/reg": 0.020129989832639694, + "step": 821 + }, + { + "epoch": 0.10275, + "grad_norm": 3.810601234436035, + "grad_norm_var": 75.94485425030823, + "learning_rate": 0.0001, + "loss": 1.2142, + "loss/crossentropy": 2.4684388637542725, + "loss/hidden": 0.83984375, + "loss/logits": 0.17318351566791534, + "loss/reg": 0.020120643079280853, + "step": 822 + }, + { + "epoch": 0.102875, + "grad_norm": 2.798835515975952, + "grad_norm_var": 76.52818091118122, + "learning_rate": 0.0001, + "loss": 0.9558, + "loss/crossentropy": 2.476940393447876, + "loss/hidden": 0.6484375, + "loss/logits": 0.10626688599586487, + "loss/reg": 0.020111503079533577, + "step": 823 + }, + { + "epoch": 0.103, + "grad_norm": 4.541812419891357, + "grad_norm_var": 75.99013496754353, + "learning_rate": 0.0001, + "loss": 1.1311, + "loss/crossentropy": 2.339744806289673, + "loss/hidden": 0.7265625, + "loss/logits": 0.2034740447998047, + "loss/reg": 0.020102351903915405, + "step": 824 + }, + { + "epoch": 0.103125, + "grad_norm": 2.6360268592834473, + "grad_norm_var": 75.95142766348106, + "learning_rate": 0.0001, + "loss": 1.0849, + "loss/crossentropy": 2.3795337677001953, + "loss/hidden": 0.74609375, + "loss/logits": 0.1378898024559021, + "loss/reg": 0.02009383775293827, + "step": 825 + }, + { + "epoch": 0.10325, + "grad_norm": 3.2206642627716064, + "grad_norm_var": 73.50864103963063, + "learning_rate": 0.0001, + "loss": 1.09, + "loss/crossentropy": 2.645775556564331, + "loss/hidden": 0.7578125, + "loss/logits": 0.13137856125831604, + "loss/reg": 0.020084405317902565, + "step": 826 + }, + { + "epoch": 0.103375, + "grad_norm": 3.2071568965911865, + "grad_norm_var": 73.45272548167614, + "learning_rate": 0.0001, + "loss": 0.9383, + "loss/crossentropy": 2.7224762439727783, + "loss/hidden": 0.625, + "loss/logits": 0.11251779645681381, + "loss/reg": 0.020074598491191864, + "step": 827 + }, + { + "epoch": 0.1035, + "grad_norm": 3.2850613594055176, + "grad_norm_var": 0.2863261800221416, + "learning_rate": 0.0001, + "loss": 1.1194, + "loss/crossentropy": 2.522642135620117, + "loss/hidden": 0.7421875, + "loss/logits": 0.17652815580368042, + "loss/reg": 0.02006435953080654, + "step": 828 + }, + { + "epoch": 0.103625, + "grad_norm": 3.5068256855010986, + "grad_norm_var": 0.24387138774257647, + "learning_rate": 0.0001, + "loss": 0.9457, + "loss/crossentropy": 2.6463444232940674, + "loss/hidden": 0.6328125, + "loss/logits": 0.11229754984378815, + "loss/reg": 0.020054515451192856, + "step": 829 + }, + { + "epoch": 0.10375, + "grad_norm": 4.16710090637207, + "grad_norm_var": 0.29672115328222404, + "learning_rate": 0.0001, + "loss": 1.301, + "loss/crossentropy": 2.519270420074463, + "loss/hidden": 0.921875, + "loss/logits": 0.1786651611328125, + "loss/reg": 0.020044928416609764, + "step": 830 + }, + { + "epoch": 0.103875, + "grad_norm": 3.5839273929595947, + "grad_norm_var": 0.27524789373512704, + "learning_rate": 0.0001, + "loss": 0.9949, + "loss/crossentropy": 2.0604701042175293, + "loss/hidden": 0.68359375, + "loss/logits": 0.1109282597899437, + "loss/reg": 0.02003585919737816, + "step": 831 + }, + { + "epoch": 0.104, + "grad_norm": 2.8121650218963623, + "grad_norm_var": 0.25541980739101955, + "learning_rate": 0.0001, + "loss": 1.1057, + "loss/crossentropy": 2.670191764831543, + "loss/hidden": 0.75390625, + "loss/logits": 0.1515616476535797, + "loss/reg": 0.020026110112667084, + "step": 832 + }, + { + "epoch": 0.104125, + "grad_norm": 3.116018295288086, + "grad_norm_var": 0.2547872758708628, + "learning_rate": 0.0001, + "loss": 1.0301, + "loss/crossentropy": 3.016143321990967, + "loss/hidden": 0.6796875, + "loss/logits": 0.1502160131931305, + "loss/reg": 0.020016156136989594, + "step": 833 + }, + { + "epoch": 0.10425, + "grad_norm": 6.202447414398193, + "grad_norm_var": 0.7634439694535559, + "learning_rate": 0.0001, + "loss": 1.2045, + "loss/crossentropy": 2.8383448123931885, + "loss/hidden": 0.84765625, + "loss/logits": 0.15678462386131287, + "loss/reg": 0.020007088780403137, + "step": 834 + }, + { + "epoch": 0.104375, + "grad_norm": 3.4904069900512695, + "grad_norm_var": 0.7519116349075012, + "learning_rate": 0.0001, + "loss": 1.2658, + "loss/crossentropy": 2.597175359725952, + "loss/hidden": 0.84375, + "loss/logits": 0.22209098935127258, + "loss/reg": 0.019998185336589813, + "step": 835 + }, + { + "epoch": 0.1045, + "grad_norm": 4.6242594718933105, + "grad_norm_var": 0.7986550791863064, + "learning_rate": 0.0001, + "loss": 1.1411, + "loss/crossentropy": 2.6017813682556152, + "loss/hidden": 0.796875, + "loss/logits": 0.14435435831546783, + "loss/reg": 0.019988389685750008, + "step": 836 + }, + { + "epoch": 0.104625, + "grad_norm": 3.2676827907562256, + "grad_norm_var": 0.800099420481778, + "learning_rate": 0.0001, + "loss": 1.1908, + "loss/crossentropy": 2.454334259033203, + "loss/hidden": 0.82421875, + "loss/logits": 0.16681547462940216, + "loss/reg": 0.019979091361165047, + "step": 837 + }, + { + "epoch": 0.10475, + "grad_norm": 3.4931089878082275, + "grad_norm_var": 0.7992595598721048, + "learning_rate": 0.0001, + "loss": 1.1803, + "loss/crossentropy": 2.2890915870666504, + "loss/hidden": 0.828125, + "loss/logits": 0.15247204899787903, + "loss/reg": 0.019969170913100243, + "step": 838 + }, + { + "epoch": 0.104875, + "grad_norm": 2.7106313705444336, + "grad_norm_var": 0.8094277801425143, + "learning_rate": 0.0001, + "loss": 1.097, + "loss/crossentropy": 2.56706166267395, + "loss/hidden": 0.76953125, + "loss/logits": 0.12783397734165192, + "loss/reg": 0.019960079342126846, + "step": 839 + }, + { + "epoch": 0.105, + "grad_norm": 3.2825114727020264, + "grad_norm_var": 0.7531900707246374, + "learning_rate": 0.0001, + "loss": 1.2726, + "loss/crossentropy": 2.324145555496216, + "loss/hidden": 0.88671875, + "loss/logits": 0.18636029958724976, + "loss/reg": 0.019951237365603447, + "step": 840 + }, + { + "epoch": 0.105125, + "grad_norm": 2.752570152282715, + "grad_norm_var": 0.7400250579900473, + "learning_rate": 0.0001, + "loss": 1.0885, + "loss/crossentropy": 2.4273452758789062, + "loss/hidden": 0.7578125, + "loss/logits": 0.1312946379184723, + "loss/reg": 0.01994233950972557, + "step": 841 + }, + { + "epoch": 0.10525, + "grad_norm": 3.451720714569092, + "grad_norm_var": 0.733364881032247, + "learning_rate": 0.0001, + "loss": 1.1042, + "loss/crossentropy": 2.5271100997924805, + "loss/hidden": 0.76171875, + "loss/logits": 0.1431449055671692, + "loss/reg": 0.019932815805077553, + "step": 842 + }, + { + "epoch": 0.105375, + "grad_norm": 3.0586564540863037, + "grad_norm_var": 0.741721542830341, + "learning_rate": 0.0001, + "loss": 1.0412, + "loss/crossentropy": 2.9447247982025146, + "loss/hidden": 0.7109375, + "loss/logits": 0.13100013136863708, + "loss/reg": 0.019923273473978043, + "step": 843 + }, + { + "epoch": 0.1055, + "grad_norm": 3.4214084148406982, + "grad_norm_var": 0.7380611813534226, + "learning_rate": 0.0001, + "loss": 1.2281, + "loss/crossentropy": 2.3005340099334717, + "loss/hidden": 0.86328125, + "loss/logits": 0.1657242327928543, + "loss/reg": 0.01991339959204197, + "step": 844 + }, + { + "epoch": 0.105625, + "grad_norm": 2.6607367992401123, + "grad_norm_var": 0.7886706735221035, + "learning_rate": 0.0001, + "loss": 1.1345, + "loss/crossentropy": 2.5435853004455566, + "loss/hidden": 0.80078125, + "loss/logits": 0.1347101330757141, + "loss/reg": 0.019903138279914856, + "step": 845 + }, + { + "epoch": 0.10575, + "grad_norm": 3.0965163707733154, + "grad_norm_var": 0.7659307635756494, + "learning_rate": 0.0001, + "loss": 1.0956, + "loss/crossentropy": 2.6083555221557617, + "loss/hidden": 0.74609375, + "loss/logits": 0.1505199819803238, + "loss/reg": 0.019893797114491463, + "step": 846 + }, + { + "epoch": 0.105875, + "grad_norm": 4.309004306793213, + "grad_norm_var": 0.8127957898222188, + "learning_rate": 0.0001, + "loss": 1.078, + "loss/crossentropy": 2.515653371810913, + "loss/hidden": 0.76171875, + "loss/logits": 0.11742238700389862, + "loss/reg": 0.019883660599589348, + "step": 847 + }, + { + "epoch": 0.106, + "grad_norm": 3.248624563217163, + "grad_norm_var": 0.7855834171862691, + "learning_rate": 0.0001, + "loss": 1.0838, + "loss/crossentropy": 2.593562126159668, + "loss/hidden": 0.75390625, + "loss/logits": 0.13119123876094818, + "loss/reg": 0.019874349236488342, + "step": 848 + }, + { + "epoch": 0.106125, + "grad_norm": 3.347926378250122, + "grad_norm_var": 0.7767115778534122, + "learning_rate": 0.0001, + "loss": 1.0816, + "loss/crossentropy": 2.3947036266326904, + "loss/hidden": 0.75, + "loss/logits": 0.13295108079910278, + "loss/reg": 0.019864298403263092, + "step": 849 + }, + { + "epoch": 0.10625, + "grad_norm": 4.369658946990967, + "grad_norm_var": 0.33264170947598637, + "learning_rate": 0.0001, + "loss": 1.0708, + "loss/crossentropy": 2.449446201324463, + "loss/hidden": 0.7421875, + "loss/logits": 0.13007891178131104, + "loss/reg": 0.019854165613651276, + "step": 850 + }, + { + "epoch": 0.106375, + "grad_norm": 3.361130475997925, + "grad_norm_var": 0.332327660409797, + "learning_rate": 0.0001, + "loss": 0.9761, + "loss/crossentropy": 2.566408157348633, + "loss/hidden": 0.66796875, + "loss/logits": 0.10966208577156067, + "loss/reg": 0.019843947142362595, + "step": 851 + }, + { + "epoch": 0.1065, + "grad_norm": 2.6885204315185547, + "grad_norm_var": 0.25144665871681204, + "learning_rate": 0.0001, + "loss": 0.9331, + "loss/crossentropy": 2.3620035648345947, + "loss/hidden": 0.625, + "loss/logits": 0.10973039269447327, + "loss/reg": 0.019834715873003006, + "step": 852 + }, + { + "epoch": 0.106625, + "grad_norm": 2.615734338760376, + "grad_norm_var": 0.2793016853206145, + "learning_rate": 0.0001, + "loss": 1.0168, + "loss/crossentropy": 2.605545997619629, + "loss/hidden": 0.69140625, + "loss/logits": 0.12715153396129608, + "loss/reg": 0.01982559822499752, + "step": 853 + }, + { + "epoch": 0.10675, + "grad_norm": 3.073012113571167, + "grad_norm_var": 0.276254032788457, + "learning_rate": 0.0001, + "loss": 0.9855, + "loss/crossentropy": 2.634042739868164, + "loss/hidden": 0.671875, + "loss/logits": 0.11542729288339615, + "loss/reg": 0.019816165789961815, + "step": 854 + }, + { + "epoch": 0.106875, + "grad_norm": 2.759152412414551, + "grad_norm_var": 0.27313479552050995, + "learning_rate": 0.0001, + "loss": 0.9145, + "loss/crossentropy": 2.4920616149902344, + "loss/hidden": 0.61328125, + "loss/logits": 0.10316716134548187, + "loss/reg": 0.019806833937764168, + "step": 855 + }, + { + "epoch": 0.107, + "grad_norm": 3.647084951400757, + "grad_norm_var": 0.28455080731760823, + "learning_rate": 0.0001, + "loss": 1.1336, + "loss/crossentropy": 2.34993314743042, + "loss/hidden": 0.78515625, + "loss/logits": 0.150485098361969, + "loss/reg": 0.019797123968601227, + "step": 856 + }, + { + "epoch": 0.107125, + "grad_norm": 5.309474468231201, + "grad_norm_var": 0.5265287098230971, + "learning_rate": 0.0001, + "loss": 1.3513, + "loss/crossentropy": 2.4511518478393555, + "loss/hidden": 0.94140625, + "loss/logits": 0.21205759048461914, + "loss/reg": 0.01978708617389202, + "step": 857 + }, + { + "epoch": 0.10725, + "grad_norm": 3.1982548236846924, + "grad_norm_var": 0.5288348795583182, + "learning_rate": 0.0001, + "loss": 1.1847, + "loss/crossentropy": 2.2767844200134277, + "loss/hidden": 0.8359375, + "loss/logits": 0.15097308158874512, + "loss/reg": 0.01977648213505745, + "step": 858 + }, + { + "epoch": 0.107375, + "grad_norm": 3.7261335849761963, + "grad_norm_var": 0.5276094221235986, + "learning_rate": 0.0001, + "loss": 1.1138, + "loss/crossentropy": 2.660844087600708, + "loss/hidden": 0.78515625, + "loss/logits": 0.13101539015769958, + "loss/reg": 0.01976662687957287, + "step": 859 + }, + { + "epoch": 0.1075, + "grad_norm": 3.3435637950897217, + "grad_norm_var": 0.5280464375318101, + "learning_rate": 0.0001, + "loss": 1.2877, + "loss/crossentropy": 2.4701263904571533, + "loss/hidden": 0.90625, + "loss/logits": 0.18388135731220245, + "loss/reg": 0.019757471978664398, + "step": 860 + }, + { + "epoch": 0.107625, + "grad_norm": 2.630688190460205, + "grad_norm_var": 0.5311534898567278, + "learning_rate": 0.0001, + "loss": 1.063, + "loss/crossentropy": 2.5359132289886475, + "loss/hidden": 0.734375, + "loss/logits": 0.1311398446559906, + "loss/reg": 0.019748201593756676, + "step": 861 + }, + { + "epoch": 0.10775, + "grad_norm": 2.5543456077575684, + "grad_norm_var": 0.5729300014134933, + "learning_rate": 0.0001, + "loss": 0.987, + "loss/crossentropy": 2.3763418197631836, + "loss/hidden": 0.671875, + "loss/logits": 0.11773102730512619, + "loss/reg": 0.01973855495452881, + "step": 862 + }, + { + "epoch": 0.107875, + "grad_norm": 2.8768351078033447, + "grad_norm_var": 0.5249464789316676, + "learning_rate": 0.0001, + "loss": 1.11, + "loss/crossentropy": 2.467682361602783, + "loss/hidden": 0.7734375, + "loss/logits": 0.13924749195575714, + "loss/reg": 0.019729435443878174, + "step": 863 + }, + { + "epoch": 0.108, + "grad_norm": 9.608988761901855, + "grad_norm_var": 3.012409881249372, + "learning_rate": 0.0001, + "loss": 1.8371, + "loss/crossentropy": 2.4410533905029297, + "loss/hidden": 1.359375, + "loss/logits": 0.280517578125, + "loss/reg": 0.019719891250133514, + "step": 864 + }, + { + "epoch": 0.108125, + "grad_norm": 2.6161296367645264, + "grad_norm_var": 3.079687357926654, + "learning_rate": 0.0001, + "loss": 0.9254, + "loss/crossentropy": 2.656090497970581, + "loss/hidden": 0.6171875, + "loss/logits": 0.11112320423126221, + "loss/reg": 0.01971041038632393, + "step": 865 + }, + { + "epoch": 0.10825, + "grad_norm": 6.282230377197266, + "grad_norm_var": 3.4921671952336317, + "learning_rate": 0.0001, + "loss": 1.1269, + "loss/crossentropy": 2.4799087047576904, + "loss/hidden": 0.80078125, + "loss/logits": 0.12914146482944489, + "loss/reg": 0.01970127783715725, + "step": 866 + }, + { + "epoch": 0.108375, + "grad_norm": 3.052783727645874, + "grad_norm_var": 3.514845564297898, + "learning_rate": 0.0001, + "loss": 1.1383, + "loss/crossentropy": 2.4700145721435547, + "loss/hidden": 0.78515625, + "loss/logits": 0.15620394051074982, + "loss/reg": 0.019691679626703262, + "step": 867 + }, + { + "epoch": 0.1085, + "grad_norm": 3.2105300426483154, + "grad_norm_var": 3.458070348929603, + "learning_rate": 0.0001, + "loss": 1.067, + "loss/crossentropy": 2.135777473449707, + "loss/hidden": 0.7578125, + "loss/logits": 0.11233663558959961, + "loss/reg": 0.019682079553604126, + "step": 868 + }, + { + "epoch": 0.108625, + "grad_norm": 2.7341737747192383, + "grad_norm_var": 3.4405364793380135, + "learning_rate": 0.0001, + "loss": 1.0495, + "loss/crossentropy": 2.4592809677124023, + "loss/hidden": 0.73046875, + "loss/logits": 0.1223248764872551, + "loss/reg": 0.019672293215990067, + "step": 869 + }, + { + "epoch": 0.10875, + "grad_norm": 4.007035732269287, + "grad_norm_var": 3.4058996890488658, + "learning_rate": 0.0001, + "loss": 1.1341, + "loss/crossentropy": 2.5008175373077393, + "loss/hidden": 0.80078125, + "loss/logits": 0.13670845329761505, + "loss/reg": 0.01966211199760437, + "step": 870 + }, + { + "epoch": 0.108875, + "grad_norm": 3.146348476409912, + "grad_norm_var": 3.359090924722083, + "learning_rate": 0.0001, + "loss": 1.2459, + "loss/crossentropy": 2.0440711975097656, + "loss/hidden": 0.8984375, + "loss/logits": 0.15090960264205933, + "loss/reg": 0.019652366638183594, + "step": 871 + }, + { + "epoch": 0.109, + "grad_norm": 4.632335662841797, + "grad_norm_var": 3.3902752468766453, + "learning_rate": 0.0001, + "loss": 1.1446, + "loss/crossentropy": 2.4574947357177734, + "loss/hidden": 0.80859375, + "loss/logits": 0.13958214223384857, + "loss/reg": 0.01964336633682251, + "step": 872 + }, + { + "epoch": 0.109125, + "grad_norm": 5.440892219543457, + "grad_norm_var": 3.415471723579617, + "learning_rate": 0.0001, + "loss": 1.1098, + "loss/crossentropy": 2.565347671508789, + "loss/hidden": 0.8046875, + "loss/logits": 0.10876456648111343, + "loss/reg": 0.01963435485959053, + "step": 873 + }, + { + "epoch": 0.10925, + "grad_norm": 2.873284101486206, + "grad_norm_var": 3.454269091905695, + "learning_rate": 0.0001, + "loss": 1.0607, + "loss/crossentropy": 2.561459541320801, + "loss/hidden": 0.72265625, + "loss/logits": 0.14177533984184265, + "loss/reg": 0.01962495781481266, + "step": 874 + }, + { + "epoch": 0.109375, + "grad_norm": 3.0819554328918457, + "grad_norm_var": 3.496943197417559, + "learning_rate": 0.0001, + "loss": 1.2422, + "loss/crossentropy": 2.2944936752319336, + "loss/hidden": 0.8828125, + "loss/logits": 0.16319133341312408, + "loss/reg": 0.019616009667515755, + "step": 875 + }, + { + "epoch": 0.1095, + "grad_norm": 4.361453533172607, + "grad_norm_var": 3.488792217244569, + "learning_rate": 0.0001, + "loss": 1.3828, + "loss/crossentropy": 2.4624311923980713, + "loss/hidden": 1.0078125, + "loss/logits": 0.1789003610610962, + "loss/reg": 0.019607286900281906, + "step": 876 + }, + { + "epoch": 0.109625, + "grad_norm": 3.34078049659729, + "grad_norm_var": 3.3959280790074216, + "learning_rate": 0.0001, + "loss": 0.9589, + "loss/crossentropy": 2.4549739360809326, + "loss/hidden": 0.64453125, + "loss/logits": 0.11839590966701508, + "loss/reg": 0.019598115235567093, + "step": 877 + }, + { + "epoch": 0.10975, + "grad_norm": 3.434715986251831, + "grad_norm_var": 3.2759937907981884, + "learning_rate": 0.0001, + "loss": 1.1192, + "loss/crossentropy": 2.571798086166382, + "loss/hidden": 0.7578125, + "loss/logits": 0.16550706326961517, + "loss/reg": 0.01958884485065937, + "step": 878 + }, + { + "epoch": 0.109875, + "grad_norm": 3.593993663787842, + "grad_norm_var": 3.196554005024426, + "learning_rate": 0.0001, + "loss": 1.1792, + "loss/crossentropy": 2.447843551635742, + "loss/hidden": 0.83203125, + "loss/logits": 0.15137381851673126, + "loss/reg": 0.019579457119107246, + "step": 879 + }, + { + "epoch": 0.11, + "grad_norm": 3.3654563426971436, + "grad_norm_var": 1.0373482238282006, + "learning_rate": 0.0001, + "loss": 1.0326, + "loss/crossentropy": 2.6210246086120605, + "loss/hidden": 0.71484375, + "loss/logits": 0.12209475785493851, + "loss/reg": 0.019569827243685722, + "step": 880 + }, + { + "epoch": 0.110125, + "grad_norm": 2.6370863914489746, + "grad_norm_var": 1.0343516088559113, + "learning_rate": 0.0001, + "loss": 1.1442, + "loss/crossentropy": 2.2681713104248047, + "loss/hidden": 0.81640625, + "loss/logits": 0.132216215133667, + "loss/reg": 0.019560784101486206, + "step": 881 + }, + { + "epoch": 0.11025, + "grad_norm": 2.152343273162842, + "grad_norm_var": 0.6782700998492743, + "learning_rate": 0.0001, + "loss": 1.0136, + "loss/crossentropy": 2.437594175338745, + "loss/hidden": 0.70703125, + "loss/logits": 0.11108942329883575, + "loss/reg": 0.019551947712898254, + "step": 882 + }, + { + "epoch": 0.110375, + "grad_norm": 3.2755074501037598, + "grad_norm_var": 0.6698247850929626, + "learning_rate": 0.0001, + "loss": 0.9972, + "loss/crossentropy": 2.481198310852051, + "loss/hidden": 0.6796875, + "loss/logits": 0.12207823246717453, + "loss/reg": 0.01954270713031292, + "step": 883 + }, + { + "epoch": 0.1105, + "grad_norm": 3.770535707473755, + "grad_norm_var": 0.6711344077538037, + "learning_rate": 0.0001, + "loss": 1.0475, + "loss/crossentropy": 2.434851884841919, + "loss/hidden": 0.71875, + "loss/logits": 0.13345816731452942, + "loss/reg": 0.019533507525920868, + "step": 884 + }, + { + "epoch": 0.110625, + "grad_norm": 2.7666234970092773, + "grad_norm_var": 0.6679279033368438, + "learning_rate": 0.0001, + "loss": 1.1166, + "loss/crossentropy": 2.433852434158325, + "loss/hidden": 0.78125, + "loss/logits": 0.1401294767856598, + "loss/reg": 0.01952442154288292, + "step": 885 + }, + { + "epoch": 0.11075, + "grad_norm": 2.7105534076690674, + "grad_norm_var": 0.6840409496040507, + "learning_rate": 0.0001, + "loss": 1.0924, + "loss/crossentropy": 2.219019889831543, + "loss/hidden": 0.765625, + "loss/logits": 0.13160449266433716, + "loss/reg": 0.019515201449394226, + "step": 886 + }, + { + "epoch": 0.110875, + "grad_norm": 2.8931920528411865, + "grad_norm_var": 0.6969961519386968, + "learning_rate": 0.0001, + "loss": 1.1134, + "loss/crossentropy": 2.511371612548828, + "loss/hidden": 0.78125, + "loss/logits": 0.13708999752998352, + "loss/reg": 0.019505700096488, + "step": 887 + }, + { + "epoch": 0.111, + "grad_norm": 2.821718215942383, + "grad_norm_var": 0.6033415037749854, + "learning_rate": 0.0001, + "loss": 1.2446, + "loss/crossentropy": 2.389580011367798, + "loss/hidden": 0.8828125, + "loss/logits": 0.16678079962730408, + "loss/reg": 0.019496839493513107, + "step": 888 + }, + { + "epoch": 0.111125, + "grad_norm": 2.833461284637451, + "grad_norm_var": 0.2778808504856213, + "learning_rate": 0.0001, + "loss": 1.029, + "loss/crossentropy": 2.1618683338165283, + "loss/hidden": 0.70703125, + "loss/logits": 0.12710769474506378, + "loss/reg": 0.019487854093313217, + "step": 889 + }, + { + "epoch": 0.11125, + "grad_norm": 3.3806753158569336, + "grad_norm_var": 0.2773113837378702, + "learning_rate": 0.0001, + "loss": 1.1349, + "loss/crossentropy": 2.656121253967285, + "loss/hidden": 0.79296875, + "loss/logits": 0.1471368372440338, + "loss/reg": 0.019478676840662956, + "step": 890 + }, + { + "epoch": 0.111375, + "grad_norm": 2.932758092880249, + "grad_norm_var": 0.2800811641911004, + "learning_rate": 0.0001, + "loss": 1.1, + "loss/crossentropy": 2.5325851440429688, + "loss/hidden": 0.76171875, + "loss/logits": 0.14360320568084717, + "loss/reg": 0.019469575956463814, + "step": 891 + }, + { + "epoch": 0.1115, + "grad_norm": 2.3603265285491943, + "grad_norm_var": 0.20497304301798067, + "learning_rate": 0.0001, + "loss": 1.0739, + "loss/crossentropy": 2.3697421550750732, + "loss/hidden": 0.75, + "loss/logits": 0.1292482614517212, + "loss/reg": 0.01946048066020012, + "step": 892 + }, + { + "epoch": 0.111625, + "grad_norm": 4.056443214416504, + "grad_norm_var": 0.2678930990243863, + "learning_rate": 0.0001, + "loss": 1.0008, + "loss/crossentropy": 2.7553114891052246, + "loss/hidden": 0.68359375, + "loss/logits": 0.12268239259719849, + "loss/reg": 0.019451187923550606, + "step": 893 + }, + { + "epoch": 0.11175, + "grad_norm": 3.346064329147339, + "grad_norm_var": 0.2639738255705176, + "learning_rate": 0.0001, + "loss": 1.1791, + "loss/crossentropy": 2.4348855018615723, + "loss/hidden": 0.7890625, + "loss/logits": 0.19565626978874207, + "loss/reg": 0.019442636519670486, + "step": 894 + }, + { + "epoch": 0.111875, + "grad_norm": 6.878971576690674, + "grad_norm_var": 1.1740357353356365, + "learning_rate": 0.0001, + "loss": 1.7085, + "loss/crossentropy": 2.9299001693725586, + "loss/hidden": 1.1640625, + "loss/logits": 0.3501082956790924, + "loss/reg": 0.019433531910181046, + "step": 895 + }, + { + "epoch": 0.112, + "grad_norm": 3.1834142208099365, + "grad_norm_var": 1.1735802221223497, + "learning_rate": 0.0001, + "loss": 1.2953, + "loss/crossentropy": 2.4165709018707275, + "loss/hidden": 0.92578125, + "loss/logits": 0.17523059248924255, + "loss/reg": 0.019424354657530785, + "step": 896 + }, + { + "epoch": 0.112125, + "grad_norm": 2.5551555156707764, + "grad_norm_var": 1.180695081530242, + "learning_rate": 0.0001, + "loss": 1.0171, + "loss/crossentropy": 2.4345083236694336, + "loss/hidden": 0.6953125, + "loss/logits": 0.1276446282863617, + "loss/reg": 0.019415004178881645, + "step": 897 + }, + { + "epoch": 0.11225, + "grad_norm": 3.5786855220794678, + "grad_norm_var": 1.1000748366509354, + "learning_rate": 0.0001, + "loss": 0.9925, + "loss/crossentropy": 2.464219808578491, + "loss/hidden": 0.69140625, + "loss/logits": 0.10706112533807755, + "loss/reg": 0.01940576173365116, + "step": 898 + }, + { + "epoch": 0.112375, + "grad_norm": 8.784457206726074, + "grad_norm_var": 2.9538895197120096, + "learning_rate": 0.0001, + "loss": 1.6799, + "loss/crossentropy": 2.3273468017578125, + "loss/hidden": 1.2265625, + "loss/logits": 0.25934016704559326, + "loss/reg": 0.019396713003516197, + "step": 899 + }, + { + "epoch": 0.1125, + "grad_norm": 4.44765567779541, + "grad_norm_var": 2.990871190956643, + "learning_rate": 0.0001, + "loss": 1.0685, + "loss/crossentropy": 2.5760438442230225, + "loss/hidden": 0.73046875, + "loss/logits": 0.14414295554161072, + "loss/reg": 0.019387517124414444, + "step": 900 + }, + { + "epoch": 0.112625, + "grad_norm": 4.9651265144348145, + "grad_norm_var": 3.013306784613239, + "learning_rate": 0.0001, + "loss": 1.1917, + "loss/crossentropy": 2.5148589611053467, + "loss/hidden": 0.828125, + "loss/logits": 0.1697455495595932, + "loss/reg": 0.01937839388847351, + "step": 901 + }, + { + "epoch": 0.11275, + "grad_norm": 3.8322513103485107, + "grad_norm_var": 2.9203267227302745, + "learning_rate": 0.0001, + "loss": 1.2309, + "loss/crossentropy": 2.536973476409912, + "loss/hidden": 0.8828125, + "loss/logits": 0.15436521172523499, + "loss/reg": 0.019369108602404594, + "step": 902 + }, + { + "epoch": 0.112875, + "grad_norm": 3.5994656085968018, + "grad_norm_var": 2.8540415836766555, + "learning_rate": 0.0001, + "loss": 1.0761, + "loss/crossentropy": 2.3160693645477295, + "loss/hidden": 0.76171875, + "loss/logits": 0.12077254056930542, + "loss/reg": 0.019359666854143143, + "step": 903 + }, + { + "epoch": 0.113, + "grad_norm": 4.031139373779297, + "grad_norm_var": 2.7599236229360753, + "learning_rate": 0.0001, + "loss": 1.0988, + "loss/crossentropy": 2.5636203289031982, + "loss/hidden": 0.78515625, + "loss/logits": 0.12018904089927673, + "loss/reg": 0.019350115209817886, + "step": 904 + }, + { + "epoch": 0.113125, + "grad_norm": 3.2178378105163574, + "grad_norm_var": 2.7069185907568794, + "learning_rate": 0.0001, + "loss": 1.0472, + "loss/crossentropy": 2.3457703590393066, + "loss/hidden": 0.72265625, + "loss/logits": 0.13114379346370697, + "loss/reg": 0.019340479746460915, + "step": 905 + }, + { + "epoch": 0.11325, + "grad_norm": 3.096679925918579, + "grad_norm_var": 2.7381334427643536, + "learning_rate": 0.0001, + "loss": 0.9662, + "loss/crossentropy": 2.3510820865631104, + "loss/hidden": 0.671875, + "loss/logits": 0.10106582939624786, + "loss/reg": 0.019330844283103943, + "step": 906 + }, + { + "epoch": 0.113375, + "grad_norm": 7.590158462524414, + "grad_norm_var": 3.3974738441650887, + "learning_rate": 0.0001, + "loss": 1.6666, + "loss/crossentropy": 2.3520147800445557, + "loss/hidden": 1.2265625, + "loss/logits": 0.2468346357345581, + "loss/reg": 0.019321195781230927, + "step": 907 + }, + { + "epoch": 0.1135, + "grad_norm": 3.3216323852539062, + "grad_norm_var": 3.20081618522182, + "learning_rate": 0.0001, + "loss": 1.2612, + "loss/crossentropy": 2.330040216445923, + "loss/hidden": 0.90234375, + "loss/logits": 0.16571447253227234, + "loss/reg": 0.019311606884002686, + "step": 908 + }, + { + "epoch": 0.113625, + "grad_norm": 3.4920501708984375, + "grad_norm_var": 3.246978809627046, + "learning_rate": 0.0001, + "loss": 1.1161, + "loss/crossentropy": 2.598465919494629, + "loss/hidden": 0.76953125, + "loss/logits": 0.15353354811668396, + "loss/reg": 0.019302019849419594, + "step": 909 + }, + { + "epoch": 0.11375, + "grad_norm": 3.0414321422576904, + "grad_norm_var": 3.294370585536838, + "learning_rate": 0.0001, + "loss": 1.0693, + "loss/crossentropy": 2.459712028503418, + "loss/hidden": 0.7421875, + "loss/logits": 0.13418710231781006, + "loss/reg": 0.019292324781417847, + "step": 910 + }, + { + "epoch": 0.113875, + "grad_norm": 3.131361722946167, + "grad_norm_var": 2.908980195007492, + "learning_rate": 0.0001, + "loss": 1.2301, + "loss/crossentropy": 2.268738269805908, + "loss/hidden": 0.875, + "loss/logits": 0.16223490238189697, + "loss/reg": 0.019283456727862358, + "step": 911 + }, + { + "epoch": 0.114, + "grad_norm": 3.160707950592041, + "grad_norm_var": 2.9118381902992776, + "learning_rate": 0.0001, + "loss": 1.3173, + "loss/crossentropy": 2.2185730934143066, + "loss/hidden": 0.9453125, + "loss/logits": 0.17927365005016327, + "loss/reg": 0.019274268299341202, + "step": 912 + }, + { + "epoch": 0.114125, + "grad_norm": 14.604296684265137, + "grad_norm_var": 9.479147248477696, + "learning_rate": 0.0001, + "loss": 1.4278, + "loss/crossentropy": 2.2313976287841797, + "loss/hidden": 1.078125, + "loss/logits": 0.15699920058250427, + "loss/reg": 0.019266733899712563, + "step": 913 + }, + { + "epoch": 0.11425, + "grad_norm": 3.744290590286255, + "grad_norm_var": 9.452382803070204, + "learning_rate": 0.0001, + "loss": 1.1776, + "loss/crossentropy": 2.689699172973633, + "loss/hidden": 0.83203125, + "loss/logits": 0.15302729606628418, + "loss/reg": 0.019258547574281693, + "step": 914 + }, + { + "epoch": 0.114375, + "grad_norm": 2.999354839324951, + "grad_norm_var": 8.531466626401157, + "learning_rate": 0.0001, + "loss": 1.3107, + "loss/crossentropy": 1.8664510250091553, + "loss/hidden": 0.9453125, + "loss/logits": 0.1728517711162567, + "loss/reg": 0.01925109326839447, + "step": 915 + }, + { + "epoch": 0.1145, + "grad_norm": 3.194913148880005, + "grad_norm_var": 8.64117053500835, + "learning_rate": 0.0001, + "loss": 1.3016, + "loss/crossentropy": 2.25707745552063, + "loss/hidden": 0.95703125, + "loss/logits": 0.15214993059635162, + "loss/reg": 0.019243914633989334, + "step": 916 + }, + { + "epoch": 0.114625, + "grad_norm": 2.335845708847046, + "grad_norm_var": 8.88876728908843, + "learning_rate": 0.0001, + "loss": 0.9484, + "loss/crossentropy": 2.6386678218841553, + "loss/hidden": 0.63671875, + "loss/logits": 0.11931365728378296, + "loss/reg": 0.019234785810112953, + "step": 917 + }, + { + "epoch": 0.11475, + "grad_norm": 3.6922056674957275, + "grad_norm_var": 8.898252742921356, + "learning_rate": 0.0001, + "loss": 1.0667, + "loss/crossentropy": 2.5539944171905518, + "loss/hidden": 0.73046875, + "loss/logits": 0.14395025372505188, + "loss/reg": 0.019226964563131332, + "step": 918 + }, + { + "epoch": 0.114875, + "grad_norm": 2.946389675140381, + "grad_norm_var": 8.982934878513694, + "learning_rate": 0.0001, + "loss": 1.0911, + "loss/crossentropy": 2.3651351928710938, + "loss/hidden": 0.76171875, + "loss/logits": 0.1371677815914154, + "loss/reg": 0.019219111651182175, + "step": 919 + }, + { + "epoch": 0.115, + "grad_norm": 3.050600051879883, + "grad_norm_var": 9.068373446668678, + "learning_rate": 0.0001, + "loss": 1.0419, + "loss/crossentropy": 2.3772196769714355, + "loss/hidden": 0.734375, + "loss/logits": 0.11545050889253616, + "loss/reg": 0.019211286678910255, + "step": 920 + }, + { + "epoch": 0.115125, + "grad_norm": 2.954526424407959, + "grad_norm_var": 9.105915478669973, + "learning_rate": 0.0001, + "loss": 1.0807, + "loss/crossentropy": 2.2472290992736816, + "loss/hidden": 0.75390625, + "loss/logits": 0.13482055068016052, + "loss/reg": 0.019202249124646187, + "step": 921 + }, + { + "epoch": 0.11525, + "grad_norm": 6.214420795440674, + "grad_norm_var": 9.27670245999236, + "learning_rate": 0.0001, + "loss": 1.8562, + "loss/crossentropy": 2.9924590587615967, + "loss/hidden": 1.2734375, + "loss/logits": 0.3907894492149353, + "loss/reg": 0.019193273037672043, + "step": 922 + }, + { + "epoch": 0.115375, + "grad_norm": 4.534095287322998, + "grad_norm_var": 8.536934613221737, + "learning_rate": 0.0001, + "loss": 1.4885, + "loss/crossentropy": 2.394090414047241, + "loss/hidden": 1.078125, + "loss/logits": 0.2185368537902832, + "loss/reg": 0.019184142351150513, + "step": 923 + }, + { + "epoch": 0.1155, + "grad_norm": 3.6761627197265625, + "grad_norm_var": 8.505579278095963, + "learning_rate": 0.0001, + "loss": 1.1306, + "loss/crossentropy": 2.3566806316375732, + "loss/hidden": 0.7890625, + "loss/logits": 0.14977289736270905, + "loss/reg": 0.019175738096237183, + "step": 924 + }, + { + "epoch": 0.115625, + "grad_norm": 4.592898368835449, + "grad_norm_var": 8.481328607270026, + "learning_rate": 0.0001, + "loss": 1.2942, + "loss/crossentropy": 2.7458250522613525, + "loss/hidden": 0.953125, + "loss/logits": 0.14940449595451355, + "loss/reg": 0.01916695386171341, + "step": 925 + }, + { + "epoch": 0.11575, + "grad_norm": 2.7483372688293457, + "grad_norm_var": 8.533618684340597, + "learning_rate": 0.0001, + "loss": 1.0727, + "loss/crossentropy": 2.4450652599334717, + "loss/hidden": 0.75390625, + "loss/logits": 0.12724441289901733, + "loss/reg": 0.019158538430929184, + "step": 926 + }, + { + "epoch": 0.115875, + "grad_norm": 3.075195074081421, + "grad_norm_var": 8.541996814909611, + "learning_rate": 0.0001, + "loss": 1.1251, + "loss/crossentropy": 2.3960814476013184, + "loss/hidden": 0.76171875, + "loss/logits": 0.1718631386756897, + "loss/reg": 0.019150495529174805, + "step": 927 + }, + { + "epoch": 0.116, + "grad_norm": 3.313359022140503, + "grad_norm_var": 8.521887542243068, + "learning_rate": 0.0001, + "loss": 1.1316, + "loss/crossentropy": 2.3464736938476562, + "loss/hidden": 0.8125, + "loss/logits": 0.1276848018169403, + "loss/reg": 0.019141457974910736, + "step": 928 + }, + { + "epoch": 0.116125, + "grad_norm": 4.252639293670654, + "grad_norm_var": 0.9000980544993648, + "learning_rate": 0.0001, + "loss": 1.1818, + "loss/crossentropy": 2.523444414138794, + "loss/hidden": 0.83203125, + "loss/logits": 0.15845248103141785, + "loss/reg": 0.01913331262767315, + "step": 929 + }, + { + "epoch": 0.11625, + "grad_norm": 3.097456455230713, + "grad_norm_var": 0.9123223599265882, + "learning_rate": 0.0001, + "loss": 1.1264, + "loss/crossentropy": 2.595120668411255, + "loss/hidden": 0.7734375, + "loss/logits": 0.16165900230407715, + "loss/reg": 0.019125619903206825, + "step": 930 + }, + { + "epoch": 0.116375, + "grad_norm": 3.292982816696167, + "grad_norm_var": 0.8964505136113113, + "learning_rate": 0.0001, + "loss": 1.1132, + "loss/crossentropy": 2.4093523025512695, + "loss/hidden": 0.77734375, + "loss/logits": 0.14468303322792053, + "loss/reg": 0.019116582348942757, + "step": 931 + }, + { + "epoch": 0.1165, + "grad_norm": 2.559980869293213, + "grad_norm_var": 0.952617731514821, + "learning_rate": 0.0001, + "loss": 0.9697, + "loss/crossentropy": 2.5491104125976562, + "loss/hidden": 0.6484375, + "loss/logits": 0.13018551468849182, + "loss/reg": 0.019108334556221962, + "step": 932 + }, + { + "epoch": 0.116625, + "grad_norm": 3.058579683303833, + "grad_norm_var": 0.871050822267735, + "learning_rate": 0.0001, + "loss": 1.2919, + "loss/crossentropy": 2.303837537765503, + "loss/hidden": 0.9296875, + "loss/logits": 0.17123734951019287, + "loss/reg": 0.019099365919828415, + "step": 933 + }, + { + "epoch": 0.11675, + "grad_norm": 4.866446495056152, + "grad_norm_var": 0.9769503909617764, + "learning_rate": 0.0001, + "loss": 1.1876, + "loss/crossentropy": 2.364741325378418, + "loss/hidden": 0.8359375, + "loss/logits": 0.1607905626296997, + "loss/reg": 0.019090238958597183, + "step": 934 + }, + { + "epoch": 0.116875, + "grad_norm": 5.912527561187744, + "grad_norm_var": 1.252657817578581, + "learning_rate": 0.0001, + "loss": 1.4831, + "loss/crossentropy": 2.641162633895874, + "loss/hidden": 1.09375, + "loss/logits": 0.19855040311813354, + "loss/reg": 0.019081177189946175, + "step": 935 + }, + { + "epoch": 0.117, + "grad_norm": 2.8301663398742676, + "grad_norm_var": 1.2784556528629765, + "learning_rate": 0.0001, + "loss": 1.0878, + "loss/crossentropy": 2.277157783508301, + "loss/hidden": 0.765625, + "loss/logits": 0.13142800331115723, + "loss/reg": 0.019072722643613815, + "step": 936 + }, + { + "epoch": 0.117125, + "grad_norm": 2.9044833183288574, + "grad_norm_var": 1.2843284928455583, + "learning_rate": 0.0001, + "loss": 1.1381, + "loss/crossentropy": 2.3311665058135986, + "loss/hidden": 0.796875, + "loss/logits": 0.1506001055240631, + "loss/reg": 0.01906409114599228, + "step": 937 + }, + { + "epoch": 0.11725, + "grad_norm": 3.78202223777771, + "grad_norm_var": 0.8736988295376342, + "learning_rate": 0.0001, + "loss": 1.2869, + "loss/crossentropy": 2.4287989139556885, + "loss/hidden": 0.8984375, + "loss/logits": 0.197871595621109, + "loss/reg": 0.019055521115660667, + "step": 938 + }, + { + "epoch": 0.117375, + "grad_norm": 3.1086387634277344, + "grad_norm_var": 0.8338185014655397, + "learning_rate": 0.0001, + "loss": 1.0576, + "loss/crossentropy": 2.335935354232788, + "loss/hidden": 0.73828125, + "loss/logits": 0.1288556456565857, + "loss/reg": 0.019047552719712257, + "step": 939 + }, + { + "epoch": 0.1175, + "grad_norm": 4.869723320007324, + "grad_norm_var": 0.9402287231159085, + "learning_rate": 0.0001, + "loss": 1.0378, + "loss/crossentropy": 2.9672138690948486, + "loss/hidden": 0.7109375, + "loss/logits": 0.13644808530807495, + "loss/reg": 0.01903851516544819, + "step": 940 + }, + { + "epoch": 0.117625, + "grad_norm": 2.8750882148742676, + "grad_norm_var": 0.9067692046415797, + "learning_rate": 0.0001, + "loss": 1.0037, + "loss/crossentropy": 2.3138959407806396, + "loss/hidden": 0.69140625, + "loss/logits": 0.12202918529510498, + "loss/reg": 0.019029438495635986, + "step": 941 + }, + { + "epoch": 0.11775, + "grad_norm": 3.044121742248535, + "grad_norm_var": 0.8812433820018912, + "learning_rate": 0.0001, + "loss": 1.0173, + "loss/crossentropy": 2.4543912410736084, + "loss/hidden": 0.70703125, + "loss/logits": 0.12005805224180222, + "loss/reg": 0.01902030035853386, + "step": 942 + }, + { + "epoch": 0.117875, + "grad_norm": 2.945160150527954, + "grad_norm_var": 0.8905794039935603, + "learning_rate": 0.0001, + "loss": 1.1688, + "loss/crossentropy": 2.3482954502105713, + "loss/hidden": 0.828125, + "loss/logits": 0.15057498216629028, + "loss/reg": 0.019011201336979866, + "step": 943 + }, + { + "epoch": 0.118, + "grad_norm": 3.0109965801239014, + "grad_norm_var": 0.9056152589294107, + "learning_rate": 0.0001, + "loss": 1.3721, + "loss/crossentropy": 2.353516101837158, + "loss/hidden": 0.984375, + "loss/logits": 0.1977054923772812, + "loss/reg": 0.0190016757696867, + "step": 944 + }, + { + "epoch": 0.118125, + "grad_norm": 3.424039363861084, + "grad_norm_var": 0.8682128423744849, + "learning_rate": 0.0001, + "loss": 1.1647, + "loss/crossentropy": 2.513850450515747, + "loss/hidden": 0.81640625, + "loss/logits": 0.15838034451007843, + "loss/reg": 0.018992552533745766, + "step": 945 + }, + { + "epoch": 0.11825, + "grad_norm": 2.7656776905059814, + "grad_norm_var": 0.8917454992029661, + "learning_rate": 0.0001, + "loss": 1.0819, + "loss/crossentropy": 2.650881767272949, + "loss/hidden": 0.75, + "loss/logits": 0.1420745849609375, + "loss/reg": 0.01898341253399849, + "step": 946 + }, + { + "epoch": 0.118375, + "grad_norm": 4.2130560874938965, + "grad_norm_var": 0.9250033835133629, + "learning_rate": 0.0001, + "loss": 1.2469, + "loss/crossentropy": 2.683312177658081, + "loss/hidden": 0.90625, + "loss/logits": 0.15088841319084167, + "loss/reg": 0.018973875790834427, + "step": 947 + }, + { + "epoch": 0.1185, + "grad_norm": 3.834226608276367, + "grad_norm_var": 0.8649633510209883, + "learning_rate": 0.0001, + "loss": 0.9767, + "loss/crossentropy": 2.5706145763397217, + "loss/hidden": 0.6640625, + "loss/logits": 0.12295837700366974, + "loss/reg": 0.018963845446705818, + "step": 948 + }, + { + "epoch": 0.118625, + "grad_norm": 3.3009235858917236, + "grad_norm_var": 0.8514524765901378, + "learning_rate": 0.0001, + "loss": 1.1857, + "loss/crossentropy": 2.6797893047332764, + "loss/hidden": 0.82421875, + "loss/logits": 0.17191748321056366, + "loss/reg": 0.018954817205667496, + "step": 949 + }, + { + "epoch": 0.11875, + "grad_norm": 6.452317237854004, + "grad_norm_var": 1.2752747995844325, + "learning_rate": 0.0001, + "loss": 1.4528, + "loss/crossentropy": 2.7505640983581543, + "loss/hidden": 1.0546875, + "loss/logits": 0.20869939029216766, + "loss/reg": 0.018945740535855293, + "step": 950 + }, + { + "epoch": 0.118875, + "grad_norm": 3.405714273452759, + "grad_norm_var": 0.9300412257064863, + "learning_rate": 0.0001, + "loss": 1.0922, + "loss/crossentropy": 2.802401065826416, + "loss/hidden": 0.7578125, + "loss/logits": 0.14506830275058746, + "loss/reg": 0.018936749547719955, + "step": 951 + }, + { + "epoch": 0.119, + "grad_norm": 9.662891387939453, + "grad_norm_var": 3.194050081601077, + "learning_rate": 0.0001, + "loss": 1.5003, + "loss/crossentropy": 2.700052499771118, + "loss/hidden": 1.1171875, + "loss/logits": 0.19381779432296753, + "loss/reg": 0.01892753876745701, + "step": 952 + }, + { + "epoch": 0.119125, + "grad_norm": 2.8670711517333984, + "grad_norm_var": 3.199477320796336, + "learning_rate": 0.0001, + "loss": 1.0153, + "loss/crossentropy": 2.490739107131958, + "loss/hidden": 0.69921875, + "loss/logits": 0.12687504291534424, + "loss/reg": 0.01891852729022503, + "step": 953 + }, + { + "epoch": 0.11925, + "grad_norm": 4.066836357116699, + "grad_norm_var": 3.1973098694543274, + "learning_rate": 0.0001, + "loss": 1.2704, + "loss/crossentropy": 2.4122891426086426, + "loss/hidden": 0.9140625, + "loss/logits": 0.1672634333372116, + "loss/reg": 0.018909232690930367, + "step": 954 + }, + { + "epoch": 0.119375, + "grad_norm": 2.8044497966766357, + "grad_norm_var": 3.2388562001879793, + "learning_rate": 0.0001, + "loss": 1.0385, + "loss/crossentropy": 2.473945379257202, + "loss/hidden": 0.71484375, + "loss/logits": 0.13464638590812683, + "loss/reg": 0.018900100141763687, + "step": 955 + }, + { + "epoch": 0.1195, + "grad_norm": 3.7214255332946777, + "grad_norm_var": 3.1837278954586044, + "learning_rate": 0.0001, + "loss": 1.0832, + "loss/crossentropy": 2.6148641109466553, + "loss/hidden": 0.75390625, + "loss/logits": 0.1403505802154541, + "loss/reg": 0.01889113523066044, + "step": 956 + }, + { + "epoch": 0.119625, + "grad_norm": 3.0579674243927, + "grad_norm_var": 3.160836005262268, + "learning_rate": 0.0001, + "loss": 1.1761, + "loss/crossentropy": 2.5016398429870605, + "loss/hidden": 0.83984375, + "loss/logits": 0.1474056839942932, + "loss/reg": 0.0188821442425251, + "step": 957 + }, + { + "epoch": 0.11975, + "grad_norm": 2.429112672805786, + "grad_norm_var": 3.255565314691306, + "learning_rate": 0.0001, + "loss": 0.9853, + "loss/crossentropy": 2.3262507915496826, + "loss/hidden": 0.66796875, + "loss/logits": 0.1286056935787201, + "loss/reg": 0.018873048946261406, + "step": 958 + }, + { + "epoch": 0.119875, + "grad_norm": 2.9561123847961426, + "grad_norm_var": 3.2542184489207098, + "learning_rate": 0.0001, + "loss": 0.9739, + "loss/crossentropy": 2.5669143199920654, + "loss/hidden": 0.65234375, + "loss/logits": 0.13291960954666138, + "loss/reg": 0.018863873556256294, + "step": 959 + }, + { + "epoch": 0.12, + "grad_norm": 3.1904752254486084, + "grad_norm_var": 3.2355963683487268, + "learning_rate": 0.0001, + "loss": 1.074, + "loss/crossentropy": 2.5522408485412598, + "loss/hidden": 0.75390625, + "loss/logits": 0.1315881609916687, + "loss/reg": 0.018855126574635506, + "step": 960 + }, + { + "epoch": 0.120125, + "grad_norm": 7.435352802276611, + "grad_norm_var": 3.9949775747956586, + "learning_rate": 0.0001, + "loss": 1.6007, + "loss/crossentropy": 2.7422168254852295, + "loss/hidden": 1.1875, + "loss/logits": 0.2246999442577362, + "loss/reg": 0.018846556544303894, + "step": 961 + }, + { + "epoch": 0.12025, + "grad_norm": 4.614249229431152, + "grad_norm_var": 3.8709926395951384, + "learning_rate": 0.0001, + "loss": 1.3908, + "loss/crossentropy": 2.356748104095459, + "loss/hidden": 0.953125, + "loss/logits": 0.24928607046604156, + "loss/reg": 0.018837420269846916, + "step": 962 + }, + { + "epoch": 0.120375, + "grad_norm": 4.978577136993408, + "grad_norm_var": 3.903770487124885, + "learning_rate": 0.0001, + "loss": 1.4333, + "loss/crossentropy": 2.2456307411193848, + "loss/hidden": 1.0703125, + "loss/logits": 0.17467612028121948, + "loss/reg": 0.01882883533835411, + "step": 963 + }, + { + "epoch": 0.1205, + "grad_norm": 5.51161003112793, + "grad_norm_var": 3.97576236618078, + "learning_rate": 0.0001, + "loss": 1.2552, + "loss/crossentropy": 2.7595887184143066, + "loss/hidden": 0.8828125, + "loss/logits": 0.18415382504463196, + "loss/reg": 0.01882052607834339, + "step": 964 + }, + { + "epoch": 0.120625, + "grad_norm": 3.2307283878326416, + "grad_norm_var": 3.9863892013288393, + "learning_rate": 0.0001, + "loss": 1.2728, + "loss/crossentropy": 2.48502779006958, + "loss/hidden": 0.88671875, + "loss/logits": 0.19795754551887512, + "loss/reg": 0.01881156861782074, + "step": 965 + }, + { + "epoch": 0.12075, + "grad_norm": 3.441767454147339, + "grad_norm_var": 3.7286595116638916, + "learning_rate": 0.0001, + "loss": 1.182, + "loss/crossentropy": 2.508680582046509, + "loss/hidden": 0.8359375, + "loss/logits": 0.15802894532680511, + "loss/reg": 0.018803071230649948, + "step": 966 + }, + { + "epoch": 0.120875, + "grad_norm": 4.063246726989746, + "grad_norm_var": 3.685090208705704, + "learning_rate": 0.0001, + "loss": 1.3354, + "loss/crossentropy": 2.47729229927063, + "loss/hidden": 0.984375, + "loss/logits": 0.16310644149780273, + "loss/reg": 0.018794314935803413, + "step": 967 + }, + { + "epoch": 0.121, + "grad_norm": 5.8381171226501465, + "grad_norm_var": 1.8400005684538645, + "learning_rate": 0.0001, + "loss": 1.1025, + "loss/crossentropy": 2.5666096210479736, + "loss/hidden": 0.77734375, + "loss/logits": 0.1373094618320465, + "loss/reg": 0.018785255029797554, + "step": 968 + }, + { + "epoch": 0.121125, + "grad_norm": 2.7132527828216553, + "grad_norm_var": 1.8649801572693356, + "learning_rate": 0.0001, + "loss": 1.0912, + "loss/crossentropy": 2.4335010051727295, + "loss/hidden": 0.76953125, + "loss/logits": 0.13391214609146118, + "loss/reg": 0.018776265904307365, + "step": 969 + }, + { + "epoch": 0.12125, + "grad_norm": 2.9154088497161865, + "grad_norm_var": 1.938092020210782, + "learning_rate": 0.0001, + "loss": 1.1314, + "loss/crossentropy": 2.523137331008911, + "loss/hidden": 0.80078125, + "loss/logits": 0.1429774910211563, + "loss/reg": 0.01876768097281456, + "step": 970 + }, + { + "epoch": 0.121375, + "grad_norm": 3.3754584789276123, + "grad_norm_var": 1.8726730225127388, + "learning_rate": 0.0001, + "loss": 1.0747, + "loss/crossentropy": 2.6033432483673096, + "loss/hidden": 0.74609375, + "loss/logits": 0.14106187224388123, + "loss/reg": 0.018759164959192276, + "step": 971 + }, + { + "epoch": 0.1215, + "grad_norm": 2.7394940853118896, + "grad_norm_var": 1.9650935524715956, + "learning_rate": 0.0001, + "loss": 1.0871, + "loss/crossentropy": 2.2976291179656982, + "loss/hidden": 0.76953125, + "loss/logits": 0.1300249695777893, + "loss/reg": 0.01874978095293045, + "step": 972 + }, + { + "epoch": 0.121625, + "grad_norm": 2.9742624759674072, + "grad_norm_var": 1.9749925269591906, + "learning_rate": 0.0001, + "loss": 1.1221, + "loss/crossentropy": 2.7049221992492676, + "loss/hidden": 0.79296875, + "loss/logits": 0.1416921317577362, + "loss/reg": 0.018740687519311905, + "step": 973 + }, + { + "epoch": 0.12175, + "grad_norm": 2.927666187286377, + "grad_norm_var": 1.892721758937742, + "learning_rate": 0.0001, + "loss": 1.0252, + "loss/crossentropy": 2.6492226123809814, + "loss/hidden": 0.71875, + "loss/logits": 0.11909263581037521, + "loss/reg": 0.018731672316789627, + "step": 974 + }, + { + "epoch": 0.121875, + "grad_norm": 16.453693389892578, + "grad_norm_var": 11.523681815422924, + "learning_rate": 0.0001, + "loss": 1.5788, + "loss/crossentropy": 2.375779867172241, + "loss/hidden": 1.171875, + "loss/logits": 0.2197396457195282, + "loss/reg": 0.01872306317090988, + "step": 975 + }, + { + "epoch": 0.122, + "grad_norm": 2.928443193435669, + "grad_norm_var": 11.583339951760102, + "learning_rate": 0.0001, + "loss": 1.0995, + "loss/crossentropy": 2.8136162757873535, + "loss/hidden": 0.77734375, + "loss/logits": 0.13505280017852783, + "loss/reg": 0.018714020028710365, + "step": 976 + }, + { + "epoch": 0.122125, + "grad_norm": 4.540535926818848, + "grad_norm_var": 11.07401646408874, + "learning_rate": 0.0001, + "loss": 1.1806, + "loss/crossentropy": 2.5312135219573975, + "loss/hidden": 0.83984375, + "loss/logits": 0.15369677543640137, + "loss/reg": 0.01870504766702652, + "step": 977 + }, + { + "epoch": 0.12225, + "grad_norm": 2.4433298110961914, + "grad_norm_var": 11.358052675820652, + "learning_rate": 0.0001, + "loss": 1.0167, + "loss/crossentropy": 2.578800678253174, + "loss/hidden": 0.6953125, + "loss/logits": 0.13442841172218323, + "loss/reg": 0.018696293234825134, + "step": 978 + }, + { + "epoch": 0.122375, + "grad_norm": 3.341182231903076, + "grad_norm_var": 11.408522912728658, + "learning_rate": 0.0001, + "loss": 1.1346, + "loss/crossentropy": 2.4482579231262207, + "loss/hidden": 0.8046875, + "loss/logits": 0.14306378364562988, + "loss/reg": 0.018688105046749115, + "step": 979 + }, + { + "epoch": 0.1225, + "grad_norm": 2.433337450027466, + "grad_norm_var": 11.51984045745032, + "learning_rate": 0.0001, + "loss": 0.9764, + "loss/crossentropy": 2.321781873703003, + "loss/hidden": 0.65625, + "loss/logits": 0.13331879675388336, + "loss/reg": 0.018679112195968628, + "step": 980 + }, + { + "epoch": 0.122625, + "grad_norm": 3.6000678539276123, + "grad_norm_var": 11.483219758864427, + "learning_rate": 0.0001, + "loss": 1.1731, + "loss/crossentropy": 2.5233397483825684, + "loss/hidden": 0.8203125, + "loss/logits": 0.16610421240329742, + "loss/reg": 0.018670594319701195, + "step": 981 + }, + { + "epoch": 0.12275, + "grad_norm": 2.8748631477355957, + "grad_norm_var": 11.558394893606714, + "learning_rate": 0.0001, + "loss": 1.0362, + "loss/crossentropy": 2.4155187606811523, + "loss/hidden": 0.7265625, + "loss/logits": 0.12304510176181793, + "loss/reg": 0.018661517649888992, + "step": 982 + }, + { + "epoch": 0.122875, + "grad_norm": 2.2293035984039307, + "grad_norm_var": 11.786185692154314, + "learning_rate": 0.0001, + "loss": 1.0077, + "loss/crossentropy": 2.618363618850708, + "loss/hidden": 0.6875, + "loss/logits": 0.13366106152534485, + "loss/reg": 0.01865258812904358, + "step": 983 + }, + { + "epoch": 0.123, + "grad_norm": 3.615424633026123, + "grad_norm_var": 11.556298836968558, + "learning_rate": 0.0001, + "loss": 1.0544, + "loss/crossentropy": 2.224966526031494, + "loss/hidden": 0.734375, + "loss/logits": 0.1335442066192627, + "loss/reg": 0.01864360086619854, + "step": 984 + }, + { + "epoch": 0.123125, + "grad_norm": 2.93835186958313, + "grad_norm_var": 11.524399601900045, + "learning_rate": 0.0001, + "loss": 1.232, + "loss/crossentropy": 2.348299026489258, + "loss/hidden": 0.87890625, + "loss/logits": 0.1667439341545105, + "loss/reg": 0.018634630367159843, + "step": 985 + }, + { + "epoch": 0.12325, + "grad_norm": 3.2750635147094727, + "grad_norm_var": 11.485476360611186, + "learning_rate": 0.0001, + "loss": 1.0639, + "loss/crossentropy": 2.5712990760803223, + "loss/hidden": 0.75, + "loss/logits": 0.12762659788131714, + "loss/reg": 0.018625380471348763, + "step": 986 + }, + { + "epoch": 0.123375, + "grad_norm": 3.81864595413208, + "grad_norm_var": 11.465683474564775, + "learning_rate": 0.0001, + "loss": 1.1284, + "loss/crossentropy": 2.4274511337280273, + "loss/hidden": 0.7890625, + "loss/logits": 0.15321871638298035, + "loss/reg": 0.01861615665256977, + "step": 987 + }, + { + "epoch": 0.1235, + "grad_norm": 3.2677698135375977, + "grad_norm_var": 11.398153583229371, + "learning_rate": 0.0001, + "loss": 1.1315, + "loss/crossentropy": 2.656604290008545, + "loss/hidden": 0.8046875, + "loss/logits": 0.14076048135757446, + "loss/reg": 0.018606893718242645, + "step": 988 + }, + { + "epoch": 0.123625, + "grad_norm": 3.561713457107544, + "grad_norm_var": 11.341034456038914, + "learning_rate": 0.0001, + "loss": 1.0152, + "loss/crossentropy": 2.6205251216888428, + "loss/hidden": 0.70703125, + "loss/logits": 0.12218683958053589, + "loss/reg": 0.01859763078391552, + "step": 989 + }, + { + "epoch": 0.12375, + "grad_norm": 2.146240472793579, + "grad_norm_var": 11.49254916357394, + "learning_rate": 0.0001, + "loss": 1.0449, + "loss/crossentropy": 2.4804821014404297, + "loss/hidden": 0.71484375, + "loss/logits": 0.14422178268432617, + "loss/reg": 0.018588390201330185, + "step": 990 + }, + { + "epoch": 0.123875, + "grad_norm": 2.6142220497131348, + "grad_norm_var": 0.4215380256098586, + "learning_rate": 0.0001, + "loss": 1.0371, + "loss/crossentropy": 2.3303298950195312, + "loss/hidden": 0.73828125, + "loss/logits": 0.11297546327114105, + "loss/reg": 0.018579507246613503, + "step": 991 + }, + { + "epoch": 0.124, + "grad_norm": 4.17028284072876, + "grad_norm_var": 0.4892223582938262, + "learning_rate": 0.0001, + "loss": 1.6898, + "loss/crossentropy": 2.250098943710327, + "loss/hidden": 1.203125, + "loss/logits": 0.30099910497665405, + "loss/reg": 0.018569782376289368, + "step": 992 + }, + { + "epoch": 0.124125, + "grad_norm": 2.3814339637756348, + "grad_norm_var": 0.3887345955884323, + "learning_rate": 0.0001, + "loss": 1.0837, + "loss/crossentropy": 2.6447057723999023, + "loss/hidden": 0.74609375, + "loss/logits": 0.15198630094528198, + "loss/reg": 0.01855996623635292, + "step": 993 + }, + { + "epoch": 0.12425, + "grad_norm": 4.670334815979004, + "grad_norm_var": 0.5202129226035586, + "learning_rate": 0.0001, + "loss": 1.1908, + "loss/crossentropy": 2.3941245079040527, + "loss/hidden": 0.85546875, + "loss/logits": 0.1498216688632965, + "loss/reg": 0.018550006672739983, + "step": 994 + }, + { + "epoch": 0.124375, + "grad_norm": 2.498332977294922, + "grad_norm_var": 0.5469080049785059, + "learning_rate": 0.0001, + "loss": 0.9226, + "loss/crossentropy": 2.489830255508423, + "loss/hidden": 0.63671875, + "loss/logits": 0.10051175206899643, + "loss/reg": 0.01854090392589569, + "step": 995 + }, + { + "epoch": 0.1245, + "grad_norm": 2.987192392349243, + "grad_norm_var": 0.5145625202891589, + "learning_rate": 0.0001, + "loss": 1.1638, + "loss/crossentropy": 2.094637632369995, + "loss/hidden": 0.8515625, + "loss/logits": 0.12690997123718262, + "loss/reg": 0.018531804904341698, + "step": 996 + }, + { + "epoch": 0.124625, + "grad_norm": 2.738851547241211, + "grad_norm_var": 0.5110263660772335, + "learning_rate": 0.0001, + "loss": 1.1959, + "loss/crossentropy": 2.1406211853027344, + "loss/hidden": 0.84375, + "loss/logits": 0.16688010096549988, + "loss/reg": 0.018522722646594048, + "step": 997 + }, + { + "epoch": 0.12475, + "grad_norm": 5.02874231338501, + "grad_norm_var": 0.732945509426732, + "learning_rate": 0.0001, + "loss": 1.089, + "loss/crossentropy": 2.442840337753296, + "loss/hidden": 0.76171875, + "loss/logits": 0.142162024974823, + "loss/reg": 0.018513953313231468, + "step": 998 + }, + { + "epoch": 0.124875, + "grad_norm": 2.947786331176758, + "grad_norm_var": 0.6677765621166297, + "learning_rate": 0.0001, + "loss": 1.3459, + "loss/crossentropy": 2.483854055404663, + "loss/hidden": 0.96875, + "loss/logits": 0.19213837385177612, + "loss/reg": 0.018505612388253212, + "step": 999 + }, + { + "epoch": 0.125, + "grad_norm": 4.166240692138672, + "grad_norm_var": 0.7105452516630361, + "learning_rate": 0.0001, + "loss": 1.2402, + "loss/crossentropy": 2.454993963241577, + "loss/hidden": 0.87890625, + "loss/logits": 0.17627671360969543, + "loss/reg": 0.018497284501791, + "step": 1000 + }, + { + "epoch": 0.125125, + "grad_norm": 2.7403693199157715, + "grad_norm_var": 0.723220167440765, + "learning_rate": 0.0001, + "loss": 1.0711, + "loss/crossentropy": 2.3623507022857666, + "loss/hidden": 0.75, + "loss/logits": 0.1362101435661316, + "loss/reg": 0.018489044159650803, + "step": 1001 + }, + { + "epoch": 0.12525, + "grad_norm": 4.017945766448975, + "grad_norm_var": 0.7539223188067105, + "learning_rate": 0.0001, + "loss": 1.2595, + "loss/crossentropy": 2.199939727783203, + "loss/hidden": 0.93359375, + "loss/logits": 0.14108574390411377, + "loss/reg": 0.018480053171515465, + "step": 1002 + }, + { + "epoch": 0.125375, + "grad_norm": 3.8252949714660645, + "grad_norm_var": 0.7543319037149537, + "learning_rate": 0.0001, + "loss": 1.2766, + "loss/crossentropy": 2.4151408672332764, + "loss/hidden": 0.91796875, + "loss/logits": 0.1738898754119873, + "loss/reg": 0.018471699208021164, + "step": 1003 + }, + { + "epoch": 0.1255, + "grad_norm": 3.2934703826904297, + "grad_norm_var": 0.754056547294514, + "learning_rate": 0.0001, + "loss": 1.1773, + "loss/crossentropy": 2.4902610778808594, + "loss/hidden": 0.83984375, + "loss/logits": 0.1528603434562683, + "loss/reg": 0.018463551998138428, + "step": 1004 + }, + { + "epoch": 0.125625, + "grad_norm": 2.8537371158599854, + "grad_norm_var": 0.7665102142099857, + "learning_rate": 0.0001, + "loss": 1.0491, + "loss/crossentropy": 2.5475528240203857, + "loss/hidden": 0.734375, + "loss/logits": 0.1301451176404953, + "loss/reg": 0.01845443621277809, + "step": 1005 + }, + { + "epoch": 0.12575, + "grad_norm": 10.271280288696289, + "grad_norm_var": 3.623624147504192, + "learning_rate": 0.0001, + "loss": 1.9301, + "loss/crossentropy": 2.789032459259033, + "loss/hidden": 1.359375, + "loss/logits": 0.386294960975647, + "loss/reg": 0.018445348367094994, + "step": 1006 + }, + { + "epoch": 0.125875, + "grad_norm": 3.5450735092163086, + "grad_norm_var": 3.5274627001684156, + "learning_rate": 0.0001, + "loss": 1.0646, + "loss/crossentropy": 2.7634024620056152, + "loss/hidden": 0.73828125, + "loss/logits": 0.1419609636068344, + "loss/reg": 0.01843627728521824, + "step": 1007 + }, + { + "epoch": 0.126, + "grad_norm": 3.8805432319641113, + "grad_norm_var": 3.521631426981449, + "learning_rate": 0.0001, + "loss": 1.2255, + "loss/crossentropy": 2.313877820968628, + "loss/hidden": 0.859375, + "loss/logits": 0.18188925087451935, + "loss/reg": 0.018427575007081032, + "step": 1008 + }, + { + "epoch": 0.126125, + "grad_norm": 3.56695556640625, + "grad_norm_var": 3.3749006612486068, + "learning_rate": 0.0001, + "loss": 1.1632, + "loss/crossentropy": 3.036876678466797, + "loss/hidden": 0.8125, + "loss/logits": 0.1664760708808899, + "loss/reg": 0.018418410792946815, + "step": 1009 + }, + { + "epoch": 0.12625, + "grad_norm": 3.4208462238311768, + "grad_norm_var": 3.350722625996883, + "learning_rate": 0.0001, + "loss": 1.3539, + "loss/crossentropy": 2.335106372833252, + "loss/hidden": 1.0, + "loss/logits": 0.16979742050170898, + "loss/reg": 0.018409088253974915, + "step": 1010 + }, + { + "epoch": 0.126375, + "grad_norm": 4.440703392028809, + "grad_norm_var": 3.2335077439479125, + "learning_rate": 0.0001, + "loss": 1.5031, + "loss/crossentropy": 1.8046929836273193, + "loss/hidden": 1.0703125, + "loss/logits": 0.2487386018037796, + "loss/reg": 0.018400251865386963, + "step": 1011 + }, + { + "epoch": 0.1265, + "grad_norm": 3.6042182445526123, + "grad_norm_var": 3.175392851042456, + "learning_rate": 0.0001, + "loss": 1.0341, + "loss/crossentropy": 2.732485294342041, + "loss/hidden": 0.70703125, + "loss/logits": 0.14311768114566803, + "loss/reg": 0.018391618505120277, + "step": 1012 + }, + { + "epoch": 0.126625, + "grad_norm": 3.0008747577667236, + "grad_norm_var": 3.1348769442621385, + "learning_rate": 0.0001, + "loss": 1.2818, + "loss/crossentropy": 2.4243671894073486, + "loss/hidden": 0.8984375, + "loss/logits": 0.19954946637153625, + "loss/reg": 0.01838279701769352, + "step": 1013 + }, + { + "epoch": 0.12675, + "grad_norm": 3.548919677734375, + "grad_norm_var": 3.076212765414932, + "learning_rate": 0.0001, + "loss": 1.0597, + "loss/crossentropy": 2.646744966506958, + "loss/hidden": 0.7578125, + "loss/logits": 0.11818855255842209, + "loss/reg": 0.018373781815171242, + "step": 1014 + }, + { + "epoch": 0.126875, + "grad_norm": 3.4126574993133545, + "grad_norm_var": 3.027892721971917, + "learning_rate": 0.0001, + "loss": 1.2446, + "loss/crossentropy": 2.2376763820648193, + "loss/hidden": 0.90234375, + "loss/logits": 0.15857847034931183, + "loss/reg": 0.018364954739809036, + "step": 1015 + }, + { + "epoch": 0.127, + "grad_norm": 2.8473548889160156, + "grad_norm_var": 3.1028595438739974, + "learning_rate": 0.0001, + "loss": 1.171, + "loss/crossentropy": 2.4969823360443115, + "loss/hidden": 0.80859375, + "loss/logits": 0.17886783182621002, + "loss/reg": 0.01835593394935131, + "step": 1016 + }, + { + "epoch": 0.127125, + "grad_norm": 3.4492313861846924, + "grad_norm_var": 3.0254289441294913, + "learning_rate": 0.0001, + "loss": 1.1521, + "loss/crossentropy": 2.6258046627044678, + "loss/hidden": 0.828125, + "loss/logits": 0.14048799872398376, + "loss/reg": 0.018347129225730896, + "step": 1017 + }, + { + "epoch": 0.12725, + "grad_norm": 2.7578630447387695, + "grad_norm_var": 3.1109318052612775, + "learning_rate": 0.0001, + "loss": 1.1909, + "loss/crossentropy": 2.5233936309814453, + "loss/hidden": 0.84375, + "loss/logits": 0.16377520561218262, + "loss/reg": 0.018338393419981003, + "step": 1018 + }, + { + "epoch": 0.127375, + "grad_norm": 4.195294380187988, + "grad_norm_var": 3.117902257815278, + "learning_rate": 0.0001, + "loss": 1.3224, + "loss/crossentropy": 1.8772588968276978, + "loss/hidden": 0.984375, + "loss/logits": 0.1546936333179474, + "loss/reg": 0.01832934282720089, + "step": 1019 + }, + { + "epoch": 0.1275, + "grad_norm": 3.7895658016204834, + "grad_norm_var": 3.0944502488485717, + "learning_rate": 0.0001, + "loss": 1.1973, + "loss/crossentropy": 2.3982863426208496, + "loss/hidden": 0.84375, + "loss/logits": 0.17036345601081848, + "loss/reg": 0.01832025870680809, + "step": 1020 + }, + { + "epoch": 0.127625, + "grad_norm": 6.021730422973633, + "grad_norm_var": 3.274883958363541, + "learning_rate": 0.0001, + "loss": 1.5031, + "loss/crossentropy": 2.428899049758911, + "loss/hidden": 1.109375, + "loss/logits": 0.21065470576286316, + "loss/reg": 0.01831124909222126, + "step": 1021 + }, + { + "epoch": 0.12775, + "grad_norm": 4.54056453704834, + "grad_norm_var": 0.6193178360798726, + "learning_rate": 0.0001, + "loss": 1.6077, + "loss/crossentropy": 2.3732786178588867, + "loss/hidden": 1.1953125, + "loss/logits": 0.22934843599796295, + "loss/reg": 0.01830223761498928, + "step": 1022 + }, + { + "epoch": 0.127875, + "grad_norm": 4.612347602844238, + "grad_norm_var": 0.6611490686092556, + "learning_rate": 0.0001, + "loss": 1.1245, + "loss/crossentropy": 2.393007516860962, + "loss/hidden": 0.8046875, + "loss/logits": 0.13689909875392914, + "loss/reg": 0.0182929839938879, + "step": 1023 + }, + { + "epoch": 0.128, + "grad_norm": 4.8808794021606445, + "grad_norm_var": 0.7320190710671227, + "learning_rate": 0.0001, + "loss": 1.6498, + "loss/crossentropy": 1.8295843601226807, + "loss/hidden": 1.265625, + "loss/logits": 0.2013380229473114, + "loss/reg": 0.01828295737504959, + "step": 1024 + }, + { + "epoch": 0.128125, + "grad_norm": 6.514127731323242, + "grad_norm_var": 1.1516245124796, + "learning_rate": 0.0001, + "loss": 1.4269, + "loss/crossentropy": 2.6622331142425537, + "loss/hidden": 1.0390625, + "loss/logits": 0.20514041185379028, + "loss/reg": 0.018273649737238884, + "step": 1025 + }, + { + "epoch": 0.12825, + "grad_norm": 3.6572396755218506, + "grad_norm_var": 1.1348195216000618, + "learning_rate": 0.0001, + "loss": 1.2673, + "loss/crossentropy": 2.410374164581299, + "loss/hidden": 0.9140625, + "loss/logits": 0.17059262096881866, + "loss/reg": 0.018263790756464005, + "step": 1026 + }, + { + "epoch": 0.128375, + "grad_norm": 2.8987033367156982, + "grad_norm_var": 1.209186568114529, + "learning_rate": 0.0001, + "loss": 1.2157, + "loss/crossentropy": 2.236950397491455, + "loss/hidden": 0.8828125, + "loss/logits": 0.15036781132221222, + "loss/reg": 0.018253570422530174, + "step": 1027 + }, + { + "epoch": 0.1285, + "grad_norm": 3.276984453201294, + "grad_norm_var": 1.2324156239644137, + "learning_rate": 0.0001, + "loss": 1.2494, + "loss/crossentropy": 2.683966636657715, + "loss/hidden": 0.90234375, + "loss/logits": 0.16460160911083221, + "loss/reg": 0.0182446651160717, + "step": 1028 + }, + { + "epoch": 0.128625, + "grad_norm": 5.419056415557861, + "grad_norm_var": 1.2877520831133287, + "learning_rate": 0.0001, + "loss": 1.4861, + "loss/crossentropy": 2.306896209716797, + "loss/hidden": 1.1015625, + "loss/logits": 0.20220112800598145, + "loss/reg": 0.018234653398394585, + "step": 1029 + }, + { + "epoch": 0.12875, + "grad_norm": 3.4227371215820312, + "grad_norm_var": 1.298252758406007, + "learning_rate": 0.0001, + "loss": 1.1322, + "loss/crossentropy": 2.6958277225494385, + "loss/hidden": 0.78125, + "loss/logits": 0.16871951520442963, + "loss/reg": 0.018225453794002533, + "step": 1030 + }, + { + "epoch": 0.128875, + "grad_norm": 2.5001378059387207, + "grad_norm_var": 1.4346570797964828, + "learning_rate": 0.0001, + "loss": 1.0835, + "loss/crossentropy": 2.743760824203491, + "loss/hidden": 0.75390625, + "loss/logits": 0.14742383360862732, + "loss/reg": 0.01821640320122242, + "step": 1031 + }, + { + "epoch": 0.129, + "grad_norm": 3.422490119934082, + "grad_norm_var": 1.3631839436174078, + "learning_rate": 0.0001, + "loss": 1.213, + "loss/crossentropy": 2.6142783164978027, + "loss/hidden": 0.8671875, + "loss/logits": 0.1637578308582306, + "loss/reg": 0.01820731721818447, + "step": 1032 + }, + { + "epoch": 0.129125, + "grad_norm": 3.200204849243164, + "grad_norm_var": 1.3881674273527314, + "learning_rate": 0.0001, + "loss": 1.28, + "loss/crossentropy": 2.4325146675109863, + "loss/hidden": 0.91796875, + "loss/logits": 0.18004098534584045, + "loss/reg": 0.018198398873209953, + "step": 1033 + }, + { + "epoch": 0.12925, + "grad_norm": 3.3802099227905273, + "grad_norm_var": 1.3035463186707592, + "learning_rate": 0.0001, + "loss": 1.1984, + "loss/crossentropy": 2.424464225769043, + "loss/hidden": 0.8671875, + "loss/logits": 0.14932216703891754, + "loss/reg": 0.018189454451203346, + "step": 1034 + }, + { + "epoch": 0.129375, + "grad_norm": 3.2020263671875, + "grad_norm_var": 1.3536821307745555, + "learning_rate": 0.0001, + "loss": 1.085, + "loss/crossentropy": 2.539475440979004, + "loss/hidden": 0.77734375, + "loss/logits": 0.1258580982685089, + "loss/reg": 0.018180513754487038, + "step": 1035 + }, + { + "epoch": 0.1295, + "grad_norm": 3.5899553298950195, + "grad_norm_var": 1.3630023284114352, + "learning_rate": 0.0001, + "loss": 1.1137, + "loss/crossentropy": 2.277602434158325, + "loss/hidden": 0.78125, + "loss/logits": 0.15075945854187012, + "loss/reg": 0.018171606585383415, + "step": 1036 + }, + { + "epoch": 0.129625, + "grad_norm": 3.1815500259399414, + "grad_norm_var": 1.1143223174368395, + "learning_rate": 0.0001, + "loss": 1.2827, + "loss/crossentropy": 2.5251145362854004, + "loss/hidden": 0.91015625, + "loss/logits": 0.19092242419719696, + "loss/reg": 0.018162967637181282, + "step": 1037 + }, + { + "epoch": 0.12975, + "grad_norm": 3.6182315349578857, + "grad_norm_var": 1.083329466089586, + "learning_rate": 0.0001, + "loss": 1.5649, + "loss/crossentropy": 2.0126724243164062, + "loss/hidden": 1.1796875, + "loss/logits": 0.20370075106620789, + "loss/reg": 0.018154380843043327, + "step": 1038 + }, + { + "epoch": 0.129875, + "grad_norm": 3.239778518676758, + "grad_norm_var": 1.0521445613054454, + "learning_rate": 0.0001, + "loss": 1.1521, + "loss/crossentropy": 2.286173105239868, + "loss/hidden": 0.8203125, + "loss/logits": 0.1503629982471466, + "loss/reg": 0.01814563386142254, + "step": 1039 + }, + { + "epoch": 0.13, + "grad_norm": 5.980926990509033, + "grad_norm_var": 1.2991062966869322, + "learning_rate": 0.0001, + "loss": 1.4125, + "loss/crossentropy": 2.5735340118408203, + "loss/hidden": 0.87109375, + "loss/logits": 0.36005836725234985, + "loss/reg": 0.018136821687221527, + "step": 1040 + }, + { + "epoch": 0.130125, + "grad_norm": 3.8425281047821045, + "grad_norm_var": 0.7718063043351868, + "learning_rate": 0.0001, + "loss": 1.3336, + "loss/crossentropy": 2.002030372619629, + "loss/hidden": 0.9921875, + "loss/logits": 0.16016829013824463, + "loss/reg": 0.01812821812927723, + "step": 1041 + }, + { + "epoch": 0.13025, + "grad_norm": 2.6303024291992188, + "grad_norm_var": 0.8318731912874341, + "learning_rate": 0.0001, + "loss": 1.0674, + "loss/crossentropy": 2.5561156272888184, + "loss/hidden": 0.75390625, + "loss/logits": 0.13234438002109528, + "loss/reg": 0.01811978593468666, + "step": 1042 + }, + { + "epoch": 0.130375, + "grad_norm": 4.081605911254883, + "grad_norm_var": 0.816546710723537, + "learning_rate": 0.0001, + "loss": 1.1508, + "loss/crossentropy": 2.5789449214935303, + "loss/hidden": 0.83203125, + "loss/logits": 0.13762518763542175, + "loss/reg": 0.01811092346906662, + "step": 1043 + }, + { + "epoch": 0.1305, + "grad_norm": 3.746345281600952, + "grad_norm_var": 0.808580216385354, + "learning_rate": 0.0001, + "loss": 1.0868, + "loss/crossentropy": 2.2884302139282227, + "loss/hidden": 0.7578125, + "loss/logits": 0.14801663160324097, + "loss/reg": 0.018101971596479416, + "step": 1044 + }, + { + "epoch": 0.130625, + "grad_norm": 3.1717352867126465, + "grad_norm_var": 0.5952362637338294, + "learning_rate": 0.0001, + "loss": 1.0943, + "loss/crossentropy": 2.4526915550231934, + "loss/hidden": 0.76953125, + "loss/logits": 0.1438664197921753, + "loss/reg": 0.018093010410666466, + "step": 1045 + }, + { + "epoch": 0.13075, + "grad_norm": 5.281366348266602, + "grad_norm_var": 0.7887311446436568, + "learning_rate": 0.0001, + "loss": 1.1712, + "loss/crossentropy": 2.400421142578125, + "loss/hidden": 0.8359375, + "loss/logits": 0.1543923020362854, + "loss/reg": 0.01808418706059456, + "step": 1046 + }, + { + "epoch": 0.130875, + "grad_norm": 5.1911540031433105, + "grad_norm_var": 0.8361699826228097, + "learning_rate": 0.0001, + "loss": 1.2646, + "loss/crossentropy": 2.6032376289367676, + "loss/hidden": 0.91796875, + "loss/logits": 0.16584512591362, + "loss/reg": 0.01807507872581482, + "step": 1047 + }, + { + "epoch": 0.131, + "grad_norm": 2.902432680130005, + "grad_norm_var": 0.8790790548012068, + "learning_rate": 0.0001, + "loss": 1.2484, + "loss/crossentropy": 2.4166929721832275, + "loss/hidden": 0.90234375, + "loss/logits": 0.1653715819120407, + "loss/reg": 0.018065867945551872, + "step": 1048 + }, + { + "epoch": 0.131125, + "grad_norm": 3.408224582672119, + "grad_norm_var": 0.8661178167301301, + "learning_rate": 0.0001, + "loss": 1.1045, + "loss/crossentropy": 2.5960233211517334, + "loss/hidden": 0.78515625, + "loss/logits": 0.13876128196716309, + "loss/reg": 0.01805703714489937, + "step": 1049 + }, + { + "epoch": 0.13125, + "grad_norm": 2.8764824867248535, + "grad_norm_var": 0.9086952536899503, + "learning_rate": 0.0001, + "loss": 1.2047, + "loss/crossentropy": 2.280080556869507, + "loss/hidden": 0.875, + "loss/logits": 0.1491965353488922, + "loss/reg": 0.018048083409667015, + "step": 1050 + }, + { + "epoch": 0.131375, + "grad_norm": 4.48457670211792, + "grad_norm_var": 0.9183881653108216, + "learning_rate": 0.0001, + "loss": 1.0941, + "loss/crossentropy": 2.5797057151794434, + "loss/hidden": 0.7421875, + "loss/logits": 0.17152699828147888, + "loss/reg": 0.018038896843791008, + "step": 1051 + }, + { + "epoch": 0.1315, + "grad_norm": 3.204270839691162, + "grad_norm_var": 0.9398596856778226, + "learning_rate": 0.0001, + "loss": 1.423, + "loss/crossentropy": 2.1369965076446533, + "loss/hidden": 1.0625, + "loss/logits": 0.18023288249969482, + "loss/reg": 0.018029624596238136, + "step": 1052 + }, + { + "epoch": 0.131625, + "grad_norm": 3.225297212600708, + "grad_norm_var": 0.9363567728280238, + "learning_rate": 0.0001, + "loss": 1.1299, + "loss/crossentropy": 2.5327794551849365, + "loss/hidden": 0.79296875, + "loss/logits": 0.1567188799381256, + "loss/reg": 0.018020475283265114, + "step": 1053 + }, + { + "epoch": 0.13175, + "grad_norm": 2.6159298419952393, + "grad_norm_var": 1.0241485205327874, + "learning_rate": 0.0001, + "loss": 1.0505, + "loss/crossentropy": 2.5744223594665527, + "loss/hidden": 0.73828125, + "loss/logits": 0.1320829838514328, + "loss/reg": 0.018011758103966713, + "step": 1054 + }, + { + "epoch": 0.131875, + "grad_norm": 4.383845806121826, + "grad_norm_var": 1.0292396555670902, + "learning_rate": 0.0001, + "loss": 1.3117, + "loss/crossentropy": 2.4903180599212646, + "loss/hidden": 0.90625, + "loss/logits": 0.22538328170776367, + "loss/reg": 0.0180036798119545, + "step": 1055 + }, + { + "epoch": 0.132, + "grad_norm": 2.4342551231384277, + "grad_norm_var": 0.7907919306719045, + "learning_rate": 0.0001, + "loss": 1.0574, + "loss/crossentropy": 2.3163890838623047, + "loss/hidden": 0.75, + "loss/logits": 0.12748895585536957, + "loss/reg": 0.01799560710787773, + "step": 1056 + }, + { + "epoch": 0.132125, + "grad_norm": 3.542881727218628, + "grad_norm_var": 0.7864152227556851, + "learning_rate": 0.0001, + "loss": 1.294, + "loss/crossentropy": 2.3338446617126465, + "loss/hidden": 0.93359375, + "loss/logits": 0.18051280081272125, + "loss/reg": 0.017986783757805824, + "step": 1057 + }, + { + "epoch": 0.13225, + "grad_norm": 4.0569915771484375, + "grad_norm_var": 0.7341544247946139, + "learning_rate": 0.0001, + "loss": 1.3818, + "loss/crossentropy": 2.330078363418579, + "loss/hidden": 1.015625, + "loss/logits": 0.18636882305145264, + "loss/reg": 0.017977885901927948, + "step": 1058 + }, + { + "epoch": 0.132375, + "grad_norm": 3.663426637649536, + "grad_norm_var": 0.7217416281732681, + "learning_rate": 0.0001, + "loss": 1.1508, + "loss/crossentropy": 2.5138370990753174, + "loss/hidden": 0.796875, + "loss/logits": 0.1742265522480011, + "loss/reg": 0.017969388514757156, + "step": 1059 + }, + { + "epoch": 0.1325, + "grad_norm": 2.9652159214019775, + "grad_norm_var": 0.7484703245510673, + "learning_rate": 0.0001, + "loss": 1.1313, + "loss/crossentropy": 2.2722136974334717, + "loss/hidden": 0.80859375, + "loss/logits": 0.1430921107530594, + "loss/reg": 0.0179609302431345, + "step": 1060 + }, + { + "epoch": 0.132625, + "grad_norm": 5.577633857727051, + "grad_norm_var": 0.9767082401115801, + "learning_rate": 0.0001, + "loss": 1.7627, + "loss/crossentropy": 2.562012195587158, + "loss/hidden": 1.390625, + "loss/logits": 0.19251598417758942, + "loss/reg": 0.01795242354273796, + "step": 1061 + }, + { + "epoch": 0.13275, + "grad_norm": 5.364041328430176, + "grad_norm_var": 0.9941443511093354, + "learning_rate": 0.0001, + "loss": 1.5266, + "loss/crossentropy": 2.507934808731079, + "loss/hidden": 1.125, + "loss/logits": 0.22217199206352234, + "loss/reg": 0.017944158986210823, + "step": 1062 + }, + { + "epoch": 0.132875, + "grad_norm": 6.644399642944336, + "grad_norm_var": 1.4066377839550304, + "learning_rate": 0.0001, + "loss": 1.5347, + "loss/crossentropy": 2.4112086296081543, + "loss/hidden": 1.109375, + "loss/logits": 0.24593724310398102, + "loss/reg": 0.017935609444975853, + "step": 1063 + }, + { + "epoch": 0.133, + "grad_norm": 2.846679210662842, + "grad_norm_var": 1.4137598873748003, + "learning_rate": 0.0001, + "loss": 1.1476, + "loss/crossentropy": 2.2182581424713135, + "loss/hidden": 0.81640625, + "loss/logits": 0.1519462913274765, + "loss/reg": 0.01792830042541027, + "step": 1064 + }, + { + "epoch": 0.133125, + "grad_norm": 6.004873752593994, + "grad_norm_var": 1.688838288773405, + "learning_rate": 0.0001, + "loss": 1.6923, + "loss/crossentropy": 2.5660815238952637, + "loss/hidden": 1.2578125, + "loss/logits": 0.25529351830482483, + "loss/reg": 0.01792137697339058, + "step": 1065 + }, + { + "epoch": 0.13325, + "grad_norm": 2.905362844467163, + "grad_norm_var": 1.684590354160513, + "learning_rate": 0.0001, + "loss": 1.1266, + "loss/crossentropy": 2.679647207260132, + "loss/hidden": 0.78125, + "loss/logits": 0.16615256667137146, + "loss/reg": 0.017914744094014168, + "step": 1066 + }, + { + "epoch": 0.133375, + "grad_norm": 2.774467945098877, + "grad_norm_var": 1.7557347328903679, + "learning_rate": 0.0001, + "loss": 0.9917, + "loss/crossentropy": 2.4071364402770996, + "loss/hidden": 0.69921875, + "loss/logits": 0.11337558180093765, + "loss/reg": 0.017907986417412758, + "step": 1067 + }, + { + "epoch": 0.1335, + "grad_norm": 3.0080766677856445, + "grad_norm_var": 1.7760288881986261, + "learning_rate": 0.0001, + "loss": 1.1718, + "loss/crossentropy": 2.4299240112304688, + "loss/hidden": 0.83984375, + "loss/logits": 0.15295377373695374, + "loss/reg": 0.01789918728172779, + "step": 1068 + }, + { + "epoch": 0.133625, + "grad_norm": 3.646827220916748, + "grad_norm_var": 1.750571466335814, + "learning_rate": 0.0001, + "loss": 1.2924, + "loss/crossentropy": 2.461158514022827, + "loss/hidden": 0.93359375, + "loss/logits": 0.17990370094776154, + "loss/reg": 0.01789136230945587, + "step": 1069 + }, + { + "epoch": 0.13375, + "grad_norm": 4.164073944091797, + "grad_norm_var": 1.6348612297712255, + "learning_rate": 0.0001, + "loss": 1.171, + "loss/crossentropy": 2.4830503463745117, + "loss/hidden": 0.82421875, + "loss/logits": 0.16796636581420898, + "loss/reg": 0.017882652580738068, + "step": 1070 + }, + { + "epoch": 0.133875, + "grad_norm": 3.7383811473846436, + "grad_norm_var": 1.6277745939444626, + "learning_rate": 0.0001, + "loss": 1.2749, + "loss/crossentropy": 2.3813910484313965, + "loss/hidden": 0.94921875, + "loss/logits": 0.14690269529819489, + "loss/reg": 0.017873771488666534, + "step": 1071 + }, + { + "epoch": 0.134, + "grad_norm": 4.526171684265137, + "grad_norm_var": 1.476108335704568, + "learning_rate": 0.0001, + "loss": 1.6415, + "loss/crossentropy": 2.1495041847229004, + "loss/hidden": 1.234375, + "loss/logits": 0.22847777605056763, + "loss/reg": 0.017865996807813644, + "step": 1072 + }, + { + "epoch": 0.134125, + "grad_norm": 3.382908582687378, + "grad_norm_var": 1.4893637052056738, + "learning_rate": 0.0001, + "loss": 1.0784, + "loss/crossentropy": 2.468477725982666, + "loss/hidden": 0.765625, + "loss/logits": 0.13421444594860077, + "loss/reg": 0.01785840094089508, + "step": 1073 + }, + { + "epoch": 0.13425, + "grad_norm": 5.163125991821289, + "grad_norm_var": 1.5625376434966463, + "learning_rate": 0.0001, + "loss": 1.2499, + "loss/crossentropy": 2.2978320121765137, + "loss/hidden": 0.91796875, + "loss/logits": 0.1534654200077057, + "loss/reg": 0.01785091683268547, + "step": 1074 + }, + { + "epoch": 0.134375, + "grad_norm": 3.1810903549194336, + "grad_norm_var": 1.6082726182831408, + "learning_rate": 0.0001, + "loss": 1.0638, + "loss/crossentropy": 2.5080108642578125, + "loss/hidden": 0.7578125, + "loss/logits": 0.1275557279586792, + "loss/reg": 0.017842529341578484, + "step": 1075 + }, + { + "epoch": 0.1345, + "grad_norm": 2.6275007724761963, + "grad_norm_var": 1.6673241917556567, + "learning_rate": 0.0001, + "loss": 1.023, + "loss/crossentropy": 2.725398302078247, + "loss/hidden": 0.7265625, + "loss/logits": 0.11811243742704391, + "loss/reg": 0.01783500798046589, + "step": 1076 + }, + { + "epoch": 0.134625, + "grad_norm": 2.4731173515319824, + "grad_norm_var": 1.656907168261868, + "learning_rate": 0.0001, + "loss": 1.1398, + "loss/crossentropy": 2.2775444984436035, + "loss/hidden": 0.8125, + "loss/logits": 0.14902615547180176, + "loss/reg": 0.01782614178955555, + "step": 1077 + }, + { + "epoch": 0.13475, + "grad_norm": 2.6200878620147705, + "grad_norm_var": 1.593020801522063, + "learning_rate": 0.0001, + "loss": 1.0744, + "loss/crossentropy": 2.6091089248657227, + "loss/hidden": 0.74609375, + "loss/logits": 0.1501138061285019, + "loss/reg": 0.017817262560129166, + "step": 1078 + }, + { + "epoch": 0.134875, + "grad_norm": 2.637160062789917, + "grad_norm_var": 1.040390657280697, + "learning_rate": 0.0001, + "loss": 1.0533, + "loss/crossentropy": 2.6109707355499268, + "loss/hidden": 0.734375, + "loss/logits": 0.1408311128616333, + "loss/reg": 0.01780843175947666, + "step": 1079 + }, + { + "epoch": 0.135, + "grad_norm": 2.4874284267425537, + "grad_norm_var": 1.0788527015533946, + "learning_rate": 0.0001, + "loss": 1.0129, + "loss/crossentropy": 2.4900012016296387, + "loss/hidden": 0.71484375, + "loss/logits": 0.12004341185092926, + "loss/reg": 0.017799846827983856, + "step": 1080 + }, + { + "epoch": 0.135125, + "grad_norm": 2.8923330307006836, + "grad_norm_var": 0.6277088581300307, + "learning_rate": 0.0001, + "loss": 1.0085, + "loss/crossentropy": 2.877713441848755, + "loss/hidden": 0.69140625, + "loss/logits": 0.1391412615776062, + "loss/reg": 0.017792070284485817, + "step": 1081 + }, + { + "epoch": 0.13525, + "grad_norm": 3.0681328773498535, + "grad_norm_var": 0.6215757739924422, + "learning_rate": 0.0001, + "loss": 1.1841, + "loss/crossentropy": 2.4870967864990234, + "loss/hidden": 0.83984375, + "loss/logits": 0.16639447212219238, + "loss/reg": 0.017783144488930702, + "step": 1082 + }, + { + "epoch": 0.135375, + "grad_norm": 2.943787097930908, + "grad_norm_var": 0.6120804925495794, + "learning_rate": 0.0001, + "loss": 0.9977, + "loss/crossentropy": 2.4002597332000732, + "loss/hidden": 0.69921875, + "loss/logits": 0.12076494097709656, + "loss/reg": 0.017775090411305428, + "step": 1083 + }, + { + "epoch": 0.1355, + "grad_norm": 3.1173973083496094, + "grad_norm_var": 0.6087907870581594, + "learning_rate": 0.0001, + "loss": 1.1729, + "loss/crossentropy": 2.6328957080841064, + "loss/hidden": 0.85546875, + "loss/logits": 0.13975805044174194, + "loss/reg": 0.017766445875167847, + "step": 1084 + }, + { + "epoch": 0.135625, + "grad_norm": 4.231754779815674, + "grad_norm_var": 0.6578597190419477, + "learning_rate": 0.0001, + "loss": 1.5044, + "loss/crossentropy": 2.6625559329986572, + "loss/hidden": 1.109375, + "loss/logits": 0.2174825519323349, + "loss/reg": 0.01775727979838848, + "step": 1085 + }, + { + "epoch": 0.13575, + "grad_norm": 3.1539108753204346, + "grad_norm_var": 0.6090813956553401, + "learning_rate": 0.0001, + "loss": 1.0553, + "loss/crossentropy": 2.685743808746338, + "loss/hidden": 0.74609375, + "loss/logits": 0.1317606419324875, + "loss/reg": 0.017748642712831497, + "step": 1086 + }, + { + "epoch": 0.135875, + "grad_norm": 3.386591672897339, + "grad_norm_var": 0.5946246391748462, + "learning_rate": 0.0001, + "loss": 1.4794, + "loss/crossentropy": 2.3431785106658936, + "loss/hidden": 1.0625, + "loss/logits": 0.23952066898345947, + "loss/reg": 0.01773969829082489, + "step": 1087 + }, + { + "epoch": 0.136, + "grad_norm": 2.8175418376922607, + "grad_norm_var": 0.4848234667031941, + "learning_rate": 0.0001, + "loss": 1.0416, + "loss/crossentropy": 2.676400899887085, + "loss/hidden": 0.73046875, + "loss/logits": 0.13378942012786865, + "loss/reg": 0.01773088052868843, + "step": 1088 + }, + { + "epoch": 0.136125, + "grad_norm": 3.5386815071105957, + "grad_norm_var": 0.4914580502239192, + "learning_rate": 0.0001, + "loss": 1.1391, + "loss/crossentropy": 2.6673851013183594, + "loss/hidden": 0.81640625, + "loss/logits": 0.14545656740665436, + "loss/reg": 0.01772209256887436, + "step": 1089 + }, + { + "epoch": 0.13625, + "grad_norm": 2.944378614425659, + "grad_norm_var": 0.2024704804136301, + "learning_rate": 0.0001, + "loss": 1.0518, + "loss/crossentropy": 2.3069140911102295, + "loss/hidden": 0.7578125, + "loss/logits": 0.11681585013866425, + "loss/reg": 0.017712950706481934, + "step": 1090 + }, + { + "epoch": 0.136375, + "grad_norm": 2.4177603721618652, + "grad_norm_var": 0.221225648364459, + "learning_rate": 0.0001, + "loss": 1.0819, + "loss/crossentropy": 2.5605850219726562, + "loss/hidden": 0.75390625, + "loss/logits": 0.15096938610076904, + "loss/reg": 0.01770433411002159, + "step": 1091 + }, + { + "epoch": 0.1365, + "grad_norm": 2.8017842769622803, + "grad_norm_var": 0.2154010561648003, + "learning_rate": 0.0001, + "loss": 1.0446, + "loss/crossentropy": 2.837974786758423, + "loss/hidden": 0.7421875, + "loss/logits": 0.12545213103294373, + "loss/reg": 0.017695914953947067, + "step": 1092 + }, + { + "epoch": 0.136625, + "grad_norm": 2.30322265625, + "grad_norm_var": 0.2284775401570935, + "learning_rate": 0.0001, + "loss": 1.0164, + "loss/crossentropy": 2.645153760910034, + "loss/hidden": 0.7109375, + "loss/logits": 0.1285744607448578, + "loss/reg": 0.017688019201159477, + "step": 1093 + }, + { + "epoch": 0.13675, + "grad_norm": 3.5715980529785156, + "grad_norm_var": 0.24192379822145516, + "learning_rate": 0.0001, + "loss": 1.1766, + "loss/crossentropy": 2.2281081676483154, + "loss/hidden": 0.8359375, + "loss/logits": 0.16386280953884125, + "loss/reg": 0.017679255455732346, + "step": 1094 + }, + { + "epoch": 0.136875, + "grad_norm": 3.5261693000793457, + "grad_norm_var": 0.24598854725778285, + "learning_rate": 0.0001, + "loss": 1.4345, + "loss/crossentropy": 2.311053514480591, + "loss/hidden": 1.03125, + "loss/logits": 0.22651731967926025, + "loss/reg": 0.017671290785074234, + "step": 1095 + }, + { + "epoch": 0.137, + "grad_norm": 4.736113548278809, + "grad_norm_var": 0.3858102993473473, + "learning_rate": 0.0001, + "loss": 1.1132, + "loss/crossentropy": 2.483963966369629, + "loss/hidden": 0.8046875, + "loss/logits": 0.1318715214729309, + "loss/reg": 0.01766252890229225, + "step": 1096 + }, + { + "epoch": 0.137125, + "grad_norm": 3.013045310974121, + "grad_norm_var": 0.3815164758052994, + "learning_rate": 0.0001, + "loss": 1.0531, + "loss/crossentropy": 2.3578484058380127, + "loss/hidden": 0.75390625, + "loss/logits": 0.12263330817222595, + "loss/reg": 0.017653891816735268, + "step": 1097 + }, + { + "epoch": 0.13725, + "grad_norm": 3.655559539794922, + "grad_norm_var": 0.3909346674988434, + "learning_rate": 0.0001, + "loss": 1.0558, + "loss/crossentropy": 2.7702715396881104, + "loss/hidden": 0.7421875, + "loss/logits": 0.13715983927249908, + "loss/reg": 0.01764553412795067, + "step": 1098 + }, + { + "epoch": 0.137375, + "grad_norm": 4.493204593658447, + "grad_norm_var": 0.47566105167650385, + "learning_rate": 0.0001, + "loss": 1.1521, + "loss/crossentropy": 3.274940013885498, + "loss/hidden": 0.765625, + "loss/logits": 0.21013642847537994, + "loss/reg": 0.01763724535703659, + "step": 1099 + }, + { + "epoch": 0.1375, + "grad_norm": 2.9585206508636475, + "grad_norm_var": 0.482309950085633, + "learning_rate": 0.0001, + "loss": 1.0561, + "loss/crossentropy": 2.3324267864227295, + "loss/hidden": 0.75, + "loss/logits": 0.12980124354362488, + "loss/reg": 0.017628395929932594, + "step": 1100 + }, + { + "epoch": 0.137625, + "grad_norm": 4.262050628662109, + "grad_norm_var": 0.4859417805331835, + "learning_rate": 0.0001, + "loss": 1.0991, + "loss/crossentropy": 2.4208176136016846, + "loss/hidden": 0.80078125, + "loss/logits": 0.12208649516105652, + "loss/reg": 0.017619585618376732, + "step": 1101 + }, + { + "epoch": 0.13775, + "grad_norm": 2.6123526096343994, + "grad_norm_var": 0.5183416158931512, + "learning_rate": 0.0001, + "loss": 0.9988, + "loss/crossentropy": 2.6298210620880127, + "loss/hidden": 0.69140625, + "loss/logits": 0.13125354051589966, + "loss/reg": 0.017610682174563408, + "step": 1102 + }, + { + "epoch": 0.137875, + "grad_norm": 3.968358039855957, + "grad_norm_var": 0.5450550638000642, + "learning_rate": 0.0001, + "loss": 1.1401, + "loss/crossentropy": 2.4403319358825684, + "loss/hidden": 0.80859375, + "loss/logits": 0.15549315512180328, + "loss/reg": 0.01760167069733143, + "step": 1103 + }, + { + "epoch": 0.138, + "grad_norm": 2.6694836616516113, + "grad_norm_var": 0.5569615426057339, + "learning_rate": 0.0001, + "loss": 1.2515, + "loss/crossentropy": 2.35309100151062, + "loss/hidden": 0.8984375, + "loss/logits": 0.1771024614572525, + "loss/reg": 0.01759263686835766, + "step": 1104 + }, + { + "epoch": 0.138125, + "grad_norm": 2.5527474880218506, + "grad_norm_var": 0.5918626570387079, + "learning_rate": 0.0001, + "loss": 0.9445, + "loss/crossentropy": 2.5089659690856934, + "loss/hidden": 0.65625, + "loss/logits": 0.11239723861217499, + "loss/reg": 0.017583860084414482, + "step": 1105 + }, + { + "epoch": 0.13825, + "grad_norm": 3.055800676345825, + "grad_norm_var": 0.5876466077321024, + "learning_rate": 0.0001, + "loss": 0.9616, + "loss/crossentropy": 3.0698087215423584, + "loss/hidden": 0.66796875, + "loss/logits": 0.11786890029907227, + "loss/reg": 0.01757502555847168, + "step": 1106 + }, + { + "epoch": 0.138375, + "grad_norm": 2.3708860874176025, + "grad_norm_var": 0.5932188518407192, + "learning_rate": 0.0001, + "loss": 1.1363, + "loss/crossentropy": 2.364453077316284, + "loss/hidden": 0.82421875, + "loss/logits": 0.13644903898239136, + "loss/reg": 0.017566362395882607, + "step": 1107 + }, + { + "epoch": 0.1385, + "grad_norm": 3.3530497550964355, + "grad_norm_var": 0.5767366681943767, + "learning_rate": 0.0001, + "loss": 1.1719, + "loss/crossentropy": 2.339834451675415, + "loss/hidden": 0.86328125, + "loss/logits": 0.13302001357078552, + "loss/reg": 0.017557917162775993, + "step": 1108 + }, + { + "epoch": 0.138625, + "grad_norm": 4.9073686599731445, + "grad_norm_var": 0.6479273995103992, + "learning_rate": 0.0001, + "loss": 1.144, + "loss/crossentropy": 2.608236074447632, + "loss/hidden": 0.8359375, + "loss/logits": 0.1325811743736267, + "loss/reg": 0.017549151554703712, + "step": 1109 + }, + { + "epoch": 0.13875, + "grad_norm": 4.318938732147217, + "grad_norm_var": 0.6917982612527337, + "learning_rate": 0.0001, + "loss": 1.0295, + "loss/crossentropy": 2.6003918647766113, + "loss/hidden": 0.73046875, + "loss/logits": 0.12366551160812378, + "loss/reg": 0.017540371045470238, + "step": 1110 + }, + { + "epoch": 0.138875, + "grad_norm": 3.439058303833008, + "grad_norm_var": 0.692297895774635, + "learning_rate": 0.0001, + "loss": 1.1891, + "loss/crossentropy": 2.3828976154327393, + "loss/hidden": 0.859375, + "loss/logits": 0.1544424593448639, + "loss/reg": 0.01753184385597706, + "step": 1111 + }, + { + "epoch": 0.139, + "grad_norm": 3.090857744216919, + "grad_norm_var": 0.5953394071265308, + "learning_rate": 0.0001, + "loss": 1.1475, + "loss/crossentropy": 2.2759549617767334, + "loss/hidden": 0.8125, + "loss/logits": 0.1597966104745865, + "loss/reg": 0.017523299902677536, + "step": 1112 + }, + { + "epoch": 0.139125, + "grad_norm": 2.421586513519287, + "grad_norm_var": 0.6493026217043355, + "learning_rate": 0.0001, + "loss": 1.0266, + "loss/crossentropy": 2.580230236053467, + "loss/hidden": 0.7265625, + "loss/logits": 0.12484898418188095, + "loss/reg": 0.017514871433377266, + "step": 1113 + }, + { + "epoch": 0.13925, + "grad_norm": 2.3141372203826904, + "grad_norm_var": 0.7130373793975477, + "learning_rate": 0.0001, + "loss": 1.0674, + "loss/crossentropy": 2.7415308952331543, + "loss/hidden": 0.75, + "loss/logits": 0.14231345057487488, + "loss/reg": 0.01750599592924118, + "step": 1114 + }, + { + "epoch": 0.139375, + "grad_norm": 3.1447672843933105, + "grad_norm_var": 0.6120215321394992, + "learning_rate": 0.0001, + "loss": 1.0712, + "loss/crossentropy": 2.4831013679504395, + "loss/hidden": 0.76171875, + "loss/logits": 0.13447736203670502, + "loss/reg": 0.017497511580586433, + "step": 1115 + }, + { + "epoch": 0.1395, + "grad_norm": 3.2429728507995605, + "grad_norm_var": 0.6073512012070741, + "learning_rate": 0.0001, + "loss": 1.1311, + "loss/crossentropy": 2.567009925842285, + "loss/hidden": 0.80078125, + "loss/logits": 0.1554429829120636, + "loss/reg": 0.01748875342309475, + "step": 1116 + }, + { + "epoch": 0.139625, + "grad_norm": 3.240558624267578, + "grad_norm_var": 0.532380465942029, + "learning_rate": 0.0001, + "loss": 1.1099, + "loss/crossentropy": 2.860992431640625, + "loss/hidden": 0.79296875, + "loss/logits": 0.1421348601579666, + "loss/reg": 0.01748001016676426, + "step": 1117 + }, + { + "epoch": 0.13975, + "grad_norm": 3.012918710708618, + "grad_norm_var": 0.5126825052838881, + "learning_rate": 0.0001, + "loss": 1.1268, + "loss/crossentropy": 2.4657137393951416, + "loss/hidden": 0.8125, + "loss/logits": 0.13955920934677124, + "loss/reg": 0.017471779137849808, + "step": 1118 + }, + { + "epoch": 0.139875, + "grad_norm": 2.7599992752075195, + "grad_norm_var": 0.47917524489163554, + "learning_rate": 0.0001, + "loss": 1.0584, + "loss/crossentropy": 2.366868019104004, + "loss/hidden": 0.73828125, + "loss/logits": 0.1455003321170807, + "loss/reg": 0.017463646829128265, + "step": 1119 + }, + { + "epoch": 0.14, + "grad_norm": 3.73720645904541, + "grad_norm_var": 0.48651163922628105, + "learning_rate": 0.0001, + "loss": 1.2863, + "loss/crossentropy": 2.9208223819732666, + "loss/hidden": 0.93359375, + "loss/logits": 0.17812727391719818, + "loss/reg": 0.017455046996474266, + "step": 1120 + }, + { + "epoch": 0.140125, + "grad_norm": 2.549912691116333, + "grad_norm_var": 0.48675118323949296, + "learning_rate": 0.0001, + "loss": 1.0037, + "loss/crossentropy": 2.217289447784424, + "loss/hidden": 0.70703125, + "loss/logits": 0.12221944332122803, + "loss/reg": 0.017446177080273628, + "step": 1121 + }, + { + "epoch": 0.14025, + "grad_norm": 3.35829758644104, + "grad_norm_var": 0.48725917149039616, + "learning_rate": 0.0001, + "loss": 0.9516, + "loss/crossentropy": 2.5682172775268555, + "loss/hidden": 0.66015625, + "loss/logits": 0.1170571893453598, + "loss/reg": 0.017437297850847244, + "step": 1122 + }, + { + "epoch": 0.140375, + "grad_norm": 3.384793519973755, + "grad_norm_var": 0.4388955051274242, + "learning_rate": 0.0001, + "loss": 1.107, + "loss/crossentropy": 2.8147690296173096, + "loss/hidden": 0.78515625, + "loss/logits": 0.14758194983005524, + "loss/reg": 0.017428115010261536, + "step": 1123 + }, + { + "epoch": 0.1405, + "grad_norm": 2.5823116302490234, + "grad_norm_var": 0.4672083375473524, + "learning_rate": 0.0001, + "loss": 1.0724, + "loss/crossentropy": 2.696993827819824, + "loss/hidden": 0.75390625, + "loss/logits": 0.14429491758346558, + "loss/reg": 0.017419347539544106, + "step": 1124 + }, + { + "epoch": 0.140625, + "grad_norm": 5.868249416351318, + "grad_norm_var": 0.7412100386445967, + "learning_rate": 0.0001, + "loss": 1.2337, + "loss/crossentropy": 2.6103923320770264, + "loss/hidden": 0.8984375, + "loss/logits": 0.16118671000003815, + "loss/reg": 0.017410660162568092, + "step": 1125 + }, + { + "epoch": 0.14075, + "grad_norm": 2.4435040950775146, + "grad_norm_var": 0.7010336436922195, + "learning_rate": 0.0001, + "loss": 0.9667, + "loss/crossentropy": 2.3706815242767334, + "loss/hidden": 0.6640625, + "loss/logits": 0.12864476442337036, + "loss/reg": 0.01740197278559208, + "step": 1126 + }, + { + "epoch": 0.140875, + "grad_norm": 2.7513833045959473, + "grad_norm_var": 0.7051812497689844, + "learning_rate": 0.0001, + "loss": 1.1451, + "loss/crossentropy": 2.4533605575561523, + "loss/hidden": 0.81640625, + "loss/logits": 0.15479305386543274, + "loss/reg": 0.017393220216035843, + "step": 1127 + }, + { + "epoch": 0.141, + "grad_norm": 2.8634872436523438, + "grad_norm_var": 0.7092644673154382, + "learning_rate": 0.0001, + "loss": 1.1193, + "loss/crossentropy": 2.1886956691741943, + "loss/hidden": 0.8046875, + "loss/logits": 0.14075767993927002, + "loss/reg": 0.01738525740802288, + "step": 1128 + }, + { + "epoch": 0.141125, + "grad_norm": 2.704045057296753, + "grad_norm_var": 0.6885219755537205, + "learning_rate": 0.0001, + "loss": 0.973, + "loss/crossentropy": 2.4812088012695312, + "loss/hidden": 0.65625, + "loss/logits": 0.1429394781589508, + "loss/reg": 0.01737692952156067, + "step": 1129 + }, + { + "epoch": 0.14125, + "grad_norm": 2.6875903606414795, + "grad_norm_var": 0.656991790963667, + "learning_rate": 0.0001, + "loss": 1.0697, + "loss/crossentropy": 2.3832883834838867, + "loss/hidden": 0.76953125, + "loss/logits": 0.126431405544281, + "loss/reg": 0.017368923872709274, + "step": 1130 + }, + { + "epoch": 0.141375, + "grad_norm": 7.774624824523926, + "grad_norm_var": 1.9961090220751256, + "learning_rate": 0.0001, + "loss": 1.383, + "loss/crossentropy": 2.5491535663604736, + "loss/hidden": 1.0390625, + "loss/logits": 0.1702873259782791, + "loss/reg": 0.01736014150083065, + "step": 1131 + }, + { + "epoch": 0.1415, + "grad_norm": 3.5188465118408203, + "grad_norm_var": 1.9937980339815395, + "learning_rate": 0.0001, + "loss": 1.1816, + "loss/crossentropy": 2.6813807487487793, + "loss/hidden": 0.84375, + "loss/logits": 0.16434520483016968, + "loss/reg": 0.017351284623146057, + "step": 1132 + }, + { + "epoch": 0.141625, + "grad_norm": 30.482587814331055, + "grad_norm_var": 47.607494749063044, + "learning_rate": 0.0001, + "loss": 1.0592, + "loss/crossentropy": 2.607267379760742, + "loss/hidden": 0.75390625, + "loss/logits": 0.13182219862937927, + "loss/reg": 0.01734289340674877, + "step": 1133 + }, + { + "epoch": 0.14175, + "grad_norm": 3.333674907684326, + "grad_norm_var": 47.52231423180526, + "learning_rate": 0.0001, + "loss": 1.1791, + "loss/crossentropy": 2.5332696437835693, + "loss/hidden": 0.85546875, + "loss/logits": 0.15026560425758362, + "loss/reg": 0.017334245145320892, + "step": 1134 + }, + { + "epoch": 0.141875, + "grad_norm": 2.8526668548583984, + "grad_norm_var": 47.4930115697571, + "learning_rate": 0.0001, + "loss": 1.1235, + "loss/crossentropy": 2.583221673965454, + "loss/hidden": 0.8125, + "loss/logits": 0.13772843778133392, + "loss/reg": 0.017325541004538536, + "step": 1135 + }, + { + "epoch": 0.142, + "grad_norm": 3.022386074066162, + "grad_norm_var": 47.66253737043978, + "learning_rate": 0.0001, + "loss": 1.2714, + "loss/crossentropy": 2.3197121620178223, + "loss/hidden": 0.9453125, + "loss/logits": 0.15295147895812988, + "loss/reg": 0.017316767945885658, + "step": 1136 + }, + { + "epoch": 0.142125, + "grad_norm": 3.3717892169952393, + "grad_norm_var": 47.421346164152666, + "learning_rate": 0.0001, + "loss": 1.131, + "loss/crossentropy": 2.543901205062866, + "loss/hidden": 0.8203125, + "loss/logits": 0.13759344816207886, + "loss/reg": 0.017307989299297333, + "step": 1137 + }, + { + "epoch": 0.14225, + "grad_norm": 3.5415263175964355, + "grad_norm_var": 47.37875577313657, + "learning_rate": 0.0001, + "loss": 1.0293, + "loss/crossentropy": 2.4758806228637695, + "loss/hidden": 0.73046875, + "loss/logits": 0.12582623958587646, + "loss/reg": 0.01730157807469368, + "step": 1138 + }, + { + "epoch": 0.142375, + "grad_norm": 2.9773669242858887, + "grad_norm_var": 47.48768287025903, + "learning_rate": 0.0001, + "loss": 1.0163, + "loss/crossentropy": 2.638218402862549, + "loss/hidden": 0.71484375, + "loss/logits": 0.12850065529346466, + "loss/reg": 0.017293203622102737, + "step": 1139 + }, + { + "epoch": 0.1425, + "grad_norm": 2.8083419799804688, + "grad_norm_var": 47.41278427285148, + "learning_rate": 0.0001, + "loss": 1.0945, + "loss/crossentropy": 2.503974676132202, + "loss/hidden": 0.7578125, + "loss/logits": 0.1638229787349701, + "loss/reg": 0.01728462241590023, + "step": 1140 + }, + { + "epoch": 0.142625, + "grad_norm": 2.425929069519043, + "grad_norm_var": 47.84099408884167, + "learning_rate": 0.0001, + "loss": 1.0449, + "loss/crossentropy": 2.3581292629241943, + "loss/hidden": 0.75, + "loss/logits": 0.12217840552330017, + "loss/reg": 0.017276806756854057, + "step": 1141 + }, + { + "epoch": 0.14275, + "grad_norm": 39.945011138916016, + "grad_norm_var": 123.09327375389867, + "learning_rate": 0.0001, + "loss": 1.0453, + "loss/crossentropy": 2.5906147956848145, + "loss/hidden": 0.75390625, + "loss/logits": 0.11872614920139313, + "loss/reg": 0.017268478870391846, + "step": 1142 + }, + { + "epoch": 0.142875, + "grad_norm": 3.0662145614624023, + "grad_norm_var": 122.90784367859821, + "learning_rate": 0.0001, + "loss": 1.0595, + "loss/crossentropy": 2.2695305347442627, + "loss/hidden": 0.7578125, + "loss/logits": 0.1291281282901764, + "loss/reg": 0.017260266467928886, + "step": 1143 + }, + { + "epoch": 0.143, + "grad_norm": 4.583246231079102, + "grad_norm_var": 122.06713805652959, + "learning_rate": 0.0001, + "loss": 1.1495, + "loss/crossentropy": 2.480984926223755, + "loss/hidden": 0.8125, + "loss/logits": 0.16442613303661346, + "loss/reg": 0.017252640798687935, + "step": 1144 + }, + { + "epoch": 0.143125, + "grad_norm": 2.981353521347046, + "grad_norm_var": 121.89670586108006, + "learning_rate": 0.0001, + "loss": 1.1551, + "loss/crossentropy": 2.2689995765686035, + "loss/hidden": 0.85546875, + "loss/logits": 0.12721604108810425, + "loss/reg": 0.01724405400454998, + "step": 1145 + }, + { + "epoch": 0.14325, + "grad_norm": 2.3794004917144775, + "grad_norm_var": 122.09878373545942, + "learning_rate": 0.0001, + "loss": 1.0441, + "loss/crossentropy": 2.5703084468841553, + "loss/hidden": 0.73828125, + "loss/logits": 0.13347238302230835, + "loss/reg": 0.017235582694411278, + "step": 1146 + }, + { + "epoch": 0.143375, + "grad_norm": 2.4713571071624756, + "grad_norm_var": 123.62106362597764, + "learning_rate": 0.0001, + "loss": 0.9492, + "loss/crossentropy": 2.447471857070923, + "loss/hidden": 0.65625, + "loss/logits": 0.12064240872859955, + "loss/reg": 0.017227739095687866, + "step": 1147 + }, + { + "epoch": 0.1435, + "grad_norm": 2.6587135791778564, + "grad_norm_var": 124.07916434426059, + "learning_rate": 0.0001, + "loss": 1.0818, + "loss/crossentropy": 2.6435859203338623, + "loss/hidden": 0.75390625, + "loss/logits": 0.15571220219135284, + "loss/reg": 0.017219962552189827, + "step": 1148 + }, + { + "epoch": 0.143625, + "grad_norm": 2.5300068855285645, + "grad_norm_var": 85.6034890468201, + "learning_rate": 0.0001, + "loss": 1.0955, + "loss/crossentropy": 2.370417356491089, + "loss/hidden": 0.7890625, + "loss/logits": 0.13428199291229248, + "loss/reg": 0.017211301252245903, + "step": 1149 + }, + { + "epoch": 0.14375, + "grad_norm": 2.821120023727417, + "grad_norm_var": 85.75492487355808, + "learning_rate": 0.0001, + "loss": 1.1666, + "loss/crossentropy": 2.331489324569702, + "loss/hidden": 0.8359375, + "loss/logits": 0.15861350297927856, + "loss/reg": 0.017202915623784065, + "step": 1150 + }, + { + "epoch": 0.143875, + "grad_norm": 2.5067081451416016, + "grad_norm_var": 85.8742473316794, + "learning_rate": 0.0001, + "loss": 0.9844, + "loss/crossentropy": 2.466968297958374, + "loss/hidden": 0.69140625, + "loss/logits": 0.12101612985134125, + "loss/reg": 0.017194462940096855, + "step": 1151 + }, + { + "epoch": 0.144, + "grad_norm": 3.062662124633789, + "grad_norm_var": 85.8623557526669, + "learning_rate": 0.0001, + "loss": 1.0339, + "loss/crossentropy": 2.409207820892334, + "loss/hidden": 0.7265625, + "loss/logits": 0.13548848032951355, + "loss/reg": 0.017186442390084267, + "step": 1152 + }, + { + "epoch": 0.144125, + "grad_norm": 3.1716883182525635, + "grad_norm_var": 85.9151871866652, + "learning_rate": 0.0001, + "loss": 1.0846, + "loss/crossentropy": 2.4929358959198, + "loss/hidden": 0.75390625, + "loss/logits": 0.15896344184875488, + "loss/reg": 0.0171778816729784, + "step": 1153 + }, + { + "epoch": 0.14425, + "grad_norm": 9.434453010559082, + "grad_norm_var": 86.74661652378359, + "learning_rate": 0.0001, + "loss": 2.0602, + "loss/crossentropy": 2.4357495307922363, + "loss/hidden": 1.6015625, + "loss/logits": 0.28698208928108215, + "loss/reg": 0.017169814556837082, + "step": 1154 + }, + { + "epoch": 0.144375, + "grad_norm": 3.7739198207855225, + "grad_norm_var": 86.506246361283, + "learning_rate": 0.0001, + "loss": 1.0704, + "loss/crossentropy": 2.6996419429779053, + "loss/hidden": 0.75390625, + "loss/logits": 0.1448526829481125, + "loss/reg": 0.017161287367343903, + "step": 1155 + }, + { + "epoch": 0.1445, + "grad_norm": 4.359219551086426, + "grad_norm_var": 86.06611929299979, + "learning_rate": 0.0001, + "loss": 1.2325, + "loss/crossentropy": 2.506620168685913, + "loss/hidden": 0.87109375, + "loss/logits": 0.18992647528648376, + "loss/reg": 0.017152708023786545, + "step": 1156 + }, + { + "epoch": 0.144625, + "grad_norm": 3.83707594871521, + "grad_norm_var": 85.56313319362654, + "learning_rate": 0.0001, + "loss": 1.1268, + "loss/crossentropy": 2.392063856124878, + "loss/hidden": 0.80859375, + "loss/logits": 0.14678636193275452, + "loss/reg": 0.017143724486231804, + "step": 1157 + }, + { + "epoch": 0.14475, + "grad_norm": 2.9818544387817383, + "grad_norm_var": 2.9152543868814065, + "learning_rate": 0.0001, + "loss": 0.9894, + "loss/crossentropy": 2.590273380279541, + "loss/hidden": 0.6953125, + "loss/logits": 0.1227588877081871, + "loss/reg": 0.01713474653661251, + "step": 1158 + }, + { + "epoch": 0.144875, + "grad_norm": 2.5520756244659424, + "grad_norm_var": 2.964164435968295, + "learning_rate": 0.0001, + "loss": 1.1502, + "loss/crossentropy": 2.3040196895599365, + "loss/hidden": 0.83203125, + "loss/logits": 0.1469496786594391, + "loss/reg": 0.017125625163316727, + "step": 1159 + }, + { + "epoch": 0.145, + "grad_norm": 3.3840601444244385, + "grad_norm_var": 2.881888386237869, + "learning_rate": 0.0001, + "loss": 1.0954, + "loss/crossentropy": 2.4384047985076904, + "loss/hidden": 0.78515625, + "loss/logits": 0.13907559216022491, + "loss/reg": 0.017117124050855637, + "step": 1160 + }, + { + "epoch": 0.145125, + "grad_norm": 6.576678276062012, + "grad_norm_var": 3.47394619120245, + "learning_rate": 0.0001, + "loss": 1.7801, + "loss/crossentropy": 2.66701078414917, + "loss/hidden": 1.2578125, + "loss/logits": 0.35119134187698364, + "loss/reg": 0.017108280211687088, + "step": 1161 + }, + { + "epoch": 0.14525, + "grad_norm": 2.326526165008545, + "grad_norm_var": 3.483123034262428, + "learning_rate": 0.0001, + "loss": 1.0016, + "loss/crossentropy": 2.351609706878662, + "loss/hidden": 0.70703125, + "loss/logits": 0.12352467328310013, + "loss/reg": 0.01709994673728943, + "step": 1162 + }, + { + "epoch": 0.145375, + "grad_norm": 9.206579208374023, + "grad_norm_var": 5.257167082686383, + "learning_rate": 0.0001, + "loss": 1.8176, + "loss/crossentropy": 2.4763855934143066, + "loss/hidden": 1.3515625, + "loss/logits": 0.29511886835098267, + "loss/reg": 0.017091669142246246, + "step": 1163 + }, + { + "epoch": 0.1455, + "grad_norm": 2.9634292125701904, + "grad_norm_var": 5.205470661734309, + "learning_rate": 0.0001, + "loss": 1.1448, + "loss/crossentropy": 2.5422465801239014, + "loss/hidden": 0.828125, + "loss/logits": 0.14583711326122284, + "loss/reg": 0.01708296500146389, + "step": 1164 + }, + { + "epoch": 0.145625, + "grad_norm": 3.018620014190674, + "grad_norm_var": 5.118565326969832, + "learning_rate": 0.0001, + "loss": 1.1524, + "loss/crossentropy": 2.559739828109741, + "loss/hidden": 0.82421875, + "loss/logits": 0.15742075443267822, + "loss/reg": 0.01707414537668228, + "step": 1165 + }, + { + "epoch": 0.14575, + "grad_norm": 8.845293045043945, + "grad_norm_var": 6.340596335373259, + "learning_rate": 0.0001, + "loss": 1.558, + "loss/crossentropy": 2.8445377349853516, + "loss/hidden": 1.140625, + "loss/logits": 0.24674071371555328, + "loss/reg": 0.017065750434994698, + "step": 1166 + }, + { + "epoch": 0.145875, + "grad_norm": 4.030366897583008, + "grad_norm_var": 6.0807354199013615, + "learning_rate": 0.0001, + "loss": 1.2387, + "loss/crossentropy": 2.525634765625, + "loss/hidden": 0.8828125, + "loss/logits": 0.18527851998806, + "loss/reg": 0.017056919634342194, + "step": 1167 + }, + { + "epoch": 0.146, + "grad_norm": 4.10527229309082, + "grad_norm_var": 5.935618580704363, + "learning_rate": 0.0001, + "loss": 1.3889, + "loss/crossentropy": 2.657388687133789, + "loss/hidden": 0.9765625, + "loss/logits": 0.24190323054790497, + "loss/reg": 0.01704811304807663, + "step": 1168 + }, + { + "epoch": 0.146125, + "grad_norm": 3.8508963584899902, + "grad_norm_var": 5.829627947687252, + "learning_rate": 0.0001, + "loss": 1.2453, + "loss/crossentropy": 2.405853509902954, + "loss/hidden": 0.90234375, + "loss/logits": 0.17257630825042725, + "loss/reg": 0.017038943246006966, + "step": 1169 + }, + { + "epoch": 0.14625, + "grad_norm": 13.325060844421387, + "grad_norm_var": 9.23016466799555, + "learning_rate": 0.0001, + "loss": 1.3331, + "loss/crossentropy": 2.4716832637786865, + "loss/hidden": 0.94140625, + "loss/logits": 0.2213602215051651, + "loss/reg": 0.01702967844903469, + "step": 1170 + }, + { + "epoch": 0.146375, + "grad_norm": 6.264571189880371, + "grad_norm_var": 9.228622011623397, + "learning_rate": 0.0001, + "loss": 1.1745, + "loss/crossentropy": 2.685593605041504, + "loss/hidden": 0.83984375, + "loss/logits": 0.16446860134601593, + "loss/reg": 0.017020443454384804, + "step": 1171 + }, + { + "epoch": 0.1465, + "grad_norm": 4.161008358001709, + "grad_norm_var": 9.25070050922357, + "learning_rate": 0.0001, + "loss": 1.2714, + "loss/crossentropy": 2.329845428466797, + "loss/hidden": 0.91796875, + "loss/logits": 0.18332120776176453, + "loss/reg": 0.01701117865741253, + "step": 1172 + }, + { + "epoch": 0.146625, + "grad_norm": 2.7158303260803223, + "grad_norm_var": 9.516487065581881, + "learning_rate": 0.0001, + "loss": 1.1028, + "loss/crossentropy": 2.3517086505889893, + "loss/hidden": 0.8203125, + "loss/logits": 0.11249354481697083, + "loss/reg": 0.01700259931385517, + "step": 1173 + }, + { + "epoch": 0.14675, + "grad_norm": 4.040686130523682, + "grad_norm_var": 9.298921738225228, + "learning_rate": 0.0001, + "loss": 1.4656, + "loss/crossentropy": 1.98357093334198, + "loss/hidden": 1.1328125, + "loss/logits": 0.16282862424850464, + "loss/reg": 0.01699363812804222, + "step": 1174 + }, + { + "epoch": 0.146875, + "grad_norm": 3.484989643096924, + "grad_norm_var": 9.038196456147912, + "learning_rate": 0.0001, + "loss": 1.1281, + "loss/crossentropy": 2.7924225330352783, + "loss/hidden": 0.83984375, + "loss/logits": 0.11844426393508911, + "loss/reg": 0.016985056921839714, + "step": 1175 + }, + { + "epoch": 0.147, + "grad_norm": 2.9667844772338867, + "grad_norm_var": 9.146981868389197, + "learning_rate": 0.0001, + "loss": 1.202, + "loss/crossentropy": 2.405456781387329, + "loss/hidden": 0.875, + "loss/logits": 0.15722399950027466, + "loss/reg": 0.01697635091841221, + "step": 1176 + }, + { + "epoch": 0.147125, + "grad_norm": 5.164290904998779, + "grad_norm_var": 8.996899765603606, + "learning_rate": 0.0001, + "loss": 1.1927, + "loss/crossentropy": 2.6594996452331543, + "loss/hidden": 0.85546875, + "loss/logits": 0.16758760809898376, + "loss/reg": 0.01696733944118023, + "step": 1177 + }, + { + "epoch": 0.14725, + "grad_norm": 3.2219250202178955, + "grad_norm_var": 8.724323229467464, + "learning_rate": 0.0001, + "loss": 1.216, + "loss/crossentropy": 2.5696029663085938, + "loss/hidden": 0.8828125, + "loss/logits": 0.16358155012130737, + "loss/reg": 0.016958681866526604, + "step": 1178 + }, + { + "epoch": 0.147375, + "grad_norm": 2.848184108734131, + "grad_norm_var": 7.7572272221691225, + "learning_rate": 0.0001, + "loss": 1.0605, + "loss/crossentropy": 2.317523241043091, + "loss/hidden": 0.7578125, + "loss/logits": 0.1332237273454666, + "loss/reg": 0.016949651762843132, + "step": 1179 + }, + { + "epoch": 0.1475, + "grad_norm": 5.728630065917969, + "grad_norm_var": 7.5993034900551715, + "learning_rate": 0.0001, + "loss": 1.1138, + "loss/crossentropy": 2.771470308303833, + "loss/hidden": 0.80859375, + "loss/logits": 0.13583886623382568, + "loss/reg": 0.016941089183092117, + "step": 1180 + }, + { + "epoch": 0.147625, + "grad_norm": 3.538257122039795, + "grad_norm_var": 7.488546256518004, + "learning_rate": 0.0001, + "loss": 1.0898, + "loss/crossentropy": 2.7069919109344482, + "loss/hidden": 0.77734375, + "loss/logits": 0.1431659609079361, + "loss/reg": 0.01693253219127655, + "step": 1181 + }, + { + "epoch": 0.14775, + "grad_norm": 3.1879353523254395, + "grad_norm_var": 6.50782164977926, + "learning_rate": 0.0001, + "loss": 1.0471, + "loss/crossentropy": 2.6199276447296143, + "loss/hidden": 0.73828125, + "loss/logits": 0.13955026865005493, + "loss/reg": 0.01692408137023449, + "step": 1182 + }, + { + "epoch": 0.147875, + "grad_norm": 4.523275375366211, + "grad_norm_var": 6.489534724108, + "learning_rate": 0.0001, + "loss": 1.2453, + "loss/crossentropy": 2.120800018310547, + "loss/hidden": 0.92578125, + "loss/logits": 0.1503942906856537, + "loss/reg": 0.016915684565901756, + "step": 1183 + }, + { + "epoch": 0.148, + "grad_norm": 3.735788345336914, + "grad_norm_var": 6.520985106875844, + "learning_rate": 0.0001, + "loss": 1.0708, + "loss/crossentropy": 2.6711180210113525, + "loss/hidden": 0.78515625, + "loss/logits": 0.11657284200191498, + "loss/reg": 0.016907010227441788, + "step": 1184 + }, + { + "epoch": 0.148125, + "grad_norm": 5.269720077514648, + "grad_norm_var": 6.515042671209793, + "learning_rate": 0.0001, + "loss": 1.2068, + "loss/crossentropy": 2.2848961353302, + "loss/hidden": 0.875, + "loss/logits": 0.1628110110759735, + "loss/reg": 0.01689821295440197, + "step": 1185 + }, + { + "epoch": 0.14825, + "grad_norm": 3.4472532272338867, + "grad_norm_var": 1.169463016821809, + "learning_rate": 0.0001, + "loss": 1.0442, + "loss/crossentropy": 2.6003897190093994, + "loss/hidden": 0.7421875, + "loss/logits": 0.13315384089946747, + "loss/reg": 0.016889235004782677, + "step": 1186 + }, + { + "epoch": 0.148375, + "grad_norm": 3.267479658126831, + "grad_norm_var": 0.8333935781311614, + "learning_rate": 0.0001, + "loss": 1.135, + "loss/crossentropy": 2.3998661041259766, + "loss/hidden": 0.83203125, + "loss/logits": 0.1341383457183838, + "loss/reg": 0.01688062585890293, + "step": 1187 + }, + { + "epoch": 0.1485, + "grad_norm": 3.262617826461792, + "grad_norm_var": 0.8443526957342724, + "learning_rate": 0.0001, + "loss": 1.0085, + "loss/crossentropy": 2.5633139610290527, + "loss/hidden": 0.72265625, + "loss/logits": 0.11711567640304565, + "loss/reg": 0.016872059553861618, + "step": 1188 + }, + { + "epoch": 0.148625, + "grad_norm": 4.104423522949219, + "grad_norm_var": 0.7687216542662267, + "learning_rate": 0.0001, + "loss": 1.2662, + "loss/crossentropy": 2.4028804302215576, + "loss/hidden": 0.9296875, + "loss/logits": 0.16792353987693787, + "loss/reg": 0.016863279044628143, + "step": 1189 + }, + { + "epoch": 0.14875, + "grad_norm": 3.4790539741516113, + "grad_norm_var": 0.7750564154152146, + "learning_rate": 0.0001, + "loss": 1.3253, + "loss/crossentropy": 2.6870651245117188, + "loss/hidden": 0.9140625, + "loss/logits": 0.24266795814037323, + "loss/reg": 0.016854623332619667, + "step": 1190 + }, + { + "epoch": 0.148875, + "grad_norm": 3.920074701309204, + "grad_norm_var": 0.7670521683778655, + "learning_rate": 0.0001, + "loss": 1.7384, + "loss/crossentropy": 2.006319761276245, + "loss/hidden": 1.296875, + "loss/logits": 0.27303898334503174, + "loss/reg": 0.016845691949129105, + "step": 1191 + }, + { + "epoch": 0.149, + "grad_norm": 3.1460444927215576, + "grad_norm_var": 0.7478523869744016, + "learning_rate": 0.0001, + "loss": 1.1471, + "loss/crossentropy": 2.322908401489258, + "loss/hidden": 0.828125, + "loss/logits": 0.15057553350925446, + "loss/reg": 0.016836855560541153, + "step": 1192 + }, + { + "epoch": 0.149125, + "grad_norm": 2.8855838775634766, + "grad_norm_var": 0.6777176205160399, + "learning_rate": 0.0001, + "loss": 1.1942, + "loss/crossentropy": 2.3052327632904053, + "loss/hidden": 0.859375, + "loss/logits": 0.16649743914604187, + "loss/reg": 0.016828451305627823, + "step": 1193 + }, + { + "epoch": 0.14925, + "grad_norm": 5.441497325897217, + "grad_norm_var": 0.837366755929357, + "learning_rate": 0.0001, + "loss": 1.2583, + "loss/crossentropy": 2.8180274963378906, + "loss/hidden": 0.91015625, + "loss/logits": 0.1799638569355011, + "loss/reg": 0.016819985583424568, + "step": 1194 + }, + { + "epoch": 0.149375, + "grad_norm": 2.6638519763946533, + "grad_norm_var": 0.864398086647378, + "learning_rate": 0.0001, + "loss": 1.2269, + "loss/crossentropy": 2.1777124404907227, + "loss/hidden": 0.8984375, + "loss/logits": 0.1603836864233017, + "loss/reg": 0.016811655834317207, + "step": 1195 + }, + { + "epoch": 0.1495, + "grad_norm": 3.1290552616119385, + "grad_norm_var": 0.6356402025764699, + "learning_rate": 0.0001, + "loss": 1.063, + "loss/crossentropy": 2.31772518157959, + "loss/hidden": 0.76171875, + "loss/logits": 0.13322719931602478, + "loss/reg": 0.016803618520498276, + "step": 1196 + }, + { + "epoch": 0.149625, + "grad_norm": 2.8640289306640625, + "grad_norm_var": 0.6774789250719541, + "learning_rate": 0.0001, + "loss": 1.2119, + "loss/crossentropy": 2.603073835372925, + "loss/hidden": 0.8828125, + "loss/logits": 0.16111129522323608, + "loss/reg": 0.016795063391327858, + "step": 1197 + }, + { + "epoch": 0.14975, + "grad_norm": 2.704012155532837, + "grad_norm_var": 0.7216374904878362, + "learning_rate": 0.0001, + "loss": 1.0581, + "loss/crossentropy": 2.5079526901245117, + "loss/hidden": 0.76171875, + "loss/logits": 0.12854108214378357, + "loss/reg": 0.016786765307188034, + "step": 1198 + }, + { + "epoch": 0.149875, + "grad_norm": 3.260575532913208, + "grad_norm_var": 0.6684105203930727, + "learning_rate": 0.0001, + "loss": 1.1244, + "loss/crossentropy": 2.43037486076355, + "loss/hidden": 0.80859375, + "loss/logits": 0.14800330996513367, + "loss/reg": 0.01677859202027321, + "step": 1199 + }, + { + "epoch": 0.15, + "grad_norm": 2.9981093406677246, + "grad_norm_var": 0.6828016535629103, + "learning_rate": 0.0001, + "loss": 1.4566, + "loss/crossentropy": 2.5041277408599854, + "loss/hidden": 1.0625, + "loss/logits": 0.2263844907283783, + "loss/reg": 0.016770560294389725, + "step": 1200 + }, + { + "epoch": 0.150125, + "grad_norm": 3.0788891315460205, + "grad_norm_var": 0.4629717181381217, + "learning_rate": 0.0001, + "loss": 1.085, + "loss/crossentropy": 2.586244821548462, + "loss/hidden": 0.76953125, + "loss/logits": 0.1478727161884308, + "loss/reg": 0.016761835664510727, + "step": 1201 + }, + { + "epoch": 0.15025, + "grad_norm": 2.979407787322998, + "grad_norm_var": 0.47078996164599, + "learning_rate": 0.0001, + "loss": 1.1119, + "loss/crossentropy": 2.566535472869873, + "loss/hidden": 0.79296875, + "loss/logits": 0.1514027863740921, + "loss/reg": 0.016753433272242546, + "step": 1202 + }, + { + "epoch": 0.150375, + "grad_norm": 3.7025930881500244, + "grad_norm_var": 0.4793410999759106, + "learning_rate": 0.0001, + "loss": 1.1125, + "loss/crossentropy": 2.696070909500122, + "loss/hidden": 0.80078125, + "loss/logits": 0.14427737891674042, + "loss/reg": 0.016745014116168022, + "step": 1203 + }, + { + "epoch": 0.1505, + "grad_norm": 3.7526400089263916, + "grad_norm_var": 0.48855855062033865, + "learning_rate": 0.0001, + "loss": 1.4721, + "loss/crossentropy": 2.3625986576080322, + "loss/hidden": 1.0859375, + "loss/logits": 0.21884310245513916, + "loss/reg": 0.016736432909965515, + "step": 1204 + }, + { + "epoch": 0.150625, + "grad_norm": 2.6353213787078857, + "grad_norm_var": 0.4819149135234577, + "learning_rate": 0.0001, + "loss": 1.0125, + "loss/crossentropy": 2.5642147064208984, + "loss/hidden": 0.72265625, + "loss/logits": 0.1225152313709259, + "loss/reg": 0.01672798953950405, + "step": 1205 + }, + { + "epoch": 0.15075, + "grad_norm": 2.7348835468292236, + "grad_norm_var": 0.4977728974757267, + "learning_rate": 0.0001, + "loss": 1.0998, + "loss/crossentropy": 2.2675602436065674, + "loss/hidden": 0.80078125, + "loss/logits": 0.13179215788841248, + "loss/reg": 0.016719957813620567, + "step": 1206 + }, + { + "epoch": 0.150875, + "grad_norm": 2.585419178009033, + "grad_norm_var": 0.4887115845786629, + "learning_rate": 0.0001, + "loss": 1.0439, + "loss/crossentropy": 2.3835902214050293, + "loss/hidden": 0.73828125, + "loss/logits": 0.13845473527908325, + "loss/reg": 0.01671142503619194, + "step": 1207 + }, + { + "epoch": 0.151, + "grad_norm": 2.457369804382324, + "grad_norm_var": 0.519646055542415, + "learning_rate": 0.0001, + "loss": 1.0538, + "loss/crossentropy": 2.719109296798706, + "loss/hidden": 0.7578125, + "loss/logits": 0.129006028175354, + "loss/reg": 0.01670280657708645, + "step": 1208 + }, + { + "epoch": 0.151125, + "grad_norm": 2.794466733932495, + "grad_norm_var": 0.5229773551564895, + "learning_rate": 0.0001, + "loss": 1.1287, + "loss/crossentropy": 2.4790573120117188, + "loss/hidden": 0.8203125, + "loss/logits": 0.1414172500371933, + "loss/reg": 0.016694119200110435, + "step": 1209 + }, + { + "epoch": 0.15125, + "grad_norm": 2.713892698287964, + "grad_norm_var": 0.14054897219938098, + "learning_rate": 0.0001, + "loss": 1.1539, + "loss/crossentropy": 2.6969070434570312, + "loss/hidden": 0.84765625, + "loss/logits": 0.1394297033548355, + "loss/reg": 0.01668514870107174, + "step": 1210 + }, + { + "epoch": 0.151375, + "grad_norm": 3.024172782897949, + "grad_norm_var": 0.13535290931035035, + "learning_rate": 0.0001, + "loss": 0.9908, + "loss/crossentropy": 2.757359027862549, + "loss/hidden": 0.703125, + "loss/logits": 0.1209060549736023, + "loss/reg": 0.016675440594553947, + "step": 1211 + }, + { + "epoch": 0.1515, + "grad_norm": 2.4915895462036133, + "grad_norm_var": 0.14667295132687527, + "learning_rate": 0.0001, + "loss": 1.0311, + "loss/crossentropy": 2.311927080154419, + "loss/hidden": 0.734375, + "loss/logits": 0.13000784814357758, + "loss/reg": 0.01666680909693241, + "step": 1212 + }, + { + "epoch": 0.151625, + "grad_norm": 3.0987884998321533, + "grad_norm_var": 0.1482532510455627, + "learning_rate": 0.0001, + "loss": 0.9663, + "loss/crossentropy": 2.665640354156494, + "loss/hidden": 0.68359375, + "loss/logits": 0.11612722277641296, + "loss/reg": 0.016658229753375053, + "step": 1213 + }, + { + "epoch": 0.15175, + "grad_norm": 3.144266128540039, + "grad_norm_var": 0.1466168566420831, + "learning_rate": 0.0001, + "loss": 1.2371, + "loss/crossentropy": 2.5783543586730957, + "loss/hidden": 0.9140625, + "loss/logits": 0.1565355509519577, + "loss/reg": 0.01664978824555874, + "step": 1214 + }, + { + "epoch": 0.151875, + "grad_norm": 2.5145692825317383, + "grad_norm_var": 0.15207652538272973, + "learning_rate": 0.0001, + "loss": 0.9714, + "loss/crossentropy": 2.4898011684417725, + "loss/hidden": 0.6796875, + "loss/logits": 0.12530556321144104, + "loss/reg": 0.01664099656045437, + "step": 1215 + }, + { + "epoch": 0.152, + "grad_norm": 2.5222647190093994, + "grad_norm_var": 0.16121854801189647, + "learning_rate": 0.0001, + "loss": 1.073, + "loss/crossentropy": 2.417036533355713, + "loss/hidden": 0.76171875, + "loss/logits": 0.14500710368156433, + "loss/reg": 0.01663181744515896, + "step": 1216 + }, + { + "epoch": 0.152125, + "grad_norm": 2.7117919921875, + "grad_norm_var": 0.16036668917481525, + "learning_rate": 0.0001, + "loss": 1.0801, + "loss/crossentropy": 2.658942461013794, + "loss/hidden": 0.76953125, + "loss/logits": 0.14434798061847687, + "loss/reg": 0.016622193157672882, + "step": 1217 + }, + { + "epoch": 0.15225, + "grad_norm": 2.9744222164154053, + "grad_norm_var": 0.16029316464901414, + "learning_rate": 0.0001, + "loss": 1.1878, + "loss/crossentropy": 2.491722822189331, + "loss/hidden": 0.84375, + "loss/logits": 0.17787563800811768, + "loss/reg": 0.016612496227025986, + "step": 1218 + }, + { + "epoch": 0.152375, + "grad_norm": 3.3779635429382324, + "grad_norm_var": 0.1306752736974545, + "learning_rate": 0.0001, + "loss": 1.0725, + "loss/crossentropy": 2.4862918853759766, + "loss/hidden": 0.7578125, + "loss/logits": 0.14869090914726257, + "loss/reg": 0.01660403423011303, + "step": 1219 + }, + { + "epoch": 0.1525, + "grad_norm": 2.5530524253845215, + "grad_norm_var": 0.07557910361394207, + "learning_rate": 0.0001, + "loss": 1.0191, + "loss/crossentropy": 2.614441394805908, + "loss/hidden": 0.72265625, + "loss/logits": 0.13050062954425812, + "loss/reg": 0.016595516353845596, + "step": 1220 + }, + { + "epoch": 0.152625, + "grad_norm": 2.378697156906128, + "grad_norm_var": 0.08433378351046841, + "learning_rate": 0.0001, + "loss": 1.0027, + "loss/crossentropy": 2.529618501663208, + "loss/hidden": 0.72265625, + "loss/logits": 0.11422193050384521, + "loss/reg": 0.016587061807513237, + "step": 1221 + }, + { + "epoch": 0.15275, + "grad_norm": 3.0336711406707764, + "grad_norm_var": 0.08911795415122749, + "learning_rate": 0.0001, + "loss": 1.0727, + "loss/crossentropy": 2.5050387382507324, + "loss/hidden": 0.75390625, + "loss/logits": 0.15298862755298615, + "loss/reg": 0.016578199341893196, + "step": 1222 + }, + { + "epoch": 0.152875, + "grad_norm": 5.338172435760498, + "grad_norm_var": 0.4936799710714484, + "learning_rate": 0.0001, + "loss": 1.1124, + "loss/crossentropy": 2.6853761672973633, + "loss/hidden": 0.80859375, + "loss/logits": 0.13811737298965454, + "loss/reg": 0.016569815576076508, + "step": 1223 + }, + { + "epoch": 0.153, + "grad_norm": 2.962609052658081, + "grad_norm_var": 0.47674628875148056, + "learning_rate": 0.0001, + "loss": 1.0689, + "loss/crossentropy": 2.441126585006714, + "loss/hidden": 0.765625, + "loss/logits": 0.13769929111003876, + "loss/reg": 0.016561318188905716, + "step": 1224 + }, + { + "epoch": 0.153125, + "grad_norm": 2.7154529094696045, + "grad_norm_var": 0.4790610818976868, + "learning_rate": 0.0001, + "loss": 1.1047, + "loss/crossentropy": 2.4226393699645996, + "loss/hidden": 0.796875, + "loss/logits": 0.14224863052368164, + "loss/reg": 0.016552859917283058, + "step": 1225 + }, + { + "epoch": 0.15325, + "grad_norm": 3.163189172744751, + "grad_norm_var": 0.4762029205706464, + "learning_rate": 0.0001, + "loss": 1.1356, + "loss/crossentropy": 2.642491340637207, + "loss/hidden": 0.81640625, + "loss/logits": 0.1537851244211197, + "loss/reg": 0.016544297337532043, + "step": 1226 + }, + { + "epoch": 0.153375, + "grad_norm": 8.438536643981934, + "grad_norm_var": 2.3256512762465564, + "learning_rate": 0.0001, + "loss": 1.4483, + "loss/crossentropy": 2.159141778945923, + "loss/hidden": 1.0703125, + "loss/logits": 0.21258942782878876, + "loss/reg": 0.016535377129912376, + "step": 1227 + }, + { + "epoch": 0.1535, + "grad_norm": 16.965858459472656, + "grad_norm_var": 13.784859138236738, + "learning_rate": 0.0001, + "loss": 3.1548, + "loss/crossentropy": 2.990753173828125, + "loss/hidden": 1.8203125, + "loss/logits": 1.1692469120025635, + "loss/reg": 0.016526464372873306, + "step": 1228 + }, + { + "epoch": 0.153625, + "grad_norm": 3.789302110671997, + "grad_norm_var": 13.709283357450609, + "learning_rate": 0.0001, + "loss": 1.0345, + "loss/crossentropy": 2.7094292640686035, + "loss/hidden": 0.7265625, + "loss/logits": 0.1427900195121765, + "loss/reg": 0.016517426818609238, + "step": 1229 + }, + { + "epoch": 0.15375, + "grad_norm": 2.838264226913452, + "grad_norm_var": 13.761738651197222, + "learning_rate": 0.0001, + "loss": 1.1093, + "loss/crossentropy": 2.543126106262207, + "loss/hidden": 0.7890625, + "loss/logits": 0.1551959365606308, + "loss/reg": 0.01650911010801792, + "step": 1230 + }, + { + "epoch": 0.153875, + "grad_norm": 3.218651533126831, + "grad_norm_var": 13.628173707948106, + "learning_rate": 0.0001, + "loss": 1.0906, + "loss/crossentropy": 2.112255096435547, + "loss/hidden": 0.78515625, + "loss/logits": 0.1404722034931183, + "loss/reg": 0.016500860452651978, + "step": 1231 + }, + { + "epoch": 0.154, + "grad_norm": 4.796919345855713, + "grad_norm_var": 13.40893956577282, + "learning_rate": 0.0001, + "loss": 1.2598, + "loss/crossentropy": 2.6373000144958496, + "loss/hidden": 0.93359375, + "loss/logits": 0.16131797432899475, + "loss/reg": 0.01649186760187149, + "step": 1232 + }, + { + "epoch": 0.154125, + "grad_norm": 3.640801191329956, + "grad_norm_var": 13.24713470324538, + "learning_rate": 0.0001, + "loss": 1.4181, + "loss/crossentropy": 2.4901435375213623, + "loss/hidden": 1.03125, + "loss/logits": 0.2219938188791275, + "loss/reg": 0.01648273505270481, + "step": 1233 + }, + { + "epoch": 0.15425, + "grad_norm": 3.2018325328826904, + "grad_norm_var": 13.203757643215535, + "learning_rate": 0.0001, + "loss": 1.1419, + "loss/crossentropy": 2.504349708557129, + "loss/hidden": 0.8046875, + "loss/logits": 0.1725234091281891, + "loss/reg": 0.016473697498440742, + "step": 1234 + }, + { + "epoch": 0.154375, + "grad_norm": 2.982257127761841, + "grad_norm_var": 13.274105522819255, + "learning_rate": 0.0001, + "loss": 1.0759, + "loss/crossentropy": 2.3660614490509033, + "loss/hidden": 0.76953125, + "loss/logits": 0.14176063239574432, + "loss/reg": 0.01646505668759346, + "step": 1235 + }, + { + "epoch": 0.1545, + "grad_norm": 4.535284519195557, + "grad_norm_var": 13.004824447407662, + "learning_rate": 0.0001, + "loss": 1.4385, + "loss/crossentropy": 2.8622822761535645, + "loss/hidden": 1.0, + "loss/logits": 0.2738988697528839, + "loss/reg": 0.016456691548228264, + "step": 1236 + }, + { + "epoch": 0.154625, + "grad_norm": 3.0818183422088623, + "grad_norm_var": 12.825136343225884, + "learning_rate": 0.0001, + "loss": 1.0988, + "loss/crossentropy": 2.449317693710327, + "loss/hidden": 0.79296875, + "loss/logits": 0.14135484397411346, + "loss/reg": 0.01644827052950859, + "step": 1237 + }, + { + "epoch": 0.15475, + "grad_norm": 4.102293491363525, + "grad_norm_var": 12.663514204467905, + "learning_rate": 0.0001, + "loss": 1.1842, + "loss/crossentropy": 2.7557554244995117, + "loss/hidden": 0.8515625, + "loss/logits": 0.16828957200050354, + "loss/reg": 0.01643945835530758, + "step": 1238 + }, + { + "epoch": 0.154875, + "grad_norm": 4.042688369750977, + "grad_norm_var": 12.664341312944845, + "learning_rate": 0.0001, + "loss": 1.2182, + "loss/crossentropy": 2.4376473426818848, + "loss/hidden": 0.8828125, + "loss/logits": 0.17108118534088135, + "loss/reg": 0.016430867835879326, + "step": 1239 + }, + { + "epoch": 0.155, + "grad_norm": 3.330106735229492, + "grad_norm_var": 12.589868576516837, + "learning_rate": 0.0001, + "loss": 1.1139, + "loss/crossentropy": 2.705000638961792, + "loss/hidden": 0.80078125, + "loss/logits": 0.14893847703933716, + "loss/reg": 0.01642204262316227, + "step": 1240 + }, + { + "epoch": 0.155125, + "grad_norm": 2.7572247982025146, + "grad_norm_var": 12.579048710159194, + "learning_rate": 0.0001, + "loss": 1.0319, + "loss/crossentropy": 2.726616859436035, + "loss/hidden": 0.73828125, + "loss/logits": 0.12947078049182892, + "loss/reg": 0.016413651406764984, + "step": 1241 + }, + { + "epoch": 0.15525, + "grad_norm": 2.501053810119629, + "grad_norm_var": 12.740389120966254, + "learning_rate": 0.0001, + "loss": 1.0246, + "loss/crossentropy": 2.1374268531799316, + "loss/hidden": 0.7421875, + "loss/logits": 0.11834150552749634, + "loss/reg": 0.016405310481786728, + "step": 1242 + }, + { + "epoch": 0.155375, + "grad_norm": 3.1334445476531982, + "grad_norm_var": 11.811754750464528, + "learning_rate": 0.0001, + "loss": 1.1323, + "loss/crossentropy": 2.204963445663452, + "loss/hidden": 0.828125, + "loss/logits": 0.14021529257297516, + "loss/reg": 0.01639643684029579, + "step": 1243 + }, + { + "epoch": 0.1555, + "grad_norm": 2.888942003250122, + "grad_norm_var": 0.43771643055907344, + "learning_rate": 0.0001, + "loss": 1.0436, + "loss/crossentropy": 2.6868066787719727, + "loss/hidden": 0.75, + "loss/logits": 0.12971553206443787, + "loss/reg": 0.016387417912483215, + "step": 1244 + }, + { + "epoch": 0.155625, + "grad_norm": 3.695841073989868, + "grad_norm_var": 0.43375446821376706, + "learning_rate": 0.0001, + "loss": 1.1949, + "loss/crossentropy": 2.679568290710449, + "loss/hidden": 0.8671875, + "loss/logits": 0.16394542157649994, + "loss/reg": 0.01637819968163967, + "step": 1245 + }, + { + "epoch": 0.15575, + "grad_norm": 3.01936411857605, + "grad_norm_var": 0.4217159331567084, + "learning_rate": 0.0001, + "loss": 1.2859, + "loss/crossentropy": 2.3360393047332764, + "loss/hidden": 0.921875, + "loss/logits": 0.20038416981697083, + "loss/reg": 0.01636892557144165, + "step": 1246 + }, + { + "epoch": 0.155875, + "grad_norm": 2.5124776363372803, + "grad_norm_var": 0.47306891797669076, + "learning_rate": 0.0001, + "loss": 0.9599, + "loss/crossentropy": 2.558067560195923, + "loss/hidden": 0.6796875, + "loss/logits": 0.11665983498096466, + "loss/reg": 0.016359377652406693, + "step": 1247 + }, + { + "epoch": 0.156, + "grad_norm": 3.540679693222046, + "grad_norm_var": 0.3358607220080972, + "learning_rate": 0.0001, + "loss": 1.1319, + "loss/crossentropy": 2.760929584503174, + "loss/hidden": 0.8203125, + "loss/logits": 0.1480470895767212, + "loss/reg": 0.01635100692510605, + "step": 1248 + }, + { + "epoch": 0.156125, + "grad_norm": 3.4042372703552246, + "grad_norm_var": 0.3289363389964431, + "learning_rate": 0.0001, + "loss": 1.1718, + "loss/crossentropy": 2.771622896194458, + "loss/hidden": 0.859375, + "loss/logits": 0.14904874563217163, + "loss/reg": 0.01634254865348339, + "step": 1249 + }, + { + "epoch": 0.15625, + "grad_norm": 3.164729118347168, + "grad_norm_var": 0.32948624287587125, + "learning_rate": 0.0001, + "loss": 1.2872, + "loss/crossentropy": 2.3246641159057617, + "loss/hidden": 0.93359375, + "loss/logits": 0.19026914238929749, + "loss/reg": 0.016334179788827896, + "step": 1250 + }, + { + "epoch": 0.156375, + "grad_norm": 3.409289598464966, + "grad_norm_var": 0.32317475604933343, + "learning_rate": 0.0001, + "loss": 1.0262, + "loss/crossentropy": 2.498500347137451, + "loss/hidden": 0.73828125, + "loss/logits": 0.12468535453081131, + "loss/reg": 0.01632508635520935, + "step": 1251 + }, + { + "epoch": 0.1565, + "grad_norm": 7.274641990661621, + "grad_norm_var": 1.2360715279843908, + "learning_rate": 0.0001, + "loss": 1.5887, + "loss/crossentropy": 2.7439510822296143, + "loss/hidden": 1.2734375, + "loss/logits": 0.15212371945381165, + "loss/reg": 0.016316639259457588, + "step": 1252 + }, + { + "epoch": 0.156625, + "grad_norm": 3.9700396060943604, + "grad_norm_var": 1.2368999449904527, + "learning_rate": 0.0001, + "loss": 1.3172, + "loss/crossentropy": 2.9382898807525635, + "loss/hidden": 0.99609375, + "loss/logits": 0.15806305408477783, + "loss/reg": 0.016308104619383812, + "step": 1253 + }, + { + "epoch": 0.15675, + "grad_norm": 3.035047769546509, + "grad_norm_var": 1.2290263478015266, + "learning_rate": 0.0001, + "loss": 1.0668, + "loss/crossentropy": 2.486111879348755, + "loss/hidden": 0.74609375, + "loss/logits": 0.15766200423240662, + "loss/reg": 0.016299735754728317, + "step": 1254 + }, + { + "epoch": 0.156875, + "grad_norm": 4.703797340393066, + "grad_norm_var": 1.30594374893626, + "learning_rate": 0.0001, + "loss": 1.1914, + "loss/crossentropy": 2.6573047637939453, + "loss/hidden": 0.87890625, + "loss/logits": 0.14956605434417725, + "loss/reg": 0.0162909384816885, + "step": 1255 + }, + { + "epoch": 0.157, + "grad_norm": 2.9516775608062744, + "grad_norm_var": 1.3245417395020618, + "learning_rate": 0.0001, + "loss": 1.1959, + "loss/crossentropy": 2.3165104389190674, + "loss/hidden": 0.87890625, + "loss/logits": 0.1542087197303772, + "loss/reg": 0.01628235913813114, + "step": 1256 + }, + { + "epoch": 0.157125, + "grad_norm": 2.8121533393859863, + "grad_norm_var": 1.3193075406315065, + "learning_rate": 0.0001, + "loss": 1.2046, + "loss/crossentropy": 2.269421100616455, + "loss/hidden": 0.8828125, + "loss/logits": 0.15904302895069122, + "loss/reg": 0.01627359353005886, + "step": 1257 + }, + { + "epoch": 0.15725, + "grad_norm": 2.7715342044830322, + "grad_norm_var": 1.287814713649868, + "learning_rate": 0.0001, + "loss": 1.0294, + "loss/crossentropy": 2.6235570907592773, + "loss/hidden": 0.71484375, + "loss/logits": 0.15194407105445862, + "loss/reg": 0.01626538671553135, + "step": 1258 + }, + { + "epoch": 0.157375, + "grad_norm": 1.9729806184768677, + "grad_norm_var": 1.4314826970209784, + "learning_rate": 0.0001, + "loss": 0.9952, + "loss/crossentropy": 2.2795357704162598, + "loss/hidden": 0.71875, + "loss/logits": 0.11390725523233414, + "loss/reg": 0.016256939619779587, + "step": 1259 + }, + { + "epoch": 0.1575, + "grad_norm": 4.472729206085205, + "grad_norm_var": 1.4707347924489687, + "learning_rate": 0.0001, + "loss": 1.5205, + "loss/crossentropy": 2.3265631198883057, + "loss/hidden": 1.1328125, + "loss/logits": 0.22519487142562866, + "loss/reg": 0.016248464584350586, + "step": 1260 + }, + { + "epoch": 0.157625, + "grad_norm": 2.59954571723938, + "grad_norm_var": 1.5237222016228462, + "learning_rate": 0.0001, + "loss": 0.9663, + "loss/crossentropy": 2.6342809200286865, + "loss/hidden": 0.671875, + "loss/logits": 0.13200603425502777, + "loss/reg": 0.01624012365937233, + "step": 1261 + }, + { + "epoch": 0.15775, + "grad_norm": 3.7313549518585205, + "grad_norm_var": 1.5120623570669227, + "learning_rate": 0.0001, + "loss": 1.1542, + "loss/crossentropy": 2.5860178470611572, + "loss/hidden": 0.83984375, + "loss/logits": 0.15201479196548462, + "loss/reg": 0.016231315210461617, + "step": 1262 + }, + { + "epoch": 0.157875, + "grad_norm": 12.554633140563965, + "grad_norm_var": 6.465262907364731, + "learning_rate": 0.0001, + "loss": 1.0956, + "loss/crossentropy": 2.696977376937866, + "loss/hidden": 0.796875, + "loss/logits": 0.13646942377090454, + "loss/reg": 0.016222581267356873, + "step": 1263 + }, + { + "epoch": 0.158, + "grad_norm": 3.071765422821045, + "grad_norm_var": 6.516980451118831, + "learning_rate": 0.0001, + "loss": 0.9936, + "loss/crossentropy": 2.407457113265991, + "loss/hidden": 0.71484375, + "loss/logits": 0.1166088730096817, + "loss/reg": 0.01621370017528534, + "step": 1264 + }, + { + "epoch": 0.158125, + "grad_norm": 3.9861507415771484, + "grad_norm_var": 6.482705701616392, + "learning_rate": 0.0001, + "loss": 1.2125, + "loss/crossentropy": 2.4362969398498535, + "loss/hidden": 0.89453125, + "loss/logits": 0.15587326884269714, + "loss/reg": 0.01620490849018097, + "step": 1265 + }, + { + "epoch": 0.15825, + "grad_norm": 4.503188610076904, + "grad_norm_var": 6.417924727941712, + "learning_rate": 0.0001, + "loss": 1.1854, + "loss/crossentropy": 2.550220012664795, + "loss/hidden": 0.875, + "loss/logits": 0.14846490323543549, + "loss/reg": 0.016196196898818016, + "step": 1266 + }, + { + "epoch": 0.158375, + "grad_norm": 3.923295021057129, + "grad_norm_var": 6.3775887710365, + "learning_rate": 0.0001, + "loss": 1.5051, + "loss/crossentropy": 2.2857823371887207, + "loss/hidden": 1.109375, + "loss/logits": 0.23387819528579712, + "loss/reg": 0.01618727669119835, + "step": 1267 + }, + { + "epoch": 0.1585, + "grad_norm": 2.639031171798706, + "grad_norm_var": 5.8640922918796825, + "learning_rate": 0.0001, + "loss": 1.0417, + "loss/crossentropy": 2.541623115539551, + "loss/hidden": 0.7421875, + "loss/logits": 0.13770245015621185, + "loss/reg": 0.01617906242609024, + "step": 1268 + }, + { + "epoch": 0.158625, + "grad_norm": 3.2652931213378906, + "grad_norm_var": 5.896181098711848, + "learning_rate": 0.0001, + "loss": 1.0331, + "loss/crossentropy": 2.8000566959381104, + "loss/hidden": 0.74609375, + "loss/logits": 0.12530115246772766, + "loss/reg": 0.01617092825472355, + "step": 1269 + }, + { + "epoch": 0.15875, + "grad_norm": 7.7591872215271, + "grad_norm_var": 6.722812290225987, + "learning_rate": 0.0001, + "loss": 1.3254, + "loss/crossentropy": 2.975308895111084, + "loss/hidden": 0.9453125, + "loss/logits": 0.21848054230213165, + "loss/reg": 0.016162917017936707, + "step": 1270 + }, + { + "epoch": 0.158875, + "grad_norm": 20.061229705810547, + "grad_norm_var": 22.42875378589538, + "learning_rate": 0.0001, + "loss": 1.5526, + "loss/crossentropy": 2.5683257579803467, + "loss/hidden": 1.1796875, + "loss/logits": 0.21141059696674347, + "loss/reg": 0.016154874116182327, + "step": 1271 + }, + { + "epoch": 0.159, + "grad_norm": 3.1273183822631836, + "grad_norm_var": 22.37821079380713, + "learning_rate": 0.0001, + "loss": 1.101, + "loss/crossentropy": 2.2700870037078857, + "loss/hidden": 0.80859375, + "loss/logits": 0.13093486428260803, + "loss/reg": 0.01614651456475258, + "step": 1272 + }, + { + "epoch": 0.159125, + "grad_norm": 5.0759968757629395, + "grad_norm_var": 21.976791517132078, + "learning_rate": 0.0001, + "loss": 1.342, + "loss/crossentropy": 2.440864324569702, + "loss/hidden": 1.0234375, + "loss/logits": 0.157205730676651, + "loss/reg": 0.016138622537255287, + "step": 1273 + }, + { + "epoch": 0.15925, + "grad_norm": 3.844823122024536, + "grad_norm_var": 21.680554653297506, + "learning_rate": 0.0001, + "loss": 1.1638, + "loss/crossentropy": 2.461087942123413, + "loss/hidden": 0.8828125, + "loss/logits": 0.11964771896600723, + "loss/reg": 0.016129910945892334, + "step": 1274 + }, + { + "epoch": 0.159375, + "grad_norm": 3.4437410831451416, + "grad_norm_var": 21.141396790594833, + "learning_rate": 0.0001, + "loss": 1.1072, + "loss/crossentropy": 2.2671289443969727, + "loss/hidden": 0.82421875, + "loss/logits": 0.12177357822656631, + "loss/reg": 0.01612204499542713, + "step": 1275 + }, + { + "epoch": 0.1595, + "grad_norm": 2.6474483013153076, + "grad_norm_var": 21.600534383242742, + "learning_rate": 0.0001, + "loss": 1.0477, + "loss/crossentropy": 2.425645112991333, + "loss/hidden": 0.75390625, + "loss/logits": 0.1326972246170044, + "loss/reg": 0.016113854944705963, + "step": 1276 + }, + { + "epoch": 0.159625, + "grad_norm": 3.2777504920959473, + "grad_norm_var": 21.37698263032081, + "learning_rate": 0.0001, + "loss": 1.1394, + "loss/crossentropy": 2.5559988021850586, + "loss/hidden": 0.8125, + "loss/logits": 0.16583070158958435, + "loss/reg": 0.016105838119983673, + "step": 1277 + }, + { + "epoch": 0.15975, + "grad_norm": 2.806304693222046, + "grad_norm_var": 21.640224221543477, + "learning_rate": 0.0001, + "loss": 1.2464, + "loss/crossentropy": 2.5195603370666504, + "loss/hidden": 0.90625, + "loss/logits": 0.17914444208145142, + "loss/reg": 0.016097450628876686, + "step": 1278 + }, + { + "epoch": 0.159875, + "grad_norm": 2.830876350402832, + "grad_norm_var": 18.240248060375865, + "learning_rate": 0.0001, + "loss": 1.0662, + "loss/crossentropy": 2.4745335578918457, + "loss/hidden": 0.77734375, + "loss/logits": 0.12796689569950104, + "loss/reg": 0.016089415177702904, + "step": 1279 + }, + { + "epoch": 0.16, + "grad_norm": 3.508394479751587, + "grad_norm_var": 18.153502836014415, + "learning_rate": 0.0001, + "loss": 1.344, + "loss/crossentropy": 2.2748706340789795, + "loss/hidden": 0.9921875, + "loss/logits": 0.19095824658870697, + "loss/reg": 0.016081009060144424, + "step": 1280 + }, + { + "epoch": 0.160125, + "grad_norm": 2.568115234375, + "grad_norm_var": 18.431873650050928, + "learning_rate": 0.0001, + "loss": 1.1068, + "loss/crossentropy": 2.393493413925171, + "loss/hidden": 0.8046875, + "loss/logits": 0.14136919379234314, + "loss/reg": 0.016072595492005348, + "step": 1281 + }, + { + "epoch": 0.16025, + "grad_norm": 2.587556838989258, + "grad_norm_var": 18.712804471683103, + "learning_rate": 0.0001, + "loss": 1.0643, + "loss/crossentropy": 2.619617462158203, + "loss/hidden": 0.76171875, + "loss/logits": 0.14193680882453918, + "loss/reg": 0.016064899042248726, + "step": 1282 + }, + { + "epoch": 0.160375, + "grad_norm": 4.345118522644043, + "grad_norm_var": 18.686686687831273, + "learning_rate": 0.0001, + "loss": 1.087, + "loss/crossentropy": 2.4879989624023438, + "loss/hidden": 0.77734375, + "loss/logits": 0.14906466007232666, + "loss/reg": 0.016057275235652924, + "step": 1283 + }, + { + "epoch": 0.1605, + "grad_norm": 3.326270818710327, + "grad_norm_var": 18.535440191895614, + "learning_rate": 0.0001, + "loss": 1.2065, + "loss/crossentropy": 2.540086507797241, + "loss/hidden": 0.8828125, + "loss/logits": 0.16315226256847382, + "loss/reg": 0.01604924537241459, + "step": 1284 + }, + { + "epoch": 0.160625, + "grad_norm": 12.10493278503418, + "grad_norm_var": 21.781544615658156, + "learning_rate": 0.0001, + "loss": 1.4967, + "loss/crossentropy": 2.0798144340515137, + "loss/hidden": 1.140625, + "loss/logits": 0.19571265578269958, + "loss/reg": 0.016041060909628868, + "step": 1285 + }, + { + "epoch": 0.16075, + "grad_norm": 3.7908778190612793, + "grad_norm_var": 21.41548096635543, + "learning_rate": 0.0001, + "loss": 1.2948, + "loss/crossentropy": 2.554609775543213, + "loss/hidden": 0.95703125, + "loss/logits": 0.17741422355175018, + "loss/reg": 0.01603337749838829, + "step": 1286 + }, + { + "epoch": 0.160875, + "grad_norm": 2.7631125450134277, + "grad_norm_var": 5.285413244775635, + "learning_rate": 0.0001, + "loss": 1.1375, + "loss/crossentropy": 2.4389936923980713, + "loss/hidden": 0.8046875, + "loss/logits": 0.17257535457611084, + "loss/reg": 0.016025440767407417, + "step": 1287 + }, + { + "epoch": 0.161, + "grad_norm": 3.0926973819732666, + "grad_norm_var": 5.288953588764335, + "learning_rate": 0.0001, + "loss": 1.1024, + "loss/crossentropy": 2.507418394088745, + "loss/hidden": 0.8203125, + "loss/logits": 0.12186664342880249, + "loss/reg": 0.016017207875847816, + "step": 1288 + }, + { + "epoch": 0.161125, + "grad_norm": 3.1981067657470703, + "grad_norm_var": 5.208865380747667, + "learning_rate": 0.0001, + "loss": 1.0863, + "loss/crossentropy": 2.4531667232513428, + "loss/hidden": 0.79296875, + "loss/logits": 0.13326312601566315, + "loss/reg": 0.01600871980190277, + "step": 1289 + }, + { + "epoch": 0.16125, + "grad_norm": 2.445035457611084, + "grad_norm_var": 5.315218503488533, + "learning_rate": 0.0001, + "loss": 1.0727, + "loss/crossentropy": 2.083653450012207, + "loss/hidden": 0.77734375, + "loss/logits": 0.1353735327720642, + "loss/reg": 0.016000347211956978, + "step": 1290 + }, + { + "epoch": 0.161375, + "grad_norm": 3.601950168609619, + "grad_norm_var": 5.3119885145812225, + "learning_rate": 0.0001, + "loss": 1.3367, + "loss/crossentropy": 2.5701797008514404, + "loss/hidden": 0.9765625, + "loss/logits": 0.20016901195049286, + "loss/reg": 0.015991859138011932, + "step": 1291 + }, + { + "epoch": 0.1615, + "grad_norm": 2.633711099624634, + "grad_norm_var": 5.313893223941083, + "learning_rate": 0.0001, + "loss": 0.9594, + "loss/crossentropy": 2.5569510459899902, + "loss/hidden": 0.69140625, + "loss/logits": 0.10816207528114319, + "loss/reg": 0.015983374789357185, + "step": 1292 + }, + { + "epoch": 0.161625, + "grad_norm": 3.0227303504943848, + "grad_norm_var": 5.331637216482173, + "learning_rate": 0.0001, + "loss": 1.0746, + "loss/crossentropy": 2.440446138381958, + "loss/hidden": 0.7734375, + "loss/logits": 0.1414303183555603, + "loss/reg": 0.01597476750612259, + "step": 1293 + }, + { + "epoch": 0.16175, + "grad_norm": 3.4858055114746094, + "grad_norm_var": 5.282777369926362, + "learning_rate": 0.0001, + "loss": 1.221, + "loss/crossentropy": 2.63417649269104, + "loss/hidden": 0.90234375, + "loss/logits": 0.15903231501579285, + "loss/reg": 0.015966549515724182, + "step": 1294 + }, + { + "epoch": 0.161875, + "grad_norm": 2.918198823928833, + "grad_norm_var": 5.273058122497643, + "learning_rate": 0.0001, + "loss": 1.097, + "loss/crossentropy": 2.473909854888916, + "loss/hidden": 0.796875, + "loss/logits": 0.14054538309574127, + "loss/reg": 0.015958301723003387, + "step": 1295 + }, + { + "epoch": 0.162, + "grad_norm": 2.7234513759613037, + "grad_norm_var": 5.332879789031237, + "learning_rate": 0.0001, + "loss": 1.279, + "loss/crossentropy": 2.241210460662842, + "loss/hidden": 0.9375, + "loss/logits": 0.182043194770813, + "loss/reg": 0.015949726104736328, + "step": 1296 + }, + { + "epoch": 0.162125, + "grad_norm": 3.3515610694885254, + "grad_norm_var": 5.256872590146133, + "learning_rate": 0.0001, + "loss": 1.1463, + "loss/crossentropy": 2.6752541065216064, + "loss/hidden": 0.83203125, + "loss/logits": 0.15490111708641052, + "loss/reg": 0.015941519290208817, + "step": 1297 + }, + { + "epoch": 0.16225, + "grad_norm": 3.6638059616088867, + "grad_norm_var": 5.167917555355179, + "learning_rate": 0.0001, + "loss": 1.1893, + "loss/crossentropy": 2.2823452949523926, + "loss/hidden": 0.86328125, + "loss/logits": 0.16667145490646362, + "loss/reg": 0.015933820977807045, + "step": 1298 + }, + { + "epoch": 0.162375, + "grad_norm": 4.22912073135376, + "grad_norm_var": 5.1600059777442855, + "learning_rate": 0.0001, + "loss": 1.1752, + "loss/crossentropy": 2.6061580181121826, + "loss/hidden": 0.82421875, + "loss/logits": 0.1916988492012024, + "loss/reg": 0.01592625491321087, + "step": 1299 + }, + { + "epoch": 0.1625, + "grad_norm": 6.90070915222168, + "grad_norm_var": 5.7461320078663105, + "learning_rate": 0.0001, + "loss": 1.5428, + "loss/crossentropy": 1.8802989721298218, + "loss/hidden": 1.1015625, + "loss/logits": 0.2820611596107483, + "loss/reg": 0.015918578952550888, + "step": 1300 + }, + { + "epoch": 0.162625, + "grad_norm": 3.0886518955230713, + "grad_norm_var": 1.077876623419837, + "learning_rate": 0.0001, + "loss": 1.2216, + "loss/crossentropy": 2.5785911083221436, + "loss/hidden": 0.8984375, + "loss/logits": 0.16405634582042694, + "loss/reg": 0.01591094397008419, + "step": 1301 + }, + { + "epoch": 0.16275, + "grad_norm": 3.582451581954956, + "grad_norm_var": 1.0706141462880387, + "learning_rate": 0.0001, + "loss": 1.2677, + "loss/crossentropy": 2.2569782733917236, + "loss/hidden": 0.92578125, + "loss/logits": 0.18291030824184418, + "loss/reg": 0.01590333878993988, + "step": 1302 + }, + { + "epoch": 0.162875, + "grad_norm": 3.291792392730713, + "grad_norm_var": 1.0418618914724893, + "learning_rate": 0.0001, + "loss": 1.1212, + "loss/crossentropy": 2.5871994495391846, + "loss/hidden": 0.80859375, + "loss/logits": 0.15368321537971497, + "loss/reg": 0.01589547097682953, + "step": 1303 + }, + { + "epoch": 0.163, + "grad_norm": 12.274004936218262, + "grad_norm_var": 5.870708246927601, + "learning_rate": 0.0001, + "loss": 1.5367, + "loss/crossentropy": 2.2327094078063965, + "loss/hidden": 1.1171875, + "loss/logits": 0.2606501579284668, + "loss/reg": 0.015888074412941933, + "step": 1304 + }, + { + "epoch": 0.163125, + "grad_norm": 3.356614112854004, + "grad_norm_var": 5.854788067612952, + "learning_rate": 0.0001, + "loss": 1.1269, + "loss/crossentropy": 2.7387585639953613, + "loss/hidden": 0.80078125, + "loss/logits": 0.16729386150836945, + "loss/reg": 0.015879716724157333, + "step": 1305 + }, + { + "epoch": 0.16325, + "grad_norm": 2.274139404296875, + "grad_norm_var": 5.892856228313144, + "learning_rate": 0.0001, + "loss": 1.013, + "loss/crossentropy": 2.458984136581421, + "loss/hidden": 0.73046875, + "loss/logits": 0.12379375100135803, + "loss/reg": 0.015871398150920868, + "step": 1306 + }, + { + "epoch": 0.163375, + "grad_norm": 3.3769636154174805, + "grad_norm_var": 5.908708209046858, + "learning_rate": 0.0001, + "loss": 1.096, + "loss/crossentropy": 2.4599368572235107, + "loss/hidden": 0.80078125, + "loss/logits": 0.1365453004837036, + "loss/reg": 0.015863511711359024, + "step": 1307 + }, + { + "epoch": 0.1635, + "grad_norm": 5.5954132080078125, + "grad_norm_var": 5.91311204762225, + "learning_rate": 0.0001, + "loss": 1.4203, + "loss/crossentropy": 3.034935474395752, + "loss/hidden": 1.03125, + "loss/logits": 0.23048382997512817, + "loss/reg": 0.015855222940444946, + "step": 1308 + }, + { + "epoch": 0.163625, + "grad_norm": 2.820237636566162, + "grad_norm_var": 5.947350905923982, + "learning_rate": 0.0001, + "loss": 1.1673, + "loss/crossentropy": 2.4714202880859375, + "loss/hidden": 0.84765625, + "loss/logits": 0.16117262840270996, + "loss/reg": 0.01584673300385475, + "step": 1309 + }, + { + "epoch": 0.16375, + "grad_norm": 2.900300979614258, + "grad_norm_var": 6.0232289618053185, + "learning_rate": 0.0001, + "loss": 1.0603, + "loss/crossentropy": 2.634578227996826, + "loss/hidden": 0.76171875, + "loss/logits": 0.14015381038188934, + "loss/reg": 0.015838829800486565, + "step": 1310 + }, + { + "epoch": 0.163875, + "grad_norm": 3.875929594039917, + "grad_norm_var": 5.923678794810188, + "learning_rate": 0.0001, + "loss": 1.1148, + "loss/crossentropy": 2.8410253524780273, + "loss/hidden": 0.8046875, + "loss/logits": 0.15175345540046692, + "loss/reg": 0.01583097316324711, + "step": 1311 + }, + { + "epoch": 0.164, + "grad_norm": 3.126546621322632, + "grad_norm_var": 5.854122320902001, + "learning_rate": 0.0001, + "loss": 1.1639, + "loss/crossentropy": 2.5248920917510986, + "loss/hidden": 0.85546875, + "loss/logits": 0.15023675560951233, + "loss/reg": 0.015823420137166977, + "step": 1312 + }, + { + "epoch": 0.164125, + "grad_norm": 3.225511312484741, + "grad_norm_var": 5.869908623309258, + "learning_rate": 0.0001, + "loss": 1.2341, + "loss/crossentropy": 2.382220506668091, + "loss/hidden": 0.9140625, + "loss/logits": 0.16187481582164764, + "loss/reg": 0.01581561006605625, + "step": 1313 + }, + { + "epoch": 0.16425, + "grad_norm": 5.516601085662842, + "grad_norm_var": 5.946099660897706, + "learning_rate": 0.0001, + "loss": 1.5097, + "loss/crossentropy": 2.2973639965057373, + "loss/hidden": 1.1328125, + "loss/logits": 0.2187841385602951, + "loss/reg": 0.015808099880814552, + "step": 1314 + }, + { + "epoch": 0.164375, + "grad_norm": 3.7441492080688477, + "grad_norm_var": 5.967949014164251, + "learning_rate": 0.0001, + "loss": 1.0893, + "loss/crossentropy": 2.7098612785339355, + "loss/hidden": 0.78125, + "loss/logits": 0.15003027021884918, + "loss/reg": 0.015800559893250465, + "step": 1315 + }, + { + "epoch": 0.1645, + "grad_norm": 3.2912096977233887, + "grad_norm_var": 5.5351073509179205, + "learning_rate": 0.0001, + "loss": 1.1089, + "loss/crossentropy": 2.5598583221435547, + "loss/hidden": 0.80078125, + "loss/logits": 0.15022039413452148, + "loss/reg": 0.015792248770594597, + "step": 1316 + }, + { + "epoch": 0.164625, + "grad_norm": 2.697364091873169, + "grad_norm_var": 5.596594138613236, + "learning_rate": 0.0001, + "loss": 1.1302, + "loss/crossentropy": 2.364313840866089, + "loss/hidden": 0.81640625, + "loss/logits": 0.1559445559978485, + "loss/reg": 0.015783967450261116, + "step": 1317 + }, + { + "epoch": 0.16475, + "grad_norm": 3.0158493518829346, + "grad_norm_var": 5.652685497198369, + "learning_rate": 0.0001, + "loss": 1.1872, + "loss/crossentropy": 2.7219552993774414, + "loss/hidden": 0.87109375, + "loss/logits": 0.15835720300674438, + "loss/reg": 0.015775663778185844, + "step": 1318 + }, + { + "epoch": 0.164875, + "grad_norm": 307.651611328125, + "grad_norm_var": 5765.62343534609, + "learning_rate": 0.0001, + "loss": 3.7364, + "loss/crossentropy": 3.080000400543213, + "loss/hidden": 3.09375, + "loss/logits": 0.48493900895118713, + "loss/reg": 0.015767192468047142, + "step": 1319 + }, + { + "epoch": 0.165, + "grad_norm": 2.4901864528656006, + "grad_norm_var": 5785.658820843435, + "learning_rate": 0.0001, + "loss": 1.1422, + "loss/crossentropy": 2.167736053466797, + "loss/hidden": 0.8359375, + "loss/logits": 0.1487167328596115, + "loss/reg": 0.015759125351905823, + "step": 1320 + }, + { + "epoch": 0.165125, + "grad_norm": 2.829414129257202, + "grad_norm_var": 5787.017269350288, + "learning_rate": 0.0001, + "loss": 1.1765, + "loss/crossentropy": 2.35046648979187, + "loss/hidden": 0.83984375, + "loss/logits": 0.17911039292812347, + "loss/reg": 0.01575067639350891, + "step": 1321 + }, + { + "epoch": 0.16525, + "grad_norm": 3.047290325164795, + "grad_norm_var": 5784.979716656826, + "learning_rate": 0.0001, + "loss": 1.1083, + "loss/crossentropy": 2.5440683364868164, + "loss/hidden": 0.79296875, + "loss/logits": 0.15790146589279175, + "loss/reg": 0.015742652118206024, + "step": 1322 + }, + { + "epoch": 0.165375, + "grad_norm": 2.9854726791381836, + "grad_norm_var": 5785.984900115946, + "learning_rate": 0.0001, + "loss": 1.0518, + "loss/crossentropy": 2.0691964626312256, + "loss/hidden": 0.7734375, + "loss/logits": 0.12100932002067566, + "loss/reg": 0.015734149143099785, + "step": 1323 + }, + { + "epoch": 0.1655, + "grad_norm": 2.8492424488067627, + "grad_norm_var": 5792.618796374745, + "learning_rate": 0.0001, + "loss": 1.321, + "loss/crossentropy": 2.045762538909912, + "loss/hidden": 1.0390625, + "loss/logits": 0.12468324601650238, + "loss/reg": 0.01572607271373272, + "step": 1324 + }, + { + "epoch": 0.165625, + "grad_norm": 3.2983663082122803, + "grad_norm_var": 5791.394160827106, + "learning_rate": 0.0001, + "loss": 1.1712, + "loss/crossentropy": 2.6199800968170166, + "loss/hidden": 0.87109375, + "loss/logits": 0.14290478825569153, + "loss/reg": 0.01571841724216938, + "step": 1325 + }, + { + "epoch": 0.16575, + "grad_norm": 3.7790138721466064, + "grad_norm_var": 5789.171384194312, + "learning_rate": 0.0001, + "loss": 1.3719, + "loss/crossentropy": 2.5326523780822754, + "loss/hidden": 0.984375, + "loss/logits": 0.23037347197532654, + "loss/reg": 0.015710193663835526, + "step": 1326 + }, + { + "epoch": 0.165875, + "grad_norm": 3.0544376373291016, + "grad_norm_var": 5791.235862450414, + "learning_rate": 0.0001, + "loss": 1.3355, + "loss/crossentropy": 2.679947853088379, + "loss/hidden": 0.94140625, + "loss/logits": 0.2371068298816681, + "loss/reg": 0.015702618286013603, + "step": 1327 + }, + { + "epoch": 0.166, + "grad_norm": 3.107823133468628, + "grad_norm_var": 5791.283719365005, + "learning_rate": 0.0001, + "loss": 1.184, + "loss/crossentropy": 2.564558506011963, + "loss/hidden": 0.8515625, + "loss/logits": 0.17545363306999207, + "loss/reg": 0.01569523848593235, + "step": 1328 + }, + { + "epoch": 0.166125, + "grad_norm": 3.589580774307251, + "grad_norm_var": 5790.3667350596925, + "learning_rate": 0.0001, + "loss": 1.4278, + "loss/crossentropy": 2.2362802028656006, + "loss/hidden": 1.03125, + "loss/logits": 0.2396482676267624, + "loss/reg": 0.01568804495036602, + "step": 1329 + }, + { + "epoch": 0.16625, + "grad_norm": 3.2111618518829346, + "grad_norm_var": 5795.860842463791, + "learning_rate": 0.0001, + "loss": 1.0855, + "loss/crossentropy": 2.5137100219726562, + "loss/hidden": 0.79296875, + "loss/logits": 0.1356830894947052, + "loss/reg": 0.01568055897951126, + "step": 1330 + }, + { + "epoch": 0.166375, + "grad_norm": 2.7476325035095215, + "grad_norm_var": 5798.370483928043, + "learning_rate": 0.0001, + "loss": 1.2698, + "loss/crossentropy": 2.456242322921753, + "loss/hidden": 0.9453125, + "loss/logits": 0.16777516901493073, + "loss/reg": 0.01567363552749157, + "step": 1331 + }, + { + "epoch": 0.1665, + "grad_norm": 2.6416473388671875, + "grad_norm_var": 5800.026099397797, + "learning_rate": 0.0001, + "loss": 1.1194, + "loss/crossentropy": 2.7138493061065674, + "loss/hidden": 0.8125, + "loss/logits": 0.15024441480636597, + "loss/reg": 0.015665553510189056, + "step": 1332 + }, + { + "epoch": 0.166625, + "grad_norm": 3.000983476638794, + "grad_norm_var": 5799.247920072332, + "learning_rate": 0.0001, + "loss": 1.2997, + "loss/crossentropy": 2.4222257137298584, + "loss/hidden": 0.9609375, + "loss/logits": 0.1821441352367401, + "loss/reg": 0.015658436343073845, + "step": 1333 + }, + { + "epoch": 0.16675, + "grad_norm": 4.0280961990356445, + "grad_norm_var": 5796.738777158668, + "learning_rate": 0.0001, + "loss": 1.4751, + "loss/crossentropy": 2.4873077869415283, + "loss/hidden": 1.1015625, + "loss/logits": 0.2170398086309433, + "loss/reg": 0.015650250017642975, + "step": 1334 + }, + { + "epoch": 0.166875, + "grad_norm": 3.3132269382476807, + "grad_norm_var": 0.16804106807275332, + "learning_rate": 0.0001, + "loss": 1.0521, + "loss/crossentropy": 2.3228578567504883, + "loss/hidden": 0.7578125, + "loss/logits": 0.13784348964691162, + "loss/reg": 0.015642434358596802, + "step": 1335 + }, + { + "epoch": 0.167, + "grad_norm": 2.9326908588409424, + "grad_norm_var": 0.14292226941004174, + "learning_rate": 0.0001, + "loss": 1.0604, + "loss/crossentropy": 2.6680498123168945, + "loss/hidden": 0.7578125, + "loss/logits": 0.1462496519088745, + "loss/reg": 0.015634268522262573, + "step": 1336 + }, + { + "epoch": 0.167125, + "grad_norm": 3.7127490043640137, + "grad_norm_var": 0.1538134730442567, + "learning_rate": 0.0001, + "loss": 1.1028, + "loss/crossentropy": 2.439518690109253, + "loss/hidden": 0.8046875, + "loss/logits": 0.14183643460273743, + "loss/reg": 0.015626052394509315, + "step": 1337 + }, + { + "epoch": 0.16725, + "grad_norm": 3.0613186359405518, + "grad_norm_var": 0.15352851622272493, + "learning_rate": 0.0001, + "loss": 1.1624, + "loss/crossentropy": 2.4576408863067627, + "loss/hidden": 0.84375, + "loss/logits": 0.1624375730752945, + "loss/reg": 0.015617795288562775, + "step": 1338 + }, + { + "epoch": 0.167375, + "grad_norm": 2.435100793838501, + "grad_norm_var": 0.18872328446350153, + "learning_rate": 0.0001, + "loss": 1.1516, + "loss/crossentropy": 2.6004011631011963, + "loss/hidden": 0.84375, + "loss/logits": 0.1517818719148636, + "loss/reg": 0.015609413385391235, + "step": 1339 + }, + { + "epoch": 0.1675, + "grad_norm": 2.6485419273376465, + "grad_norm_var": 0.19989636027441596, + "learning_rate": 0.0001, + "loss": 1.0442, + "loss/crossentropy": 2.560258626937866, + "loss/hidden": 0.7578125, + "loss/logits": 0.13042010366916656, + "loss/reg": 0.015600843355059624, + "step": 1340 + }, + { + "epoch": 0.167625, + "grad_norm": 2.89918851852417, + "grad_norm_var": 0.20249881233273162, + "learning_rate": 0.0001, + "loss": 1.0632, + "loss/crossentropy": 2.5941762924194336, + "loss/hidden": 0.77734375, + "loss/logits": 0.12995409965515137, + "loss/reg": 0.015592261217534542, + "step": 1341 + }, + { + "epoch": 0.16775, + "grad_norm": 2.8898985385894775, + "grad_norm_var": 0.175583338922907, + "learning_rate": 0.0001, + "loss": 1.11, + "loss/crossentropy": 2.4292283058166504, + "loss/hidden": 0.8125, + "loss/logits": 0.14161787927150726, + "loss/reg": 0.015583738684654236, + "step": 1342 + }, + { + "epoch": 0.167875, + "grad_norm": 2.9340474605560303, + "grad_norm_var": 0.1768935876133876, + "learning_rate": 0.0001, + "loss": 1.0755, + "loss/crossentropy": 2.2534961700439453, + "loss/hidden": 0.77734375, + "loss/logits": 0.14238594472408295, + "loss/reg": 0.015575552359223366, + "step": 1343 + }, + { + "epoch": 0.168, + "grad_norm": 2.4977829456329346, + "grad_norm_var": 0.19724767622532605, + "learning_rate": 0.0001, + "loss": 1.1106, + "loss/crossentropy": 2.5671305656433105, + "loss/hidden": 0.796875, + "loss/logits": 0.15805330872535706, + "loss/reg": 0.015566708520054817, + "step": 1344 + }, + { + "epoch": 0.168125, + "grad_norm": 3.79781174659729, + "grad_norm_var": 0.2153835126984556, + "learning_rate": 0.0001, + "loss": 1.2096, + "loss/crossentropy": 2.341862440109253, + "loss/hidden": 0.90234375, + "loss/logits": 0.15165603160858154, + "loss/reg": 0.015558542683720589, + "step": 1345 + }, + { + "epoch": 0.16825, + "grad_norm": 4.044692039489746, + "grad_norm_var": 0.2770521554047564, + "learning_rate": 0.0001, + "loss": 1.2426, + "loss/crossentropy": 2.053048849105835, + "loss/hidden": 0.96484375, + "loss/logits": 0.1222817674279213, + "loss/reg": 0.015550050884485245, + "step": 1346 + }, + { + "epoch": 0.168375, + "grad_norm": 2.795452117919922, + "grad_norm_var": 0.27495421257927977, + "learning_rate": 0.0001, + "loss": 1.1152, + "loss/crossentropy": 2.381235361099243, + "loss/hidden": 0.796875, + "loss/logits": 0.16290614008903503, + "loss/reg": 0.015541622415184975, + "step": 1347 + }, + { + "epoch": 0.1685, + "grad_norm": 2.5820398330688477, + "grad_norm_var": 0.27883561860861783, + "learning_rate": 0.0001, + "loss": 0.9562, + "loss/crossentropy": 2.6334848403930664, + "loss/hidden": 0.6796875, + "loss/logits": 0.12118068337440491, + "loss/reg": 0.01553329173475504, + "step": 1348 + }, + { + "epoch": 0.168625, + "grad_norm": 3.5735538005828857, + "grad_norm_var": 0.29189209249198955, + "learning_rate": 0.0001, + "loss": 1.1534, + "loss/crossentropy": 2.587275505065918, + "loss/hidden": 0.8046875, + "loss/logits": 0.1934923231601715, + "loss/reg": 0.01552544254809618, + "step": 1349 + }, + { + "epoch": 0.16875, + "grad_norm": 2.8559420108795166, + "grad_norm_var": 0.23804927371853257, + "learning_rate": 0.0001, + "loss": 1.1208, + "loss/crossentropy": 2.4297351837158203, + "loss/hidden": 0.81640625, + "loss/logits": 0.14923590421676636, + "loss/reg": 0.01551780290901661, + "step": 1350 + }, + { + "epoch": 0.168875, + "grad_norm": 2.5194828510284424, + "grad_norm_var": 0.25071932648220735, + "learning_rate": 0.0001, + "loss": 1.0241, + "loss/crossentropy": 2.4934616088867188, + "loss/hidden": 0.74609375, + "loss/logits": 0.12285730242729187, + "loss/reg": 0.015510031953454018, + "step": 1351 + }, + { + "epoch": 0.169, + "grad_norm": 2.714785575866699, + "grad_norm_var": 0.2559699884583568, + "learning_rate": 0.0001, + "loss": 0.9928, + "loss/crossentropy": 2.7447316646575928, + "loss/hidden": 0.703125, + "loss/logits": 0.13469095528125763, + "loss/reg": 0.015501690097153187, + "step": 1352 + }, + { + "epoch": 0.169125, + "grad_norm": 2.226409912109375, + "grad_norm_var": 0.2523278002535238, + "learning_rate": 0.0001, + "loss": 1.0071, + "loss/crossentropy": 2.3574349880218506, + "loss/hidden": 0.72265625, + "loss/logits": 0.1294814646244049, + "loss/reg": 0.015494225546717644, + "step": 1353 + }, + { + "epoch": 0.16925, + "grad_norm": 2.2927169799804688, + "grad_norm_var": 0.27320470544451986, + "learning_rate": 0.0001, + "loss": 0.9263, + "loss/crossentropy": 2.6081109046936035, + "loss/hidden": 0.66015625, + "loss/logits": 0.11125902831554413, + "loss/reg": 0.015486053191125393, + "step": 1354 + }, + { + "epoch": 0.169375, + "grad_norm": 2.4285078048706055, + "grad_norm_var": 0.27357804892559606, + "learning_rate": 0.0001, + "loss": 1.0351, + "loss/crossentropy": 2.654952049255371, + "loss/hidden": 0.75, + "loss/logits": 0.13033051788806915, + "loss/reg": 0.015478008426725864, + "step": 1355 + }, + { + "epoch": 0.1695, + "grad_norm": 2.5576510429382324, + "grad_norm_var": 0.27661218725713915, + "learning_rate": 0.0001, + "loss": 0.9944, + "loss/crossentropy": 2.525481939315796, + "loss/hidden": 0.6953125, + "loss/logits": 0.14441752433776855, + "loss/reg": 0.015470432117581367, + "step": 1356 + }, + { + "epoch": 0.169625, + "grad_norm": 3.3350884914398193, + "grad_norm_var": 0.2913103816812319, + "learning_rate": 0.0001, + "loss": 1.0548, + "loss/crossentropy": 2.649492025375366, + "loss/hidden": 0.75, + "loss/logits": 0.15016667544841766, + "loss/reg": 0.015463395975530148, + "step": 1357 + }, + { + "epoch": 0.16975, + "grad_norm": 39.05031967163086, + "grad_norm_var": 82.07282531411327, + "learning_rate": 0.0001, + "loss": 1.0599, + "loss/crossentropy": 2.447631359100342, + "loss/hidden": 0.796875, + "loss/logits": 0.1085047721862793, + "loss/reg": 0.015455886721611023, + "step": 1358 + }, + { + "epoch": 0.169875, + "grad_norm": 2.6993017196655273, + "grad_norm_var": 82.14524851838627, + "learning_rate": 0.0001, + "loss": 0.9833, + "loss/crossentropy": 2.4683852195739746, + "loss/hidden": 0.70703125, + "loss/logits": 0.12176641821861267, + "loss/reg": 0.015447665005922318, + "step": 1359 + }, + { + "epoch": 0.17, + "grad_norm": 2.8238816261291504, + "grad_norm_var": 82.03774119861214, + "learning_rate": 0.0001, + "loss": 0.9911, + "loss/crossentropy": 2.6632418632507324, + "loss/hidden": 0.71875, + "loss/logits": 0.11792933940887451, + "loss/reg": 0.015439565293490887, + "step": 1360 + }, + { + "epoch": 0.170125, + "grad_norm": 3.259418249130249, + "grad_norm_var": 82.15246657395147, + "learning_rate": 0.0001, + "loss": 1.1184, + "loss/crossentropy": 2.7344815731048584, + "loss/hidden": 0.8046875, + "loss/logits": 0.15935808420181274, + "loss/reg": 0.015431756153702736, + "step": 1361 + }, + { + "epoch": 0.17025, + "grad_norm": 3.7520382404327393, + "grad_norm_var": 82.19938647618869, + "learning_rate": 0.0001, + "loss": 0.9883, + "loss/crossentropy": 2.7563743591308594, + "loss/hidden": 0.6953125, + "loss/logits": 0.13872626423835754, + "loss/reg": 0.015424099750816822, + "step": 1362 + }, + { + "epoch": 0.170375, + "grad_norm": 2.766862154006958, + "grad_norm_var": 82.20819070334429, + "learning_rate": 0.0001, + "loss": 0.9816, + "loss/crossentropy": 2.5083751678466797, + "loss/hidden": 0.703125, + "loss/logits": 0.12434659898281097, + "loss/reg": 0.01541648618876934, + "step": 1363 + }, + { + "epoch": 0.1705, + "grad_norm": 4.0485734939575195, + "grad_norm_var": 81.85223413984265, + "learning_rate": 0.0001, + "loss": 1.2789, + "loss/crossentropy": 2.2484068870544434, + "loss/hidden": 0.9296875, + "loss/logits": 0.19513945281505585, + "loss/reg": 0.015408649109303951, + "step": 1364 + }, + { + "epoch": 0.170625, + "grad_norm": 3.177393913269043, + "grad_norm_var": 81.94697865555716, + "learning_rate": 0.0001, + "loss": 1.0776, + "loss/crossentropy": 2.5238068103790283, + "loss/hidden": 0.796875, + "loss/logits": 0.1267687976360321, + "loss/reg": 0.01540052518248558, + "step": 1365 + }, + { + "epoch": 0.17075, + "grad_norm": 8.407809257507324, + "grad_norm_var": 82.17024249924071, + "learning_rate": 0.0001, + "loss": 1.3745, + "loss/crossentropy": 2.6552045345306396, + "loss/hidden": 1.0, + "loss/logits": 0.2205539047718048, + "loss/reg": 0.01539271418005228, + "step": 1366 + }, + { + "epoch": 0.170875, + "grad_norm": 3.5715489387512207, + "grad_norm_var": 81.82079857540536, + "learning_rate": 0.0001, + "loss": 1.1968, + "loss/crossentropy": 2.3991222381591797, + "loss/hidden": 0.88671875, + "loss/logits": 0.1562199890613556, + "loss/reg": 0.01538482028990984, + "step": 1367 + }, + { + "epoch": 0.171, + "grad_norm": 3.431995153427124, + "grad_norm_var": 81.57995561859133, + "learning_rate": 0.0001, + "loss": 1.1711, + "loss/crossentropy": 2.475353479385376, + "loss/hidden": 0.8515625, + "loss/logits": 0.16577771306037903, + "loss/reg": 0.015377094969153404, + "step": 1368 + }, + { + "epoch": 0.171125, + "grad_norm": 2.35178279876709, + "grad_norm_var": 81.52430399273271, + "learning_rate": 0.0001, + "loss": 0.9426, + "loss/crossentropy": 2.6334617137908936, + "loss/hidden": 0.67578125, + "loss/logits": 0.113157257437706, + "loss/reg": 0.015369528904557228, + "step": 1369 + }, + { + "epoch": 0.17125, + "grad_norm": 2.277183771133423, + "grad_norm_var": 81.53121470659494, + "learning_rate": 0.0001, + "loss": 1.0905, + "loss/crossentropy": 2.4363012313842773, + "loss/hidden": 0.796875, + "loss/logits": 0.14004433155059814, + "loss/reg": 0.015361781232059002, + "step": 1370 + }, + { + "epoch": 0.171375, + "grad_norm": 2.6015970706939697, + "grad_norm_var": 81.45940420807128, + "learning_rate": 0.0001, + "loss": 1.1042, + "loss/crossentropy": 2.4151899814605713, + "loss/hidden": 0.80859375, + "loss/logits": 0.14207510650157928, + "loss/reg": 0.015353736467659473, + "step": 1371 + }, + { + "epoch": 0.1715, + "grad_norm": 2.7182395458221436, + "grad_norm_var": 81.39518805728484, + "learning_rate": 0.0001, + "loss": 1.0125, + "loss/crossentropy": 2.546245813369751, + "loss/hidden": 0.734375, + "loss/logits": 0.12465515732765198, + "loss/reg": 0.015345688909292221, + "step": 1372 + }, + { + "epoch": 0.171625, + "grad_norm": 2.4946951866149902, + "grad_norm_var": 81.69783167090145, + "learning_rate": 0.0001, + "loss": 1.0365, + "loss/crossentropy": 2.4872238636016846, + "loss/hidden": 0.76171875, + "loss/logits": 0.121379055082798, + "loss/reg": 0.015337930992245674, + "step": 1373 + }, + { + "epoch": 0.17175, + "grad_norm": 2.35099720954895, + "grad_norm_var": 2.1436230027036345, + "learning_rate": 0.0001, + "loss": 1.0129, + "loss/crossentropy": 2.5447275638580322, + "loss/hidden": 0.73046875, + "loss/logits": 0.12912125885486603, + "loss/reg": 0.015329563058912754, + "step": 1374 + }, + { + "epoch": 0.171875, + "grad_norm": 3.531275987625122, + "grad_norm_var": 2.120711236961161, + "learning_rate": 0.0001, + "loss": 1.1698, + "loss/crossentropy": 2.6748950481414795, + "loss/hidden": 0.88671875, + "loss/logits": 0.1298878937959671, + "loss/reg": 0.015320966020226479, + "step": 1375 + }, + { + "epoch": 0.172, + "grad_norm": 2.5129640102386475, + "grad_norm_var": 2.1484737670750205, + "learning_rate": 0.0001, + "loss": 0.9549, + "loss/crossentropy": 2.4481074810028076, + "loss/hidden": 0.67578125, + "loss/logits": 0.1260184943675995, + "loss/reg": 0.015312742441892624, + "step": 1376 + }, + { + "epoch": 0.172125, + "grad_norm": 2.2456185817718506, + "grad_norm_var": 2.222034906196358, + "learning_rate": 0.0001, + "loss": 0.9583, + "loss/crossentropy": 2.697751760482788, + "loss/hidden": 0.6875, + "loss/logits": 0.11780545860528946, + "loss/reg": 0.01530410535633564, + "step": 1377 + }, + { + "epoch": 0.17225, + "grad_norm": 3.371694564819336, + "grad_norm_var": 2.206379139706632, + "learning_rate": 0.0001, + "loss": 1.0815, + "loss/crossentropy": 2.5198638439178467, + "loss/hidden": 0.77734375, + "loss/logits": 0.1512106955051422, + "loss/reg": 0.015296036377549171, + "step": 1378 + }, + { + "epoch": 0.172375, + "grad_norm": 3.3469138145446777, + "grad_norm_var": 2.190717504538292, + "learning_rate": 0.0001, + "loss": 1.1007, + "loss/crossentropy": 2.500404119491577, + "loss/hidden": 0.84375, + "loss/logits": 0.10408136248588562, + "loss/reg": 0.015288051217794418, + "step": 1379 + }, + { + "epoch": 0.1725, + "grad_norm": 2.7227180004119873, + "grad_norm_var": 2.1642779969536603, + "learning_rate": 0.0001, + "loss": 1.0844, + "loss/crossentropy": 2.3428802490234375, + "loss/hidden": 0.79296875, + "loss/logits": 0.1386614292860031, + "loss/reg": 0.015280190855264664, + "step": 1380 + }, + { + "epoch": 0.172625, + "grad_norm": 2.9242119789123535, + "grad_norm_var": 2.1688668977830954, + "learning_rate": 0.0001, + "loss": 1.239, + "loss/crossentropy": 2.668134927749634, + "loss/hidden": 0.88671875, + "loss/logits": 0.19956323504447937, + "loss/reg": 0.015271900221705437, + "step": 1381 + }, + { + "epoch": 0.17275, + "grad_norm": 2.492800712585449, + "grad_norm_var": 0.23164549730800724, + "learning_rate": 0.0001, + "loss": 0.9934, + "loss/crossentropy": 2.5607941150665283, + "loss/hidden": 0.71484375, + "loss/logits": 0.12592297792434692, + "loss/reg": 0.01526356115937233, + "step": 1382 + }, + { + "epoch": 0.172875, + "grad_norm": 3.9195237159729004, + "grad_norm_var": 0.2745866186604149, + "learning_rate": 0.0001, + "loss": 1.2783, + "loss/crossentropy": 2.569718837738037, + "loss/hidden": 0.9453125, + "loss/logits": 0.18045136332511902, + "loss/reg": 0.015255567617714405, + "step": 1383 + }, + { + "epoch": 0.173, + "grad_norm": 3.3682620525360107, + "grad_norm_var": 0.2697324337180568, + "learning_rate": 0.0001, + "loss": 1.0247, + "loss/crossentropy": 2.756709337234497, + "loss/hidden": 0.7421875, + "loss/logits": 0.1299922913312912, + "loss/reg": 0.015247280709445477, + "step": 1384 + }, + { + "epoch": 0.173125, + "grad_norm": 75.24736022949219, + "grad_norm_var": 327.7621509720236, + "learning_rate": 0.0001, + "loss": 1.1005, + "loss/crossentropy": 2.492774248123169, + "loss/hidden": 0.828125, + "loss/logits": 0.11999952048063278, + "loss/reg": 0.015239723026752472, + "step": 1385 + }, + { + "epoch": 0.17325, + "grad_norm": 2.639838218688965, + "grad_norm_var": 327.5234904743987, + "learning_rate": 0.0001, + "loss": 1.123, + "loss/crossentropy": 2.783590793609619, + "loss/hidden": 0.81640625, + "loss/logits": 0.15431414544582367, + "loss/reg": 0.015231656841933727, + "step": 1386 + }, + { + "epoch": 0.173375, + "grad_norm": 8.296279907226562, + "grad_norm_var": 325.9027345524763, + "learning_rate": 0.0001, + "loss": 1.0279, + "loss/crossentropy": 2.591702938079834, + "loss/hidden": 0.75390625, + "loss/logits": 0.12174837291240692, + "loss/reg": 0.01522404421120882, + "step": 1387 + }, + { + "epoch": 0.1735, + "grad_norm": 5.515318393707275, + "grad_norm_var": 324.5108738623049, + "learning_rate": 0.0001, + "loss": 1.2457, + "loss/crossentropy": 2.321329355239868, + "loss/hidden": 0.9453125, + "loss/logits": 0.14820685982704163, + "loss/reg": 0.01521630771458149, + "step": 1388 + }, + { + "epoch": 0.173625, + "grad_norm": 2.965728759765625, + "grad_norm_var": 324.1829850455801, + "learning_rate": 0.0001, + "loss": 1.0633, + "loss/crossentropy": 2.864595413208008, + "loss/hidden": 0.76171875, + "loss/logits": 0.14945918321609497, + "loss/reg": 0.015208802185952663, + "step": 1389 + }, + { + "epoch": 0.17375, + "grad_norm": 2.8366498947143555, + "grad_norm_var": 323.8341522332258, + "learning_rate": 0.0001, + "loss": 1.178, + "loss/crossentropy": 2.634251594543457, + "loss/hidden": 0.8671875, + "loss/logits": 0.1587602198123932, + "loss/reg": 0.015202116221189499, + "step": 1390 + }, + { + "epoch": 0.173875, + "grad_norm": 3.016205310821533, + "grad_norm_var": 324.1573581089945, + "learning_rate": 0.0001, + "loss": 1.0881, + "loss/crossentropy": 2.1916353702545166, + "loss/hidden": 0.79296875, + "loss/logits": 0.14321959018707275, + "loss/reg": 0.015194511041045189, + "step": 1391 + }, + { + "epoch": 0.174, + "grad_norm": 34.42859649658203, + "grad_norm_var": 364.624406562687, + "learning_rate": 0.0001, + "loss": 1.0421, + "loss/crossentropy": 2.4397382736206055, + "loss/hidden": 0.76953125, + "loss/logits": 0.12064860761165619, + "loss/reg": 0.015187704935669899, + "step": 1392 + }, + { + "epoch": 0.174125, + "grad_norm": 3.928023099899292, + "grad_norm_var": 363.0711295434095, + "learning_rate": 0.0001, + "loss": 1.2257, + "loss/crossentropy": 2.1384785175323486, + "loss/hidden": 0.921875, + "loss/logits": 0.15198791027069092, + "loss/reg": 0.015179669484496117, + "step": 1393 + }, + { + "epoch": 0.17425, + "grad_norm": 5.9230194091796875, + "grad_norm_var": 361.2014745642014, + "learning_rate": 0.0001, + "loss": 1.2374, + "loss/crossentropy": 2.474677801132202, + "loss/hidden": 0.9453125, + "loss/logits": 0.14034368097782135, + "loss/reg": 0.015171775594353676, + "step": 1394 + }, + { + "epoch": 0.174375, + "grad_norm": 5.671228885650635, + "grad_norm_var": 359.4081015077895, + "learning_rate": 0.0001, + "loss": 1.6994, + "loss/crossentropy": 2.1997835636138916, + "loss/hidden": 1.34375, + "loss/logits": 0.20398974418640137, + "loss/reg": 0.015164447948336601, + "step": 1395 + }, + { + "epoch": 0.1745, + "grad_norm": 2.8046653270721436, + "grad_norm_var": 359.32498119248385, + "learning_rate": 0.0001, + "loss": 1.2227, + "loss/crossentropy": 2.2905519008636475, + "loss/hidden": 0.92578125, + "loss/logits": 0.14532050490379333, + "loss/reg": 0.015156446024775505, + "step": 1396 + }, + { + "epoch": 0.174625, + "grad_norm": 3.8924100399017334, + "grad_norm_var": 358.42190384848453, + "learning_rate": 0.0001, + "loss": 1.2919, + "loss/crossentropy": 2.5372776985168457, + "loss/hidden": 0.89453125, + "loss/logits": 0.2458799183368683, + "loss/reg": 0.015149053186178207, + "step": 1397 + }, + { + "epoch": 0.17475, + "grad_norm": 3.204949378967285, + "grad_norm_var": 357.6995478125139, + "learning_rate": 0.0001, + "loss": 1.2475, + "loss/crossentropy": 2.3744595050811768, + "loss/hidden": 0.9296875, + "loss/logits": 0.16641047596931458, + "loss/reg": 0.015141867101192474, + "step": 1398 + }, + { + "epoch": 0.174875, + "grad_norm": 2.5903260707855225, + "grad_norm_var": 358.97241696361874, + "learning_rate": 0.0001, + "loss": 1.079, + "loss/crossentropy": 2.613074541091919, + "loss/hidden": 0.7890625, + "loss/logits": 0.1385442167520523, + "loss/reg": 0.015134819783270359, + "step": 1399 + }, + { + "epoch": 0.175, + "grad_norm": 2.877967357635498, + "grad_norm_var": 359.4468337869737, + "learning_rate": 0.0001, + "loss": 1.0731, + "loss/crossentropy": 2.320415735244751, + "loss/hidden": 0.79296875, + "loss/logits": 0.12884414196014404, + "loss/reg": 0.015128130093216896, + "step": 1400 + }, + { + "epoch": 0.175125, + "grad_norm": 2.7650208473205566, + "grad_norm_var": 60.75819602934475, + "learning_rate": 0.0001, + "loss": 1.2011, + "loss/crossentropy": 2.242631673812866, + "loss/hidden": 0.8828125, + "loss/logits": 0.1670370101928711, + "loss/reg": 0.015121539123356342, + "step": 1401 + }, + { + "epoch": 0.17525, + "grad_norm": 3.245616912841797, + "grad_norm_var": 60.52307577230558, + "learning_rate": 0.0001, + "loss": 1.2625, + "loss/crossentropy": 2.2215981483459473, + "loss/hidden": 0.9453125, + "loss/logits": 0.16608265042304993, + "loss/reg": 0.015114927664399147, + "step": 1402 + }, + { + "epoch": 0.175375, + "grad_norm": 62.871517181396484, + "grad_norm_var": 264.3128112734296, + "learning_rate": 0.0001, + "loss": 3.0131, + "loss/crossentropy": 2.5125503540039062, + "loss/hidden": 2.4375, + "loss/logits": 0.42451030015945435, + "loss/reg": 0.015108034946024418, + "step": 1403 + }, + { + "epoch": 0.1755, + "grad_norm": 3.219738006591797, + "grad_norm_var": 265.7955458129954, + "learning_rate": 0.0001, + "loss": 1.2222, + "loss/crossentropy": 2.3459861278533936, + "loss/hidden": 0.91015625, + "loss/logits": 0.16100311279296875, + "loss/reg": 0.015101352706551552, + "step": 1404 + }, + { + "epoch": 0.175625, + "grad_norm": 2.7814700603485107, + "grad_norm_var": 265.9493587458945, + "learning_rate": 0.0001, + "loss": 1.0905, + "loss/crossentropy": 2.5293545722961426, + "loss/hidden": 0.80078125, + "loss/logits": 0.1387513130903244, + "loss/reg": 0.01509346254169941, + "step": 1405 + }, + { + "epoch": 0.17575, + "grad_norm": 2.6544785499572754, + "grad_norm_var": 266.1042610002773, + "learning_rate": 0.0001, + "loss": 1.0535, + "loss/crossentropy": 2.5097312927246094, + "loss/hidden": 0.7734375, + "loss/logits": 0.1291828155517578, + "loss/reg": 0.015085602179169655, + "step": 1406 + }, + { + "epoch": 0.175875, + "grad_norm": 3.1223692893981934, + "grad_norm_var": 266.0186046129393, + "learning_rate": 0.0001, + "loss": 1.4849, + "loss/crossentropy": 2.2004942893981934, + "loss/hidden": 1.140625, + "loss/logits": 0.1934625506401062, + "loss/reg": 0.015078413300216198, + "step": 1407 + }, + { + "epoch": 0.176, + "grad_norm": 4.406020164489746, + "grad_norm_var": 221.05808913641562, + "learning_rate": 0.0001, + "loss": 1.2206, + "loss/crossentropy": 2.239046096801758, + "loss/hidden": 0.90625, + "loss/logits": 0.1636815071105957, + "loss/reg": 0.015070472843945026, + "step": 1408 + }, + { + "epoch": 0.176125, + "grad_norm": 2.3968393802642822, + "grad_norm_var": 221.88230400943462, + "learning_rate": 0.0001, + "loss": 1.0135, + "loss/crossentropy": 2.5016496181488037, + "loss/hidden": 0.73046875, + "loss/logits": 0.13240215182304382, + "loss/reg": 0.015062601305544376, + "step": 1409 + }, + { + "epoch": 0.17625, + "grad_norm": 4.053096771240234, + "grad_norm_var": 222.40718733745135, + "learning_rate": 0.0001, + "loss": 1.3875, + "loss/crossentropy": 2.5696771144866943, + "loss/hidden": 1.046875, + "loss/logits": 0.19007228314876556, + "loss/reg": 0.015055070631206036, + "step": 1410 + }, + { + "epoch": 0.176375, + "grad_norm": 3.213609218597412, + "grad_norm_var": 223.23151802105554, + "learning_rate": 0.0001, + "loss": 1.101, + "loss/crossentropy": 2.580714225769043, + "loss/hidden": 0.81640625, + "loss/logits": 0.13412834703922272, + "loss/reg": 0.01504768431186676, + "step": 1411 + }, + { + "epoch": 0.1765, + "grad_norm": 7.05836820602417, + "grad_norm_var": 222.05031160271446, + "learning_rate": 0.0001, + "loss": 1.3444, + "loss/crossentropy": 2.6128039360046387, + "loss/hidden": 1.046875, + "loss/logits": 0.14714105427265167, + "loss/reg": 0.015039726160466671, + "step": 1412 + }, + { + "epoch": 0.176625, + "grad_norm": 2.905400037765503, + "grad_norm_var": 222.53952156242244, + "learning_rate": 0.0001, + "loss": 1.0393, + "loss/crossentropy": 2.390307664871216, + "loss/hidden": 0.75390625, + "loss/logits": 0.13505280017852783, + "loss/reg": 0.015031633898615837, + "step": 1413 + }, + { + "epoch": 0.17675, + "grad_norm": 2.7498996257781982, + "grad_norm_var": 222.7879046702847, + "learning_rate": 0.0001, + "loss": 1.1323, + "loss/crossentropy": 2.4936277866363525, + "loss/hidden": 0.828125, + "loss/logits": 0.15391477942466736, + "loss/reg": 0.015023862943053246, + "step": 1414 + }, + { + "epoch": 0.176875, + "grad_norm": 3.2711455821990967, + "grad_norm_var": 222.41140935738122, + "learning_rate": 0.0001, + "loss": 1.159, + "loss/crossentropy": 2.586298704147339, + "loss/hidden": 0.8515625, + "loss/logits": 0.15724779665470123, + "loss/reg": 0.015016157180070877, + "step": 1415 + }, + { + "epoch": 0.177, + "grad_norm": 2.5050208568573, + "grad_norm_var": 222.63002493426723, + "learning_rate": 0.0001, + "loss": 0.9875, + "loss/crossentropy": 2.4593968391418457, + "loss/hidden": 0.71484375, + "loss/logits": 0.12256527692079544, + "loss/reg": 0.015008063055574894, + "step": 1416 + }, + { + "epoch": 0.177125, + "grad_norm": 5.0899248123168945, + "grad_norm_var": 221.63143029624337, + "learning_rate": 0.0001, + "loss": 1.3226, + "loss/crossentropy": 2.529581069946289, + "loss/hidden": 0.9921875, + "loss/logits": 0.18042267858982086, + "loss/reg": 0.014999698847532272, + "step": 1417 + }, + { + "epoch": 0.17725, + "grad_norm": 2.873887300491333, + "grad_norm_var": 221.83712878589319, + "learning_rate": 0.0001, + "loss": 1.1234, + "loss/crossentropy": 2.637040376663208, + "loss/hidden": 0.81640625, + "loss/logits": 0.1570313721895218, + "loss/reg": 0.014991391450166702, + "step": 1418 + }, + { + "epoch": 0.177375, + "grad_norm": 2.4938929080963135, + "grad_norm_var": 1.4893372742809399, + "learning_rate": 0.0001, + "loss": 1.0818, + "loss/crossentropy": 2.3521924018859863, + "loss/hidden": 0.7890625, + "loss/logits": 0.14290779829025269, + "loss/reg": 0.01498338021337986, + "step": 1419 + }, + { + "epoch": 0.1775, + "grad_norm": 7.30941104888916, + "grad_norm_var": 2.4229140389199517, + "learning_rate": 0.0001, + "loss": 1.2235, + "loss/crossentropy": 2.295588254928589, + "loss/hidden": 0.9140625, + "loss/logits": 0.15963850915431976, + "loss/reg": 0.014975651167333126, + "step": 1420 + }, + { + "epoch": 0.177625, + "grad_norm": 2.976419448852539, + "grad_norm_var": 2.4019258000462114, + "learning_rate": 0.0001, + "loss": 1.1336, + "loss/crossentropy": 2.398700714111328, + "loss/hidden": 0.8125, + "loss/logits": 0.17146307229995728, + "loss/reg": 0.014968100003898144, + "step": 1421 + }, + { + "epoch": 0.17775, + "grad_norm": 6.012148380279541, + "grad_norm_var": 2.641842426821879, + "learning_rate": 0.0001, + "loss": 1.4281, + "loss/crossentropy": 2.3960044384002686, + "loss/hidden": 1.0625, + "loss/logits": 0.21602004766464233, + "loss/reg": 0.01496051624417305, + "step": 1422 + }, + { + "epoch": 0.177875, + "grad_norm": 3.721160888671875, + "grad_norm_var": 2.601979835113735, + "learning_rate": 0.0001, + "loss": 1.4417, + "loss/crossentropy": 2.231612205505371, + "loss/hidden": 1.078125, + "loss/logits": 0.21401020884513855, + "loss/reg": 0.01495243888348341, + "step": 1423 + }, + { + "epoch": 0.178, + "grad_norm": 2.6747055053710938, + "grad_norm_var": 2.68168930149086, + "learning_rate": 0.0001, + "loss": 1.1718, + "loss/crossentropy": 2.2853996753692627, + "loss/hidden": 0.87109375, + "loss/logits": 0.15127936005592346, + "loss/reg": 0.014944901689887047, + "step": 1424 + }, + { + "epoch": 0.178125, + "grad_norm": 11.191994667053223, + "grad_norm_var": 5.833885032277132, + "learning_rate": 0.0001, + "loss": 1.2679, + "loss/crossentropy": 2.5116124153137207, + "loss/hidden": 0.9296875, + "loss/logits": 0.18879596889019012, + "loss/reg": 0.014937575906515121, + "step": 1425 + }, + { + "epoch": 0.17825, + "grad_norm": 3.66359543800354, + "grad_norm_var": 5.860409413897694, + "learning_rate": 0.0001, + "loss": 1.1213, + "loss/crossentropy": 2.5135722160339355, + "loss/hidden": 0.82421875, + "loss/logits": 0.1477985680103302, + "loss/reg": 0.014929663389921188, + "step": 1426 + }, + { + "epoch": 0.178375, + "grad_norm": 7.735219478607178, + "grad_norm_var": 6.448943732227483, + "learning_rate": 0.0001, + "loss": 1.1287, + "loss/crossentropy": 2.675562620162964, + "loss/hidden": 0.81640625, + "loss/logits": 0.16307833790779114, + "loss/reg": 0.01492208894342184, + "step": 1427 + }, + { + "epoch": 0.1785, + "grad_norm": 3.957063913345337, + "grad_norm_var": 6.0498597570917205, + "learning_rate": 0.0001, + "loss": 1.247, + "loss/crossentropy": 2.4751713275909424, + "loss/hidden": 0.91015625, + "loss/logits": 0.18771307170391083, + "loss/reg": 0.014914041385054588, + "step": 1428 + }, + { + "epoch": 0.178625, + "grad_norm": 4.195860862731934, + "grad_norm_var": 5.888917428574246, + "learning_rate": 0.0001, + "loss": 1.4162, + "loss/crossentropy": 2.572279930114746, + "loss/hidden": 1.03125, + "loss/logits": 0.23586444556713104, + "loss/reg": 0.014906428754329681, + "step": 1429 + }, + { + "epoch": 0.17875, + "grad_norm": 3.2409324645996094, + "grad_norm_var": 5.787681963969308, + "learning_rate": 0.0001, + "loss": 1.0805, + "loss/crossentropy": 2.401996374130249, + "loss/hidden": 0.80078125, + "loss/logits": 0.13070350885391235, + "loss/reg": 0.014898893423378468, + "step": 1430 + }, + { + "epoch": 0.178875, + "grad_norm": 3.5550079345703125, + "grad_norm_var": 5.744049750040046, + "learning_rate": 0.0001, + "loss": 1.159, + "loss/crossentropy": 2.4993062019348145, + "loss/hidden": 0.8359375, + "loss/logits": 0.1741950362920761, + "loss/reg": 0.014890996739268303, + "step": 1431 + }, + { + "epoch": 0.179, + "grad_norm": 2.403970718383789, + "grad_norm_var": 5.772574341640287, + "learning_rate": 0.0001, + "loss": 0.9385, + "loss/crossentropy": 2.338052749633789, + "loss/hidden": 0.67578125, + "loss/logits": 0.11390648782253265, + "loss/reg": 0.014883420430123806, + "step": 1432 + }, + { + "epoch": 0.179125, + "grad_norm": 3.128432512283325, + "grad_norm_var": 5.876657514659041, + "learning_rate": 0.0001, + "loss": 1.1205, + "loss/crossentropy": 2.323333740234375, + "loss/hidden": 0.828125, + "loss/logits": 0.1436302661895752, + "loss/reg": 0.014875980094075203, + "step": 1433 + }, + { + "epoch": 0.17925, + "grad_norm": 2.75893235206604, + "grad_norm_var": 5.901577514262766, + "learning_rate": 0.0001, + "loss": 1.0911, + "loss/crossentropy": 2.403975009918213, + "loss/hidden": 0.796875, + "loss/logits": 0.14556562900543213, + "loss/reg": 0.014869259670376778, + "step": 1434 + }, + { + "epoch": 0.179375, + "grad_norm": 4.4465861320495605, + "grad_norm_var": 5.633549820228222, + "learning_rate": 0.0001, + "loss": 1.4989, + "loss/crossentropy": 2.418936014175415, + "loss/hidden": 1.109375, + "loss/logits": 0.24089978635311127, + "loss/reg": 0.014861512929201126, + "step": 1435 + }, + { + "epoch": 0.1795, + "grad_norm": 2.976722478866577, + "grad_norm_var": 5.218912579761936, + "learning_rate": 0.0001, + "loss": 1.1243, + "loss/crossentropy": 2.423041582107544, + "loss/hidden": 0.83203125, + "loss/logits": 0.14372695982456207, + "loss/reg": 0.014853725209832191, + "step": 1436 + }, + { + "epoch": 0.179625, + "grad_norm": 2.8299055099487305, + "grad_norm_var": 5.245913751427944, + "learning_rate": 0.0001, + "loss": 0.9575, + "loss/crossentropy": 2.5777714252471924, + "loss/hidden": 0.6796875, + "loss/logits": 0.1293865293264389, + "loss/reg": 0.014846443198621273, + "step": 1437 + }, + { + "epoch": 0.17975, + "grad_norm": 3.8410592079162598, + "grad_norm_var": 5.039317138416171, + "learning_rate": 0.0001, + "loss": 1.1673, + "loss/crossentropy": 2.2890384197235107, + "loss/hidden": 0.859375, + "loss/logits": 0.15949448943138123, + "loss/reg": 0.014838694594800472, + "step": 1438 + }, + { + "epoch": 0.179875, + "grad_norm": 3.1295011043548584, + "grad_norm_var": 5.094637447706395, + "learning_rate": 0.0001, + "loss": 1.0505, + "loss/crossentropy": 2.4979283809661865, + "loss/hidden": 0.77734375, + "loss/logits": 0.12488513439893723, + "loss/reg": 0.014831180684268475, + "step": 1439 + }, + { + "epoch": 0.18, + "grad_norm": 3.2620816230773926, + "grad_norm_var": 5.0039422612885565, + "learning_rate": 0.0001, + "loss": 1.0724, + "loss/crossentropy": 2.5473530292510986, + "loss/hidden": 0.8046875, + "loss/logits": 0.1195274293422699, + "loss/reg": 0.014823324047029018, + "step": 1440 + }, + { + "epoch": 0.180125, + "grad_norm": 2.8635447025299072, + "grad_norm_var": 1.5135115386307687, + "learning_rate": 0.0001, + "loss": 1.0522, + "loss/crossentropy": 2.709258794784546, + "loss/hidden": 0.76953125, + "loss/logits": 0.13453327119350433, + "loss/reg": 0.01481538638472557, + "step": 1441 + }, + { + "epoch": 0.18025, + "grad_norm": 2.694793462753296, + "grad_norm_var": 1.5670935881051355, + "learning_rate": 0.0001, + "loss": 1.042, + "loss/crossentropy": 2.6498730182647705, + "loss/hidden": 0.76171875, + "loss/logits": 0.13220643997192383, + "loss/reg": 0.01480773463845253, + "step": 1442 + }, + { + "epoch": 0.180375, + "grad_norm": 3.662919282913208, + "grad_norm_var": 0.33856051311708674, + "learning_rate": 0.0001, + "loss": 1.195, + "loss/crossentropy": 2.589240550994873, + "loss/hidden": 0.86328125, + "loss/logits": 0.18375417590141296, + "loss/reg": 0.014799892902374268, + "step": 1443 + }, + { + "epoch": 0.1805, + "grad_norm": 3.0655100345611572, + "grad_norm_var": 0.3112265539838631, + "learning_rate": 0.0001, + "loss": 1.0221, + "loss/crossentropy": 2.322566509246826, + "loss/hidden": 0.7578125, + "loss/logits": 0.11638704687356949, + "loss/reg": 0.014792241156101227, + "step": 1444 + }, + { + "epoch": 0.180625, + "grad_norm": 3.0010573863983154, + "grad_norm_var": 0.2503215727538245, + "learning_rate": 0.0001, + "loss": 1.1143, + "loss/crossentropy": 2.2516565322875977, + "loss/hidden": 0.83203125, + "loss/logits": 0.13441051542758942, + "loss/reg": 0.014784677885472775, + "step": 1445 + }, + { + "epoch": 0.18075, + "grad_norm": 2.6158626079559326, + "grad_norm_var": 0.26956362632713443, + "learning_rate": 0.0001, + "loss": 1.1134, + "loss/crossentropy": 2.476480007171631, + "loss/hidden": 0.80078125, + "loss/logits": 0.1648055911064148, + "loss/reg": 0.014777293428778648, + "step": 1446 + }, + { + "epoch": 0.180875, + "grad_norm": 2.593005895614624, + "grad_norm_var": 0.2741393520658806, + "learning_rate": 0.0001, + "loss": 1.1448, + "loss/crossentropy": 2.5278983116149902, + "loss/hidden": 0.83203125, + "loss/logits": 0.1651032269001007, + "loss/reg": 0.014769522473216057, + "step": 1447 + }, + { + "epoch": 0.181, + "grad_norm": 3.3676040172576904, + "grad_norm_var": 0.24536603446710312, + "learning_rate": 0.0001, + "loss": 1.0625, + "loss/crossentropy": 2.5347743034362793, + "loss/hidden": 0.77734375, + "loss/logits": 0.13756276667118073, + "loss/reg": 0.01476323138922453, + "step": 1448 + }, + { + "epoch": 0.181125, + "grad_norm": 2.7923572063446045, + "grad_norm_var": 0.2529365869795574, + "learning_rate": 0.0001, + "loss": 1.3644, + "loss/crossentropy": 2.110182046890259, + "loss/hidden": 1.0234375, + "loss/logits": 0.1933690905570984, + "loss/reg": 0.014755993150174618, + "step": 1449 + }, + { + "epoch": 0.18125, + "grad_norm": 70.97599792480469, + "grad_norm_var": 287.8273579394089, + "learning_rate": 0.0001, + "loss": 1.1063, + "loss/crossentropy": 2.6005754470825195, + "loss/hidden": 0.8125, + "loss/logits": 0.14632144570350647, + "loss/reg": 0.014748108573257923, + "step": 1450 + }, + { + "epoch": 0.181375, + "grad_norm": 5.7473530769348145, + "grad_norm_var": 287.42393180966496, + "learning_rate": 0.0001, + "loss": 1.6866, + "loss/crossentropy": 2.3792290687561035, + "loss/hidden": 1.2265625, + "loss/logits": 0.31258517503738403, + "loss/reg": 0.014740433543920517, + "step": 1451 + }, + { + "epoch": 0.1815, + "grad_norm": 3.4155495166778564, + "grad_norm_var": 287.17343283264796, + "learning_rate": 0.0001, + "loss": 1.1977, + "loss/crossentropy": 2.4722485542297363, + "loss/hidden": 0.8828125, + "loss/logits": 0.1675756871700287, + "loss/reg": 0.014733717776834965, + "step": 1452 + }, + { + "epoch": 0.181625, + "grad_norm": 2.5721304416656494, + "grad_norm_var": 287.3377922083848, + "learning_rate": 0.0001, + "loss": 1.0864, + "loss/crossentropy": 2.65114426612854, + "loss/hidden": 0.79296875, + "loss/logits": 0.1461598426103592, + "loss/reg": 0.01472744531929493, + "step": 1453 + }, + { + "epoch": 0.18175, + "grad_norm": 3.40112566947937, + "grad_norm_var": 287.5630487447142, + "learning_rate": 0.0001, + "loss": 1.3854, + "loss/crossentropy": 2.49849796295166, + "loss/hidden": 1.0390625, + "loss/logits": 0.19910715520381927, + "loss/reg": 0.014719628728926182, + "step": 1454 + }, + { + "epoch": 0.181875, + "grad_norm": 3.4560482501983643, + "grad_norm_var": 287.381708208898, + "learning_rate": 0.0001, + "loss": 1.2084, + "loss/crossentropy": 2.630333423614502, + "loss/hidden": 0.90625, + "loss/logits": 0.1550079882144928, + "loss/reg": 0.014712607488036156, + "step": 1455 + }, + { + "epoch": 0.182, + "grad_norm": 2.8808276653289795, + "grad_norm_var": 287.60459257620465, + "learning_rate": 0.0001, + "loss": 1.1446, + "loss/crossentropy": 2.7043960094451904, + "loss/hidden": 0.8515625, + "loss/logits": 0.14598789811134338, + "loss/reg": 0.01470634713768959, + "step": 1456 + }, + { + "epoch": 0.182125, + "grad_norm": 2.3799662590026855, + "grad_norm_var": 287.91454947447465, + "learning_rate": 0.0001, + "loss": 1.0639, + "loss/crossentropy": 2.4705262184143066, + "loss/hidden": 0.77734375, + "loss/logits": 0.13952219486236572, + "loss/reg": 0.014699905179440975, + "step": 1457 + }, + { + "epoch": 0.18225, + "grad_norm": 3.0028903484344482, + "grad_norm_var": 287.7266240160942, + "learning_rate": 0.0001, + "loss": 1.1176, + "loss/crossentropy": 2.4257349967956543, + "loss/hidden": 0.8125, + "loss/logits": 0.1581314206123352, + "loss/reg": 0.014692124910652637, + "step": 1458 + }, + { + "epoch": 0.182375, + "grad_norm": 2.5650155544281006, + "grad_norm_var": 288.3538726561922, + "learning_rate": 0.0001, + "loss": 1.0762, + "loss/crossentropy": 2.4188382625579834, + "loss/hidden": 0.7890625, + "loss/logits": 0.14031586050987244, + "loss/reg": 0.014684327878057957, + "step": 1459 + }, + { + "epoch": 0.1825, + "grad_norm": 3.0308585166931152, + "grad_norm_var": 288.3738099925176, + "learning_rate": 0.0001, + "loss": 1.2435, + "loss/crossentropy": 2.5064949989318848, + "loss/hidden": 0.9296875, + "loss/logits": 0.16703946888446808, + "loss/reg": 0.014676612801849842, + "step": 1460 + }, + { + "epoch": 0.182625, + "grad_norm": 3.903977394104004, + "grad_norm_var": 287.89971053282926, + "learning_rate": 0.0001, + "loss": 1.0425, + "loss/crossentropy": 2.5762782096862793, + "loss/hidden": 0.75390625, + "loss/logits": 0.14189936220645905, + "loss/reg": 0.014669781550765038, + "step": 1461 + }, + { + "epoch": 0.18275, + "grad_norm": 3.6422464847564697, + "grad_norm_var": 287.3082663217944, + "learning_rate": 0.0001, + "loss": 1.0475, + "loss/crossentropy": 2.6891908645629883, + "loss/hidden": 0.7734375, + "loss/logits": 0.12741170823574066, + "loss/reg": 0.0146620599552989, + "step": 1462 + }, + { + "epoch": 0.182875, + "grad_norm": 2.7266409397125244, + "grad_norm_var": 287.22225368800906, + "learning_rate": 0.0001, + "loss": 1.1596, + "loss/crossentropy": 2.4353747367858887, + "loss/hidden": 0.84765625, + "loss/logits": 0.16537800431251526, + "loss/reg": 0.014655125327408314, + "step": 1463 + }, + { + "epoch": 0.183, + "grad_norm": 8.853667259216309, + "grad_norm_var": 286.08693801368264, + "learning_rate": 0.0001, + "loss": 1.2411, + "loss/crossentropy": 2.9862468242645264, + "loss/hidden": 0.90625, + "loss/logits": 0.1883150041103363, + "loss/reg": 0.014648628421127796, + "step": 1464 + }, + { + "epoch": 0.183125, + "grad_norm": 3.3163163661956787, + "grad_norm_var": 285.7518694340515, + "learning_rate": 0.0001, + "loss": 1.1675, + "loss/crossentropy": 2.642627477645874, + "loss/hidden": 0.8671875, + "loss/logits": 0.15386328101158142, + "loss/reg": 0.014640781097114086, + "step": 1465 + }, + { + "epoch": 0.18325, + "grad_norm": 3.663252830505371, + "grad_norm_var": 2.5336251924554736, + "learning_rate": 0.0001, + "loss": 1.2303, + "loss/crossentropy": 2.353497266769409, + "loss/hidden": 0.91015625, + "loss/logits": 0.17375797033309937, + "loss/reg": 0.014634395018219948, + "step": 1466 + }, + { + "epoch": 0.183375, + "grad_norm": 3.4295380115509033, + "grad_norm_var": 2.224270864584783, + "learning_rate": 0.0001, + "loss": 1.186, + "loss/crossentropy": 2.5201594829559326, + "loss/hidden": 0.8828125, + "loss/logits": 0.1568823754787445, + "loss/reg": 0.01462656632065773, + "step": 1467 + }, + { + "epoch": 0.1835, + "grad_norm": 2.4592854976654053, + "grad_norm_var": 2.294103952189973, + "learning_rate": 0.0001, + "loss": 1.0183, + "loss/crossentropy": 2.5786993503570557, + "loss/hidden": 0.74609375, + "loss/logits": 0.1260080635547638, + "loss/reg": 0.01461972575634718, + "step": 1468 + }, + { + "epoch": 0.183625, + "grad_norm": 4.478452205657959, + "grad_norm_var": 2.296768240317753, + "learning_rate": 0.0001, + "loss": 1.1604, + "loss/crossentropy": 2.472398042678833, + "loss/hidden": 0.87890625, + "loss/logits": 0.13542431592941284, + "loss/reg": 0.014611870981752872, + "step": 1469 + }, + { + "epoch": 0.18375, + "grad_norm": 2.8013250827789307, + "grad_norm_var": 2.3331091729009654, + "learning_rate": 0.0001, + "loss": 1.2052, + "loss/crossentropy": 2.448435068130493, + "loss/hidden": 0.8671875, + "loss/logits": 0.19197094440460205, + "loss/reg": 0.014604009687900543, + "step": 1470 + }, + { + "epoch": 0.183875, + "grad_norm": 2.5477471351623535, + "grad_norm_var": 2.394463361736054, + "learning_rate": 0.0001, + "loss": 1.0754, + "loss/crossentropy": 2.4608027935028076, + "loss/hidden": 0.796875, + "loss/logits": 0.13251473009586334, + "loss/reg": 0.014596136286854744, + "step": 1471 + }, + { + "epoch": 0.184, + "grad_norm": 2.9321022033691406, + "grad_norm_var": 2.3905305167023623, + "learning_rate": 0.0001, + "loss": 0.997, + "loss/crossentropy": 2.4788243770599365, + "loss/hidden": 0.7421875, + "loss/logits": 0.10894551128149033, + "loss/reg": 0.014588245190680027, + "step": 1472 + }, + { + "epoch": 0.184125, + "grad_norm": 3.2976138591766357, + "grad_norm_var": 2.3081604420680737, + "learning_rate": 0.0001, + "loss": 1.0611, + "loss/crossentropy": 2.440394878387451, + "loss/hidden": 0.77734375, + "loss/logits": 0.13798683881759644, + "loss/reg": 0.014581209979951382, + "step": 1473 + }, + { + "epoch": 0.18425, + "grad_norm": 2.3695755004882812, + "grad_norm_var": 2.378640708203821, + "learning_rate": 0.0001, + "loss": 1.0597, + "loss/crossentropy": 2.457941770553589, + "loss/hidden": 0.77734375, + "loss/logits": 0.1365838199853897, + "loss/reg": 0.014573454856872559, + "step": 1474 + }, + { + "epoch": 0.184375, + "grad_norm": 2.909522771835327, + "grad_norm_var": 2.343060112343133, + "learning_rate": 0.0001, + "loss": 1.0217, + "loss/crossentropy": 2.291848659515381, + "loss/hidden": 0.76953125, + "loss/logits": 0.10653392970561981, + "loss/reg": 0.014565936289727688, + "step": 1475 + }, + { + "epoch": 0.1845, + "grad_norm": 4.1714019775390625, + "grad_norm_var": 2.349577274287818, + "learning_rate": 0.0001, + "loss": 1.2516, + "loss/crossentropy": 2.4249794483184814, + "loss/hidden": 0.9375, + "loss/logits": 0.16853143274784088, + "loss/reg": 0.0145582165569067, + "step": 1476 + }, + { + "epoch": 0.184625, + "grad_norm": 2.30051326751709, + "grad_norm_var": 2.4439813338223115, + "learning_rate": 0.0001, + "loss": 1.0722, + "loss/crossentropy": 2.621089220046997, + "loss/hidden": 0.7890625, + "loss/logits": 0.13760125637054443, + "loss/reg": 0.014550295658409595, + "step": 1477 + }, + { + "epoch": 0.18475, + "grad_norm": 2.7384438514709473, + "grad_norm_var": 2.4771341504323514, + "learning_rate": 0.0001, + "loss": 1.1117, + "loss/crossentropy": 2.1965439319610596, + "loss/hidden": 0.83203125, + "loss/logits": 0.1342887282371521, + "loss/reg": 0.014541985467076302, + "step": 1478 + }, + { + "epoch": 0.184875, + "grad_norm": 37.360321044921875, + "grad_norm_var": 74.16407744545192, + "learning_rate": 0.0001, + "loss": 0.9989, + "loss/crossentropy": 2.9801506996154785, + "loss/hidden": 0.75, + "loss/logits": 0.10352309793233871, + "loss/reg": 0.014533448964357376, + "step": 1479 + }, + { + "epoch": 0.185, + "grad_norm": 3.2833774089813232, + "grad_norm_var": 73.68816936181202, + "learning_rate": 0.0001, + "loss": 1.3522, + "loss/crossentropy": 2.354642868041992, + "loss/hidden": 1.0078125, + "loss/logits": 0.19911861419677734, + "loss/reg": 0.014524821192026138, + "step": 1480 + }, + { + "epoch": 0.185125, + "grad_norm": 2.9734416007995605, + "grad_norm_var": 73.78408654274459, + "learning_rate": 0.0001, + "loss": 1.1487, + "loss/crossentropy": 2.5445892810821533, + "loss/hidden": 0.84375, + "loss/logits": 0.1597684621810913, + "loss/reg": 0.014516538009047508, + "step": 1481 + }, + { + "epoch": 0.18525, + "grad_norm": 3.2212953567504883, + "grad_norm_var": 73.88875146417945, + "learning_rate": 0.0001, + "loss": 1.1972, + "loss/crossentropy": 2.4549801349639893, + "loss/hidden": 0.875, + "loss/logits": 0.17715027928352356, + "loss/reg": 0.014508497901260853, + "step": 1482 + }, + { + "epoch": 0.185375, + "grad_norm": 3.029637098312378, + "grad_norm_var": 73.9933942207774, + "learning_rate": 0.0001, + "loss": 1.0726, + "loss/crossentropy": 2.6153388023376465, + "loss/hidden": 0.796875, + "loss/logits": 0.13069182634353638, + "loss/reg": 0.014500816352665424, + "step": 1483 + }, + { + "epoch": 0.1855, + "grad_norm": 3.7203376293182373, + "grad_norm_var": 73.63538575655532, + "learning_rate": 0.0001, + "loss": 1.1901, + "loss/crossentropy": 2.437302350997925, + "loss/hidden": 0.8828125, + "loss/logits": 0.16235879063606262, + "loss/reg": 0.014492850750684738, + "step": 1484 + }, + { + "epoch": 0.185625, + "grad_norm": 2.606203079223633, + "grad_norm_var": 74.04917997908026, + "learning_rate": 0.0001, + "loss": 0.9748, + "loss/crossentropy": 2.3537402153015137, + "loss/hidden": 0.71484375, + "loss/logits": 0.11513984948396683, + "loss/reg": 0.014484620653092861, + "step": 1485 + }, + { + "epoch": 0.18575, + "grad_norm": 3.775780200958252, + "grad_norm_var": 73.80448419578045, + "learning_rate": 0.0001, + "loss": 1.1872, + "loss/crossentropy": 2.6341981887817383, + "loss/hidden": 0.890625, + "loss/logits": 0.1517730951309204, + "loss/reg": 0.014477075077593327, + "step": 1486 + }, + { + "epoch": 0.185875, + "grad_norm": 3.1672568321228027, + "grad_norm_var": 73.60919906004487, + "learning_rate": 0.0001, + "loss": 1.0533, + "loss/crossentropy": 2.3641412258148193, + "loss/hidden": 0.78515625, + "loss/logits": 0.12350372225046158, + "loss/reg": 0.014468939043581486, + "step": 1487 + }, + { + "epoch": 0.186, + "grad_norm": 2.9723494052886963, + "grad_norm_var": 73.59690980017169, + "learning_rate": 0.0001, + "loss": 1.1189, + "loss/crossentropy": 2.4746789932250977, + "loss/hidden": 0.81640625, + "loss/logits": 0.1578400433063507, + "loss/reg": 0.014461321756243706, + "step": 1488 + }, + { + "epoch": 0.186125, + "grad_norm": 4.781861782073975, + "grad_norm_var": 73.34949321986252, + "learning_rate": 0.0001, + "loss": 1.0247, + "loss/crossentropy": 2.6176137924194336, + "loss/hidden": 0.76171875, + "loss/logits": 0.11844634264707565, + "loss/reg": 0.014453292824327946, + "step": 1489 + }, + { + "epoch": 0.18625, + "grad_norm": 3.498753786087036, + "grad_norm_var": 72.98251711179161, + "learning_rate": 0.0001, + "loss": 1.0433, + "loss/crossentropy": 2.575167417526245, + "loss/hidden": 0.76171875, + "loss/logits": 0.1371680647134781, + "loss/reg": 0.014445292763411999, + "step": 1490 + }, + { + "epoch": 0.186375, + "grad_norm": 3.3169898986816406, + "grad_norm_var": 72.85721374014727, + "learning_rate": 0.0001, + "loss": 1.2038, + "loss/crossentropy": 2.556975841522217, + "loss/hidden": 0.8984375, + "loss/logits": 0.1610018014907837, + "loss/reg": 0.01443762518465519, + "step": 1491 + }, + { + "epoch": 0.1865, + "grad_norm": 3.731511354446411, + "grad_norm_var": 72.9432662884783, + "learning_rate": 0.0001, + "loss": 0.967, + "loss/crossentropy": 2.7008392810821533, + "loss/hidden": 0.6953125, + "loss/logits": 0.12734538316726685, + "loss/reg": 0.014430060982704163, + "step": 1492 + }, + { + "epoch": 0.186625, + "grad_norm": 2.6620421409606934, + "grad_norm_var": 72.80179282549148, + "learning_rate": 0.0001, + "loss": 1.0512, + "loss/crossentropy": 2.7235186100006104, + "loss/hidden": 0.78125, + "loss/logits": 0.1257205456495285, + "loss/reg": 0.014422285370528698, + "step": 1493 + }, + { + "epoch": 0.18675, + "grad_norm": 3.3065989017486572, + "grad_norm_var": 72.61826294021522, + "learning_rate": 0.0001, + "loss": 1.1709, + "loss/crossentropy": 2.5472517013549805, + "loss/hidden": 0.87109375, + "loss/logits": 0.15567487478256226, + "loss/reg": 0.014414286240935326, + "step": 1494 + }, + { + "epoch": 0.186875, + "grad_norm": 3.0994396209716797, + "grad_norm_var": 0.27048224897859136, + "learning_rate": 0.0001, + "loss": 1.1325, + "loss/crossentropy": 2.8537662029266357, + "loss/hidden": 0.84765625, + "loss/logits": 0.1407473087310791, + "loss/reg": 0.014406588859856129, + "step": 1495 + }, + { + "epoch": 0.187, + "grad_norm": 2.676715612411499, + "grad_norm_var": 0.29658286686653407, + "learning_rate": 0.0001, + "loss": 1.1095, + "loss/crossentropy": 2.4631519317626953, + "loss/hidden": 0.83203125, + "loss/logits": 0.1335071474313736, + "loss/reg": 0.01439889520406723, + "step": 1496 + }, + { + "epoch": 0.187125, + "grad_norm": 3.475898265838623, + "grad_norm_var": 0.29157201854104237, + "learning_rate": 0.0001, + "loss": 1.3724, + "loss/crossentropy": 2.6677818298339844, + "loss/hidden": 1.03125, + "loss/logits": 0.1972627192735672, + "loss/reg": 0.014391305856406689, + "step": 1497 + }, + { + "epoch": 0.18725, + "grad_norm": 2.769845962524414, + "grad_norm_var": 0.30996036390038334, + "learning_rate": 0.0001, + "loss": 1.0487, + "loss/crossentropy": 2.2365801334381104, + "loss/hidden": 0.77734375, + "loss/logits": 0.12756550312042236, + "loss/reg": 0.014383896254003048, + "step": 1498 + }, + { + "epoch": 0.187375, + "grad_norm": 2.8853700160980225, + "grad_norm_var": 0.31621077264406544, + "learning_rate": 0.0001, + "loss": 1.1072, + "loss/crossentropy": 2.6836471557617188, + "loss/hidden": 0.8359375, + "loss/logits": 0.12754273414611816, + "loss/reg": 0.014376661740243435, + "step": 1499 + }, + { + "epoch": 0.1875, + "grad_norm": 2.84495210647583, + "grad_norm_var": 0.31246808986064234, + "learning_rate": 0.0001, + "loss": 1.2259, + "loss/crossentropy": 2.2098467350006104, + "loss/hidden": 0.9140625, + "loss/logits": 0.16809730231761932, + "loss/reg": 0.014369050972163677, + "step": 1500 + }, + { + "epoch": 0.187625, + "grad_norm": 6.990378379821777, + "grad_norm_var": 1.1530969883337083, + "learning_rate": 0.0001, + "loss": 1.3325, + "loss/crossentropy": 2.6205296516418457, + "loss/hidden": 0.98828125, + "loss/logits": 0.2006484866142273, + "loss/reg": 0.014361603185534477, + "step": 1501 + }, + { + "epoch": 0.18775, + "grad_norm": 3.3288161754608154, + "grad_norm_var": 1.1489830243296237, + "learning_rate": 0.0001, + "loss": 1.2901, + "loss/crossentropy": 2.6651721000671387, + "loss/hidden": 0.97265625, + "loss/logits": 0.17393462359905243, + "loss/reg": 0.014354297891259193, + "step": 1502 + }, + { + "epoch": 0.187875, + "grad_norm": 2.990226984024048, + "grad_norm_var": 1.1580711389422116, + "learning_rate": 0.0001, + "loss": 1.0356, + "loss/crossentropy": 2.6010048389434814, + "loss/hidden": 0.7578125, + "loss/logits": 0.13436806201934814, + "loss/reg": 0.014346664771437645, + "step": 1503 + }, + { + "epoch": 0.188, + "grad_norm": 2.692655086517334, + "grad_norm_var": 1.1810803489356902, + "learning_rate": 0.0001, + "loss": 1.2967, + "loss/crossentropy": 2.152822971343994, + "loss/hidden": 0.97265625, + "loss/logits": 0.1806175410747528, + "loss/reg": 0.014339223504066467, + "step": 1504 + }, + { + "epoch": 0.188125, + "grad_norm": 87.7072982788086, + "grad_norm_var": 445.7985967242888, + "learning_rate": 0.0001, + "loss": 1.5753, + "loss/crossentropy": 2.5603487491607666, + "loss/hidden": 1.2890625, + "loss/logits": 0.1428820788860321, + "loss/reg": 0.014331568963825703, + "step": 1505 + }, + { + "epoch": 0.18825, + "grad_norm": 2.2747116088867188, + "grad_norm_var": 446.72864180402945, + "learning_rate": 0.0001, + "loss": 0.9904, + "loss/crossentropy": 2.3886001110076904, + "loss/hidden": 0.73046875, + "loss/logits": 0.11668366193771362, + "loss/reg": 0.014323906973004341, + "step": 1506 + }, + { + "epoch": 0.188375, + "grad_norm": 2.267629623413086, + "grad_norm_var": 447.52923211089256, + "learning_rate": 0.0001, + "loss": 1.0243, + "loss/crossentropy": 2.423229932785034, + "loss/hidden": 0.7578125, + "loss/logits": 0.12335064262151718, + "loss/reg": 0.014316298067569733, + "step": 1507 + }, + { + "epoch": 0.1885, + "grad_norm": 3.617304801940918, + "grad_norm_var": 447.6023780363864, + "learning_rate": 0.0001, + "loss": 1.3432, + "loss/crossentropy": 2.423637866973877, + "loss/hidden": 0.98046875, + "loss/logits": 0.21964287757873535, + "loss/reg": 0.014309005811810493, + "step": 1508 + }, + { + "epoch": 0.188625, + "grad_norm": 4.585550308227539, + "grad_norm_var": 446.3429466687175, + "learning_rate": 0.0001, + "loss": 1.3114, + "loss/crossentropy": 2.488982677459717, + "loss/hidden": 0.984375, + "loss/logits": 0.18399442732334137, + "loss/reg": 0.014301459304988384, + "step": 1509 + }, + { + "epoch": 0.18875, + "grad_norm": 3.1016576290130615, + "grad_norm_var": 446.4900686609001, + "learning_rate": 0.0001, + "loss": 1.0954, + "loss/crossentropy": 2.5978121757507324, + "loss/hidden": 0.8046875, + "loss/logits": 0.1477963924407959, + "loss/reg": 0.014294042252004147, + "step": 1510 + }, + { + "epoch": 0.188875, + "grad_norm": 2.6194772720336914, + "grad_norm_var": 446.85530854590877, + "learning_rate": 0.0001, + "loss": 1.1325, + "loss/crossentropy": 2.6663565635681152, + "loss/hidden": 0.8359375, + "loss/logits": 0.15372540056705475, + "loss/reg": 0.014286375604569912, + "step": 1511 + }, + { + "epoch": 0.189, + "grad_norm": 7.802513122558594, + "grad_norm_var": 444.48216865196844, + "learning_rate": 0.0001, + "loss": 1.4337, + "loss/crossentropy": 2.6862032413482666, + "loss/hidden": 1.09375, + "loss/logits": 0.19720105826854706, + "loss/reg": 0.014278349466621876, + "step": 1512 + }, + { + "epoch": 0.189125, + "grad_norm": 3.310786247253418, + "grad_norm_var": 444.6026705038085, + "learning_rate": 0.0001, + "loss": 1.3428, + "loss/crossentropy": 2.1258671283721924, + "loss/hidden": 1.0234375, + "loss/logits": 0.17663809657096863, + "loss/reg": 0.014270216226577759, + "step": 1513 + }, + { + "epoch": 0.18925, + "grad_norm": 5.729246616363525, + "grad_norm_var": 442.7462351862821, + "learning_rate": 0.0001, + "loss": 1.0873, + "loss/crossentropy": 2.9488282203674316, + "loss/hidden": 0.80078125, + "loss/logits": 0.1438678652048111, + "loss/reg": 0.01426264550536871, + "step": 1514 + }, + { + "epoch": 0.189375, + "grad_norm": 3.574831247329712, + "grad_norm_var": 442.2095373355806, + "learning_rate": 0.0001, + "loss": 1.0687, + "loss/crossentropy": 2.5907766819000244, + "loss/hidden": 0.79296875, + "loss/logits": 0.13321569561958313, + "loss/reg": 0.014254805631935596, + "step": 1515 + }, + { + "epoch": 0.1895, + "grad_norm": 3.7363853454589844, + "grad_norm_var": 441.5169453192193, + "learning_rate": 0.0001, + "loss": 1.1045, + "loss/crossentropy": 2.552393913269043, + "loss/hidden": 0.828125, + "loss/logits": 0.1338808685541153, + "loss/reg": 0.014247224666178226, + "step": 1516 + }, + { + "epoch": 0.189625, + "grad_norm": 2.745471954345703, + "grad_norm_var": 443.8629711327847, + "learning_rate": 0.0001, + "loss": 1.1382, + "loss/crossentropy": 2.7987189292907715, + "loss/hidden": 0.84375, + "loss/logits": 0.15207618474960327, + "loss/reg": 0.01423969492316246, + "step": 1517 + }, + { + "epoch": 0.18975, + "grad_norm": 2.548691987991333, + "grad_norm_var": 444.4784529377907, + "learning_rate": 0.0001, + "loss": 1.1242, + "loss/crossentropy": 2.413057565689087, + "loss/hidden": 0.828125, + "loss/logits": 0.15380065143108368, + "loss/reg": 0.014231679029762745, + "step": 1518 + }, + { + "epoch": 0.189875, + "grad_norm": 3.604227304458618, + "grad_norm_var": 444.0238071702247, + "learning_rate": 0.0001, + "loss": 1.3671, + "loss/crossentropy": 2.107539415359497, + "loss/hidden": 1.015625, + "loss/logits": 0.20924213528633118, + "loss/reg": 0.014223325997591019, + "step": 1519 + }, + { + "epoch": 0.19, + "grad_norm": 2.764598846435547, + "grad_norm_var": 443.96487541121735, + "learning_rate": 0.0001, + "loss": 0.9063, + "loss/crossentropy": 2.645019054412842, + "loss/hidden": 0.6484375, + "loss/logits": 0.11573326587677002, + "loss/reg": 0.014215711504220963, + "step": 1520 + }, + { + "epoch": 0.190125, + "grad_norm": 3.417343854904175, + "grad_norm_var": 2.0384518833410272, + "learning_rate": 0.0001, + "loss": 1.1809, + "loss/crossentropy": 2.0993950366973877, + "loss/hidden": 0.91015625, + "loss/logits": 0.1286228746175766, + "loss/reg": 0.014208083041012287, + "step": 1521 + }, + { + "epoch": 0.19025, + "grad_norm": 2.5339038372039795, + "grad_norm_var": 1.996633160561107, + "learning_rate": 0.0001, + "loss": 1.1835, + "loss/crossentropy": 2.3847148418426514, + "loss/hidden": 0.890625, + "loss/logits": 0.15089885890483856, + "loss/reg": 0.014200469478964806, + "step": 1522 + }, + { + "epoch": 0.190375, + "grad_norm": 5.412919044494629, + "grad_norm_var": 2.046751372081492, + "learning_rate": 0.0001, + "loss": 1.2804, + "loss/crossentropy": 2.647505760192871, + "loss/hidden": 1.0078125, + "loss/logits": 0.1306157112121582, + "loss/reg": 0.014192642644047737, + "step": 1523 + }, + { + "epoch": 0.1905, + "grad_norm": 3.2071800231933594, + "grad_norm_var": 2.068296485893218, + "learning_rate": 0.0001, + "loss": 1.0836, + "loss/crossentropy": 2.7921669483184814, + "loss/hidden": 0.7890625, + "loss/logits": 0.1526501327753067, + "loss/reg": 0.01418515294790268, + "step": 1524 + }, + { + "epoch": 0.190625, + "grad_norm": 3.779355049133301, + "grad_norm_var": 2.0237706183651416, + "learning_rate": 0.0001, + "loss": 1.0808, + "loss/crossentropy": 2.626934289932251, + "loss/hidden": 0.79296875, + "loss/logits": 0.1460772454738617, + "loss/reg": 0.014177562668919563, + "step": 1525 + }, + { + "epoch": 0.19075, + "grad_norm": 2.429072618484497, + "grad_norm_var": 2.109561386098513, + "learning_rate": 0.0001, + "loss": 1.0907, + "loss/crossentropy": 2.4317626953125, + "loss/hidden": 0.80859375, + "loss/logits": 0.14040398597717285, + "loss/reg": 0.014170153997838497, + "step": 1526 + }, + { + "epoch": 0.190875, + "grad_norm": 9.4876070022583, + "grad_norm_var": 4.06735639009191, + "learning_rate": 0.0001, + "loss": 1.4633, + "loss/crossentropy": 2.2017152309417725, + "loss/hidden": 1.15625, + "loss/logits": 0.16541144251823425, + "loss/reg": 0.014162125997245312, + "step": 1527 + }, + { + "epoch": 0.191, + "grad_norm": 3.5674562454223633, + "grad_norm_var": 3.1147103692906586, + "learning_rate": 0.0001, + "loss": 1.2965, + "loss/crossentropy": 2.85056734085083, + "loss/hidden": 0.98828125, + "loss/logits": 0.1666402667760849, + "loss/reg": 0.014154158532619476, + "step": 1528 + }, + { + "epoch": 0.191125, + "grad_norm": 2.6774816513061523, + "grad_norm_var": 3.1866235930450406, + "learning_rate": 0.0001, + "loss": 1.1206, + "loss/crossentropy": 2.367767095565796, + "loss/hidden": 0.83984375, + "loss/logits": 0.13927701115608215, + "loss/reg": 0.014145908877253532, + "step": 1529 + }, + { + "epoch": 0.19125, + "grad_norm": 8.835766792297363, + "grad_norm_var": 4.578113572841516, + "learning_rate": 0.0001, + "loss": 1.3634, + "loss/crossentropy": 2.98734974861145, + "loss/hidden": 1.0625, + "loss/logits": 0.15954606235027313, + "loss/reg": 0.014136978425085545, + "step": 1530 + }, + { + "epoch": 0.191375, + "grad_norm": 2.618042469024658, + "grad_norm_var": 4.692138147417478, + "learning_rate": 0.0001, + "loss": 1.162, + "loss/crossentropy": 2.5602381229400635, + "loss/hidden": 0.86328125, + "loss/logits": 0.15742892026901245, + "loss/reg": 0.014127855189144611, + "step": 1531 + }, + { + "epoch": 0.1915, + "grad_norm": 2.5733799934387207, + "grad_norm_var": 4.811403170073528, + "learning_rate": 0.0001, + "loss": 1.1001, + "loss/crossentropy": 2.511143684387207, + "loss/hidden": 0.80078125, + "loss/logits": 0.15818098187446594, + "loss/reg": 0.014118584804236889, + "step": 1532 + }, + { + "epoch": 0.191625, + "grad_norm": 3.53314208984375, + "grad_norm_var": 4.730224432732357, + "learning_rate": 0.0001, + "loss": 1.1633, + "loss/crossentropy": 2.384641647338867, + "loss/hidden": 0.88671875, + "loss/logits": 0.13548484444618225, + "loss/reg": 0.014110967516899109, + "step": 1533 + }, + { + "epoch": 0.19175, + "grad_norm": 2.8533082008361816, + "grad_norm_var": 4.679641703787122, + "learning_rate": 0.0001, + "loss": 1.0351, + "loss/crossentropy": 2.5344479084014893, + "loss/hidden": 0.76171875, + "loss/logits": 0.13237014412879944, + "loss/reg": 0.014102863147854805, + "step": 1534 + }, + { + "epoch": 0.191875, + "grad_norm": 3.1153576374053955, + "grad_norm_var": 4.717503317774359, + "learning_rate": 0.0001, + "loss": 1.1134, + "loss/crossentropy": 2.35892915725708, + "loss/hidden": 0.82421875, + "loss/logits": 0.1481958031654358, + "loss/reg": 0.014095306396484375, + "step": 1535 + }, + { + "epoch": 0.192, + "grad_norm": 2.1978485584259033, + "grad_norm_var": 4.825294315312353, + "learning_rate": 0.0001, + "loss": 1.0504, + "loss/crossentropy": 1.9747709035873413, + "loss/hidden": 0.7890625, + "loss/logits": 0.12044215202331543, + "loss/reg": 0.014087379910051823, + "step": 1536 + }, + { + "epoch": 0.192125, + "grad_norm": 3.004746913909912, + "grad_norm_var": 4.861933406571506, + "learning_rate": 0.0001, + "loss": 1.1657, + "loss/crossentropy": 2.6981475353240967, + "loss/hidden": 0.8671875, + "loss/logits": 0.1577630490064621, + "loss/reg": 0.014079651795327663, + "step": 1537 + }, + { + "epoch": 0.19225, + "grad_norm": 2.8707988262176514, + "grad_norm_var": 4.809272805310865, + "learning_rate": 0.0001, + "loss": 1.0948, + "loss/crossentropy": 2.7565841674804688, + "loss/hidden": 0.80859375, + "loss/logits": 0.1454852670431137, + "loss/reg": 0.01407123077660799, + "step": 1538 + }, + { + "epoch": 0.192375, + "grad_norm": 3.2437527179718018, + "grad_norm_var": 4.661507493305176, + "learning_rate": 0.0001, + "loss": 1.268, + "loss/crossentropy": 2.3115715980529785, + "loss/hidden": 0.953125, + "loss/logits": 0.1742616444826126, + "loss/reg": 0.014062810689210892, + "step": 1539 + }, + { + "epoch": 0.1925, + "grad_norm": 3.202810049057007, + "grad_norm_var": 4.661824760391179, + "learning_rate": 0.0001, + "loss": 1.1887, + "loss/crossentropy": 2.351571798324585, + "loss/hidden": 0.89453125, + "loss/logits": 0.15359902381896973, + "loss/reg": 0.014054255560040474, + "step": 1540 + }, + { + "epoch": 0.192625, + "grad_norm": 2.8404643535614014, + "grad_norm_var": 4.713165856200402, + "learning_rate": 0.0001, + "loss": 1.0457, + "loss/crossentropy": 2.3900203704833984, + "loss/hidden": 0.7734375, + "loss/logits": 0.13183440268039703, + "loss/reg": 0.014045719988644123, + "step": 1541 + }, + { + "epoch": 0.19275, + "grad_norm": 2.3016951084136963, + "grad_norm_var": 4.735606807223379, + "learning_rate": 0.0001, + "loss": 1.1063, + "loss/crossentropy": 2.413910150527954, + "loss/hidden": 0.82421875, + "loss/logits": 0.14167185127735138, + "loss/reg": 0.014036578126251698, + "step": 1542 + }, + { + "epoch": 0.192875, + "grad_norm": 2.4764444828033447, + "grad_norm_var": 2.381355740482232, + "learning_rate": 0.0001, + "loss": 1.1498, + "loss/crossentropy": 2.508134603500366, + "loss/hidden": 0.85546875, + "loss/logits": 0.15407797694206238, + "loss/reg": 0.014029039070010185, + "step": 1543 + }, + { + "epoch": 0.193, + "grad_norm": 3.122025966644287, + "grad_norm_var": 2.374577491531939, + "learning_rate": 0.0001, + "loss": 1.0871, + "loss/crossentropy": 2.517812490463257, + "loss/hidden": 0.81640625, + "loss/logits": 0.1304890513420105, + "loss/reg": 0.014020491391420364, + "step": 1544 + }, + { + "epoch": 0.193125, + "grad_norm": 3.5309674739837646, + "grad_norm_var": 2.358743795236999, + "learning_rate": 0.0001, + "loss": 1.3379, + "loss/crossentropy": 2.173165798187256, + "loss/hidden": 1.015625, + "loss/logits": 0.18210922181606293, + "loss/reg": 0.014012150466442108, + "step": 1545 + }, + { + "epoch": 0.19325, + "grad_norm": 2.8506641387939453, + "grad_norm_var": 0.15605408960923697, + "learning_rate": 0.0001, + "loss": 1.0831, + "loss/crossentropy": 2.6542272567749023, + "loss/hidden": 0.796875, + "loss/logits": 0.14616048336029053, + "loss/reg": 0.014004606753587723, + "step": 1546 + }, + { + "epoch": 0.193375, + "grad_norm": 2.6514549255371094, + "grad_norm_var": 0.1548857183604106, + "learning_rate": 0.0001, + "loss": 1.2542, + "loss/crossentropy": 2.267397403717041, + "loss/hidden": 0.953125, + "loss/logits": 0.16108238697052002, + "loss/reg": 0.013996096327900887, + "step": 1547 + }, + { + "epoch": 0.1935, + "grad_norm": 3.0364744663238525, + "grad_norm_var": 0.1482419605375867, + "learning_rate": 0.0001, + "loss": 1.1003, + "loss/crossentropy": 2.34012770652771, + "loss/hidden": 0.81640625, + "loss/logits": 0.14404839277267456, + "loss/reg": 0.013987723737955093, + "step": 1548 + }, + { + "epoch": 0.193625, + "grad_norm": 3.133777141571045, + "grad_norm_var": 0.12593383250846937, + "learning_rate": 0.0001, + "loss": 1.0985, + "loss/crossentropy": 2.618865966796875, + "loss/hidden": 0.8203125, + "loss/logits": 0.1383739411830902, + "loss/reg": 0.013980153016746044, + "step": 1549 + }, + { + "epoch": 0.19375, + "grad_norm": 3.1407878398895264, + "grad_norm_var": 0.12923131391317852, + "learning_rate": 0.0001, + "loss": 1.101, + "loss/crossentropy": 2.688640832901001, + "loss/hidden": 0.8046875, + "loss/logits": 0.1566149890422821, + "loss/reg": 0.013972623273730278, + "step": 1550 + }, + { + "epoch": 0.193875, + "grad_norm": 3.174556016921997, + "grad_norm_var": 0.1309922878345463, + "learning_rate": 0.0001, + "loss": 1.1717, + "loss/crossentropy": 2.1879935264587402, + "loss/hidden": 0.875, + "loss/logits": 0.15705443918704987, + "loss/reg": 0.013964612036943436, + "step": 1551 + }, + { + "epoch": 0.194, + "grad_norm": 3.0509793758392334, + "grad_norm_var": 0.09391514491316547, + "learning_rate": 0.0001, + "loss": 0.9586, + "loss/crossentropy": 2.3660058975219727, + "loss/hidden": 0.70703125, + "loss/logits": 0.11203601211309433, + "loss/reg": 0.013956940732896328, + "step": 1552 + }, + { + "epoch": 0.194125, + "grad_norm": 3.3187499046325684, + "grad_norm_var": 0.10123814801312937, + "learning_rate": 0.0001, + "loss": 1.1062, + "loss/crossentropy": 2.394197463989258, + "loss/hidden": 0.81640625, + "loss/logits": 0.150349423289299, + "loss/reg": 0.013949030078947544, + "step": 1553 + }, + { + "epoch": 0.19425, + "grad_norm": 2.903859853744507, + "grad_norm_var": 0.10075169250019347, + "learning_rate": 0.0001, + "loss": 1.0472, + "loss/crossentropy": 2.6103944778442383, + "loss/hidden": 0.7578125, + "loss/logits": 0.1500079333782196, + "loss/reg": 0.013941409066319466, + "step": 1554 + }, + { + "epoch": 0.194375, + "grad_norm": 3.239140272140503, + "grad_norm_var": 0.1006023266548631, + "learning_rate": 0.0001, + "loss": 1.1446, + "loss/crossentropy": 2.4417059421539307, + "loss/hidden": 0.859375, + "loss/logits": 0.1459202617406845, + "loss/reg": 0.01393379457294941, + "step": 1555 + }, + { + "epoch": 0.1945, + "grad_norm": 2.5746965408325195, + "grad_norm_var": 0.10814357204767854, + "learning_rate": 0.0001, + "loss": 1.0554, + "loss/crossentropy": 2.532374620437622, + "loss/hidden": 0.76953125, + "loss/logits": 0.1466185450553894, + "loss/reg": 0.013926350511610508, + "step": 1556 + }, + { + "epoch": 0.194625, + "grad_norm": 3.628814697265625, + "grad_norm_var": 0.13450941960769924, + "learning_rate": 0.0001, + "loss": 1.3249, + "loss/crossentropy": 2.3864905834198, + "loss/hidden": 1.0234375, + "loss/logits": 0.16223978996276855, + "loss/reg": 0.013918645679950714, + "step": 1557 + }, + { + "epoch": 0.19475, + "grad_norm": 2.513291358947754, + "grad_norm_var": 0.11736836954879663, + "learning_rate": 0.0001, + "loss": 1.0366, + "loss/crossentropy": 2.7693920135498047, + "loss/hidden": 0.765625, + "loss/logits": 0.1318259835243225, + "loss/reg": 0.013910802081227303, + "step": 1558 + }, + { + "epoch": 0.194875, + "grad_norm": 3.2579643726348877, + "grad_norm_var": 0.09872798985575174, + "learning_rate": 0.0001, + "loss": 1.0427, + "loss/crossentropy": 2.501122236251831, + "loss/hidden": 0.77734375, + "loss/logits": 0.1263553649187088, + "loss/reg": 0.013903032056987286, + "step": 1559 + }, + { + "epoch": 0.195, + "grad_norm": 20.420337677001953, + "grad_norm_var": 18.91951441564413, + "learning_rate": 0.0001, + "loss": 1.3618, + "loss/crossentropy": 2.46478533744812, + "loss/hidden": 1.046875, + "loss/logits": 0.1759488880634308, + "loss/reg": 0.013895487412810326, + "step": 1560 + }, + { + "epoch": 0.195125, + "grad_norm": 2.696916341781616, + "grad_norm_var": 19.032016931453608, + "learning_rate": 0.0001, + "loss": 1.174, + "loss/crossentropy": 2.2702722549438477, + "loss/hidden": 0.83203125, + "loss/logits": 0.2030649483203888, + "loss/reg": 0.013887940905988216, + "step": 1561 + }, + { + "epoch": 0.19525, + "grad_norm": 4.7133378982543945, + "grad_norm_var": 18.93870030552219, + "learning_rate": 0.0001, + "loss": 1.2984, + "loss/crossentropy": 1.9574859142303467, + "loss/hidden": 1.015625, + "loss/logits": 0.144018292427063, + "loss/reg": 0.013880369253456593, + "step": 1562 + }, + { + "epoch": 0.195375, + "grad_norm": 2.689742088317871, + "grad_norm_var": 18.930805267251564, + "learning_rate": 0.0001, + "loss": 1.0472, + "loss/crossentropy": 2.5726308822631836, + "loss/hidden": 0.76953125, + "loss/logits": 0.1389167606830597, + "loss/reg": 0.013872841373085976, + "step": 1563 + }, + { + "epoch": 0.1955, + "grad_norm": 3.9613354206085205, + "grad_norm_var": 18.838524358177413, + "learning_rate": 0.0001, + "loss": 1.0574, + "loss/crossentropy": 2.2799181938171387, + "loss/hidden": 0.77734375, + "loss/logits": 0.14136341214179993, + "loss/reg": 0.013865368440747261, + "step": 1564 + }, + { + "epoch": 0.195625, + "grad_norm": 2.84377121925354, + "grad_norm_var": 18.88795320188418, + "learning_rate": 0.0001, + "loss": 1.2585, + "loss/crossentropy": 2.306931495666504, + "loss/hidden": 0.9296875, + "loss/logits": 0.19025813043117523, + "loss/reg": 0.01385766826570034, + "step": 1565 + }, + { + "epoch": 0.19575, + "grad_norm": 3.5409600734710693, + "grad_norm_var": 18.83835057402114, + "learning_rate": 0.0001, + "loss": 1.0235, + "loss/crossentropy": 2.611679792404175, + "loss/hidden": 0.734375, + "loss/logits": 0.15062148869037628, + "loss/reg": 0.013849391601979733, + "step": 1566 + }, + { + "epoch": 0.195875, + "grad_norm": 2.4071547985076904, + "grad_norm_var": 18.988576179472293, + "learning_rate": 0.0001, + "loss": 0.9748, + "loss/crossentropy": 2.4327828884124756, + "loss/hidden": 0.7265625, + "loss/logits": 0.10979950428009033, + "loss/reg": 0.013841108419001102, + "step": 1567 + }, + { + "epoch": 0.196, + "grad_norm": 2.5137085914611816, + "grad_norm_var": 19.091440757813977, + "learning_rate": 0.0001, + "loss": 1.0107, + "loss/crossentropy": 2.6318089962005615, + "loss/hidden": 0.75, + "loss/logits": 0.12240324914455414, + "loss/reg": 0.013833708129823208, + "step": 1568 + }, + { + "epoch": 0.196125, + "grad_norm": 3.096564769744873, + "grad_norm_var": 19.120676935364486, + "learning_rate": 0.0001, + "loss": 1.0895, + "loss/crossentropy": 2.374494791030884, + "loss/hidden": 0.796875, + "loss/logits": 0.15435993671417236, + "loss/reg": 0.013826224021613598, + "step": 1569 + }, + { + "epoch": 0.19625, + "grad_norm": 2.169402837753296, + "grad_norm_var": 19.28010469927802, + "learning_rate": 0.0001, + "loss": 0.9576, + "loss/crossentropy": 2.391505479812622, + "loss/hidden": 0.6953125, + "loss/logits": 0.12412133812904358, + "loss/reg": 0.013818818144500256, + "step": 1570 + }, + { + "epoch": 0.196375, + "grad_norm": 3.0437140464782715, + "grad_norm_var": 19.30600940844608, + "learning_rate": 0.0001, + "loss": 1.239, + "loss/crossentropy": 2.501621723175049, + "loss/hidden": 0.9140625, + "loss/logits": 0.18683794140815735, + "loss/reg": 0.013811350800096989, + "step": 1571 + }, + { + "epoch": 0.1965, + "grad_norm": 2.5897915363311768, + "grad_norm_var": 19.302894385600194, + "learning_rate": 0.0001, + "loss": 1.035, + "loss/crossentropy": 2.4975814819335938, + "loss/hidden": 0.7578125, + "loss/logits": 0.1391228437423706, + "loss/reg": 0.013803965412080288, + "step": 1572 + }, + { + "epoch": 0.196625, + "grad_norm": 2.516031503677368, + "grad_norm_var": 19.454711828122623, + "learning_rate": 0.0001, + "loss": 0.9646, + "loss/crossentropy": 2.3849472999572754, + "loss/hidden": 0.703125, + "loss/logits": 0.12352485954761505, + "loss/reg": 0.013796493411064148, + "step": 1573 + }, + { + "epoch": 0.19675, + "grad_norm": 2.0846610069274902, + "grad_norm_var": 19.55464017386997, + "learning_rate": 0.0001, + "loss": 1.0915, + "loss/crossentropy": 2.333425998687744, + "loss/hidden": 0.80078125, + "loss/logits": 0.15285080671310425, + "loss/reg": 0.013788769952952862, + "step": 1574 + }, + { + "epoch": 0.196875, + "grad_norm": 2.8610305786132812, + "grad_norm_var": 19.605563364937975, + "learning_rate": 0.0001, + "loss": 0.9948, + "loss/crossentropy": 2.2638580799102783, + "loss/hidden": 0.69140625, + "loss/logits": 0.16560885310173035, + "loss/reg": 0.013781173154711723, + "step": 1575 + }, + { + "epoch": 0.197, + "grad_norm": 2.6086127758026123, + "grad_norm_var": 0.45959099378937346, + "learning_rate": 0.0001, + "loss": 1.1686, + "loss/crossentropy": 2.385111093521118, + "loss/hidden": 0.87890625, + "loss/logits": 0.15191304683685303, + "loss/reg": 0.01377350464463234, + "step": 1576 + }, + { + "epoch": 0.197125, + "grad_norm": 4.653759002685547, + "grad_norm_var": 0.6469626890922541, + "learning_rate": 0.0001, + "loss": 1.3708, + "loss/crossentropy": 2.556640625, + "loss/hidden": 1.0390625, + "loss/logits": 0.1940462738275528, + "loss/reg": 0.013765547424554825, + "step": 1577 + }, + { + "epoch": 0.19725, + "grad_norm": 3.648402690887451, + "grad_norm_var": 0.4771692938880648, + "learning_rate": 0.0001, + "loss": 1.0714, + "loss/crossentropy": 2.9089317321777344, + "loss/hidden": 0.80078125, + "loss/logits": 0.13300660252571106, + "loss/reg": 0.013758075423538685, + "step": 1578 + }, + { + "epoch": 0.197375, + "grad_norm": 3.6522624492645264, + "grad_norm_var": 0.5014419172244267, + "learning_rate": 0.0001, + "loss": 1.1563, + "loss/crossentropy": 2.547022581100464, + "loss/hidden": 0.875, + "loss/logits": 0.14384308457374573, + "loss/reg": 0.013750174082815647, + "step": 1579 + }, + { + "epoch": 0.1975, + "grad_norm": 3.541172981262207, + "grad_norm_var": 0.4592891725510337, + "learning_rate": 0.0001, + "loss": 1.3161, + "loss/crossentropy": 2.342548370361328, + "loss/hidden": 1.015625, + "loss/logits": 0.16299855709075928, + "loss/reg": 0.013742712326347828, + "step": 1580 + }, + { + "epoch": 0.197625, + "grad_norm": 3.876798391342163, + "grad_norm_var": 0.506438619715117, + "learning_rate": 0.0001, + "loss": 1.6102, + "loss/crossentropy": 2.066164970397949, + "loss/hidden": 1.2265625, + "loss/logits": 0.24627353250980377, + "loss/reg": 0.01373511552810669, + "step": 1581 + }, + { + "epoch": 0.19775, + "grad_norm": 2.5165457725524902, + "grad_norm_var": 0.5050025113667628, + "learning_rate": 0.0001, + "loss": 0.9853, + "loss/crossentropy": 2.4709391593933105, + "loss/hidden": 0.73046875, + "loss/logits": 0.1175711378455162, + "loss/reg": 0.013726145029067993, + "step": 1582 + }, + { + "epoch": 0.197875, + "grad_norm": 2.9556968212127686, + "grad_norm_var": 0.48145601689114204, + "learning_rate": 0.0001, + "loss": 1.0748, + "loss/crossentropy": 2.4822633266448975, + "loss/hidden": 0.78515625, + "loss/logits": 0.1524786353111267, + "loss/reg": 0.013718714937567711, + "step": 1583 + }, + { + "epoch": 0.198, + "grad_norm": 3.161695957183838, + "grad_norm_var": 0.46391222848667796, + "learning_rate": 0.0001, + "loss": 1.0736, + "loss/crossentropy": 2.565737724304199, + "loss/hidden": 0.8046875, + "loss/logits": 0.13177385926246643, + "loss/reg": 0.013710074126720428, + "step": 1584 + }, + { + "epoch": 0.198125, + "grad_norm": 7.7486958503723145, + "grad_norm_var": 1.8386121671978246, + "learning_rate": 0.0001, + "loss": 1.5153, + "loss/crossentropy": 2.1480207443237305, + "loss/hidden": 1.1484375, + "loss/logits": 0.22985613346099854, + "loss/reg": 0.013702766969799995, + "step": 1585 + }, + { + "epoch": 0.19825, + "grad_norm": 3.0685408115386963, + "grad_norm_var": 1.7473924169074466, + "learning_rate": 0.0001, + "loss": 1.1056, + "loss/crossentropy": 2.5632662773132324, + "loss/hidden": 0.81640625, + "loss/logits": 0.15228858590126038, + "loss/reg": 0.013695274479687214, + "step": 1586 + }, + { + "epoch": 0.198375, + "grad_norm": 4.356388568878174, + "grad_norm_var": 1.7913349785216173, + "learning_rate": 0.0001, + "loss": 1.2621, + "loss/crossentropy": 2.306581497192383, + "loss/hidden": 0.92578125, + "loss/logits": 0.19942906498908997, + "loss/reg": 0.013687408529222012, + "step": 1587 + }, + { + "epoch": 0.1985, + "grad_norm": 2.662060260772705, + "grad_norm_var": 1.7829870936881813, + "learning_rate": 0.0001, + "loss": 1.2003, + "loss/crossentropy": 2.5029749870300293, + "loss/hidden": 0.8984375, + "loss/logits": 0.16509464383125305, + "loss/reg": 0.013679537922143936, + "step": 1588 + }, + { + "epoch": 0.198625, + "grad_norm": 4.810679912567139, + "grad_norm_var": 1.8127030143600467, + "learning_rate": 0.0001, + "loss": 1.3583, + "loss/crossentropy": 2.612169027328491, + "loss/hidden": 1.03125, + "loss/logits": 0.1903519481420517, + "loss/reg": 0.01367125939577818, + "step": 1589 + }, + { + "epoch": 0.19875, + "grad_norm": 2.754514217376709, + "grad_norm_var": 1.7020179846119938, + "learning_rate": 0.0001, + "loss": 1.0334, + "loss/crossentropy": 2.3626105785369873, + "loss/hidden": 0.75390625, + "loss/logits": 0.1428566873073578, + "loss/reg": 0.013663833029568195, + "step": 1590 + }, + { + "epoch": 0.198875, + "grad_norm": 3.2916183471679688, + "grad_norm_var": 1.6665986976673535, + "learning_rate": 0.0001, + "loss": 1.2137, + "loss/crossentropy": 2.2759227752685547, + "loss/hidden": 0.87109375, + "loss/logits": 0.20607063174247742, + "loss/reg": 0.013655973598361015, + "step": 1591 + }, + { + "epoch": 0.199, + "grad_norm": 2.276350975036621, + "grad_norm_var": 1.7221462363283602, + "learning_rate": 0.0001, + "loss": 0.9194, + "loss/crossentropy": 2.429999351501465, + "loss/hidden": 0.671875, + "loss/logits": 0.11101213842630386, + "loss/reg": 0.013647787272930145, + "step": 1592 + }, + { + "epoch": 0.199125, + "grad_norm": 2.7854268550872803, + "grad_norm_var": 1.6992207121709098, + "learning_rate": 0.0001, + "loss": 1.1081, + "loss/crossentropy": 2.6592743396759033, + "loss/hidden": 0.81640625, + "loss/logits": 0.15529003739356995, + "loss/reg": 0.013640275225043297, + "step": 1593 + }, + { + "epoch": 0.19925, + "grad_norm": 2.5592005252838135, + "grad_norm_var": 1.7618627623489904, + "learning_rate": 0.0001, + "loss": 1.1146, + "loss/crossentropy": 2.27114200592041, + "loss/hidden": 0.8515625, + "loss/logits": 0.1267244815826416, + "loss/reg": 0.013632478192448616, + "step": 1594 + }, + { + "epoch": 0.199375, + "grad_norm": 3.1051416397094727, + "grad_norm_var": 1.7695445919247692, + "learning_rate": 0.0001, + "loss": 1.2779, + "loss/crossentropy": 2.096179246902466, + "loss/hidden": 0.96875, + "loss/logits": 0.1729447841644287, + "loss/reg": 0.013624493032693863, + "step": 1595 + }, + { + "epoch": 0.1995, + "grad_norm": 2.740203857421875, + "grad_norm_var": 1.8017103679937707, + "learning_rate": 0.0001, + "loss": 0.9738, + "loss/crossentropy": 2.5383737087249756, + "loss/hidden": 0.7109375, + "loss/logits": 0.12667007744312286, + "loss/reg": 0.013617145828902721, + "step": 1596 + }, + { + "epoch": 0.199625, + "grad_norm": 2.787034034729004, + "grad_norm_var": 1.8091027588667723, + "learning_rate": 0.0001, + "loss": 1.0191, + "loss/crossentropy": 2.452460527420044, + "loss/hidden": 0.7578125, + "loss/logits": 0.12520015239715576, + "loss/reg": 0.013609787449240685, + "step": 1597 + }, + { + "epoch": 0.19975, + "grad_norm": 2.521711587905884, + "grad_norm_var": 1.8085312337868815, + "learning_rate": 0.0001, + "loss": 1.1116, + "loss/crossentropy": 2.4604299068450928, + "loss/hidden": 0.828125, + "loss/logits": 0.14743748307228088, + "loss/reg": 0.013602245599031448, + "step": 1598 + }, + { + "epoch": 0.199875, + "grad_norm": 2.541367530822754, + "grad_norm_var": 1.840991450339924, + "learning_rate": 0.0001, + "loss": 1.1753, + "loss/crossentropy": 2.3595387935638428, + "loss/hidden": 0.88671875, + "loss/logits": 0.1526535153388977, + "loss/reg": 0.013594499789178371, + "step": 1599 + }, + { + "epoch": 0.2, + "grad_norm": 3.170604944229126, + "grad_norm_var": 1.840804608226538, + "learning_rate": 0.0001, + "loss": 1.1393, + "loss/crossentropy": 2.679455041885376, + "loss/hidden": 0.83984375, + "loss/logits": 0.16357922554016113, + "loss/reg": 0.01358658354729414, + "step": 1600 + }, + { + "epoch": 0.200125, + "grad_norm": 3.085852861404419, + "grad_norm_var": 0.44862457908304804, + "learning_rate": 0.0001, + "loss": 1.162, + "loss/crossentropy": 2.183800220489502, + "loss/hidden": 0.87109375, + "loss/logits": 0.15514615178108215, + "loss/reg": 0.013579203747212887, + "step": 1601 + }, + { + "epoch": 0.20025, + "grad_norm": 2.978541851043701, + "grad_norm_var": 0.44869585537156215, + "learning_rate": 0.0001, + "loss": 1.1595, + "loss/crossentropy": 2.7346818447113037, + "loss/hidden": 0.8515625, + "loss/logits": 0.1722511649131775, + "loss/reg": 0.01357145607471466, + "step": 1602 + }, + { + "epoch": 0.200375, + "grad_norm": 2.433056116104126, + "grad_norm_var": 0.3388972014701608, + "learning_rate": 0.0001, + "loss": 1.0578, + "loss/crossentropy": 2.400810718536377, + "loss/hidden": 0.78515625, + "loss/logits": 0.13697531819343567, + "loss/reg": 0.013563701882958412, + "step": 1603 + }, + { + "epoch": 0.2005, + "grad_norm": 2.651841640472412, + "grad_norm_var": 0.3392367186207063, + "learning_rate": 0.0001, + "loss": 1.0811, + "loss/crossentropy": 2.490342617034912, + "loss/hidden": 0.78515625, + "loss/logits": 0.1603870689868927, + "loss/reg": 0.013556394726037979, + "step": 1604 + }, + { + "epoch": 0.200625, + "grad_norm": 2.8456075191497803, + "grad_norm_var": 0.08148981985714249, + "learning_rate": 0.0001, + "loss": 1.1882, + "loss/crossentropy": 2.4219372272491455, + "loss/hidden": 0.87890625, + "loss/logits": 0.17379163205623627, + "loss/reg": 0.0135487737134099, + "step": 1605 + }, + { + "epoch": 0.20075, + "grad_norm": 2.8339900970458984, + "grad_norm_var": 0.08158268879521771, + "learning_rate": 0.0001, + "loss": 0.9949, + "loss/crossentropy": 2.611527442932129, + "loss/hidden": 0.734375, + "loss/logits": 0.12512920796871185, + "loss/reg": 0.013541026972234249, + "step": 1606 + }, + { + "epoch": 0.200875, + "grad_norm": 3.8811702728271484, + "grad_norm_var": 0.14289600365006525, + "learning_rate": 0.0001, + "loss": 1.0861, + "loss/crossentropy": 2.5142064094543457, + "loss/hidden": 0.8125, + "loss/logits": 0.13831937313079834, + "loss/reg": 0.013533033430576324, + "step": 1607 + }, + { + "epoch": 0.201, + "grad_norm": 3.6074378490448, + "grad_norm_var": 0.15629189387123668, + "learning_rate": 0.0001, + "loss": 1.0655, + "loss/crossentropy": 2.4579451084136963, + "loss/hidden": 0.79296875, + "loss/logits": 0.13723407685756683, + "loss/reg": 0.013525336049497128, + "step": 1608 + }, + { + "epoch": 0.201125, + "grad_norm": 2.8290226459503174, + "grad_norm_var": 0.1556981224441491, + "learning_rate": 0.0001, + "loss": 1.0043, + "loss/crossentropy": 2.7764294147491455, + "loss/hidden": 0.73828125, + "loss/logits": 0.1308579295873642, + "loss/reg": 0.01351715624332428, + "step": 1609 + }, + { + "epoch": 0.20125, + "grad_norm": 2.59934401512146, + "grad_norm_var": 0.15391725674414733, + "learning_rate": 0.0001, + "loss": 1.0636, + "loss/crossentropy": 2.5147807598114014, + "loss/hidden": 0.79296875, + "loss/logits": 0.13549384474754333, + "loss/reg": 0.013509852811694145, + "step": 1610 + }, + { + "epoch": 0.201375, + "grad_norm": 2.7972164154052734, + "grad_norm_var": 0.15196475783560098, + "learning_rate": 0.0001, + "loss": 1.0975, + "loss/crossentropy": 2.507530689239502, + "loss/hidden": 0.81640625, + "loss/logits": 0.14603210985660553, + "loss/reg": 0.013502140529453754, + "step": 1611 + }, + { + "epoch": 0.2015, + "grad_norm": 3.2658374309539795, + "grad_norm_var": 0.15845418736495442, + "learning_rate": 0.0001, + "loss": 1.2549, + "loss/crossentropy": 2.164236307144165, + "loss/hidden": 0.9453125, + "loss/logits": 0.17467688024044037, + "loss/reg": 0.01349355187267065, + "step": 1612 + }, + { + "epoch": 0.201625, + "grad_norm": 2.7505931854248047, + "grad_norm_var": 0.15921652951525, + "learning_rate": 0.0001, + "loss": 1.0594, + "loss/crossentropy": 2.0924651622772217, + "loss/hidden": 0.80078125, + "loss/logits": 0.12380584329366684, + "loss/reg": 0.013486127369105816, + "step": 1613 + }, + { + "epoch": 0.20175, + "grad_norm": 3.203188180923462, + "grad_norm_var": 0.15163660104867607, + "learning_rate": 0.0001, + "loss": 1.1208, + "loss/crossentropy": 2.62121319770813, + "loss/hidden": 0.84765625, + "loss/logits": 0.1383514702320099, + "loss/reg": 0.013478783890604973, + "step": 1614 + }, + { + "epoch": 0.201875, + "grad_norm": 2.6951982975006104, + "grad_norm_var": 0.1443821198786272, + "learning_rate": 0.0001, + "loss": 1.4372, + "loss/crossentropy": 2.364288091659546, + "loss/hidden": 1.109375, + "loss/logits": 0.19316649436950684, + "loss/reg": 0.013470477424561977, + "step": 1615 + }, + { + "epoch": 0.202, + "grad_norm": 2.867556095123291, + "grad_norm_var": 0.1422903014164942, + "learning_rate": 0.0001, + "loss": 1.0695, + "loss/crossentropy": 2.412200689315796, + "loss/hidden": 0.8046875, + "loss/logits": 0.1301771104335785, + "loss/reg": 0.01346264686435461, + "step": 1616 + }, + { + "epoch": 0.202125, + "grad_norm": 3.35479736328125, + "grad_norm_var": 0.15140141291244522, + "learning_rate": 0.0001, + "loss": 1.1988, + "loss/crossentropy": 2.4715232849121094, + "loss/hidden": 0.90234375, + "loss/logits": 0.16192206740379333, + "loss/reg": 0.013454729691147804, + "step": 1617 + }, + { + "epoch": 0.20225, + "grad_norm": 22.905237197875977, + "grad_norm_var": 24.978816029077244, + "learning_rate": 0.0001, + "loss": 1.579, + "loss/crossentropy": 2.2932474613189697, + "loss/hidden": 1.21875, + "loss/logits": 0.22579804062843323, + "loss/reg": 0.013446752913296223, + "step": 1618 + }, + { + "epoch": 0.202375, + "grad_norm": 2.676539182662964, + "grad_norm_var": 24.92450698201823, + "learning_rate": 0.0001, + "loss": 1.0158, + "loss/crossentropy": 2.443284034729004, + "loss/hidden": 0.75, + "loss/logits": 0.13141301274299622, + "loss/reg": 0.01343945600092411, + "step": 1619 + }, + { + "epoch": 0.2025, + "grad_norm": 3.8005261421203613, + "grad_norm_var": 24.764457157993366, + "learning_rate": 0.0001, + "loss": 1.2057, + "loss/crossentropy": 2.7221662998199463, + "loss/hidden": 0.91015625, + "loss/logits": 0.1611778438091278, + "loss/reg": 0.013432328589260578, + "step": 1620 + }, + { + "epoch": 0.202625, + "grad_norm": 2.675564765930176, + "grad_norm_var": 24.79939933153828, + "learning_rate": 0.0001, + "loss": 1.0176, + "loss/crossentropy": 2.7226831912994385, + "loss/hidden": 0.75390625, + "loss/logits": 0.12948712706565857, + "loss/reg": 0.013424508273601532, + "step": 1621 + }, + { + "epoch": 0.20275, + "grad_norm": 5.532473564147949, + "grad_norm_var": 24.728322365826084, + "learning_rate": 0.0001, + "loss": 1.393, + "loss/crossentropy": 2.501004219055176, + "loss/hidden": 1.0546875, + "loss/logits": 0.20409145951271057, + "loss/reg": 0.013417219743132591, + "step": 1622 + }, + { + "epoch": 0.202875, + "grad_norm": 2.9599437713623047, + "grad_norm_var": 24.853088500483913, + "learning_rate": 0.0001, + "loss": 1.047, + "loss/crossentropy": 2.583350419998169, + "loss/hidden": 0.7890625, + "loss/logits": 0.1238275021314621, + "loss/reg": 0.013409611769020557, + "step": 1623 + }, + { + "epoch": 0.203, + "grad_norm": 2.609393358230591, + "grad_norm_var": 25.021814610211056, + "learning_rate": 0.0001, + "loss": 1.0136, + "loss/crossentropy": 2.800400495529175, + "loss/hidden": 0.74609375, + "loss/logits": 0.13346146047115326, + "loss/reg": 0.013402425684034824, + "step": 1624 + }, + { + "epoch": 0.203125, + "grad_norm": 3.3347935676574707, + "grad_norm_var": 24.93556049048077, + "learning_rate": 0.0001, + "loss": 1.3162, + "loss/crossentropy": 2.306166648864746, + "loss/hidden": 1.0, + "loss/logits": 0.1822924166917801, + "loss/reg": 0.013394915498793125, + "step": 1625 + }, + { + "epoch": 0.20325, + "grad_norm": 2.487501859664917, + "grad_norm_var": 24.9628476598421, + "learning_rate": 0.0001, + "loss": 1.163, + "loss/crossentropy": 2.2325022220611572, + "loss/hidden": 0.8828125, + "loss/logits": 0.1463523954153061, + "loss/reg": 0.013387140817940235, + "step": 1626 + }, + { + "epoch": 0.203375, + "grad_norm": 2.961826801300049, + "grad_norm_var": 24.930026653662896, + "learning_rate": 0.0001, + "loss": 1.1983, + "loss/crossentropy": 2.5841550827026367, + "loss/hidden": 0.85546875, + "loss/logits": 0.2090195268392563, + "loss/reg": 0.013379319570958614, + "step": 1627 + }, + { + "epoch": 0.2035, + "grad_norm": 2.5991106033325195, + "grad_norm_var": 25.056860448715973, + "learning_rate": 0.0001, + "loss": 1.0763, + "loss/crossentropy": 2.617375135421753, + "loss/hidden": 0.796875, + "loss/logits": 0.1457059681415558, + "loss/reg": 0.01337137445807457, + "step": 1628 + }, + { + "epoch": 0.203625, + "grad_norm": 3.130793571472168, + "grad_norm_var": 24.985404162175104, + "learning_rate": 0.0001, + "loss": 1.4514, + "loss/crossentropy": 2.28818678855896, + "loss/hidden": 1.09375, + "loss/logits": 0.2240520417690277, + "loss/reg": 0.013363334350287914, + "step": 1629 + }, + { + "epoch": 0.20375, + "grad_norm": 3.3310132026672363, + "grad_norm_var": 24.96667274126064, + "learning_rate": 0.0001, + "loss": 1.2145, + "loss/crossentropy": 2.4561591148376465, + "loss/hidden": 0.8828125, + "loss/logits": 0.1981697976589203, + "loss/reg": 0.013356066308915615, + "step": 1630 + }, + { + "epoch": 0.203875, + "grad_norm": 2.9992592334747314, + "grad_norm_var": 24.904546403803987, + "learning_rate": 0.0001, + "loss": 1.3327, + "loss/crossentropy": 2.304450035095215, + "loss/hidden": 1.03125, + "loss/logits": 0.16797912120819092, + "loss/reg": 0.013348528183996677, + "step": 1631 + }, + { + "epoch": 0.204, + "grad_norm": 3.1939244270324707, + "grad_norm_var": 24.844990519484032, + "learning_rate": 0.0001, + "loss": 1.0774, + "loss/crossentropy": 2.4736955165863037, + "loss/hidden": 0.7890625, + "loss/logits": 0.15497320890426636, + "loss/reg": 0.013341384008526802, + "step": 1632 + }, + { + "epoch": 0.204125, + "grad_norm": 2.8169233798980713, + "grad_norm_var": 24.938715041703063, + "learning_rate": 0.0001, + "loss": 0.8683, + "loss/crossentropy": 2.676666736602783, + "loss/hidden": 0.62109375, + "loss/logits": 0.11391356587409973, + "loss/reg": 0.013333736918866634, + "step": 1633 + }, + { + "epoch": 0.20425, + "grad_norm": 3.69463849067688, + "grad_norm_var": 0.5429387753821554, + "learning_rate": 0.0001, + "loss": 1.7185, + "loss/crossentropy": 1.9171524047851562, + "loss/hidden": 1.3125, + "loss/logits": 0.27270910143852234, + "loss/reg": 0.013325790874660015, + "step": 1634 + }, + { + "epoch": 0.204375, + "grad_norm": 2.5067968368530273, + "grad_norm_var": 0.5560268531523621, + "learning_rate": 0.0001, + "loss": 0.9318, + "loss/crossentropy": 2.6572604179382324, + "loss/hidden": 0.6875, + "loss/logits": 0.11111121624708176, + "loss/reg": 0.013317680917680264, + "step": 1635 + }, + { + "epoch": 0.2045, + "grad_norm": 2.7445805072784424, + "grad_norm_var": 0.5361896610226305, + "learning_rate": 0.0001, + "loss": 1.1343, + "loss/crossentropy": 2.34808087348938, + "loss/hidden": 0.8359375, + "loss/logits": 0.16526776552200317, + "loss/reg": 0.013309688307344913, + "step": 1636 + }, + { + "epoch": 0.204625, + "grad_norm": 2.5069262981414795, + "grad_norm_var": 0.5474804142573985, + "learning_rate": 0.0001, + "loss": 0.9285, + "loss/crossentropy": 2.678297758102417, + "loss/hidden": 0.6796875, + "loss/logits": 0.11579211056232452, + "loss/reg": 0.013302477076649666, + "step": 1637 + }, + { + "epoch": 0.20475, + "grad_norm": 3.034487247467041, + "grad_norm_var": 0.12334773017239489, + "learning_rate": 0.0001, + "loss": 1.1895, + "loss/crossentropy": 2.0104122161865234, + "loss/hidden": 0.90234375, + "loss/logits": 0.15417572855949402, + "loss/reg": 0.013295282609760761, + "step": 1638 + }, + { + "epoch": 0.204875, + "grad_norm": 2.7470128536224365, + "grad_norm_var": 0.12538795384078488, + "learning_rate": 0.0001, + "loss": 1.0896, + "loss/crossentropy": 2.2556824684143066, + "loss/hidden": 0.8203125, + "loss/logits": 0.13637785613536835, + "loss/reg": 0.013288102112710476, + "step": 1639 + }, + { + "epoch": 0.205, + "grad_norm": 2.6958811283111572, + "grad_norm_var": 0.12228878695745683, + "learning_rate": 0.0001, + "loss": 0.9741, + "loss/crossentropy": 2.2803361415863037, + "loss/hidden": 0.72265625, + "loss/logits": 0.11862225830554962, + "loss/reg": 0.013280958868563175, + "step": 1640 + }, + { + "epoch": 0.205125, + "grad_norm": 2.923374652862549, + "grad_norm_var": 0.11033848957714554, + "learning_rate": 0.0001, + "loss": 1.1012, + "loss/crossentropy": 2.5998592376708984, + "loss/hidden": 0.82421875, + "loss/logits": 0.14425665140151978, + "loss/reg": 0.013273877091705799, + "step": 1641 + }, + { + "epoch": 0.20525, + "grad_norm": 16.231281280517578, + "grad_norm_var": 11.163123044335308, + "learning_rate": 0.0001, + "loss": 1.573, + "loss/crossentropy": 2.28498911857605, + "loss/hidden": 1.1875, + "loss/logits": 0.25285494327545166, + "loss/reg": 0.013266587629914284, + "step": 1642 + }, + { + "epoch": 0.205375, + "grad_norm": 2.8149280548095703, + "grad_norm_var": 11.180053543888178, + "learning_rate": 0.0001, + "loss": 1.1024, + "loss/crossentropy": 2.148362636566162, + "loss/hidden": 0.8125, + "loss/logits": 0.15731269121170044, + "loss/reg": 0.013259139843285084, + "step": 1643 + }, + { + "epoch": 0.2055, + "grad_norm": 3.9090774059295654, + "grad_norm_var": 11.086604757622807, + "learning_rate": 0.0001, + "loss": 1.4303, + "loss/crossentropy": 2.6009087562561035, + "loss/hidden": 1.0703125, + "loss/logits": 0.22749975323677063, + "loss/reg": 0.01325154397636652, + "step": 1644 + }, + { + "epoch": 0.205625, + "grad_norm": 2.7664973735809326, + "grad_norm_var": 11.128864400454058, + "learning_rate": 0.0001, + "loss": 0.9888, + "loss/crossentropy": 2.576356887817383, + "loss/hidden": 0.734375, + "loss/logits": 0.12194088101387024, + "loss/reg": 0.013244031928479671, + "step": 1645 + }, + { + "epoch": 0.20575, + "grad_norm": 2.7838664054870605, + "grad_norm_var": 11.182320606560461, + "learning_rate": 0.0001, + "loss": 1.0593, + "loss/crossentropy": 2.458827018737793, + "loss/hidden": 0.80078125, + "loss/logits": 0.12612512707710266, + "loss/reg": 0.013236268423497677, + "step": 1646 + }, + { + "epoch": 0.205875, + "grad_norm": 3.0026252269744873, + "grad_norm_var": 11.181974019665732, + "learning_rate": 0.0001, + "loss": 1.0012, + "loss/crossentropy": 2.8010799884796143, + "loss/hidden": 0.73828125, + "loss/logits": 0.1305844783782959, + "loss/reg": 0.01322907768189907, + "step": 1647 + }, + { + "epoch": 0.206, + "grad_norm": 3.544288158416748, + "grad_norm_var": 11.162580503190616, + "learning_rate": 0.0001, + "loss": 1.2099, + "loss/crossentropy": 2.5440893173217773, + "loss/hidden": 0.8828125, + "loss/logits": 0.19490712881088257, + "loss/reg": 0.013221465051174164, + "step": 1648 + }, + { + "epoch": 0.206125, + "grad_norm": 3.2531473636627197, + "grad_norm_var": 11.117574070283142, + "learning_rate": 0.0001, + "loss": 1.3131, + "loss/crossentropy": 2.267230272293091, + "loss/hidden": 0.98828125, + "loss/logits": 0.19262857735157013, + "loss/reg": 0.013214114122092724, + "step": 1649 + }, + { + "epoch": 0.20625, + "grad_norm": 2.65549635887146, + "grad_norm_var": 11.202772982286575, + "learning_rate": 0.0001, + "loss": 1.1064, + "loss/crossentropy": 2.623307228088379, + "loss/hidden": 0.8125, + "loss/logits": 0.16180500388145447, + "loss/reg": 0.013206473551690578, + "step": 1650 + }, + { + "epoch": 0.206375, + "grad_norm": 3.4816501140594482, + "grad_norm_var": 11.09960005034767, + "learning_rate": 0.0001, + "loss": 1.1543, + "loss/crossentropy": 2.644336462020874, + "loss/hidden": 0.84375, + "loss/logits": 0.17855095863342285, + "loss/reg": 0.013198580592870712, + "step": 1651 + }, + { + "epoch": 0.2065, + "grad_norm": 2.8406617641448975, + "grad_norm_var": 11.086419925476456, + "learning_rate": 0.0001, + "loss": 0.9424, + "loss/crossentropy": 2.5468101501464844, + "loss/hidden": 0.69921875, + "loss/logits": 0.11126389354467392, + "loss/reg": 0.01318978238850832, + "step": 1652 + }, + { + "epoch": 0.206625, + "grad_norm": 2.7195427417755127, + "grad_norm_var": 11.051894988141786, + "learning_rate": 0.0001, + "loss": 1.2921, + "loss/crossentropy": 2.5399060249328613, + "loss/hidden": 0.96484375, + "loss/logits": 0.19546636939048767, + "loss/reg": 0.013182584196329117, + "step": 1653 + }, + { + "epoch": 0.20675, + "grad_norm": 2.4805049896240234, + "grad_norm_var": 11.13040761168501, + "learning_rate": 0.0001, + "loss": 0.9454, + "loss/crossentropy": 2.6860992908477783, + "loss/hidden": 0.69921875, + "loss/logits": 0.11445553600788116, + "loss/reg": 0.013174121268093586, + "step": 1654 + }, + { + "epoch": 0.206875, + "grad_norm": 2.9009270668029785, + "grad_norm_var": 11.110214998589447, + "learning_rate": 0.0001, + "loss": 1.1267, + "loss/crossentropy": 2.6687192916870117, + "loss/hidden": 0.8359375, + "loss/logits": 0.15907853841781616, + "loss/reg": 0.013165648095309734, + "step": 1655 + }, + { + "epoch": 0.207, + "grad_norm": 5.767265796661377, + "grad_norm_var": 11.242431274213367, + "learning_rate": 0.0001, + "loss": 1.3322, + "loss/crossentropy": 2.3070762157440186, + "loss/hidden": 1.0234375, + "loss/logits": 0.17719680070877075, + "loss/reg": 0.013158419169485569, + "step": 1656 + }, + { + "epoch": 0.207125, + "grad_norm": 2.9326095581054688, + "grad_norm_var": 11.241105151169359, + "learning_rate": 0.0001, + "loss": 1.2628, + "loss/crossentropy": 2.429621934890747, + "loss/hidden": 0.94921875, + "loss/logits": 0.18210232257843018, + "loss/reg": 0.013150298967957497, + "step": 1657 + }, + { + "epoch": 0.20725, + "grad_norm": 3.11118483543396, + "grad_norm_var": 0.6121424659486439, + "learning_rate": 0.0001, + "loss": 1.0707, + "loss/crossentropy": 2.6597509384155273, + "loss/hidden": 0.78125, + "loss/logits": 0.15797501802444458, + "loss/reg": 0.013143090531229973, + "step": 1658 + }, + { + "epoch": 0.207375, + "grad_norm": 3.273897886276245, + "grad_norm_var": 0.6026450391734357, + "learning_rate": 0.0001, + "loss": 1.5868, + "loss/crossentropy": 2.4786179065704346, + "loss/hidden": 1.1953125, + "loss/logits": 0.2601345479488373, + "loss/reg": 0.013135283254086971, + "step": 1659 + }, + { + "epoch": 0.2075, + "grad_norm": 2.313868761062622, + "grad_norm_var": 0.6138390088935656, + "learning_rate": 0.0001, + "loss": 1.0272, + "loss/crossentropy": 2.646815538406372, + "loss/hidden": 0.7734375, + "loss/logits": 0.1224367693066597, + "loss/reg": 0.013128082267940044, + "step": 1660 + }, + { + "epoch": 0.207625, + "grad_norm": 3.9266319274902344, + "grad_norm_var": 0.6441662093447725, + "learning_rate": 0.0001, + "loss": 1.3995, + "loss/crossentropy": 2.480496644973755, + "loss/hidden": 1.0859375, + "loss/logits": 0.1823263317346573, + "loss/reg": 0.013120830059051514, + "step": 1661 + }, + { + "epoch": 0.20775, + "grad_norm": 4.4009881019592285, + "grad_norm_var": 0.7207383535866235, + "learning_rate": 0.0001, + "loss": 1.5143, + "loss/crossentropy": 2.490128993988037, + "loss/hidden": 1.15625, + "loss/logits": 0.22691452503204346, + "loss/reg": 0.013114040717482567, + "step": 1662 + }, + { + "epoch": 0.207875, + "grad_norm": 2.803945302963257, + "grad_norm_var": 0.7307607399187871, + "learning_rate": 0.0001, + "loss": 1.1357, + "loss/crossentropy": 2.6528525352478027, + "loss/hidden": 0.84375, + "loss/logits": 0.1609269231557846, + "loss/reg": 0.013106826692819595, + "step": 1663 + }, + { + "epoch": 0.208, + "grad_norm": 3.9250094890594482, + "grad_norm_var": 0.753468894736533, + "learning_rate": 0.0001, + "loss": 1.5158, + "loss/crossentropy": 2.8934035301208496, + "loss/hidden": 1.1484375, + "loss/logits": 0.23635676503181458, + "loss/reg": 0.013099766336381435, + "step": 1664 + }, + { + "epoch": 0.208125, + "grad_norm": 2.537315607070923, + "grad_norm_var": 0.789891085089917, + "learning_rate": 0.0001, + "loss": 1.1371, + "loss/crossentropy": 2.4187686443328857, + "loss/hidden": 0.8515625, + "loss/logits": 0.15466302633285522, + "loss/reg": 0.013092422857880592, + "step": 1665 + }, + { + "epoch": 0.20825, + "grad_norm": 2.553142547607422, + "grad_norm_var": 0.7987201352076717, + "learning_rate": 0.0001, + "loss": 1.0203, + "loss/crossentropy": 2.137171745300293, + "loss/hidden": 0.76171875, + "loss/logits": 0.12768462300300598, + "loss/reg": 0.013086005114018917, + "step": 1666 + }, + { + "epoch": 0.208375, + "grad_norm": 2.1522605419158936, + "grad_norm_var": 0.8677726892771486, + "learning_rate": 0.0001, + "loss": 0.9736, + "loss/crossentropy": 2.4442384243011475, + "loss/hidden": 0.7109375, + "loss/logits": 0.13187186419963837, + "loss/reg": 0.013079563155770302, + "step": 1667 + }, + { + "epoch": 0.2085, + "grad_norm": 2.8897452354431152, + "grad_norm_var": 0.8658007433698373, + "learning_rate": 0.0001, + "loss": 0.9685, + "loss/crossentropy": 2.4599223136901855, + "loss/hidden": 0.7109375, + "loss/logits": 0.12683042883872986, + "loss/reg": 0.01307291816920042, + "step": 1668 + }, + { + "epoch": 0.208625, + "grad_norm": 3.2847912311553955, + "grad_norm_var": 0.8519672623501406, + "learning_rate": 0.0001, + "loss": 0.9332, + "loss/crossentropy": 2.2193775177001953, + "loss/hidden": 0.6796875, + "loss/logits": 0.12289424240589142, + "loss/reg": 0.013066742569208145, + "step": 1669 + }, + { + "epoch": 0.20875, + "grad_norm": 3.1088175773620605, + "grad_norm_var": 0.8160818976558214, + "learning_rate": 0.0001, + "loss": 1.2098, + "loss/crossentropy": 2.402890920639038, + "loss/hidden": 0.93359375, + "loss/logits": 0.14563772082328796, + "loss/reg": 0.013059607706964016, + "step": 1670 + }, + { + "epoch": 0.208875, + "grad_norm": 3.5626771450042725, + "grad_norm_var": 0.8133001054088592, + "learning_rate": 0.0001, + "loss": 1.1257, + "loss/crossentropy": 2.4826173782348633, + "loss/hidden": 0.8359375, + "loss/logits": 0.15928024053573608, + "loss/reg": 0.013052698224782944, + "step": 1671 + }, + { + "epoch": 0.209, + "grad_norm": 3.8422510623931885, + "grad_norm_var": 0.40753121137815923, + "learning_rate": 0.0001, + "loss": 1.0944, + "loss/crossentropy": 2.2613954544067383, + "loss/hidden": 0.81640625, + "loss/logits": 0.14750930666923523, + "loss/reg": 0.013046123087406158, + "step": 1672 + }, + { + "epoch": 0.209125, + "grad_norm": 3.1732969284057617, + "grad_norm_var": 0.40373591532357217, + "learning_rate": 0.0001, + "loss": 1.148, + "loss/crossentropy": 2.6913723945617676, + "loss/hidden": 0.84765625, + "loss/logits": 0.1699780523777008, + "loss/reg": 0.01303982362151146, + "step": 1673 + }, + { + "epoch": 0.20925, + "grad_norm": 3.5349199771881104, + "grad_norm_var": 0.41114120511671504, + "learning_rate": 0.0001, + "loss": 1.1601, + "loss/crossentropy": 2.636218547821045, + "loss/hidden": 0.859375, + "loss/logits": 0.1703943908214569, + "loss/reg": 0.013032618910074234, + "step": 1674 + }, + { + "epoch": 0.209375, + "grad_norm": 3.3266332149505615, + "grad_norm_var": 0.4117979013874437, + "learning_rate": 0.0001, + "loss": 1.2931, + "loss/crossentropy": 2.5062716007232666, + "loss/hidden": 0.96875, + "loss/logits": 0.1940596103668213, + "loss/reg": 0.013026205822825432, + "step": 1675 + }, + { + "epoch": 0.2095, + "grad_norm": 2.690976142883301, + "grad_norm_var": 0.3757021597893411, + "learning_rate": 0.0001, + "loss": 1.0795, + "loss/crossentropy": 2.6262998580932617, + "loss/hidden": 0.80078125, + "loss/logits": 0.14850963652133942, + "loss/reg": 0.013019601814448833, + "step": 1676 + }, + { + "epoch": 0.209625, + "grad_norm": 2.262477397918701, + "grad_norm_var": 0.3946797642913109, + "learning_rate": 0.0001, + "loss": 1.0682, + "loss/crossentropy": 2.522778272628784, + "loss/hidden": 0.7890625, + "loss/logits": 0.14901478588581085, + "loss/reg": 0.013013252057135105, + "step": 1677 + }, + { + "epoch": 0.20975, + "grad_norm": 4.060770034790039, + "grad_norm_var": 0.3441717651006473, + "learning_rate": 0.0001, + "loss": 1.1066, + "loss/crossentropy": 2.75931715965271, + "loss/hidden": 0.83203125, + "loss/logits": 0.1445423662662506, + "loss/reg": 0.0130059365183115, + "step": 1678 + }, + { + "epoch": 0.209875, + "grad_norm": 3.035757064819336, + "grad_norm_var": 0.3381691610716331, + "learning_rate": 0.0001, + "loss": 1.2199, + "loss/crossentropy": 2.2356534004211426, + "loss/hidden": 0.9140625, + "loss/logits": 0.1757938712835312, + "loss/reg": 0.012999509461224079, + "step": 1679 + }, + { + "epoch": 0.21, + "grad_norm": 3.4442572593688965, + "grad_norm_var": 0.3010964780836121, + "learning_rate": 0.0001, + "loss": 1.3513, + "loss/crossentropy": 2.327789545059204, + "loss/hidden": 1.03125, + "loss/logits": 0.19014956057071686, + "loss/reg": 0.012993012554943562, + "step": 1680 + }, + { + "epoch": 0.210125, + "grad_norm": 3.582969903945923, + "grad_norm_var": 0.29220290919850694, + "learning_rate": 0.0001, + "loss": 1.1276, + "loss/crossentropy": 2.5549631118774414, + "loss/hidden": 0.80859375, + "loss/logits": 0.18918761610984802, + "loss/reg": 0.0129857761785388, + "step": 1681 + }, + { + "epoch": 0.21025, + "grad_norm": 2.9530413150787354, + "grad_norm_var": 0.27002111859183725, + "learning_rate": 0.0001, + "loss": 1.051, + "loss/crossentropy": 2.4788119792938232, + "loss/hidden": 0.77734375, + "loss/logits": 0.14384132623672485, + "loss/reg": 0.012978510931134224, + "step": 1682 + }, + { + "epoch": 0.210375, + "grad_norm": 2.191362142562866, + "grad_norm_var": 0.2647501539546122, + "learning_rate": 0.0001, + "loss": 1.0957, + "loss/crossentropy": 2.532787561416626, + "loss/hidden": 0.80859375, + "loss/logits": 0.15740235149860382, + "loss/reg": 0.012971782125532627, + "step": 1683 + }, + { + "epoch": 0.2105, + "grad_norm": 5.248608112335205, + "grad_norm_var": 0.5199526136507975, + "learning_rate": 0.0001, + "loss": 1.3987, + "loss/crossentropy": 2.741248369216919, + "loss/hidden": 1.0625, + "loss/logits": 0.206559956073761, + "loss/reg": 0.012965119443833828, + "step": 1684 + }, + { + "epoch": 0.210625, + "grad_norm": 9.923733711242676, + "grad_norm_var": 3.2333504676999985, + "learning_rate": 0.0001, + "loss": 1.2908, + "loss/crossentropy": 2.4856460094451904, + "loss/hidden": 1.0078125, + "loss/logits": 0.1534312665462494, + "loss/reg": 0.012957965023815632, + "step": 1685 + }, + { + "epoch": 0.21075, + "grad_norm": 2.7674450874328613, + "grad_norm_var": 3.2696547533182656, + "learning_rate": 0.0001, + "loss": 1.094, + "loss/crossentropy": 2.563523054122925, + "loss/hidden": 0.828125, + "loss/logits": 0.13640597462654114, + "loss/reg": 0.012951286509633064, + "step": 1686 + }, + { + "epoch": 0.210875, + "grad_norm": 2.63311505317688, + "grad_norm_var": 3.3437877784147436, + "learning_rate": 0.0001, + "loss": 1.0298, + "loss/crossentropy": 2.604543685913086, + "loss/hidden": 0.76171875, + "loss/logits": 0.13861128687858582, + "loss/reg": 0.012944560497999191, + "step": 1687 + }, + { + "epoch": 0.211, + "grad_norm": 10.589808464050293, + "grad_norm_var": 6.347074021055989, + "learning_rate": 0.0001, + "loss": 1.3073, + "loss/crossentropy": 2.7445929050445557, + "loss/hidden": 1.0078125, + "loss/logits": 0.1701403260231018, + "loss/reg": 0.01293735858052969, + "step": 1688 + }, + { + "epoch": 0.211125, + "grad_norm": 2.85196590423584, + "grad_norm_var": 6.392746951466915, + "learning_rate": 0.0001, + "loss": 1.0699, + "loss/crossentropy": 2.5519180297851562, + "loss/hidden": 0.8203125, + "loss/logits": 0.12032581865787506, + "loss/reg": 0.012930169701576233, + "step": 1689 + }, + { + "epoch": 0.21125, + "grad_norm": 3.8103320598602295, + "grad_norm_var": 6.377889547085541, + "learning_rate": 0.0001, + "loss": 1.1586, + "loss/crossentropy": 2.479715585708618, + "loss/hidden": 0.87890625, + "loss/logits": 0.15049707889556885, + "loss/reg": 0.012922849506139755, + "step": 1690 + }, + { + "epoch": 0.211375, + "grad_norm": 2.617124080657959, + "grad_norm_var": 6.481173027892926, + "learning_rate": 0.0001, + "loss": 1.0754, + "loss/crossentropy": 2.60794734954834, + "loss/hidden": 0.80078125, + "loss/logits": 0.14545781910419464, + "loss/reg": 0.012915358878672123, + "step": 1691 + }, + { + "epoch": 0.2115, + "grad_norm": 3.9162254333496094, + "grad_norm_var": 6.354372430431797, + "learning_rate": 0.0001, + "loss": 1.1995, + "loss/crossentropy": 2.4997646808624268, + "loss/hidden": 0.921875, + "loss/logits": 0.1485539972782135, + "loss/reg": 0.012908329255878925, + "step": 1692 + }, + { + "epoch": 0.211625, + "grad_norm": 2.7217462062835693, + "grad_norm_var": 6.2539271325365196, + "learning_rate": 0.0001, + "loss": 1.1512, + "loss/crossentropy": 2.617048501968384, + "loss/hidden": 0.79296875, + "loss/logits": 0.22927230596542358, + "loss/reg": 0.012900839559733868, + "step": 1693 + }, + { + "epoch": 0.21175, + "grad_norm": 2.224349021911621, + "grad_norm_var": 6.485761495009887, + "learning_rate": 0.0001, + "loss": 0.9654, + "loss/crossentropy": 2.703951358795166, + "loss/hidden": 0.72265625, + "loss/logits": 0.11382674425840378, + "loss/reg": 0.012893814593553543, + "step": 1694 + }, + { + "epoch": 0.211875, + "grad_norm": 2.9117870330810547, + "grad_norm_var": 6.503189101884489, + "learning_rate": 0.0001, + "loss": 1.2238, + "loss/crossentropy": 2.5064597129821777, + "loss/hidden": 0.8984375, + "loss/logits": 0.1964527666568756, + "loss/reg": 0.012886927463114262, + "step": 1695 + }, + { + "epoch": 0.212, + "grad_norm": 2.943073034286499, + "grad_norm_var": 6.557645425580934, + "learning_rate": 0.0001, + "loss": 1.0268, + "loss/crossentropy": 2.681680679321289, + "loss/hidden": 0.75, + "loss/logits": 0.14798036217689514, + "loss/reg": 0.012880067341029644, + "step": 1696 + }, + { + "epoch": 0.212125, + "grad_norm": 2.8364195823669434, + "grad_norm_var": 6.633285254118307, + "learning_rate": 0.0001, + "loss": 1.2396, + "loss/crossentropy": 2.267443895339966, + "loss/hidden": 0.94140625, + "loss/logits": 0.16948428750038147, + "loss/reg": 0.012873291037976742, + "step": 1697 + }, + { + "epoch": 0.21225, + "grad_norm": 2.667727470397949, + "grad_norm_var": 6.676156819217446, + "learning_rate": 0.0001, + "loss": 1.1879, + "loss/crossentropy": 2.5819482803344727, + "loss/hidden": 0.8671875, + "loss/logits": 0.1920512467622757, + "loss/reg": 0.012866680510342121, + "step": 1698 + }, + { + "epoch": 0.212375, + "grad_norm": 3.2915964126586914, + "grad_norm_var": 6.496990351425703, + "learning_rate": 0.0001, + "loss": 1.0907, + "loss/crossentropy": 2.5162808895111084, + "loss/hidden": 0.828125, + "loss/logits": 0.13395392894744873, + "loss/reg": 0.012859686277806759, + "step": 1699 + }, + { + "epoch": 0.2125, + "grad_norm": 2.9825503826141357, + "grad_norm_var": 6.439824510110865, + "learning_rate": 0.0001, + "loss": 1.2178, + "loss/crossentropy": 2.2996246814727783, + "loss/hidden": 0.9140625, + "loss/logits": 0.1752283126115799, + "loss/reg": 0.01285255141556263, + "step": 1700 + }, + { + "epoch": 0.212625, + "grad_norm": 2.9510700702667236, + "grad_norm_var": 3.836942936103522, + "learning_rate": 0.0001, + "loss": 1.3336, + "loss/crossentropy": 2.3370251655578613, + "loss/hidden": 1.015625, + "loss/logits": 0.1895022988319397, + "loss/reg": 0.01284568477421999, + "step": 1701 + }, + { + "epoch": 0.21275, + "grad_norm": 2.5308520793914795, + "grad_norm_var": 3.8610195504163127, + "learning_rate": 0.0001, + "loss": 1.2202, + "loss/crossentropy": 2.513655185699463, + "loss/hidden": 0.91015625, + "loss/logits": 0.18170946836471558, + "loss/reg": 0.01283828355371952, + "step": 1702 + }, + { + "epoch": 0.212875, + "grad_norm": 4.751686096191406, + "grad_norm_var": 3.923506474684037, + "learning_rate": 0.0001, + "loss": 1.2352, + "loss/crossentropy": 2.173074245452881, + "loss/hidden": 0.953125, + "loss/logits": 0.15377816557884216, + "loss/reg": 0.012831181287765503, + "step": 1703 + }, + { + "epoch": 0.213, + "grad_norm": 3.511111259460449, + "grad_norm_var": 0.3989999503182124, + "learning_rate": 0.0001, + "loss": 1.2827, + "loss/crossentropy": 2.21486759185791, + "loss/hidden": 0.9765625, + "loss/logits": 0.17789113521575928, + "loss/reg": 0.01282414235174656, + "step": 1704 + }, + { + "epoch": 0.213125, + "grad_norm": 2.6096677780151367, + "grad_norm_var": 0.4105200098953181, + "learning_rate": 0.0001, + "loss": 1.0013, + "loss/crossentropy": 2.316843271255493, + "loss/hidden": 0.74609375, + "loss/logits": 0.12707841396331787, + "loss/reg": 0.012817141599953175, + "step": 1705 + }, + { + "epoch": 0.21325, + "grad_norm": 2.8105530738830566, + "grad_norm_var": 0.375613954977198, + "learning_rate": 0.0001, + "loss": 1.0661, + "loss/crossentropy": 2.369736909866333, + "loss/hidden": 0.8046875, + "loss/logits": 0.13333451747894287, + "loss/reg": 0.012809785082936287, + "step": 1706 + }, + { + "epoch": 0.213375, + "grad_norm": 3.1986374855041504, + "grad_norm_var": 0.36571755056712973, + "learning_rate": 0.0001, + "loss": 1.057, + "loss/crossentropy": 2.320181131362915, + "loss/hidden": 0.8046875, + "loss/logits": 0.12431928515434265, + "loss/reg": 0.012802771292626858, + "step": 1707 + }, + { + "epoch": 0.2135, + "grad_norm": 2.6183996200561523, + "grad_norm_var": 0.32173357495411, + "learning_rate": 0.0001, + "loss": 1.0431, + "loss/crossentropy": 2.4588027000427246, + "loss/hidden": 0.77734375, + "loss/logits": 0.13782772421836853, + "loss/reg": 0.01279559638351202, + "step": 1708 + }, + { + "epoch": 0.213625, + "grad_norm": 2.6844642162323, + "grad_norm_var": 0.3230673077730707, + "learning_rate": 0.0001, + "loss": 1.12, + "loss/crossentropy": 2.0865495204925537, + "loss/hidden": 0.8359375, + "loss/logits": 0.1561318039894104, + "loss/reg": 0.012788429856300354, + "step": 1709 + }, + { + "epoch": 0.21375, + "grad_norm": 15.15482234954834, + "grad_norm_var": 9.486914195081253, + "learning_rate": 0.0001, + "loss": 1.095, + "loss/crossentropy": 2.5281543731689453, + "loss/hidden": 0.82421875, + "loss/logits": 0.14299717545509338, + "loss/reg": 0.012780962511897087, + "step": 1710 + }, + { + "epoch": 0.213875, + "grad_norm": 3.1808626651763916, + "grad_norm_var": 9.460348003251534, + "learning_rate": 0.0001, + "loss": 1.1502, + "loss/crossentropy": 2.383219003677368, + "loss/hidden": 0.89453125, + "loss/logits": 0.12796291708946228, + "loss/reg": 0.012773919850587845, + "step": 1711 + }, + { + "epoch": 0.214, + "grad_norm": 4.163309574127197, + "grad_norm_var": 9.414766565786032, + "learning_rate": 0.0001, + "loss": 1.3702, + "loss/crossentropy": 2.455665111541748, + "loss/hidden": 1.03125, + "loss/logits": 0.21129587292671204, + "loss/reg": 0.012766973115503788, + "step": 1712 + }, + { + "epoch": 0.214125, + "grad_norm": 2.4835665225982666, + "grad_norm_var": 9.471244857981635, + "learning_rate": 0.0001, + "loss": 1.0306, + "loss/crossentropy": 2.1449975967407227, + "loss/hidden": 0.765625, + "loss/logits": 0.13736189901828766, + "loss/reg": 0.012759552337229252, + "step": 1713 + }, + { + "epoch": 0.21425, + "grad_norm": 3.456458806991577, + "grad_norm_var": 9.385853171996628, + "learning_rate": 0.0001, + "loss": 1.0432, + "loss/crossentropy": 2.746030330657959, + "loss/hidden": 0.78125, + "loss/logits": 0.1344452202320099, + "loss/reg": 0.01275256834924221, + "step": 1714 + }, + { + "epoch": 0.214375, + "grad_norm": 2.5408084392547607, + "grad_norm_var": 9.481860031377286, + "learning_rate": 0.0001, + "loss": 1.1865, + "loss/crossentropy": 2.4518415927886963, + "loss/hidden": 0.875, + "loss/logits": 0.18404340744018555, + "loss/reg": 0.01274561882019043, + "step": 1715 + }, + { + "epoch": 0.2145, + "grad_norm": 2.634857654571533, + "grad_norm_var": 9.529713299554377, + "learning_rate": 0.0001, + "loss": 1.0157, + "loss/crossentropy": 2.7000675201416016, + "loss/hidden": 0.7421875, + "loss/logits": 0.14611287415027618, + "loss/reg": 0.012738562189042568, + "step": 1716 + }, + { + "epoch": 0.214625, + "grad_norm": 2.735212564468384, + "grad_norm_var": 9.557923964972339, + "learning_rate": 0.0001, + "loss": 1.1027, + "loss/crossentropy": 2.5324275493621826, + "loss/hidden": 0.83203125, + "loss/logits": 0.14335589110851288, + "loss/reg": 0.012731647118926048, + "step": 1717 + }, + { + "epoch": 0.21475, + "grad_norm": 2.8685081005096436, + "grad_norm_var": 9.50716521368737, + "learning_rate": 0.0001, + "loss": 1.0448, + "loss/crossentropy": 2.559062957763672, + "loss/hidden": 0.77734375, + "loss/logits": 0.14023086428642273, + "loss/reg": 0.012724741362035275, + "step": 1718 + }, + { + "epoch": 0.214875, + "grad_norm": 2.7883400917053223, + "grad_norm_var": 9.508818411578824, + "learning_rate": 0.0001, + "loss": 1.1709, + "loss/crossentropy": 2.7375028133392334, + "loss/hidden": 0.86328125, + "loss/logits": 0.18043699860572815, + "loss/reg": 0.012717757374048233, + "step": 1719 + }, + { + "epoch": 0.215, + "grad_norm": 4.499538421630859, + "grad_norm_var": 9.54301307944679, + "learning_rate": 0.0001, + "loss": 1.394, + "loss/crossentropy": 2.3297677040100098, + "loss/hidden": 1.046875, + "loss/logits": 0.2200247049331665, + "loss/reg": 0.012710826471447945, + "step": 1720 + }, + { + "epoch": 0.215125, + "grad_norm": 2.6797218322753906, + "grad_norm_var": 9.53241861946436, + "learning_rate": 0.0001, + "loss": 1.2027, + "loss/crossentropy": 2.094529628753662, + "loss/hidden": 0.92578125, + "loss/logits": 0.1498618870973587, + "loss/reg": 0.012703881599009037, + "step": 1721 + }, + { + "epoch": 0.21525, + "grad_norm": 2.300706148147583, + "grad_norm_var": 9.614644455093018, + "learning_rate": 0.0001, + "loss": 1.056, + "loss/crossentropy": 2.341108560562134, + "loss/hidden": 0.78125, + "loss/logits": 0.14775308966636658, + "loss/reg": 0.012696735560894012, + "step": 1722 + }, + { + "epoch": 0.215375, + "grad_norm": 2.584787607192993, + "grad_norm_var": 9.683262071884398, + "learning_rate": 0.0001, + "loss": 1.2349, + "loss/crossentropy": 2.511127471923828, + "loss/hidden": 0.94140625, + "loss/logits": 0.16660267114639282, + "loss/reg": 0.012690168805420399, + "step": 1723 + }, + { + "epoch": 0.2155, + "grad_norm": 2.7051329612731934, + "grad_norm_var": 9.67109810339239, + "learning_rate": 0.0001, + "loss": 1.0733, + "loss/crossentropy": 2.2387161254882812, + "loss/hidden": 0.796875, + "loss/logits": 0.14958390593528748, + "loss/reg": 0.012683309614658356, + "step": 1724 + }, + { + "epoch": 0.215625, + "grad_norm": 2.6227893829345703, + "grad_norm_var": 9.679821099755788, + "learning_rate": 0.0001, + "loss": 1.0231, + "loss/crossentropy": 2.2900543212890625, + "loss/hidden": 0.76953125, + "loss/logits": 0.1268245130777359, + "loss/reg": 0.012676582671701908, + "step": 1725 + }, + { + "epoch": 0.21575, + "grad_norm": 3.2791683673858643, + "grad_norm_var": 0.3762032236898106, + "learning_rate": 0.0001, + "loss": 1.3265, + "loss/crossentropy": 2.2019684314727783, + "loss/hidden": 1.0078125, + "loss/logits": 0.19199709594249725, + "loss/reg": 0.01266949437558651, + "step": 1726 + }, + { + "epoch": 0.215875, + "grad_norm": 3.5843939781188965, + "grad_norm_var": 0.3977131857555946, + "learning_rate": 0.0001, + "loss": 1.1419, + "loss/crossentropy": 2.474095106124878, + "loss/hidden": 0.8515625, + "loss/logits": 0.16370782256126404, + "loss/reg": 0.012662646360695362, + "step": 1727 + }, + { + "epoch": 0.216, + "grad_norm": 4.510242938995361, + "grad_norm_var": 0.45925816137897757, + "learning_rate": 0.0001, + "loss": 1.0694, + "loss/crossentropy": 2.829896926879883, + "loss/hidden": 0.8046875, + "loss/logits": 0.13811606168746948, + "loss/reg": 0.012655883096158504, + "step": 1728 + }, + { + "epoch": 0.216125, + "grad_norm": 3.1752772331237793, + "grad_norm_var": 0.4399517134814609, + "learning_rate": 0.0001, + "loss": 1.2447, + "loss/crossentropy": 2.6229610443115234, + "loss/hidden": 0.92578125, + "loss/logits": 0.19241932034492493, + "loss/reg": 0.012648857198655605, + "step": 1729 + }, + { + "epoch": 0.21625, + "grad_norm": 3.531876802444458, + "grad_norm_var": 0.44429015337232514, + "learning_rate": 0.0001, + "loss": 1.2492, + "loss/crossentropy": 2.466808795928955, + "loss/hidden": 0.91015625, + "loss/logits": 0.2125817835330963, + "loss/reg": 0.01264151744544506, + "step": 1730 + }, + { + "epoch": 0.216375, + "grad_norm": 7.509683609008789, + "grad_norm_var": 1.6400556058435427, + "learning_rate": 0.0001, + "loss": 1.6584, + "loss/crossentropy": 2.631517171859741, + "loss/hidden": 1.2890625, + "loss/logits": 0.24304136633872986, + "loss/reg": 0.012634233571588993, + "step": 1731 + }, + { + "epoch": 0.2165, + "grad_norm": 2.4990897178649902, + "grad_norm_var": 1.6546175936024687, + "learning_rate": 0.0001, + "loss": 1.041, + "loss/crossentropy": 2.5932819843292236, + "loss/hidden": 0.77734375, + "loss/logits": 0.13742858171463013, + "loss/reg": 0.012627250514924526, + "step": 1732 + }, + { + "epoch": 0.216625, + "grad_norm": 2.389944553375244, + "grad_norm_var": 1.6911601234778078, + "learning_rate": 0.0001, + "loss": 0.9812, + "loss/crossentropy": 2.318802833557129, + "loss/hidden": 0.73046875, + "loss/logits": 0.12450764328241348, + "loss/reg": 0.012620464898645878, + "step": 1733 + }, + { + "epoch": 0.21675, + "grad_norm": 2.6159985065460205, + "grad_norm_var": 1.7112070581365304, + "learning_rate": 0.0001, + "loss": 1.141, + "loss/crossentropy": 2.716378688812256, + "loss/hidden": 0.84375, + "loss/logits": 0.17106443643569946, + "loss/reg": 0.012613578699529171, + "step": 1734 + }, + { + "epoch": 0.216875, + "grad_norm": 3.3391213417053223, + "grad_norm_var": 1.6904040902109745, + "learning_rate": 0.0001, + "loss": 1.1009, + "loss/crossentropy": 2.4364097118377686, + "loss/hidden": 0.828125, + "loss/logits": 0.14670366048812866, + "loss/reg": 0.012606512755155563, + "step": 1735 + }, + { + "epoch": 0.217, + "grad_norm": 3.596266984939575, + "grad_norm_var": 1.604664018069381, + "learning_rate": 0.0001, + "loss": 1.2259, + "loss/crossentropy": 2.486934185028076, + "loss/hidden": 0.94140625, + "loss/logits": 0.15848477184772491, + "loss/reg": 0.012599713169038296, + "step": 1736 + }, + { + "epoch": 0.217125, + "grad_norm": 2.6140332221984863, + "grad_norm_var": 1.6104343887098584, + "learning_rate": 0.0001, + "loss": 1.0859, + "loss/crossentropy": 2.3189902305603027, + "loss/hidden": 0.82421875, + "loss/logits": 0.13571876287460327, + "loss/reg": 0.012592630460858345, + "step": 1737 + }, + { + "epoch": 0.21725, + "grad_norm": 2.87418532371521, + "grad_norm_var": 1.554299756346355, + "learning_rate": 0.0001, + "loss": 1.2125, + "loss/crossentropy": 2.095885992050171, + "loss/hidden": 0.91796875, + "loss/logits": 0.16862446069717407, + "loss/reg": 0.012585851363837719, + "step": 1738 + }, + { + "epoch": 0.217375, + "grad_norm": 4.191568851470947, + "grad_norm_var": 1.5539712836144928, + "learning_rate": 0.0001, + "loss": 1.7198, + "loss/crossentropy": 2.424238681793213, + "loss/hidden": 1.3046875, + "loss/logits": 0.2893022298812866, + "loss/reg": 0.012578519992530346, + "step": 1739 + }, + { + "epoch": 0.2175, + "grad_norm": 2.666806936264038, + "grad_norm_var": 1.5578179682411237, + "learning_rate": 0.0001, + "loss": 1.1622, + "loss/crossentropy": 2.2649426460266113, + "loss/hidden": 0.875, + "loss/logits": 0.16145509481430054, + "loss/reg": 0.01257177721709013, + "step": 1740 + }, + { + "epoch": 0.217625, + "grad_norm": 2.6806042194366455, + "grad_norm_var": 1.5517463474106703, + "learning_rate": 0.0001, + "loss": 1.0853, + "loss/crossentropy": 2.4488718509674072, + "loss/hidden": 0.80078125, + "loss/logits": 0.15884113311767578, + "loss/reg": 0.012565212324261665, + "step": 1741 + }, + { + "epoch": 0.21775, + "grad_norm": 3.704366683959961, + "grad_norm_var": 1.5538631925025423, + "learning_rate": 0.0001, + "loss": 1.2354, + "loss/crossentropy": 2.6138522624969482, + "loss/hidden": 0.94140625, + "loss/logits": 0.16840386390686035, + "loss/reg": 0.012558743357658386, + "step": 1742 + }, + { + "epoch": 0.217875, + "grad_norm": 3.2077255249023438, + "grad_norm_var": 1.556870797982151, + "learning_rate": 0.0001, + "loss": 1.316, + "loss/crossentropy": 2.141016721725464, + "loss/hidden": 1.0390625, + "loss/logits": 0.15139830112457275, + "loss/reg": 0.012552441097795963, + "step": 1743 + }, + { + "epoch": 0.218, + "grad_norm": 2.951503038406372, + "grad_norm_var": 1.4871620619995438, + "learning_rate": 0.0001, + "loss": 1.0989, + "loss/crossentropy": 2.5490455627441406, + "loss/hidden": 0.8203125, + "loss/logits": 0.1530894637107849, + "loss/reg": 0.012546169571578503, + "step": 1744 + }, + { + "epoch": 0.218125, + "grad_norm": 3.119734525680542, + "grad_norm_var": 1.488624773240887, + "learning_rate": 0.0001, + "loss": 1.0844, + "loss/crossentropy": 2.4211771488189697, + "loss/hidden": 0.81640625, + "loss/logits": 0.14257250726222992, + "loss/reg": 0.012539266608655453, + "step": 1745 + }, + { + "epoch": 0.21825, + "grad_norm": 3.663892984390259, + "grad_norm_var": 1.4930337180673443, + "learning_rate": 0.0001, + "loss": 1.0072, + "loss/crossentropy": 2.431640148162842, + "loss/hidden": 0.76953125, + "loss/logits": 0.11231502890586853, + "loss/reg": 0.012532477267086506, + "step": 1746 + }, + { + "epoch": 0.218375, + "grad_norm": 2.77474045753479, + "grad_norm_var": 0.26911648905078756, + "learning_rate": 0.0001, + "loss": 1.0839, + "loss/crossentropy": 2.7117726802825928, + "loss/hidden": 0.80859375, + "loss/logits": 0.15006102621555328, + "loss/reg": 0.012525715865194798, + "step": 1747 + }, + { + "epoch": 0.2185, + "grad_norm": 2.6455776691436768, + "grad_norm_var": 0.25958807313815746, + "learning_rate": 0.0001, + "loss": 1.0041, + "loss/crossentropy": 2.7506678104400635, + "loss/hidden": 0.74609375, + "loss/logits": 0.1328078657388687, + "loss/reg": 0.012519202195107937, + "step": 1748 + }, + { + "epoch": 0.218625, + "grad_norm": 24.219947814941406, + "grad_norm_var": 28.07975632569756, + "learning_rate": 0.0001, + "loss": 1.4577, + "loss/crossentropy": 2.543936014175415, + "loss/hidden": 1.203125, + "loss/logits": 0.12944281101226807, + "loss/reg": 0.01251268945634365, + "step": 1749 + }, + { + "epoch": 0.21875, + "grad_norm": 37.90777587890625, + "grad_norm_var": 97.39228721131126, + "learning_rate": 0.0001, + "loss": 1.1124, + "loss/crossentropy": 2.2267796993255615, + "loss/hidden": 0.8515625, + "loss/logits": 0.1357845962047577, + "loss/reg": 0.012506171129643917, + "step": 1750 + }, + { + "epoch": 0.218875, + "grad_norm": 3.590038537979126, + "grad_norm_var": 97.28596098453178, + "learning_rate": 0.0001, + "loss": 1.0614, + "loss/crossentropy": 2.42260479927063, + "loss/hidden": 0.8046875, + "loss/logits": 0.1316903978586197, + "loss/reg": 0.012499329634010792, + "step": 1751 + }, + { + "epoch": 0.219, + "grad_norm": 2.845022439956665, + "grad_norm_var": 97.6271689383777, + "learning_rate": 0.0001, + "loss": 1.092, + "loss/crossentropy": 2.804971218109131, + "loss/hidden": 0.82421875, + "loss/logits": 0.14280739426612854, + "loss/reg": 0.012492484413087368, + "step": 1752 + }, + { + "epoch": 0.219125, + "grad_norm": 3.189939260482788, + "grad_norm_var": 97.34154979157398, + "learning_rate": 0.0001, + "loss": 1.1496, + "loss/crossentropy": 2.448812484741211, + "loss/hidden": 0.875, + "loss/logits": 0.14971600472927094, + "loss/reg": 0.012485613115131855, + "step": 1753 + }, + { + "epoch": 0.21925, + "grad_norm": 2.670297145843506, + "grad_norm_var": 97.44651079060357, + "learning_rate": 0.0001, + "loss": 1.1915, + "loss/crossentropy": 2.4728586673736572, + "loss/hidden": 0.87890625, + "loss/logits": 0.1877635270357132, + "loss/reg": 0.012478794902563095, + "step": 1754 + }, + { + "epoch": 0.219375, + "grad_norm": 3.3437023162841797, + "grad_norm_var": 97.76674601970879, + "learning_rate": 0.0001, + "loss": 1.1638, + "loss/crossentropy": 2.3619091510772705, + "loss/hidden": 0.87890625, + "loss/logits": 0.1601634919643402, + "loss/reg": 0.012471921741962433, + "step": 1755 + }, + { + "epoch": 0.2195, + "grad_norm": 3.5836129188537598, + "grad_norm_var": 97.34167870831958, + "learning_rate": 0.0001, + "loss": 1.3367, + "loss/crossentropy": 2.671821355819702, + "loss/hidden": 1.03125, + "loss/logits": 0.1808364987373352, + "loss/reg": 0.012465192936360836, + "step": 1756 + }, + { + "epoch": 0.219625, + "grad_norm": 2.6130692958831787, + "grad_norm_var": 97.37753712214686, + "learning_rate": 0.0001, + "loss": 0.9876, + "loss/crossentropy": 2.5102763175964355, + "loss/hidden": 0.73828125, + "loss/logits": 0.12473314255475998, + "loss/reg": 0.01245811302214861, + "step": 1757 + }, + { + "epoch": 0.21975, + "grad_norm": 3.395615339279175, + "grad_norm_var": 97.50380796911385, + "learning_rate": 0.0001, + "loss": 1.2064, + "loss/crossentropy": 2.485377073287964, + "loss/hidden": 0.9140625, + "loss/logits": 0.16786439716815948, + "loss/reg": 0.01245111133903265, + "step": 1758 + }, + { + "epoch": 0.219875, + "grad_norm": 4.825221061706543, + "grad_norm_var": 96.93408061336578, + "learning_rate": 0.0001, + "loss": 1.0737, + "loss/crossentropy": 2.5381832122802734, + "loss/hidden": 0.8125, + "loss/logits": 0.13672339916229248, + "loss/reg": 0.012444333173334599, + "step": 1759 + }, + { + "epoch": 0.22, + "grad_norm": 2.695854663848877, + "grad_norm_var": 97.06623592058641, + "learning_rate": 0.0001, + "loss": 1.2057, + "loss/crossentropy": 2.5472490787506104, + "loss/hidden": 0.90625, + "loss/logits": 0.17507830262184143, + "loss/reg": 0.012437481433153152, + "step": 1760 + }, + { + "epoch": 0.220125, + "grad_norm": 2.619779109954834, + "grad_norm_var": 97.32003810205347, + "learning_rate": 0.0001, + "loss": 1.0803, + "loss/crossentropy": 2.5513741970062256, + "loss/hidden": 0.81640625, + "loss/logits": 0.13957199454307556, + "loss/reg": 0.012430677190423012, + "step": 1761 + }, + { + "epoch": 0.22025, + "grad_norm": 2.6490063667297363, + "grad_norm_var": 97.79004434756365, + "learning_rate": 0.0001, + "loss": 1.1991, + "loss/crossentropy": 2.408766031265259, + "loss/hidden": 0.921875, + "loss/logits": 0.15302732586860657, + "loss/reg": 0.012424097396433353, + "step": 1762 + }, + { + "epoch": 0.220375, + "grad_norm": 3.600940465927124, + "grad_norm_var": 97.41152871154476, + "learning_rate": 0.0001, + "loss": 1.0448, + "loss/crossentropy": 2.567258358001709, + "loss/hidden": 0.79296875, + "loss/logits": 0.12761420011520386, + "loss/reg": 0.012417309917509556, + "step": 1763 + }, + { + "epoch": 0.2205, + "grad_norm": 2.783167600631714, + "grad_norm_var": 97.33925474643891, + "learning_rate": 0.0001, + "loss": 1.2647, + "loss/crossentropy": 2.4639530181884766, + "loss/hidden": 0.98828125, + "loss/logits": 0.15236002206802368, + "loss/reg": 0.012410394847393036, + "step": 1764 + }, + { + "epoch": 0.220625, + "grad_norm": 2.8482487201690674, + "grad_norm_var": 75.84316673308427, + "learning_rate": 0.0001, + "loss": 1.1159, + "loss/crossentropy": 2.614351511001587, + "loss/hidden": 0.84765625, + "loss/logits": 0.14418500661849976, + "loss/reg": 0.012403246946632862, + "step": 1765 + }, + { + "epoch": 0.22075, + "grad_norm": 4.04698371887207, + "grad_norm_var": 0.388008374816809, + "learning_rate": 0.0001, + "loss": 1.3027, + "loss/crossentropy": 2.6768503189086914, + "loss/hidden": 1.0, + "loss/logits": 0.17873653769493103, + "loss/reg": 0.012396049685776234, + "step": 1766 + }, + { + "epoch": 0.220875, + "grad_norm": 16.276508331298828, + "grad_norm_var": 11.096302421984861, + "learning_rate": 0.0001, + "loss": 1.1526, + "loss/crossentropy": 2.6857051849365234, + "loss/hidden": 0.87890625, + "loss/logits": 0.14977185428142548, + "loss/reg": 0.01238931342959404, + "step": 1767 + }, + { + "epoch": 0.221, + "grad_norm": 3.4267630577087402, + "grad_norm_var": 11.027930664929489, + "learning_rate": 0.0001, + "loss": 1.0696, + "loss/crossentropy": 2.5231096744537354, + "loss/hidden": 0.80078125, + "loss/logits": 0.1450020670890808, + "loss/reg": 0.01238264236599207, + "step": 1768 + }, + { + "epoch": 0.221125, + "grad_norm": 2.5508995056152344, + "grad_norm_var": 11.12550393762463, + "learning_rate": 0.0001, + "loss": 1.1015, + "loss/crossentropy": 2.5338897705078125, + "loss/hidden": 0.8125, + "loss/logits": 0.1652265191078186, + "loss/reg": 0.012375940568745136, + "step": 1769 + }, + { + "epoch": 0.22125, + "grad_norm": 2.664597511291504, + "grad_norm_var": 11.126513136887851, + "learning_rate": 0.0001, + "loss": 1.0946, + "loss/crossentropy": 2.4849798679351807, + "loss/hidden": 0.80859375, + "loss/logits": 0.16228261590003967, + "loss/reg": 0.012369180098176003, + "step": 1770 + }, + { + "epoch": 0.221375, + "grad_norm": 4.765702247619629, + "grad_norm_var": 11.129360295504242, + "learning_rate": 0.0001, + "loss": 1.1274, + "loss/crossentropy": 2.440256357192993, + "loss/hidden": 0.85546875, + "loss/logits": 0.14826975762844086, + "loss/reg": 0.01236215140670538, + "step": 1771 + }, + { + "epoch": 0.2215, + "grad_norm": 3.133784294128418, + "grad_norm_var": 11.172026082855703, + "learning_rate": 0.0001, + "loss": 1.0617, + "loss/crossentropy": 2.7009589672088623, + "loss/hidden": 0.7890625, + "loss/logits": 0.14906959235668182, + "loss/reg": 0.012355109676718712, + "step": 1772 + }, + { + "epoch": 0.221625, + "grad_norm": 3.329200506210327, + "grad_norm_var": 11.066300955859553, + "learning_rate": 0.0001, + "loss": 1.064, + "loss/crossentropy": 2.5220742225646973, + "loss/hidden": 0.8125, + "loss/logits": 0.12804913520812988, + "loss/reg": 0.01234830915927887, + "step": 1773 + }, + { + "epoch": 0.22175, + "grad_norm": 4.502491474151611, + "grad_norm_var": 11.038805635564632, + "learning_rate": 0.0001, + "loss": 1.1216, + "loss/crossentropy": 2.4427030086517334, + "loss/hidden": 0.8515625, + "loss/logits": 0.14662238955497742, + "loss/reg": 0.012341534718871117, + "step": 1774 + }, + { + "epoch": 0.221875, + "grad_norm": 4.261502265930176, + "grad_norm_var": 11.009414759143056, + "learning_rate": 0.0001, + "loss": 1.2303, + "loss/crossentropy": 2.463496208190918, + "loss/hidden": 0.921875, + "loss/logits": 0.18511801958084106, + "loss/reg": 0.012334545142948627, + "step": 1775 + }, + { + "epoch": 0.222, + "grad_norm": 2.7611560821533203, + "grad_norm_var": 10.997153332448207, + "learning_rate": 0.0001, + "loss": 1.103, + "loss/crossentropy": 2.643156051635742, + "loss/hidden": 0.80859375, + "loss/logits": 0.17114627361297607, + "loss/reg": 0.012327339500188828, + "step": 1776 + }, + { + "epoch": 0.222125, + "grad_norm": 2.4437193870544434, + "grad_norm_var": 11.034748998650539, + "learning_rate": 0.0001, + "loss": 1.0003, + "loss/crossentropy": 2.910154342651367, + "loss/hidden": 0.73828125, + "loss/logits": 0.13876628875732422, + "loss/reg": 0.012320267036557198, + "step": 1777 + }, + { + "epoch": 0.22225, + "grad_norm": 4.437688827514648, + "grad_norm_var": 10.88203350793716, + "learning_rate": 0.0001, + "loss": 1.3096, + "loss/crossentropy": 2.2305355072021484, + "loss/hidden": 1.0, + "loss/logits": 0.1865069717168808, + "loss/reg": 0.012313010171055794, + "step": 1778 + }, + { + "epoch": 0.222375, + "grad_norm": 3.549940586090088, + "grad_norm_var": 10.886538839719051, + "learning_rate": 0.0001, + "loss": 1.2606, + "loss/crossentropy": 2.4338810443878174, + "loss/hidden": 0.9609375, + "loss/logits": 0.1766429841518402, + "loss/reg": 0.012306897900998592, + "step": 1779 + }, + { + "epoch": 0.2225, + "grad_norm": 3.347196578979492, + "grad_norm_var": 10.797133407006523, + "learning_rate": 0.0001, + "loss": 1.1304, + "loss/crossentropy": 2.835353374481201, + "loss/hidden": 0.84765625, + "loss/logits": 0.1597260981798172, + "loss/reg": 0.012300064787268639, + "step": 1780 + }, + { + "epoch": 0.222625, + "grad_norm": 3.352013111114502, + "grad_norm_var": 10.717386787566932, + "learning_rate": 0.0001, + "loss": 0.9526, + "loss/crossentropy": 2.4358811378479004, + "loss/hidden": 0.7265625, + "loss/logits": 0.10308530926704407, + "loss/reg": 0.012293456122279167, + "step": 1781 + }, + { + "epoch": 0.22275, + "grad_norm": 3.733184576034546, + "grad_norm_var": 10.73425846404735, + "learning_rate": 0.0001, + "loss": 1.0594, + "loss/crossentropy": 2.7393431663513184, + "loss/hidden": 0.796875, + "loss/logits": 0.139676034450531, + "loss/reg": 0.012287287972867489, + "step": 1782 + }, + { + "epoch": 0.222875, + "grad_norm": 2.7327675819396973, + "grad_norm_var": 0.5414954532598804, + "learning_rate": 0.0001, + "loss": 0.9985, + "loss/crossentropy": 2.4664580821990967, + "loss/hidden": 0.75, + "loss/logits": 0.1256864368915558, + "loss/reg": 0.012280437164008617, + "step": 1783 + }, + { + "epoch": 0.223, + "grad_norm": 2.7125771045684814, + "grad_norm_var": 0.5743527285802199, + "learning_rate": 0.0001, + "loss": 1.0003, + "loss/crossentropy": 2.3793299198150635, + "loss/hidden": 0.75, + "loss/logits": 0.12756191194057465, + "loss/reg": 0.012273788452148438, + "step": 1784 + }, + { + "epoch": 0.223125, + "grad_norm": 4.580681800842285, + "grad_norm_var": 0.6041116655885861, + "learning_rate": 0.0001, + "loss": 1.5202, + "loss/crossentropy": 1.8176430463790894, + "loss/hidden": 1.171875, + "loss/logits": 0.22560694813728333, + "loss/reg": 0.012266921810805798, + "step": 1785 + }, + { + "epoch": 0.22325, + "grad_norm": 3.310520887374878, + "grad_norm_var": 0.5565812947330983, + "learning_rate": 0.0001, + "loss": 1.0981, + "loss/crossentropy": 2.3666412830352783, + "loss/hidden": 0.8359375, + "loss/logits": 0.13953973352909088, + "loss/reg": 0.012260017916560173, + "step": 1786 + }, + { + "epoch": 0.223375, + "grad_norm": 3.3059277534484863, + "grad_norm_var": 0.45501991794078456, + "learning_rate": 0.0001, + "loss": 1.198, + "loss/crossentropy": 2.3159430027008057, + "loss/hidden": 0.90625, + "loss/logits": 0.16920451819896698, + "loss/reg": 0.012253600172698498, + "step": 1787 + }, + { + "epoch": 0.2235, + "grad_norm": 2.687586784362793, + "grad_norm_var": 0.4873702987343137, + "learning_rate": 0.0001, + "loss": 1.0853, + "loss/crossentropy": 2.2798428535461426, + "loss/hidden": 0.8359375, + "loss/logits": 0.12685424089431763, + "loss/reg": 0.012246805243194103, + "step": 1788 + }, + { + "epoch": 0.223625, + "grad_norm": 2.5434353351593018, + "grad_norm_var": 0.5376211993019903, + "learning_rate": 0.0001, + "loss": 0.9544, + "loss/crossentropy": 2.6347262859344482, + "loss/hidden": 0.703125, + "loss/logits": 0.12888537347316742, + "loss/reg": 0.012239991687238216, + "step": 1789 + }, + { + "epoch": 0.22375, + "grad_norm": 2.1676807403564453, + "grad_norm_var": 0.5324380567161892, + "learning_rate": 0.0001, + "loss": 1.0395, + "loss/crossentropy": 2.4155385494232178, + "loss/hidden": 0.78125, + "loss/logits": 0.13590013980865479, + "loss/reg": 0.012233071029186249, + "step": 1790 + }, + { + "epoch": 0.223875, + "grad_norm": 2.7618322372436523, + "grad_norm_var": 0.4698401846206546, + "learning_rate": 0.0001, + "loss": 0.9624, + "loss/crossentropy": 2.396468162536621, + "loss/hidden": 0.7109375, + "loss/logits": 0.12916944921016693, + "loss/reg": 0.012226960621774197, + "step": 1791 + }, + { + "epoch": 0.224, + "grad_norm": 2.605301856994629, + "grad_norm_var": 0.4794749872916848, + "learning_rate": 0.0001, + "loss": 1.0749, + "loss/crossentropy": 2.55946946144104, + "loss/hidden": 0.80859375, + "loss/logits": 0.1441141664981842, + "loss/reg": 0.012221286073327065, + "step": 1792 + }, + { + "epoch": 0.224125, + "grad_norm": 2.7405450344085693, + "grad_norm_var": 0.4573457631061404, + "learning_rate": 0.0001, + "loss": 1.2526, + "loss/crossentropy": 2.6217191219329834, + "loss/hidden": 0.91015625, + "loss/logits": 0.22033345699310303, + "loss/reg": 0.012214362621307373, + "step": 1793 + }, + { + "epoch": 0.22425, + "grad_norm": 4.141777515411377, + "grad_norm_var": 0.412429371225325, + "learning_rate": 0.0001, + "loss": 1.3743, + "loss/crossentropy": 1.6800086498260498, + "loss/hidden": 1.109375, + "loss/logits": 0.14288680255413055, + "loss/reg": 0.012208379805088043, + "step": 1794 + }, + { + "epoch": 0.224375, + "grad_norm": 3.810920476913452, + "grad_norm_var": 0.43087940783878576, + "learning_rate": 0.0001, + "loss": 1.0222, + "loss/crossentropy": 2.799727201461792, + "loss/hidden": 0.7734375, + "loss/logits": 0.12675243616104126, + "loss/reg": 0.012202396057546139, + "step": 1795 + }, + { + "epoch": 0.2245, + "grad_norm": 5.86505126953125, + "grad_norm_var": 0.8904950250011618, + "learning_rate": 0.0001, + "loss": 1.3993, + "loss/crossentropy": 2.3400473594665527, + "loss/hidden": 1.109375, + "loss/logits": 0.16792380809783936, + "loss/reg": 0.012196486815810204, + "step": 1796 + }, + { + "epoch": 0.224625, + "grad_norm": 4.985130310058594, + "grad_norm_var": 1.0650859328511084, + "learning_rate": 0.0001, + "loss": 1.2963, + "loss/crossentropy": 2.8352878093719482, + "loss/hidden": 0.9609375, + "loss/logits": 0.21343833208084106, + "loss/reg": 0.012190500274300575, + "step": 1797 + }, + { + "epoch": 0.22475, + "grad_norm": 3.3390769958496094, + "grad_norm_var": 1.0582211532143861, + "learning_rate": 0.0001, + "loss": 1.2406, + "loss/crossentropy": 2.443668842315674, + "loss/hidden": 0.94921875, + "loss/logits": 0.16958631575107574, + "loss/reg": 0.012183710001409054, + "step": 1798 + }, + { + "epoch": 0.224875, + "grad_norm": 6.64145040512085, + "grad_norm_var": 1.6689068782404564, + "learning_rate": 0.0001, + "loss": 1.0831, + "loss/crossentropy": 2.9244539737701416, + "loss/hidden": 0.81640625, + "loss/logits": 0.1449125111103058, + "loss/reg": 0.012176886200904846, + "step": 1799 + }, + { + "epoch": 0.225, + "grad_norm": 5.070358753204346, + "grad_norm_var": 1.7255938342974626, + "learning_rate": 0.0001, + "loss": 1.6577, + "loss/crossentropy": 2.39843487739563, + "loss/hidden": 1.296875, + "loss/logits": 0.23911422491073608, + "loss/reg": 0.012170514091849327, + "step": 1800 + }, + { + "epoch": 0.225125, + "grad_norm": 4.553746223449707, + "grad_norm_var": 1.7227809488467594, + "learning_rate": 0.0001, + "loss": 1.2558, + "loss/crossentropy": 2.6131913661956787, + "loss/hidden": 0.953125, + "loss/logits": 0.181059330701828, + "loss/reg": 0.012163708917796612, + "step": 1801 + }, + { + "epoch": 0.22525, + "grad_norm": 4.960707187652588, + "grad_norm_var": 1.7889862701755266, + "learning_rate": 0.0001, + "loss": 1.1394, + "loss/crossentropy": 2.8065645694732666, + "loss/hidden": 0.87890625, + "loss/logits": 0.13889148831367493, + "loss/reg": 0.012157038785517216, + "step": 1802 + }, + { + "epoch": 0.225375, + "grad_norm": 3.1211495399475098, + "grad_norm_var": 1.8054184757113145, + "learning_rate": 0.0001, + "loss": 1.1695, + "loss/crossentropy": 2.45322585105896, + "loss/hidden": 0.8828125, + "loss/logits": 0.16519255936145782, + "loss/reg": 0.012150485999882221, + "step": 1803 + }, + { + "epoch": 0.2255, + "grad_norm": 3.1585381031036377, + "grad_norm_var": 1.74473550652442, + "learning_rate": 0.0001, + "loss": 1.1105, + "loss/crossentropy": 2.432692050933838, + "loss/hidden": 0.85546875, + "loss/logits": 0.1336173415184021, + "loss/reg": 0.012143667787313461, + "step": 1804 + }, + { + "epoch": 0.225625, + "grad_norm": 3.253127098083496, + "grad_norm_var": 1.647454221879684, + "learning_rate": 0.0001, + "loss": 1.1977, + "loss/crossentropy": 2.515928268432617, + "loss/hidden": 0.9140625, + "loss/logits": 0.1622786521911621, + "loss/reg": 0.012136753648519516, + "step": 1805 + }, + { + "epoch": 0.22575, + "grad_norm": 2.5755836963653564, + "grad_norm_var": 1.560998409452076, + "learning_rate": 0.0001, + "loss": 1.0782, + "loss/crossentropy": 2.3887364864349365, + "loss/hidden": 0.8125, + "loss/logits": 0.14443060755729675, + "loss/reg": 0.01213008351624012, + "step": 1806 + }, + { + "epoch": 0.225875, + "grad_norm": 3.9604837894439697, + "grad_norm_var": 1.4570643895132678, + "learning_rate": 0.0001, + "loss": 1.3124, + "loss/crossentropy": 2.5280768871307373, + "loss/hidden": 0.96484375, + "loss/logits": 0.2262798249721527, + "loss/reg": 0.012123593129217625, + "step": 1807 + }, + { + "epoch": 0.226, + "grad_norm": 3.0762956142425537, + "grad_norm_var": 1.380270170821843, + "learning_rate": 0.0001, + "loss": 1.0253, + "loss/crossentropy": 2.928957939147949, + "loss/hidden": 0.76953125, + "loss/logits": 0.13464616239070892, + "loss/reg": 0.012116815894842148, + "step": 1808 + }, + { + "epoch": 0.226125, + "grad_norm": 3.482069730758667, + "grad_norm_var": 1.2823655143187342, + "learning_rate": 0.0001, + "loss": 1.3007, + "loss/crossentropy": 2.523679733276367, + "loss/hidden": 0.99609375, + "loss/logits": 0.18347124755382538, + "loss/reg": 0.012109990231692791, + "step": 1809 + }, + { + "epoch": 0.22625, + "grad_norm": 2.7978577613830566, + "grad_norm_var": 1.3921909123204605, + "learning_rate": 0.0001, + "loss": 1.1764, + "loss/crossentropy": 2.5396063327789307, + "loss/hidden": 0.8984375, + "loss/logits": 0.15695957839488983, + "loss/reg": 0.012103389948606491, + "step": 1810 + }, + { + "epoch": 0.226375, + "grad_norm": 3.3859915733337402, + "grad_norm_var": 1.4164960881741988, + "learning_rate": 0.0001, + "loss": 1.1827, + "loss/crossentropy": 2.421473264694214, + "loss/hidden": 0.890625, + "loss/logits": 0.1710580289363861, + "loss/reg": 0.01209702342748642, + "step": 1811 + }, + { + "epoch": 0.2265, + "grad_norm": 2.7233433723449707, + "grad_norm_var": 1.2580651775254919, + "learning_rate": 0.0001, + "loss": 1.1054, + "loss/crossentropy": 2.460507392883301, + "loss/hidden": 0.8359375, + "loss/logits": 0.14855897426605225, + "loss/reg": 0.012090392410755157, + "step": 1812 + }, + { + "epoch": 0.226625, + "grad_norm": 2.803961992263794, + "grad_norm_var": 1.2159247798178134, + "learning_rate": 0.0001, + "loss": 1.0841, + "loss/crossentropy": 2.4394664764404297, + "loss/hidden": 0.83203125, + "loss/logits": 0.13119429349899292, + "loss/reg": 0.012084128335118294, + "step": 1813 + }, + { + "epoch": 0.22675, + "grad_norm": 2.345188856124878, + "grad_norm_var": 1.323038348759911, + "learning_rate": 0.0001, + "loss": 1.0238, + "loss/crossentropy": 2.4309263229370117, + "loss/hidden": 0.76953125, + "loss/logits": 0.13351188600063324, + "loss/reg": 0.01207789871841669, + "step": 1814 + }, + { + "epoch": 0.226875, + "grad_norm": 2.409219741821289, + "grad_norm_var": 0.7371698535500113, + "learning_rate": 0.0001, + "loss": 1.0862, + "loss/crossentropy": 2.6133697032928467, + "loss/hidden": 0.82421875, + "loss/logits": 0.14128842949867249, + "loss/reg": 0.012071667239069939, + "step": 1815 + }, + { + "epoch": 0.227, + "grad_norm": 2.506357192993164, + "grad_norm_var": 0.5615762297781826, + "learning_rate": 0.0001, + "loss": 1.1755, + "loss/crossentropy": 2.237166166305542, + "loss/hidden": 0.875, + "loss/logits": 0.17987734079360962, + "loss/reg": 0.01206506323069334, + "step": 1816 + }, + { + "epoch": 0.227125, + "grad_norm": 3.0118813514709473, + "grad_norm_var": 0.43074473519190304, + "learning_rate": 0.0001, + "loss": 1.2733, + "loss/crossentropy": 2.306960344314575, + "loss/hidden": 0.97265625, + "loss/logits": 0.18007943034172058, + "loss/reg": 0.012058539316058159, + "step": 1817 + }, + { + "epoch": 0.22725, + "grad_norm": 2.3795273303985596, + "grad_norm_var": 0.20616682699356942, + "learning_rate": 0.0001, + "loss": 1.1529, + "loss/crossentropy": 2.451083183288574, + "loss/hidden": 0.8828125, + "loss/logits": 0.1495814025402069, + "loss/reg": 0.012052006088197231, + "step": 1818 + }, + { + "epoch": 0.227375, + "grad_norm": 2.332099676132202, + "grad_norm_var": 0.22569619304637148, + "learning_rate": 0.0001, + "loss": 1.0095, + "loss/crossentropy": 2.538863182067871, + "loss/hidden": 0.765625, + "loss/logits": 0.1233830600976944, + "loss/reg": 0.012045500800013542, + "step": 1819 + }, + { + "epoch": 0.2275, + "grad_norm": 2.9021058082580566, + "grad_norm_var": 0.22054224463718033, + "learning_rate": 0.0001, + "loss": 1.4194, + "loss/crossentropy": 2.32419753074646, + "loss/hidden": 1.1171875, + "loss/logits": 0.18185210227966309, + "loss/reg": 0.012038588523864746, + "step": 1820 + }, + { + "epoch": 0.227625, + "grad_norm": 2.590693235397339, + "grad_norm_var": 0.21426742260109843, + "learning_rate": 0.0001, + "loss": 1.0937, + "loss/crossentropy": 2.387709856033325, + "loss/hidden": 0.82421875, + "loss/logits": 0.14921152591705322, + "loss/reg": 0.012031761929392815, + "step": 1821 + }, + { + "epoch": 0.22775, + "grad_norm": 3.300173044204712, + "grad_norm_var": 0.22248606839011675, + "learning_rate": 0.0001, + "loss": 1.2738, + "loss/crossentropy": 2.198507308959961, + "loss/hidden": 0.921875, + "loss/logits": 0.2317131608724594, + "loss/reg": 0.012025104835629463, + "step": 1822 + }, + { + "epoch": 0.227875, + "grad_norm": 2.4631481170654297, + "grad_norm_var": 0.14599126890724012, + "learning_rate": 0.0001, + "loss": 1.0501, + "loss/crossentropy": 2.370026111602783, + "loss/hidden": 0.7890625, + "loss/logits": 0.14089564979076385, + "loss/reg": 0.012018561363220215, + "step": 1823 + }, + { + "epoch": 0.228, + "grad_norm": 3.0455853939056396, + "grad_norm_var": 0.14484462892754285, + "learning_rate": 0.0001, + "loss": 1.1923, + "loss/crossentropy": 2.4721148014068604, + "loss/hidden": 0.890625, + "loss/logits": 0.18152591586112976, + "loss/reg": 0.01201186515390873, + "step": 1824 + }, + { + "epoch": 0.228125, + "grad_norm": 3.3358211517333984, + "grad_norm_var": 0.13249022272456326, + "learning_rate": 0.0001, + "loss": 1.1468, + "loss/crossentropy": 2.6852481365203857, + "loss/hidden": 0.86328125, + "loss/logits": 0.16346678137779236, + "loss/reg": 0.012004739604890347, + "step": 1825 + }, + { + "epoch": 0.22825, + "grad_norm": 2.6543102264404297, + "grad_norm_var": 0.13326040062104477, + "learning_rate": 0.0001, + "loss": 1.034, + "loss/crossentropy": 2.3123793601989746, + "loss/hidden": 0.7734375, + "loss/logits": 0.14057926833629608, + "loss/reg": 0.011998065747320652, + "step": 1826 + }, + { + "epoch": 0.228375, + "grad_norm": 2.4901700019836426, + "grad_norm_var": 0.10886572110197955, + "learning_rate": 0.0001, + "loss": 1.0816, + "loss/crossentropy": 2.3769681453704834, + "loss/hidden": 0.80859375, + "loss/logits": 0.1531093716621399, + "loss/reg": 0.011991064064204693, + "step": 1827 + }, + { + "epoch": 0.2285, + "grad_norm": 2.7322981357574463, + "grad_norm_var": 0.10889162038143449, + "learning_rate": 0.0001, + "loss": 1.2769, + "loss/crossentropy": 2.432474136352539, + "loss/hidden": 0.96484375, + "loss/logits": 0.19221791625022888, + "loss/reg": 0.011984668672084808, + "step": 1828 + }, + { + "epoch": 0.228625, + "grad_norm": 3.040367603302002, + "grad_norm_var": 0.11545954489478329, + "learning_rate": 0.0001, + "loss": 1.1185, + "loss/crossentropy": 2.5954103469848633, + "loss/hidden": 0.828125, + "loss/logits": 0.17058409750461578, + "loss/reg": 0.011977394111454487, + "step": 1829 + }, + { + "epoch": 0.22875, + "grad_norm": 4.54348611831665, + "grad_norm_var": 0.30728487463357607, + "learning_rate": 0.0001, + "loss": 1.4038, + "loss/crossentropy": 2.3670899868011475, + "loss/hidden": 1.1171875, + "loss/logits": 0.16688917577266693, + "loss/reg": 0.011970371939241886, + "step": 1830 + }, + { + "epoch": 0.228875, + "grad_norm": 2.4918017387390137, + "grad_norm_var": 0.3027632602969258, + "learning_rate": 0.0001, + "loss": 1.1499, + "loss/crossentropy": 2.4433906078338623, + "loss/hidden": 0.859375, + "loss/logits": 0.17090028524398804, + "loss/reg": 0.011963790282607079, + "step": 1831 + }, + { + "epoch": 0.229, + "grad_norm": 2.9950368404388428, + "grad_norm_var": 0.2944027102760967, + "learning_rate": 0.0001, + "loss": 1.1104, + "loss/crossentropy": 2.791016101837158, + "loss/hidden": 0.828125, + "loss/logits": 0.16274631023406982, + "loss/reg": 0.011956454254686832, + "step": 1832 + }, + { + "epoch": 0.229125, + "grad_norm": 3.249807596206665, + "grad_norm_var": 0.3016714416861295, + "learning_rate": 0.0001, + "loss": 1.1681, + "loss/crossentropy": 2.459493637084961, + "loss/hidden": 0.890625, + "loss/logits": 0.1580270528793335, + "loss/reg": 0.011949477717280388, + "step": 1833 + }, + { + "epoch": 0.22925, + "grad_norm": 2.811523199081421, + "grad_norm_var": 0.2828291293008836, + "learning_rate": 0.0001, + "loss": 1.1852, + "loss/crossentropy": 2.3925209045410156, + "loss/hidden": 0.90234375, + "loss/logits": 0.16343240439891815, + "loss/reg": 0.011942173354327679, + "step": 1834 + }, + { + "epoch": 0.229375, + "grad_norm": 2.8339016437530518, + "grad_norm_var": 0.25815168646451997, + "learning_rate": 0.0001, + "loss": 1.2253, + "loss/crossentropy": 2.3892555236816406, + "loss/hidden": 0.93359375, + "loss/logits": 0.17232704162597656, + "loss/reg": 0.011935535818338394, + "step": 1835 + }, + { + "epoch": 0.2295, + "grad_norm": 3.5241827964782715, + "grad_norm_var": 0.2769127015292412, + "learning_rate": 0.0001, + "loss": 1.0834, + "loss/crossentropy": 2.4555206298828125, + "loss/hidden": 0.80859375, + "loss/logits": 0.15549984574317932, + "loss/reg": 0.011928863823413849, + "step": 1836 + }, + { + "epoch": 0.229625, + "grad_norm": 3.1511709690093994, + "grad_norm_var": 0.26548067421464716, + "learning_rate": 0.0001, + "loss": 1.178, + "loss/crossentropy": 2.634005069732666, + "loss/hidden": 0.91015625, + "loss/logits": 0.14858701825141907, + "loss/reg": 0.011921834200620651, + "step": 1837 + }, + { + "epoch": 0.22975, + "grad_norm": 3.1957132816314697, + "grad_norm_var": 0.2625588163447295, + "learning_rate": 0.0001, + "loss": 1.4165, + "loss/crossentropy": 2.364680051803589, + "loss/hidden": 1.0703125, + "loss/logits": 0.22706648707389832, + "loss/reg": 0.01191527210175991, + "step": 1838 + }, + { + "epoch": 0.229875, + "grad_norm": 3.4130618572235107, + "grad_norm_var": 0.24654008934847388, + "learning_rate": 0.0001, + "loss": 1.1042, + "loss/crossentropy": 2.80603289604187, + "loss/hidden": 0.84765625, + "loss/logits": 0.1374989151954651, + "loss/reg": 0.011908802203834057, + "step": 1839 + }, + { + "epoch": 0.23, + "grad_norm": 10.125365257263184, + "grad_norm_var": 3.3332932374288853, + "learning_rate": 0.0001, + "loss": 1.2822, + "loss/crossentropy": 2.7263705730438232, + "loss/hidden": 0.96875, + "loss/logits": 0.1943938136100769, + "loss/reg": 0.011902069672942162, + "step": 1840 + }, + { + "epoch": 0.230125, + "grad_norm": 3.0267136096954346, + "grad_norm_var": 3.3475461515590967, + "learning_rate": 0.0001, + "loss": 1.4492, + "loss/crossentropy": 2.527991771697998, + "loss/hidden": 1.125, + "loss/logits": 0.20524314045906067, + "loss/reg": 0.01189569290727377, + "step": 1841 + }, + { + "epoch": 0.23025, + "grad_norm": 3.0950570106506348, + "grad_norm_var": 3.308964844199806, + "learning_rate": 0.0001, + "loss": 1.1675, + "loss/crossentropy": 2.9109840393066406, + "loss/hidden": 0.87890625, + "loss/logits": 0.16966286301612854, + "loss/reg": 0.011889121495187283, + "step": 1842 + }, + { + "epoch": 0.230375, + "grad_norm": 2.6885743141174316, + "grad_norm_var": 3.283521301133828, + "learning_rate": 0.0001, + "loss": 0.9502, + "loss/crossentropy": 2.463144540786743, + "loss/hidden": 0.7265625, + "loss/logits": 0.10477050393819809, + "loss/reg": 0.011882682330906391, + "step": 1843 + }, + { + "epoch": 0.2305, + "grad_norm": 7.153027534484863, + "grad_norm_var": 4.018621504186891, + "learning_rate": 0.0001, + "loss": 1.5744, + "loss/crossentropy": 2.408703088760376, + "loss/hidden": 1.1796875, + "loss/logits": 0.2759320139884949, + "loss/reg": 0.01187664270401001, + "step": 1844 + }, + { + "epoch": 0.230625, + "grad_norm": 2.739917516708374, + "grad_norm_var": 4.056043276959989, + "learning_rate": 0.0001, + "loss": 1.1633, + "loss/crossentropy": 2.417595863342285, + "loss/hidden": 0.87109375, + "loss/logits": 0.17353396117687225, + "loss/reg": 0.011870414949953556, + "step": 1845 + }, + { + "epoch": 0.23075, + "grad_norm": 3.544043779373169, + "grad_norm_var": 4.021382457840771, + "learning_rate": 0.0001, + "loss": 1.256, + "loss/crossentropy": 2.5209176540374756, + "loss/hidden": 0.96484375, + "loss/logits": 0.1725134551525116, + "loss/reg": 0.0118643743917346, + "step": 1846 + }, + { + "epoch": 0.230875, + "grad_norm": 2.5454280376434326, + "grad_norm_var": 4.012548475227705, + "learning_rate": 0.0001, + "loss": 1.125, + "loss/crossentropy": 2.788163185119629, + "loss/hidden": 0.84765625, + "loss/logits": 0.15880800783634186, + "loss/reg": 0.011857859790325165, + "step": 1847 + }, + { + "epoch": 0.231, + "grad_norm": 2.663525104522705, + "grad_norm_var": 4.053043390213641, + "learning_rate": 0.0001, + "loss": 1.1004, + "loss/crossentropy": 2.458189010620117, + "loss/hidden": 0.83203125, + "loss/logits": 0.14984336495399475, + "loss/reg": 0.011851528659462929, + "step": 1848 + }, + { + "epoch": 0.231125, + "grad_norm": 3.5077850818634033, + "grad_norm_var": 4.0405115731206624, + "learning_rate": 0.0001, + "loss": 1.0857, + "loss/crossentropy": 2.5606067180633545, + "loss/hidden": 0.8125, + "loss/logits": 0.15474531054496765, + "loss/reg": 0.011845485307276249, + "step": 1849 + }, + { + "epoch": 0.23125, + "grad_norm": 3.274979829788208, + "grad_norm_var": 3.995870290819255, + "learning_rate": 0.0001, + "loss": 1.0438, + "loss/crossentropy": 2.5572433471679688, + "loss/hidden": 0.796875, + "loss/logits": 0.12853728234767914, + "loss/reg": 0.011839361861348152, + "step": 1850 + }, + { + "epoch": 0.231375, + "grad_norm": 2.723388433456421, + "grad_norm_var": 4.010576716926485, + "learning_rate": 0.0001, + "loss": 1.0091, + "loss/crossentropy": 2.6289405822753906, + "loss/hidden": 0.7578125, + "loss/logits": 0.1329689919948578, + "loss/reg": 0.011833377182483673, + "step": 1851 + }, + { + "epoch": 0.2315, + "grad_norm": 5.253420829772949, + "grad_norm_var": 4.140042975539734, + "learning_rate": 0.0001, + "loss": 1.2997, + "loss/crossentropy": 2.522901773452759, + "loss/hidden": 0.984375, + "loss/logits": 0.19709725677967072, + "loss/reg": 0.011827518232166767, + "step": 1852 + }, + { + "epoch": 0.231625, + "grad_norm": 2.8319897651672363, + "grad_norm_var": 4.177483717989236, + "learning_rate": 0.0001, + "loss": 1.3684, + "loss/crossentropy": 2.2863683700561523, + "loss/hidden": 1.078125, + "loss/logits": 0.17204830050468445, + "loss/reg": 0.011821010150015354, + "step": 1853 + }, + { + "epoch": 0.23175, + "grad_norm": 3.758786201477051, + "grad_norm_var": 4.14732397532542, + "learning_rate": 0.0001, + "loss": 1.091, + "loss/crossentropy": 2.6834845542907715, + "loss/hidden": 0.81640625, + "loss/logits": 0.1564285159111023, + "loss/reg": 0.011815285310149193, + "step": 1854 + }, + { + "epoch": 0.231875, + "grad_norm": 2.970064163208008, + "grad_norm_var": 4.18814826964047, + "learning_rate": 0.0001, + "loss": 1.1222, + "loss/crossentropy": 2.350801944732666, + "loss/hidden": 0.85546875, + "loss/logits": 0.14867839217185974, + "loss/reg": 0.011809652671217918, + "step": 1855 + }, + { + "epoch": 0.232, + "grad_norm": 2.991593599319458, + "grad_norm_var": 1.4178378002755372, + "learning_rate": 0.0001, + "loss": 1.0011, + "loss/crossentropy": 2.438445568084717, + "loss/hidden": 0.7734375, + "loss/logits": 0.10962004959583282, + "loss/reg": 0.011803114786744118, + "step": 1856 + }, + { + "epoch": 0.232125, + "grad_norm": 2.449436664581299, + "grad_norm_var": 1.4691695267047424, + "learning_rate": 0.0001, + "loss": 0.9734, + "loss/crossentropy": 2.3218026161193848, + "loss/hidden": 0.7265625, + "loss/logits": 0.12885171175003052, + "loss/reg": 0.011797088198363781, + "step": 1857 + }, + { + "epoch": 0.23225, + "grad_norm": 3.538789987564087, + "grad_norm_var": 1.4642067121892361, + "learning_rate": 0.0001, + "loss": 1.2534, + "loss/crossentropy": 2.3883466720581055, + "loss/hidden": 0.95703125, + "loss/logits": 0.17844754457473755, + "loss/reg": 0.01179055217653513, + "step": 1858 + }, + { + "epoch": 0.232375, + "grad_norm": 2.6442153453826904, + "grad_norm_var": 1.4686242200827846, + "learning_rate": 0.0001, + "loss": 1.1711, + "loss/crossentropy": 2.5905346870422363, + "loss/hidden": 0.890625, + "loss/logits": 0.1626666784286499, + "loss/reg": 0.011784548871219158, + "step": 1859 + }, + { + "epoch": 0.2325, + "grad_norm": 2.63577938079834, + "grad_norm_var": 0.4906894012779368, + "learning_rate": 0.0001, + "loss": 0.9256, + "loss/crossentropy": 2.1434152126312256, + "loss/hidden": 0.703125, + "loss/logits": 0.10471002757549286, + "loss/reg": 0.011777986772358418, + "step": 1860 + }, + { + "epoch": 0.232625, + "grad_norm": 2.578632354736328, + "grad_norm_var": 0.5006945948105681, + "learning_rate": 0.0001, + "loss": 1.0467, + "loss/crossentropy": 2.7150838375091553, + "loss/hidden": 0.796875, + "loss/logits": 0.13211926817893982, + "loss/reg": 0.011771964840590954, + "step": 1861 + }, + { + "epoch": 0.23275, + "grad_norm": 3.5398929119110107, + "grad_norm_var": 0.5004607034463172, + "learning_rate": 0.0001, + "loss": 1.2583, + "loss/crossentropy": 2.7141544818878174, + "loss/hidden": 0.94140625, + "loss/logits": 0.19919423758983612, + "loss/reg": 0.01176582369953394, + "step": 1862 + }, + { + "epoch": 0.232875, + "grad_norm": 3.1188647747039795, + "grad_norm_var": 0.4771405434982313, + "learning_rate": 0.0001, + "loss": 1.0285, + "loss/crossentropy": 2.2451930046081543, + "loss/hidden": 0.7890625, + "loss/logits": 0.12186941504478455, + "loss/reg": 0.011759834364056587, + "step": 1863 + }, + { + "epoch": 0.233, + "grad_norm": 3.0794904232025146, + "grad_norm_var": 0.4606925715881855, + "learning_rate": 0.0001, + "loss": 1.0381, + "loss/crossentropy": 2.338972330093384, + "loss/hidden": 0.7890625, + "loss/logits": 0.1315452754497528, + "loss/reg": 0.011753806844353676, + "step": 1864 + }, + { + "epoch": 0.233125, + "grad_norm": 3.019987106323242, + "grad_norm_var": 0.45431474823360435, + "learning_rate": 0.0001, + "loss": 1.0534, + "loss/crossentropy": 2.5802109241485596, + "loss/hidden": 0.80078125, + "loss/logits": 0.1351454257965088, + "loss/reg": 0.011747847311198711, + "step": 1865 + }, + { + "epoch": 0.23325, + "grad_norm": 3.0198545455932617, + "grad_norm_var": 0.45415120043857066, + "learning_rate": 0.0001, + "loss": 1.3085, + "loss/crossentropy": 2.2313005924224854, + "loss/hidden": 0.96875, + "loss/logits": 0.22236411273479462, + "loss/reg": 0.01174143236130476, + "step": 1866 + }, + { + "epoch": 0.233375, + "grad_norm": 2.4949746131896973, + "grad_norm_var": 0.46993664201424584, + "learning_rate": 0.0001, + "loss": 1.0675, + "loss/crossentropy": 2.530468225479126, + "loss/hidden": 0.8046875, + "loss/logits": 0.14545810222625732, + "loss/reg": 0.01173495315015316, + "step": 1867 + }, + { + "epoch": 0.2335, + "grad_norm": 3.2556769847869873, + "grad_norm_var": 0.15119857978987525, + "learning_rate": 0.0001, + "loss": 1.2375, + "loss/crossentropy": 2.5190608501434326, + "loss/hidden": 0.9296875, + "loss/logits": 0.1905001997947693, + "loss/reg": 0.011728293262422085, + "step": 1868 + }, + { + "epoch": 0.233625, + "grad_norm": 5.818284034729004, + "grad_norm_var": 0.6434646637531138, + "learning_rate": 0.0001, + "loss": 1.7541, + "loss/crossentropy": 2.4520504474639893, + "loss/hidden": 1.328125, + "loss/logits": 0.30873966217041016, + "loss/reg": 0.011721369810402393, + "step": 1869 + }, + { + "epoch": 0.23375, + "grad_norm": 18.115583419799805, + "grad_norm_var": 14.629645381532592, + "learning_rate": 0.0001, + "loss": 1.626, + "loss/crossentropy": 2.765774965286255, + "loss/hidden": 1.265625, + "loss/logits": 0.24319738149642944, + "loss/reg": 0.011714870110154152, + "step": 1870 + }, + { + "epoch": 0.233875, + "grad_norm": 4.950761795043945, + "grad_norm_var": 14.58186333788439, + "learning_rate": 0.0001, + "loss": 1.3825, + "loss/crossentropy": 2.3341286182403564, + "loss/hidden": 1.0859375, + "loss/logits": 0.17949844896793365, + "loss/reg": 0.011708397418260574, + "step": 1871 + }, + { + "epoch": 0.234, + "grad_norm": 3.149343729019165, + "grad_norm_var": 14.557933702264693, + "learning_rate": 0.0001, + "loss": 1.2519, + "loss/crossentropy": 2.2633235454559326, + "loss/hidden": 0.98046875, + "loss/logits": 0.1543847918510437, + "loss/reg": 0.011701937764883041, + "step": 1872 + }, + { + "epoch": 0.234125, + "grad_norm": 3.2733466625213623, + "grad_norm_var": 14.4066140044379, + "learning_rate": 0.0001, + "loss": 0.9954, + "loss/crossentropy": 2.3169283866882324, + "loss/hidden": 0.75, + "loss/logits": 0.12840038537979126, + "loss/reg": 0.011695554479956627, + "step": 1873 + }, + { + "epoch": 0.23425, + "grad_norm": 2.9650564193725586, + "grad_norm_var": 14.482709435196355, + "learning_rate": 0.0001, + "loss": 1.0411, + "loss/crossentropy": 2.499809741973877, + "loss/hidden": 0.78125, + "loss/logits": 0.14294317364692688, + "loss/reg": 0.011689374223351479, + "step": 1874 + }, + { + "epoch": 0.234375, + "grad_norm": 3.3756163120269775, + "grad_norm_var": 14.361621179597174, + "learning_rate": 0.0001, + "loss": 1.2119, + "loss/crossentropy": 2.484644889831543, + "loss/hidden": 0.92578125, + "loss/logits": 0.16932502388954163, + "loss/reg": 0.011682825163006783, + "step": 1875 + }, + { + "epoch": 0.2345, + "grad_norm": 4.519603729248047, + "grad_norm_var": 14.171825990122963, + "learning_rate": 0.0001, + "loss": 1.2405, + "loss/crossentropy": 2.1961562633514404, + "loss/hidden": 0.95703125, + "loss/logits": 0.1666729897260666, + "loss/reg": 0.01167618203908205, + "step": 1876 + }, + { + "epoch": 0.234625, + "grad_norm": 7.144575595855713, + "grad_norm_var": 14.370738345950354, + "learning_rate": 0.0001, + "loss": 1.5206, + "loss/crossentropy": 3.1038763523101807, + "loss/hidden": 1.1171875, + "loss/logits": 0.28672921657562256, + "loss/reg": 0.01166961807757616, + "step": 1877 + }, + { + "epoch": 0.23475, + "grad_norm": 3.3410837650299072, + "grad_norm_var": 14.403365735245997, + "learning_rate": 0.0001, + "loss": 1.0188, + "loss/crossentropy": 2.6396772861480713, + "loss/hidden": 0.76953125, + "loss/logits": 0.1326141357421875, + "loss/reg": 0.011662835255265236, + "step": 1878 + }, + { + "epoch": 0.234875, + "grad_norm": 3.955864429473877, + "grad_norm_var": 14.274587966883617, + "learning_rate": 0.0001, + "loss": 1.2218, + "loss/crossentropy": 2.5625252723693848, + "loss/hidden": 0.9453125, + "loss/logits": 0.1599578559398651, + "loss/reg": 0.011656397022306919, + "step": 1879 + }, + { + "epoch": 0.235, + "grad_norm": 3.702489137649536, + "grad_norm_var": 14.162786868506032, + "learning_rate": 0.0001, + "loss": 1.1474, + "loss/crossentropy": 2.570443868637085, + "loss/hidden": 0.8828125, + "loss/logits": 0.14810335636138916, + "loss/reg": 0.011648759245872498, + "step": 1880 + }, + { + "epoch": 0.235125, + "grad_norm": 3.006218194961548, + "grad_norm_var": 14.165986485307208, + "learning_rate": 0.0001, + "loss": 1.1945, + "loss/crossentropy": 2.175870895385742, + "loss/hidden": 0.91015625, + "loss/logits": 0.1679525077342987, + "loss/reg": 0.01164237316697836, + "step": 1881 + }, + { + "epoch": 0.23525, + "grad_norm": 3.0096826553344727, + "grad_norm_var": 14.168346952953607, + "learning_rate": 0.0001, + "loss": 1.0846, + "loss/crossentropy": 2.7216742038726807, + "loss/hidden": 0.8203125, + "loss/logits": 0.14791440963745117, + "loss/reg": 0.01163501013070345, + "step": 1882 + }, + { + "epoch": 0.235375, + "grad_norm": 3.3620381355285645, + "grad_norm_var": 13.954069607905243, + "learning_rate": 0.0001, + "loss": 1.339, + "loss/crossentropy": 2.6900413036346436, + "loss/hidden": 1.0234375, + "loss/logits": 0.19929756224155426, + "loss/reg": 0.01162752602249384, + "step": 1883 + }, + { + "epoch": 0.2355, + "grad_norm": 2.5984811782836914, + "grad_norm_var": 14.117182111852275, + "learning_rate": 0.0001, + "loss": 1.0653, + "loss/crossentropy": 2.392869472503662, + "loss/hidden": 0.8046875, + "loss/logits": 0.14441151916980743, + "loss/reg": 0.011621098034083843, + "step": 1884 + }, + { + "epoch": 0.235625, + "grad_norm": 3.761037588119507, + "grad_norm_var": 14.093606633107498, + "learning_rate": 0.0001, + "loss": 1.0553, + "loss/crossentropy": 2.8546535968780518, + "loss/hidden": 0.77734375, + "loss/logits": 0.16185110807418823, + "loss/reg": 0.011613850481808186, + "step": 1885 + }, + { + "epoch": 0.23575, + "grad_norm": 5.239340782165527, + "grad_norm_var": 1.319651559267754, + "learning_rate": 0.0001, + "loss": 1.0582, + "loss/crossentropy": 2.560152769088745, + "loss/hidden": 0.80078125, + "loss/logits": 0.14137983322143555, + "loss/reg": 0.011606672778725624, + "step": 1886 + }, + { + "epoch": 0.235875, + "grad_norm": 2.783051013946533, + "grad_norm_var": 1.2907520410126019, + "learning_rate": 0.0001, + "loss": 1.0424, + "loss/crossentropy": 2.37328839302063, + "loss/hidden": 0.79296875, + "loss/logits": 0.13344892859458923, + "loss/reg": 0.011599456891417503, + "step": 1887 + }, + { + "epoch": 0.236, + "grad_norm": 3.554786443710327, + "grad_norm_var": 1.2713025846844552, + "learning_rate": 0.0001, + "loss": 1.1632, + "loss/crossentropy": 2.494732141494751, + "loss/hidden": 0.85546875, + "loss/logits": 0.1918359100818634, + "loss/reg": 0.011593029834330082, + "step": 1888 + }, + { + "epoch": 0.236125, + "grad_norm": 3.321483612060547, + "grad_norm_var": 1.2685516790340434, + "learning_rate": 0.0001, + "loss": 1.2471, + "loss/crossentropy": 2.4095706939697266, + "loss/hidden": 0.9765625, + "loss/logits": 0.15465524792671204, + "loss/reg": 0.011586642824113369, + "step": 1889 + }, + { + "epoch": 0.23625, + "grad_norm": 2.9616763591766357, + "grad_norm_var": 1.2688960186311133, + "learning_rate": 0.0001, + "loss": 1.4451, + "loss/crossentropy": 2.239219903945923, + "loss/hidden": 1.140625, + "loss/logits": 0.1887097954750061, + "loss/reg": 0.011579960584640503, + "step": 1890 + }, + { + "epoch": 0.236375, + "grad_norm": 3.4970099925994873, + "grad_norm_var": 1.2641245233408538, + "learning_rate": 0.0001, + "loss": 1.1386, + "loss/crossentropy": 2.5096256732940674, + "loss/hidden": 0.87109375, + "loss/logits": 0.15174484252929688, + "loss/reg": 0.011572892777621746, + "step": 1891 + }, + { + "epoch": 0.2365, + "grad_norm": 2.6197335720062256, + "grad_norm_var": 1.2909410184264962, + "learning_rate": 0.0001, + "loss": 1.0721, + "loss/crossentropy": 2.704296112060547, + "loss/hidden": 0.8125, + "loss/logits": 0.14390236139297485, + "loss/reg": 0.01156578678637743, + "step": 1892 + }, + { + "epoch": 0.236625, + "grad_norm": 5.325662136077881, + "grad_norm_var": 0.6420011074287828, + "learning_rate": 0.0001, + "loss": 1.5009, + "loss/crossentropy": 2.394350528717041, + "loss/hidden": 1.2109375, + "loss/logits": 0.1743205487728119, + "loss/reg": 0.011559294536709785, + "step": 1893 + }, + { + "epoch": 0.23675, + "grad_norm": 3.4110569953918457, + "grad_norm_var": 0.6408013583584702, + "learning_rate": 0.0001, + "loss": 1.3052, + "loss/crossentropy": 2.3642685413360596, + "loss/hidden": 1.0, + "loss/logits": 0.18962880969047546, + "loss/reg": 0.011552795767784119, + "step": 1894 + }, + { + "epoch": 0.236875, + "grad_norm": 4.094106674194336, + "grad_norm_var": 0.6502721450147265, + "learning_rate": 0.0001, + "loss": 1.3815, + "loss/crossentropy": 2.5077645778656006, + "loss/hidden": 1.0859375, + "loss/logits": 0.18015095591545105, + "loss/reg": 0.01154585275799036, + "step": 1895 + }, + { + "epoch": 0.237, + "grad_norm": 3.3601646423339844, + "grad_norm_var": 0.6490610636632368, + "learning_rate": 0.0001, + "loss": 1.1372, + "loss/crossentropy": 2.4785971641540527, + "loss/hidden": 0.8671875, + "loss/logits": 0.15463027358055115, + "loss/reg": 0.011538945138454437, + "step": 1896 + }, + { + "epoch": 0.237125, + "grad_norm": 2.699127197265625, + "grad_norm_var": 0.6749314875548348, + "learning_rate": 0.0001, + "loss": 1.0438, + "loss/crossentropy": 2.562880754470825, + "loss/hidden": 0.80078125, + "loss/logits": 0.12774690985679626, + "loss/reg": 0.011531817726790905, + "step": 1897 + }, + { + "epoch": 0.23725, + "grad_norm": 3.408806324005127, + "grad_norm_var": 0.6601303555653694, + "learning_rate": 0.0001, + "loss": 1.2514, + "loss/crossentropy": 2.7521603107452393, + "loss/hidden": 0.95703125, + "loss/logits": 0.17915445566177368, + "loss/reg": 0.01152490172535181, + "step": 1898 + }, + { + "epoch": 0.237375, + "grad_norm": 2.782313346862793, + "grad_norm_var": 0.6917876208904978, + "learning_rate": 0.0001, + "loss": 1.2561, + "loss/crossentropy": 2.768892765045166, + "loss/hidden": 0.96484375, + "loss/logits": 0.17611053586006165, + "loss/reg": 0.011517412029206753, + "step": 1899 + }, + { + "epoch": 0.2375, + "grad_norm": 2.913452625274658, + "grad_norm_var": 0.6616557378995289, + "learning_rate": 0.0001, + "loss": 1.1729, + "loss/crossentropy": 2.365873336791992, + "loss/hidden": 0.90234375, + "loss/logits": 0.15543469786643982, + "loss/reg": 0.011509752832353115, + "step": 1900 + }, + { + "epoch": 0.237625, + "grad_norm": 2.3335838317871094, + "grad_norm_var": 0.736146354285043, + "learning_rate": 0.0001, + "loss": 0.9944, + "loss/crossentropy": 2.6142027378082275, + "loss/hidden": 0.75390625, + "loss/logits": 0.12545104324817657, + "loss/reg": 0.011503461748361588, + "step": 1901 + }, + { + "epoch": 0.23775, + "grad_norm": 3.5080623626708984, + "grad_norm_var": 0.4975255652152091, + "learning_rate": 0.0001, + "loss": 1.204, + "loss/crossentropy": 2.880540609359741, + "loss/hidden": 0.9140625, + "loss/logits": 0.17492809891700745, + "loss/reg": 0.01149703934788704, + "step": 1902 + }, + { + "epoch": 0.237875, + "grad_norm": 2.856538772583008, + "grad_norm_var": 0.49293619178501635, + "learning_rate": 0.0001, + "loss": 1.1438, + "loss/crossentropy": 2.411937713623047, + "loss/hidden": 0.875, + "loss/logits": 0.153866246342659, + "loss/reg": 0.01148985605686903, + "step": 1903 + }, + { + "epoch": 0.238, + "grad_norm": 2.922060012817383, + "grad_norm_var": 0.49565918281477345, + "learning_rate": 0.0001, + "loss": 1.0083, + "loss/crossentropy": 2.714773416519165, + "loss/hidden": 0.76171875, + "loss/logits": 0.1317380964756012, + "loss/reg": 0.011483217589557171, + "step": 1904 + }, + { + "epoch": 0.238125, + "grad_norm": 3.1696555614471436, + "grad_norm_var": 0.49567159607848527, + "learning_rate": 0.0001, + "loss": 1.2106, + "loss/crossentropy": 2.5127952098846436, + "loss/hidden": 0.95703125, + "loss/logits": 0.1388104110956192, + "loss/reg": 0.011476212181150913, + "step": 1905 + }, + { + "epoch": 0.23825, + "grad_norm": 2.5425992012023926, + "grad_norm_var": 0.5222804369498891, + "learning_rate": 0.0001, + "loss": 1.0714, + "loss/crossentropy": 2.6032824516296387, + "loss/hidden": 0.80859375, + "loss/logits": 0.14816129207611084, + "loss/reg": 0.011469176970422268, + "step": 1906 + }, + { + "epoch": 0.238375, + "grad_norm": 2.748033285140991, + "grad_norm_var": 0.5292028458853089, + "learning_rate": 0.0001, + "loss": 1.1389, + "loss/crossentropy": 2.697582721710205, + "loss/hidden": 0.859375, + "loss/logits": 0.16493722796440125, + "loss/reg": 0.011462134309113026, + "step": 1907 + }, + { + "epoch": 0.2385, + "grad_norm": 4.973997592926025, + "grad_norm_var": 0.7033744522376982, + "learning_rate": 0.0001, + "loss": 1.1519, + "loss/crossentropy": 2.451634168624878, + "loss/hidden": 0.8828125, + "loss/logits": 0.15450610220432281, + "loss/reg": 0.011455683968961239, + "step": 1908 + }, + { + "epoch": 0.238625, + "grad_norm": 2.706803321838379, + "grad_norm_var": 0.43014165554197537, + "learning_rate": 0.0001, + "loss": 1.0876, + "loss/crossentropy": 2.553027629852295, + "loss/hidden": 0.8203125, + "loss/logits": 0.15277239680290222, + "loss/reg": 0.011448659934103489, + "step": 1909 + }, + { + "epoch": 0.23875, + "grad_norm": 2.650634288787842, + "grad_norm_var": 0.44000573292169826, + "learning_rate": 0.0001, + "loss": 1.2477, + "loss/crossentropy": 2.413283586502075, + "loss/hidden": 0.94921875, + "loss/logits": 0.1840301901102066, + "loss/reg": 0.011441958136856556, + "step": 1910 + }, + { + "epoch": 0.238875, + "grad_norm": 3.073920965194702, + "grad_norm_var": 0.37042588009566063, + "learning_rate": 0.0001, + "loss": 1.24, + "loss/crossentropy": 2.647343158721924, + "loss/hidden": 0.92578125, + "loss/logits": 0.19982083141803741, + "loss/reg": 0.011434913612902164, + "step": 1911 + }, + { + "epoch": 0.239, + "grad_norm": 4.049685478210449, + "grad_norm_var": 0.42951946606552244, + "learning_rate": 0.0001, + "loss": 1.3637, + "loss/crossentropy": 2.2610855102539062, + "loss/hidden": 1.046875, + "loss/logits": 0.20253784954547882, + "loss/reg": 0.01142791099846363, + "step": 1912 + }, + { + "epoch": 0.239125, + "grad_norm": 2.926657199859619, + "grad_norm_var": 0.42108803087631347, + "learning_rate": 0.0001, + "loss": 1.2487, + "loss/crossentropy": 2.795015335083008, + "loss/hidden": 0.9609375, + "loss/logits": 0.17352721095085144, + "loss/reg": 0.011420897208154202, + "step": 1913 + }, + { + "epoch": 0.23925, + "grad_norm": 2.98282790184021, + "grad_norm_var": 0.4147719819065732, + "learning_rate": 0.0001, + "loss": 1.1135, + "loss/crossentropy": 2.5007472038269043, + "loss/hidden": 0.8359375, + "loss/logits": 0.1634053736925125, + "loss/reg": 0.011413791216909885, + "step": 1914 + }, + { + "epoch": 0.239375, + "grad_norm": 2.743699073791504, + "grad_norm_var": 0.4163530495107797, + "learning_rate": 0.0001, + "loss": 1.1331, + "loss/crossentropy": 2.5021727085113525, + "loss/hidden": 0.875, + "loss/logits": 0.14406922459602356, + "loss/reg": 0.011406795121729374, + "step": 1915 + }, + { + "epoch": 0.2395, + "grad_norm": 2.8699593544006348, + "grad_norm_var": 0.41737266552279284, + "learning_rate": 0.0001, + "loss": 1.1846, + "loss/crossentropy": 2.409477949142456, + "loss/hidden": 0.91015625, + "loss/logits": 0.1604822278022766, + "loss/reg": 0.011399611830711365, + "step": 1916 + }, + { + "epoch": 0.239625, + "grad_norm": 2.5404951572418213, + "grad_norm_var": 0.3998377204796666, + "learning_rate": 0.0001, + "loss": 1.0893, + "loss/crossentropy": 2.4783318042755127, + "loss/hidden": 0.8203125, + "loss/logits": 0.1551009714603424, + "loss/reg": 0.011393279768526554, + "step": 1917 + }, + { + "epoch": 0.23975, + "grad_norm": 2.9057133197784424, + "grad_norm_var": 0.38806304932068847, + "learning_rate": 0.0001, + "loss": 1.055, + "loss/crossentropy": 2.5464046001434326, + "loss/hidden": 0.796875, + "loss/logits": 0.1442166268825531, + "loss/reg": 0.011386997997760773, + "step": 1918 + }, + { + "epoch": 0.239875, + "grad_norm": 3.536914587020874, + "grad_norm_var": 0.4002199957694375, + "learning_rate": 0.0001, + "loss": 1.388, + "loss/crossentropy": 2.1978702545166016, + "loss/hidden": 1.0625, + "loss/logits": 0.21171514689922333, + "loss/reg": 0.011380769312381744, + "step": 1919 + }, + { + "epoch": 0.24, + "grad_norm": 2.766721725463867, + "grad_norm_var": 0.4050817388615691, + "learning_rate": 0.0001, + "loss": 1.1325, + "loss/crossentropy": 2.6648097038269043, + "loss/hidden": 0.859375, + "loss/logits": 0.1593654751777649, + "loss/reg": 0.011374091729521751, + "step": 1920 + }, + { + "epoch": 0.240125, + "grad_norm": 3.2392373085021973, + "grad_norm_var": 0.4062692871011743, + "learning_rate": 0.0001, + "loss": 1.2079, + "loss/crossentropy": 2.6759071350097656, + "loss/hidden": 0.89453125, + "loss/logits": 0.19964686036109924, + "loss/reg": 0.01136783231049776, + "step": 1921 + }, + { + "epoch": 0.24025, + "grad_norm": 3.552058696746826, + "grad_norm_var": 0.39781198223739217, + "learning_rate": 0.0001, + "loss": 1.2685, + "loss/crossentropy": 2.4821066856384277, + "loss/hidden": 0.984375, + "loss/logits": 0.170503631234169, + "loss/reg": 0.011361614800989628, + "step": 1922 + }, + { + "epoch": 0.240375, + "grad_norm": 3.387380361557007, + "grad_norm_var": 0.38980030472856225, + "learning_rate": 0.0001, + "loss": 1.1798, + "loss/crossentropy": 2.5588643550872803, + "loss/hidden": 0.91015625, + "loss/logits": 0.1560502052307129, + "loss/reg": 0.011355074122548103, + "step": 1923 + }, + { + "epoch": 0.2405, + "grad_norm": 2.437252998352051, + "grad_norm_var": 0.18576844254103358, + "learning_rate": 0.0001, + "loss": 1.2037, + "loss/crossentropy": 2.2251975536346436, + "loss/hidden": 0.9453125, + "loss/logits": 0.1448952853679657, + "loss/reg": 0.011348758824169636, + "step": 1924 + }, + { + "epoch": 0.240625, + "grad_norm": 2.870936393737793, + "grad_norm_var": 0.18052971078875166, + "learning_rate": 0.0001, + "loss": 1.0809, + "loss/crossentropy": 2.538954973220825, + "loss/hidden": 0.8203125, + "loss/logits": 0.1471145749092102, + "loss/reg": 0.011342531070113182, + "step": 1925 + }, + { + "epoch": 0.24075, + "grad_norm": 2.4371042251586914, + "grad_norm_var": 0.19427645895656553, + "learning_rate": 0.0001, + "loss": 1.0965, + "loss/crossentropy": 2.3479714393615723, + "loss/hidden": 0.828125, + "loss/logits": 0.15495869517326355, + "loss/reg": 0.01133667305111885, + "step": 1926 + }, + { + "epoch": 0.240875, + "grad_norm": 2.7412068843841553, + "grad_norm_var": 0.19880465575710388, + "learning_rate": 0.0001, + "loss": 1.0415, + "loss/crossentropy": 2.402764081954956, + "loss/hidden": 0.79296875, + "loss/logits": 0.13523459434509277, + "loss/reg": 0.01133043598383665, + "step": 1927 + }, + { + "epoch": 0.241, + "grad_norm": 2.7866973876953125, + "grad_norm_var": 0.12160759981975294, + "learning_rate": 0.0001, + "loss": 1.3153, + "loss/crossentropy": 2.327364683151245, + "loss/hidden": 0.9921875, + "loss/logits": 0.20987921953201294, + "loss/reg": 0.011324429884552956, + "step": 1928 + }, + { + "epoch": 0.241125, + "grad_norm": 2.642634153366089, + "grad_norm_var": 0.1264088206080712, + "learning_rate": 0.0001, + "loss": 1.1514, + "loss/crossentropy": 2.319239616394043, + "loss/hidden": 0.8828125, + "loss/logits": 0.15535815060138702, + "loss/reg": 0.011318519711494446, + "step": 1929 + }, + { + "epoch": 0.24125, + "grad_norm": 2.6170005798339844, + "grad_norm_var": 0.13085757964727993, + "learning_rate": 0.0001, + "loss": 1.1138, + "loss/crossentropy": 2.5678954124450684, + "loss/hidden": 0.84375, + "loss/logits": 0.15691521763801575, + "loss/reg": 0.011312729679048061, + "step": 1930 + }, + { + "epoch": 0.241375, + "grad_norm": 3.271930456161499, + "grad_norm_var": 0.13871901991367938, + "learning_rate": 0.0001, + "loss": 1.0944, + "loss/crossentropy": 2.6121537685394287, + "loss/hidden": 0.82421875, + "loss/logits": 0.15707923471927643, + "loss/reg": 0.01130701508373022, + "step": 1931 + }, + { + "epoch": 0.2415, + "grad_norm": 4.04671573638916, + "grad_norm_var": 0.21855977270830043, + "learning_rate": 0.0001, + "loss": 1.0816, + "loss/crossentropy": 2.6852362155914307, + "loss/hidden": 0.82421875, + "loss/logits": 0.1443198323249817, + "loss/reg": 0.011301231570541859, + "step": 1932 + }, + { + "epoch": 0.241625, + "grad_norm": 2.44777250289917, + "grad_norm_var": 0.22460799214468258, + "learning_rate": 0.0001, + "loss": 1.1546, + "loss/crossentropy": 2.544693946838379, + "loss/hidden": 0.87890625, + "loss/logits": 0.1627037525177002, + "loss/reg": 0.011295530013740063, + "step": 1933 + }, + { + "epoch": 0.24175, + "grad_norm": 3.0974671840667725, + "grad_norm_var": 0.22499515882713012, + "learning_rate": 0.0001, + "loss": 1.2202, + "loss/crossentropy": 2.6100382804870605, + "loss/hidden": 0.9296875, + "loss/logits": 0.1776452660560608, + "loss/reg": 0.011289400048553944, + "step": 1934 + }, + { + "epoch": 0.241875, + "grad_norm": 2.6123459339141846, + "grad_norm_var": 0.21130123911614854, + "learning_rate": 0.0001, + "loss": 1.087, + "loss/crossentropy": 2.5938217639923096, + "loss/hidden": 0.83203125, + "loss/logits": 0.14210623502731323, + "loss/reg": 0.011283115483820438, + "step": 1935 + }, + { + "epoch": 0.242, + "grad_norm": 2.2436466217041016, + "grad_norm_var": 0.24011386438526852, + "learning_rate": 0.0001, + "loss": 1.0124, + "loss/crossentropy": 2.5183980464935303, + "loss/hidden": 0.7734375, + "loss/logits": 0.12614327669143677, + "loss/reg": 0.011277486570179462, + "step": 1936 + }, + { + "epoch": 0.242125, + "grad_norm": 5.555136680603027, + "grad_norm_var": 0.6794719115699573, + "learning_rate": 0.0001, + "loss": 1.1126, + "loss/crossentropy": 2.543240547180176, + "loss/hidden": 0.8671875, + "loss/logits": 0.13274727761745453, + "loss/reg": 0.011271213181316853, + "step": 1937 + }, + { + "epoch": 0.24225, + "grad_norm": 2.610562562942505, + "grad_norm_var": 0.6714344269587417, + "learning_rate": 0.0001, + "loss": 0.8577, + "loss/crossentropy": 2.4972715377807617, + "loss/hidden": 0.6484375, + "loss/logits": 0.09660603851079941, + "loss/reg": 0.01126556284725666, + "step": 1938 + }, + { + "epoch": 0.242375, + "grad_norm": 3.4906604290008545, + "grad_norm_var": 0.6776027391572443, + "learning_rate": 0.0001, + "loss": 1.2131, + "loss/crossentropy": 2.576024293899536, + "loss/hidden": 0.91796875, + "loss/logits": 0.18251577019691467, + "loss/reg": 0.01125932577997446, + "step": 1939 + }, + { + "epoch": 0.2425, + "grad_norm": 2.422001361846924, + "grad_norm_var": 0.6787500956269599, + "learning_rate": 0.0001, + "loss": 0.9879, + "loss/crossentropy": 2.3907670974731445, + "loss/hidden": 0.74609375, + "loss/logits": 0.1292981505393982, + "loss/reg": 0.011253675445914268, + "step": 1940 + }, + { + "epoch": 0.242625, + "grad_norm": 3.6441028118133545, + "grad_norm_var": 0.7034908497749977, + "learning_rate": 0.0001, + "loss": 1.2193, + "loss/crossentropy": 2.268623113632202, + "loss/hidden": 0.921875, + "loss/logits": 0.18494915962219238, + "loss/reg": 0.011247408576309681, + "step": 1941 + }, + { + "epoch": 0.24275, + "grad_norm": 3.2040059566497803, + "grad_norm_var": 0.6784287892697456, + "learning_rate": 0.0001, + "loss": 1.3474, + "loss/crossentropy": 2.369084596633911, + "loss/hidden": 1.0234375, + "loss/logits": 0.21159708499908447, + "loss/reg": 0.011241290718317032, + "step": 1942 + }, + { + "epoch": 0.242875, + "grad_norm": 3.0895087718963623, + "grad_norm_var": 0.6698306293757182, + "learning_rate": 0.0001, + "loss": 1.4156, + "loss/crossentropy": 2.0874648094177246, + "loss/hidden": 1.09375, + "loss/logits": 0.20947730541229248, + "loss/reg": 0.011235379613935947, + "step": 1943 + }, + { + "epoch": 0.243, + "grad_norm": 2.4965028762817383, + "grad_norm_var": 0.687657011627713, + "learning_rate": 0.0001, + "loss": 1.1098, + "loss/crossentropy": 2.210761547088623, + "loss/hidden": 0.8515625, + "loss/logits": 0.14598006010055542, + "loss/reg": 0.011229581199586391, + "step": 1944 + }, + { + "epoch": 0.243125, + "grad_norm": 2.9449803829193115, + "grad_norm_var": 0.6752047525480802, + "learning_rate": 0.0001, + "loss": 1.1668, + "loss/crossentropy": 2.4518120288848877, + "loss/hidden": 0.8671875, + "loss/logits": 0.18732941150665283, + "loss/reg": 0.011223935522139072, + "step": 1945 + }, + { + "epoch": 0.24325, + "grad_norm": 3.066868305206299, + "grad_norm_var": 0.6581535524958798, + "learning_rate": 0.0001, + "loss": 1.29, + "loss/crossentropy": 2.822751522064209, + "loss/hidden": 0.96484375, + "loss/logits": 0.21297214925289154, + "loss/reg": 0.01121827308088541, + "step": 1946 + }, + { + "epoch": 0.243375, + "grad_norm": 2.5983641147613525, + "grad_norm_var": 0.6746843795057048, + "learning_rate": 0.0001, + "loss": 1.1594, + "loss/crossentropy": 2.397672414779663, + "loss/hidden": 0.890625, + "loss/logits": 0.156642884016037, + "loss/reg": 0.01121209841221571, + "step": 1947 + }, + { + "epoch": 0.2435, + "grad_norm": 2.9095447063446045, + "grad_norm_var": 0.6116848502456292, + "learning_rate": 0.0001, + "loss": 1.0915, + "loss/crossentropy": 2.527217149734497, + "loss/hidden": 0.81640625, + "loss/logits": 0.16299590468406677, + "loss/reg": 0.01120635587722063, + "step": 1948 + }, + { + "epoch": 0.243625, + "grad_norm": 5.4240403175354, + "grad_norm_var": 0.9354258383958637, + "learning_rate": 0.0001, + "loss": 1.4684, + "loss/crossentropy": 2.551121473312378, + "loss/hidden": 1.1015625, + "loss/logits": 0.25484058260917664, + "loss/reg": 0.011200698092579842, + "step": 1949 + }, + { + "epoch": 0.24375, + "grad_norm": 3.1100833415985107, + "grad_norm_var": 0.9352412595018517, + "learning_rate": 0.0001, + "loss": 1.2134, + "loss/crossentropy": 2.401557683944702, + "loss/hidden": 0.9140625, + "loss/logits": 0.18738284707069397, + "loss/reg": 0.011194508522748947, + "step": 1950 + }, + { + "epoch": 0.243875, + "grad_norm": 3.456061363220215, + "grad_norm_var": 0.9120604979018725, + "learning_rate": 0.0001, + "loss": 1.1817, + "loss/crossentropy": 2.688356637954712, + "loss/hidden": 0.875, + "loss/logits": 0.19481763243675232, + "loss/reg": 0.011188787408173084, + "step": 1951 + }, + { + "epoch": 0.244, + "grad_norm": 3.2770729064941406, + "grad_norm_var": 0.8378516417593855, + "learning_rate": 0.0001, + "loss": 1.4059, + "loss/crossentropy": 2.156545877456665, + "loss/hidden": 1.09375, + "loss/logits": 0.2003558874130249, + "loss/reg": 0.01118260808289051, + "step": 1952 + }, + { + "epoch": 0.244125, + "grad_norm": 3.4741783142089844, + "grad_norm_var": 0.4914500706308291, + "learning_rate": 0.0001, + "loss": 1.259, + "loss/crossentropy": 2.633277177810669, + "loss/hidden": 0.95703125, + "loss/logits": 0.19024121761322021, + "loss/reg": 0.011176753789186478, + "step": 1953 + }, + { + "epoch": 0.24425, + "grad_norm": 3.2239580154418945, + "grad_norm_var": 0.46666341661654465, + "learning_rate": 0.0001, + "loss": 1.1213, + "loss/crossentropy": 2.5291545391082764, + "loss/hidden": 0.8671875, + "loss/logits": 0.14238449931144714, + "loss/reg": 0.011171228252351284, + "step": 1954 + }, + { + "epoch": 0.244375, + "grad_norm": 2.7568724155426025, + "grad_norm_var": 0.4757426809575984, + "learning_rate": 0.0001, + "loss": 1.0901, + "loss/crossentropy": 2.6687417030334473, + "loss/hidden": 0.82421875, + "loss/logits": 0.15422269701957703, + "loss/reg": 0.01116592064499855, + "step": 1955 + }, + { + "epoch": 0.2445, + "grad_norm": 3.1235272884368896, + "grad_norm_var": 0.4343252933982605, + "learning_rate": 0.0001, + "loss": 1.2777, + "loss/crossentropy": 2.0929863452911377, + "loss/hidden": 1.0078125, + "loss/logits": 0.15828515589237213, + "loss/reg": 0.011160776950418949, + "step": 1956 + }, + { + "epoch": 0.244625, + "grad_norm": 2.8191616535186768, + "grad_norm_var": 0.43213291318467645, + "learning_rate": 0.0001, + "loss": 1.1685, + "loss/crossentropy": 2.562255620956421, + "loss/hidden": 0.87109375, + "loss/logits": 0.1858426034450531, + "loss/reg": 0.011155403219163418, + "step": 1957 + }, + { + "epoch": 0.24475, + "grad_norm": 7.025396823883057, + "grad_norm_var": 1.35403696610659, + "learning_rate": 0.0001, + "loss": 1.313, + "loss/crossentropy": 2.0798208713531494, + "loss/hidden": 1.0546875, + "loss/logits": 0.1467897593975067, + "loss/reg": 0.01114996150135994, + "step": 1958 + }, + { + "epoch": 0.244875, + "grad_norm": 3.6586244106292725, + "grad_norm_var": 1.348840874134669, + "learning_rate": 0.0001, + "loss": 1.0528, + "loss/crossentropy": 2.80499267578125, + "loss/hidden": 0.79296875, + "loss/logits": 0.14840048551559448, + "loss/reg": 0.011143793351948261, + "step": 1959 + }, + { + "epoch": 0.245, + "grad_norm": 3.549360752105713, + "grad_norm_var": 1.2828200422537663, + "learning_rate": 0.0001, + "loss": 1.1735, + "loss/crossentropy": 2.612431764602661, + "loss/hidden": 0.89453125, + "loss/logits": 0.16759154200553894, + "loss/reg": 0.01113795768469572, + "step": 1960 + }, + { + "epoch": 0.245125, + "grad_norm": 3.2019731998443604, + "grad_norm_var": 1.2670343380172766, + "learning_rate": 0.0001, + "loss": 1.2087, + "loss/crossentropy": 2.242535352706909, + "loss/hidden": 0.93359375, + "loss/logits": 0.1637311577796936, + "loss/reg": 0.011132647283375263, + "step": 1961 + }, + { + "epoch": 0.24525, + "grad_norm": 2.65508770942688, + "grad_norm_var": 1.3037293062655784, + "learning_rate": 0.0001, + "loss": 1.066, + "loss/crossentropy": 2.53322172164917, + "loss/hidden": 0.80859375, + "loss/logits": 0.1461862325668335, + "loss/reg": 0.011126426979899406, + "step": 1962 + }, + { + "epoch": 0.245375, + "grad_norm": 2.4140734672546387, + "grad_norm_var": 1.3284114469095243, + "learning_rate": 0.0001, + "loss": 1.1738, + "loss/crossentropy": 2.441091299057007, + "loss/hidden": 0.91015625, + "loss/logits": 0.1524820178747177, + "loss/reg": 0.011120946146547794, + "step": 1963 + }, + { + "epoch": 0.2455, + "grad_norm": 4.649234771728516, + "grad_norm_var": 1.3794622764879967, + "learning_rate": 0.0001, + "loss": 1.2865, + "loss/crossentropy": 2.4274308681488037, + "loss/hidden": 1.015625, + "loss/logits": 0.15970450639724731, + "loss/reg": 0.011115475557744503, + "step": 1964 + }, + { + "epoch": 0.245625, + "grad_norm": 2.690500259399414, + "grad_norm_var": 1.1866477483632742, + "learning_rate": 0.0001, + "loss": 1.0634, + "loss/crossentropy": 2.594061851501465, + "loss/hidden": 0.8125, + "loss/logits": 0.13979078829288483, + "loss/reg": 0.011109288781881332, + "step": 1965 + }, + { + "epoch": 0.24575, + "grad_norm": 3.1549224853515625, + "grad_norm_var": 1.1847841066358076, + "learning_rate": 0.0001, + "loss": 1.037, + "loss/crossentropy": 2.6761364936828613, + "loss/hidden": 0.796875, + "loss/logits": 0.12911373376846313, + "loss/reg": 0.011103102937340736, + "step": 1966 + }, + { + "epoch": 0.245875, + "grad_norm": 2.8074398040771484, + "grad_norm_var": 1.210175941928484, + "learning_rate": 0.0001, + "loss": 1.0844, + "loss/crossentropy": 2.2973666191101074, + "loss/hidden": 0.81640625, + "loss/logits": 0.15705451369285583, + "loss/reg": 0.011097206734120846, + "step": 1967 + }, + { + "epoch": 0.246, + "grad_norm": 2.813947916030884, + "grad_norm_var": 1.2314860795345413, + "learning_rate": 0.0001, + "loss": 1.0939, + "loss/crossentropy": 2.4095160961151123, + "loss/hidden": 0.8359375, + "loss/logits": 0.14708581566810608, + "loss/reg": 0.01109134592115879, + "step": 1968 + }, + { + "epoch": 0.246125, + "grad_norm": 3.996295928955078, + "grad_norm_var": 1.2553489249469367, + "learning_rate": 0.0001, + "loss": 1.203, + "loss/crossentropy": 2.5568830966949463, + "loss/hidden": 0.921875, + "loss/logits": 0.17023617029190063, + "loss/reg": 0.011085476726293564, + "step": 1969 + }, + { + "epoch": 0.24625, + "grad_norm": 3.1737494468688965, + "grad_norm_var": 1.2567437243872042, + "learning_rate": 0.0001, + "loss": 1.19, + "loss/crossentropy": 2.820736885070801, + "loss/hidden": 0.8984375, + "loss/logits": 0.18073034286499023, + "loss/reg": 0.011080092750489712, + "step": 1970 + }, + { + "epoch": 0.246375, + "grad_norm": 3.1411452293395996, + "grad_norm_var": 1.2327325542410335, + "learning_rate": 0.0001, + "loss": 1.0159, + "loss/crossentropy": 2.550328493118286, + "loss/hidden": 0.78515625, + "loss/logits": 0.1200067549943924, + "loss/reg": 0.011074939742684364, + "step": 1971 + }, + { + "epoch": 0.2465, + "grad_norm": 2.7992095947265625, + "grad_norm_var": 1.2525440065906799, + "learning_rate": 0.0001, + "loss": 1.0909, + "loss/crossentropy": 2.4028799533843994, + "loss/hidden": 0.84375, + "loss/logits": 0.13644427061080933, + "loss/reg": 0.011069980449974537, + "step": 1972 + }, + { + "epoch": 0.246625, + "grad_norm": 3.7964675426483154, + "grad_norm_var": 1.2353292289717575, + "learning_rate": 0.0001, + "loss": 1.1583, + "loss/crossentropy": 2.6295735836029053, + "loss/hidden": 0.890625, + "loss/logits": 0.1569831818342209, + "loss/reg": 0.01106490008533001, + "step": 1973 + }, + { + "epoch": 0.24675, + "grad_norm": 2.4510080814361572, + "grad_norm_var": 0.3749246635942266, + "learning_rate": 0.0001, + "loss": 1.0464, + "loss/crossentropy": 2.5560388565063477, + "loss/hidden": 0.79296875, + "loss/logits": 0.14279651641845703, + "loss/reg": 0.011059917509555817, + "step": 1974 + }, + { + "epoch": 0.246875, + "grad_norm": 2.3039627075195312, + "grad_norm_var": 0.4039935905054485, + "learning_rate": 0.0001, + "loss": 0.9695, + "loss/crossentropy": 2.500410556793213, + "loss/hidden": 0.7421875, + "loss/logits": 0.11676067113876343, + "loss/reg": 0.011055227369070053, + "step": 1975 + }, + { + "epoch": 0.247, + "grad_norm": 2.6313462257385254, + "grad_norm_var": 0.4016504793812447, + "learning_rate": 0.0001, + "loss": 1.1421, + "loss/crossentropy": 2.510258197784424, + "loss/hidden": 0.87109375, + "loss/logits": 0.16046884655952454, + "loss/reg": 0.011049150489270687, + "step": 1976 + }, + { + "epoch": 0.247125, + "grad_norm": 3.447317361831665, + "grad_norm_var": 0.41062862008729606, + "learning_rate": 0.0001, + "loss": 1.2887, + "loss/crossentropy": 2.2219176292419434, + "loss/hidden": 1.0, + "loss/logits": 0.17826291918754578, + "loss/reg": 0.011043058708310127, + "step": 1977 + }, + { + "epoch": 0.24725, + "grad_norm": 2.774780035018921, + "grad_norm_var": 0.40509622860432326, + "learning_rate": 0.0001, + "loss": 1.1503, + "loss/crossentropy": 2.6890835762023926, + "loss/hidden": 0.8671875, + "loss/logits": 0.1727658063173294, + "loss/reg": 0.011036898009479046, + "step": 1978 + }, + { + "epoch": 0.247375, + "grad_norm": 2.6805171966552734, + "grad_norm_var": 0.38639654731341744, + "learning_rate": 0.0001, + "loss": 1.063, + "loss/crossentropy": 2.730665683746338, + "loss/hidden": 0.80859375, + "loss/logits": 0.14413277804851532, + "loss/reg": 0.01103129331022501, + "step": 1979 + }, + { + "epoch": 0.2475, + "grad_norm": 2.5546305179595947, + "grad_norm_var": 0.2229060548882482, + "learning_rate": 0.0001, + "loss": 1.0903, + "loss/crossentropy": 2.460444688796997, + "loss/hidden": 0.84375, + "loss/logits": 0.13627831637859344, + "loss/reg": 0.011025946587324142, + "step": 1980 + }, + { + "epoch": 0.247625, + "grad_norm": 3.147676706314087, + "grad_norm_var": 0.22008522732602553, + "learning_rate": 0.0001, + "loss": 1.0293, + "loss/crossentropy": 2.780017614364624, + "loss/hidden": 0.78515625, + "loss/logits": 0.13396324217319489, + "loss/reg": 0.011021067388355732, + "step": 1981 + }, + { + "epoch": 0.24775, + "grad_norm": 2.545849561691284, + "grad_norm_var": 0.22903709663108504, + "learning_rate": 0.0001, + "loss": 1.0831, + "loss/crossentropy": 2.723945379257202, + "loss/hidden": 0.84375, + "loss/logits": 0.1291472613811493, + "loss/reg": 0.011016342788934708, + "step": 1982 + }, + { + "epoch": 0.247875, + "grad_norm": 3.5800068378448486, + "grad_norm_var": 0.2525227852681437, + "learning_rate": 0.0001, + "loss": 1.7206, + "loss/crossentropy": 1.9947932958602905, + "loss/hidden": 1.3203125, + "loss/logits": 0.29013922810554504, + "loss/reg": 0.01101152878254652, + "step": 1983 + }, + { + "epoch": 0.248, + "grad_norm": 2.3426930904388428, + "grad_norm_var": 0.27745670304271336, + "learning_rate": 0.0001, + "loss": 1.0741, + "loss/crossentropy": 2.498370885848999, + "loss/hidden": 0.82421875, + "loss/logits": 0.13984939455986023, + "loss/reg": 0.011006931774318218, + "step": 1984 + }, + { + "epoch": 0.248125, + "grad_norm": 2.4072375297546387, + "grad_norm_var": 0.21579937260172882, + "learning_rate": 0.0001, + "loss": 1.1224, + "loss/crossentropy": 2.3923802375793457, + "loss/hidden": 0.83984375, + "loss/logits": 0.17257341742515564, + "loss/reg": 0.011000873520970345, + "step": 1985 + }, + { + "epoch": 0.24825, + "grad_norm": 4.206304550170898, + "grad_norm_var": 0.32547872452598553, + "learning_rate": 0.0001, + "loss": 1.3701, + "loss/crossentropy": 2.5781798362731934, + "loss/hidden": 1.0234375, + "loss/logits": 0.23670879006385803, + "loss/reg": 0.010995452292263508, + "step": 1986 + }, + { + "epoch": 0.248375, + "grad_norm": 5.367579460144043, + "grad_norm_var": 0.6992678587503592, + "learning_rate": 0.0001, + "loss": 1.381, + "loss/crossentropy": 2.560682535171509, + "loss/hidden": 1.109375, + "loss/logits": 0.16169926524162292, + "loss/reg": 0.010989362373948097, + "step": 1987 + }, + { + "epoch": 0.2485, + "grad_norm": 2.6005303859710693, + "grad_norm_var": 0.708770234220439, + "learning_rate": 0.0001, + "loss": 1.0953, + "loss/crossentropy": 2.4954795837402344, + "loss/hidden": 0.8359375, + "loss/logits": 0.149558424949646, + "loss/reg": 0.010983383283019066, + "step": 1988 + }, + { + "epoch": 0.248625, + "grad_norm": 3.366856098175049, + "grad_norm_var": 0.6776825224044613, + "learning_rate": 0.0001, + "loss": 1.1398, + "loss/crossentropy": 2.572629451751709, + "loss/hidden": 0.8515625, + "loss/logits": 0.17850361764431, + "loss/reg": 0.01097831316292286, + "step": 1989 + }, + { + "epoch": 0.24875, + "grad_norm": 2.9385433197021484, + "grad_norm_var": 0.6551923075237018, + "learning_rate": 0.0001, + "loss": 1.114, + "loss/crossentropy": 2.4854507446289062, + "loss/hidden": 0.83203125, + "loss/logits": 0.17227290570735931, + "loss/reg": 0.010973170399665833, + "step": 1990 + }, + { + "epoch": 0.248875, + "grad_norm": 2.1237387657165527, + "grad_norm_var": 0.675293446442898, + "learning_rate": 0.0001, + "loss": 1.0324, + "loss/crossentropy": 2.4779257774353027, + "loss/hidden": 0.78125, + "loss/logits": 0.14143508672714233, + "loss/reg": 0.01096703764051199, + "step": 1991 + }, + { + "epoch": 0.249, + "grad_norm": 17.35088539123535, + "grad_norm_var": 13.405545245645559, + "learning_rate": 0.0001, + "loss": 1.5438, + "loss/crossentropy": 2.1613028049468994, + "loss/hidden": 1.2265625, + "loss/logits": 0.20763814449310303, + "loss/reg": 0.010961621068418026, + "step": 1992 + }, + { + "epoch": 0.249125, + "grad_norm": 3.643308162689209, + "grad_norm_var": 13.394425808799774, + "learning_rate": 0.0001, + "loss": 1.5124, + "loss/crossentropy": 2.740511894226074, + "loss/hidden": 1.125, + "loss/logits": 0.27779725193977356, + "loss/reg": 0.010955520905554295, + "step": 1993 + }, + { + "epoch": 0.24925, + "grad_norm": 2.6261227130889893, + "grad_norm_var": 13.419635101303081, + "learning_rate": 0.0001, + "loss": 1.1103, + "loss/crossentropy": 2.4874839782714844, + "loss/hidden": 0.859375, + "loss/logits": 0.14138638973236084, + "loss/reg": 0.010949719697237015, + "step": 1994 + }, + { + "epoch": 0.249375, + "grad_norm": 2.8772454261779785, + "grad_norm_var": 13.388291796772698, + "learning_rate": 0.0001, + "loss": 1.1465, + "loss/crossentropy": 2.4892168045043945, + "loss/hidden": 0.875, + "loss/logits": 0.1620655208826065, + "loss/reg": 0.010943672619760036, + "step": 1995 + }, + { + "epoch": 0.2495, + "grad_norm": 4.019129753112793, + "grad_norm_var": 13.244021829599607, + "learning_rate": 0.0001, + "loss": 1.1853, + "loss/crossentropy": 2.284893751144409, + "loss/hidden": 0.90234375, + "loss/logits": 0.17355592548847198, + "loss/reg": 0.010938170365989208, + "step": 1996 + }, + { + "epoch": 0.249625, + "grad_norm": 3.0385830402374268, + "grad_norm_var": 13.258203172483755, + "learning_rate": 0.0001, + "loss": 1.1565, + "loss/crossentropy": 2.5010361671447754, + "loss/hidden": 0.8671875, + "loss/logits": 0.18000578880310059, + "loss/reg": 0.010932603850960732, + "step": 1997 + }, + { + "epoch": 0.24975, + "grad_norm": 2.5925393104553223, + "grad_norm_var": 13.248884346858516, + "learning_rate": 0.0001, + "loss": 1.083, + "loss/crossentropy": 2.6594510078430176, + "loss/hidden": 0.8359375, + "loss/logits": 0.1377822607755661, + "loss/reg": 0.010927069000899792, + "step": 1998 + }, + { + "epoch": 0.249875, + "grad_norm": 2.673877239227295, + "grad_norm_var": 13.359108718093754, + "learning_rate": 0.0001, + "loss": 0.9031, + "loss/crossentropy": 2.683049440383911, + "loss/hidden": 0.67578125, + "loss/logits": 0.11807288229465485, + "loss/reg": 0.010921536944806576, + "step": 1999 + }, + { + "epoch": 0.25, + "grad_norm": 2.3662562370300293, + "grad_norm_var": 13.353902173571798, + "learning_rate": 0.0001, + "loss": 1.0005, + "loss/crossentropy": 2.5584921836853027, + "loss/hidden": 0.765625, + "loss/logits": 0.1257750689983368, + "loss/reg": 0.010914883576333523, + "step": 2000 + }, + { + "epoch": 0.250125, + "grad_norm": 3.3031492233276367, + "grad_norm_var": 13.212321254571746, + "learning_rate": 0.0001, + "loss": 1.208, + "loss/crossentropy": 2.3520901203155518, + "loss/hidden": 0.9296875, + "loss/logits": 0.169231578707695, + "loss/reg": 0.010908186435699463, + "step": 2001 + }, + { + "epoch": 0.25025, + "grad_norm": 2.7960500717163086, + "grad_norm_var": 13.310694553026417, + "learning_rate": 0.0001, + "loss": 1.196, + "loss/crossentropy": 2.5724170207977295, + "loss/hidden": 0.9140625, + "loss/logits": 0.17287981510162354, + "loss/reg": 0.010901669040322304, + "step": 2002 + }, + { + "epoch": 0.250375, + "grad_norm": 2.519015073776245, + "grad_norm_var": 13.29092922248683, + "learning_rate": 0.0001, + "loss": 1.0908, + "loss/crossentropy": 2.4606783390045166, + "loss/hidden": 0.83203125, + "loss/logits": 0.14983445405960083, + "loss/reg": 0.010896313935518265, + "step": 2003 + }, + { + "epoch": 0.2505, + "grad_norm": 4.033186912536621, + "grad_norm_var": 13.189659268332564, + "learning_rate": 0.0001, + "loss": 1.1695, + "loss/crossentropy": 2.404235363006592, + "loss/hidden": 0.9140625, + "loss/logits": 0.1464817225933075, + "loss/reg": 0.010891149751842022, + "step": 2004 + }, + { + "epoch": 0.250625, + "grad_norm": 2.4134268760681152, + "grad_norm_var": 13.313203898618182, + "learning_rate": 0.0001, + "loss": 1.0413, + "loss/crossentropy": 2.5215063095092773, + "loss/hidden": 0.78125, + "loss/logits": 0.15116176009178162, + "loss/reg": 0.010884839110076427, + "step": 2005 + }, + { + "epoch": 0.25075, + "grad_norm": 4.4074788093566895, + "grad_norm_var": 13.273036491395748, + "learning_rate": 0.0001, + "loss": 1.4021, + "loss/crossentropy": 2.1157116889953613, + "loss/hidden": 1.0546875, + "loss/logits": 0.23859882354736328, + "loss/reg": 0.010878421366214752, + "step": 2006 + }, + { + "epoch": 0.250875, + "grad_norm": 5.314060211181641, + "grad_norm_var": 13.143382840425524, + "learning_rate": 0.0001, + "loss": 1.2215, + "loss/crossentropy": 2.8061904907226562, + "loss/hidden": 0.9375, + "loss/logits": 0.17523032426834106, + "loss/reg": 0.010873752646148205, + "step": 2007 + }, + { + "epoch": 0.251, + "grad_norm": 3.8481509685516357, + "grad_norm_var": 0.7243167107963312, + "learning_rate": 0.0001, + "loss": 1.3794, + "loss/crossentropy": 2.1161513328552246, + "loss/hidden": 1.078125, + "loss/logits": 0.1925767958164215, + "loss/reg": 0.010869134217500687, + "step": 2008 + }, + { + "epoch": 0.251125, + "grad_norm": 3.040430784225464, + "grad_norm_var": 0.7177866935255468, + "learning_rate": 0.0001, + "loss": 1.2114, + "loss/crossentropy": 2.319920301437378, + "loss/hidden": 0.9296875, + "loss/logits": 0.17311570048332214, + "loss/reg": 0.010863146744668484, + "step": 2009 + }, + { + "epoch": 0.25125, + "grad_norm": 3.2466933727264404, + "grad_norm_var": 0.6909136087633553, + "learning_rate": 0.0001, + "loss": 1.2099, + "loss/crossentropy": 2.736701250076294, + "loss/hidden": 0.9140625, + "loss/logits": 0.18722936511039734, + "loss/reg": 0.010857407003641129, + "step": 2010 + }, + { + "epoch": 0.251375, + "grad_norm": 3.346977710723877, + "grad_norm_var": 0.6794429250422503, + "learning_rate": 0.0001, + "loss": 1.1144, + "loss/crossentropy": 2.411923408508301, + "loss/hidden": 0.84375, + "loss/logits": 0.1621374785900116, + "loss/reg": 0.010851394385099411, + "step": 2011 + }, + { + "epoch": 0.2515, + "grad_norm": 2.475320339202881, + "grad_norm_var": 0.6824211926172303, + "learning_rate": 0.0001, + "loss": 1.0638, + "loss/crossentropy": 2.391957998275757, + "loss/hidden": 0.8046875, + "loss/logits": 0.15069535374641418, + "loss/reg": 0.010845213197171688, + "step": 2012 + }, + { + "epoch": 0.251625, + "grad_norm": 3.6020660400390625, + "grad_norm_var": 0.6891278375376554, + "learning_rate": 0.0001, + "loss": 1.0331, + "loss/crossentropy": 2.573591947555542, + "loss/hidden": 0.78515625, + "loss/logits": 0.13958533108234406, + "loss/reg": 0.010839528404176235, + "step": 2013 + }, + { + "epoch": 0.25175, + "grad_norm": 3.1850457191467285, + "grad_norm_var": 0.6592346442265646, + "learning_rate": 0.0001, + "loss": 1.1971, + "loss/crossentropy": 2.504432439804077, + "loss/hidden": 0.92578125, + "loss/logits": 0.16294726729393005, + "loss/reg": 0.01083376631140709, + "step": 2014 + }, + { + "epoch": 0.251875, + "grad_norm": 4.146329879760742, + "grad_norm_var": 0.6746247811027994, + "learning_rate": 0.0001, + "loss": 1.218, + "loss/crossentropy": 2.2574198246002197, + "loss/hidden": 0.9453125, + "loss/logits": 0.16444240510463715, + "loss/reg": 0.010828280821442604, + "step": 2015 + }, + { + "epoch": 0.252, + "grad_norm": 2.453033447265625, + "grad_norm_var": 0.6633924045555346, + "learning_rate": 0.0001, + "loss": 0.9995, + "loss/crossentropy": 2.3319449424743652, + "loss/hidden": 0.7734375, + "loss/logits": 0.11781074106693268, + "loss/reg": 0.01082212757319212, + "step": 2016 + }, + { + "epoch": 0.252125, + "grad_norm": 3.5673329830169678, + "grad_norm_var": 0.6649364492837339, + "learning_rate": 0.0001, + "loss": 1.1255, + "loss/crossentropy": 2.299909830093384, + "loss/hidden": 0.8671875, + "loss/logits": 0.15017305314540863, + "loss/reg": 0.010815980844199657, + "step": 2017 + }, + { + "epoch": 0.25225, + "grad_norm": 3.0831964015960693, + "grad_norm_var": 0.6469797521622468, + "learning_rate": 0.0001, + "loss": 1.0967, + "loss/crossentropy": 2.3248322010040283, + "loss/hidden": 0.84765625, + "loss/logits": 0.14093933999538422, + "loss/reg": 0.01080994587391615, + "step": 2018 + }, + { + "epoch": 0.252375, + "grad_norm": 3.40370512008667, + "grad_norm_var": 0.5899000738053907, + "learning_rate": 0.0001, + "loss": 0.994, + "loss/crossentropy": 2.258739471435547, + "loss/hidden": 0.765625, + "loss/logits": 0.1203707754611969, + "loss/reg": 0.010804006829857826, + "step": 2019 + }, + { + "epoch": 0.2525, + "grad_norm": 24.29651641845703, + "grad_norm_var": 27.766322176629185, + "learning_rate": 0.0001, + "loss": 1.2966, + "loss/crossentropy": 2.150182008743286, + "loss/hidden": 1.0234375, + "loss/logits": 0.16513843834400177, + "loss/reg": 0.010797837749123573, + "step": 2020 + }, + { + "epoch": 0.252625, + "grad_norm": 6.50246000289917, + "grad_norm_var": 27.54322498539218, + "learning_rate": 0.0001, + "loss": 1.207, + "loss/crossentropy": 2.768232583999634, + "loss/hidden": 0.90625, + "loss/logits": 0.19279730319976807, + "loss/reg": 0.010791877284646034, + "step": 2021 + }, + { + "epoch": 0.25275, + "grad_norm": 2.8173868656158447, + "grad_norm_var": 27.825795280013214, + "learning_rate": 0.0001, + "loss": 1.1749, + "loss/crossentropy": 2.713151454925537, + "loss/hidden": 0.90625, + "loss/logits": 0.1608140617609024, + "loss/reg": 0.01078594010323286, + "step": 2022 + }, + { + "epoch": 0.252875, + "grad_norm": 2.4808120727539062, + "grad_norm_var": 28.169399901683587, + "learning_rate": 0.0001, + "loss": 1.105, + "loss/crossentropy": 2.4170539379119873, + "loss/hidden": 0.8515625, + "loss/logits": 0.14562010765075684, + "loss/reg": 0.010779998265206814, + "step": 2023 + }, + { + "epoch": 0.253, + "grad_norm": 5.388950824737549, + "grad_norm_var": 28.138981383314007, + "learning_rate": 0.0001, + "loss": 1.3457, + "loss/crossentropy": 2.666219711303711, + "loss/hidden": 1.078125, + "loss/logits": 0.15986664593219757, + "loss/reg": 0.01077408716082573, + "step": 2024 + }, + { + "epoch": 0.253125, + "grad_norm": 3.080457925796509, + "grad_norm_var": 28.129611976156532, + "learning_rate": 0.0001, + "loss": 1.1759, + "loss/crossentropy": 2.418120861053467, + "loss/hidden": 0.89453125, + "loss/logits": 0.1737123727798462, + "loss/reg": 0.010768269188702106, + "step": 2025 + }, + { + "epoch": 0.25325, + "grad_norm": 2.7178633213043213, + "grad_norm_var": 28.257833064724025, + "learning_rate": 0.0001, + "loss": 1.0728, + "loss/crossentropy": 2.368070363998413, + "loss/hidden": 0.82421875, + "loss/logits": 0.1410026252269745, + "loss/reg": 0.01076251920312643, + "step": 2026 + }, + { + "epoch": 0.253375, + "grad_norm": 2.7056283950805664, + "grad_norm_var": 28.406444024574483, + "learning_rate": 0.0001, + "loss": 1.0374, + "loss/crossentropy": 2.5718796253204346, + "loss/hidden": 0.796875, + "loss/logits": 0.13298764824867249, + "loss/reg": 0.010756811127066612, + "step": 2027 + }, + { + "epoch": 0.2535, + "grad_norm": 3.140803337097168, + "grad_norm_var": 28.232809207386662, + "learning_rate": 0.0001, + "loss": 1.2094, + "loss/crossentropy": 2.7838029861450195, + "loss/hidden": 0.92578125, + "loss/logits": 0.176119863986969, + "loss/reg": 0.010750662535429, + "step": 2028 + }, + { + "epoch": 0.253625, + "grad_norm": 3.3981008529663086, + "grad_norm_var": 28.26759933011211, + "learning_rate": 0.0001, + "loss": 1.4477, + "loss/crossentropy": 1.9992707967758179, + "loss/hidden": 1.140625, + "loss/logits": 0.19963175058364868, + "loss/reg": 0.010744369588792324, + "step": 2029 + }, + { + "epoch": 0.25375, + "grad_norm": 3.8234119415283203, + "grad_norm_var": 28.15791128049958, + "learning_rate": 0.0001, + "loss": 1.1762, + "loss/crossentropy": 2.8387129306793213, + "loss/hidden": 0.89453125, + "loss/logits": 0.1743220090866089, + "loss/reg": 0.010738285258412361, + "step": 2030 + }, + { + "epoch": 0.253875, + "grad_norm": 2.742777109146118, + "grad_norm_var": 28.405771184857475, + "learning_rate": 0.0001, + "loss": 1.1515, + "loss/crossentropy": 2.638526201248169, + "loss/hidden": 0.8515625, + "loss/logits": 0.19261294603347778, + "loss/reg": 0.010731999762356281, + "step": 2031 + }, + { + "epoch": 0.254, + "grad_norm": 3.0282509326934814, + "grad_norm_var": 28.25218921528566, + "learning_rate": 0.0001, + "loss": 1.0424, + "loss/crossentropy": 2.668217182159424, + "loss/hidden": 0.79296875, + "loss/logits": 0.1421516239643097, + "loss/reg": 0.01072592195123434, + "step": 2032 + }, + { + "epoch": 0.254125, + "grad_norm": 4.297961711883545, + "grad_norm_var": 28.16925913255387, + "learning_rate": 0.0001, + "loss": 1.2974, + "loss/crossentropy": 2.6681506633758545, + "loss/hidden": 1.0234375, + "loss/logits": 0.1667848825454712, + "loss/reg": 0.010719675570726395, + "step": 2033 + }, + { + "epoch": 0.25425, + "grad_norm": 2.805267810821533, + "grad_norm_var": 28.23795753375522, + "learning_rate": 0.0001, + "loss": 1.227, + "loss/crossentropy": 2.2722957134246826, + "loss/hidden": 0.9375, + "loss/logits": 0.18238624930381775, + "loss/reg": 0.010713436640799046, + "step": 2034 + }, + { + "epoch": 0.254375, + "grad_norm": 4.684373378753662, + "grad_norm_var": 28.10384957392755, + "learning_rate": 0.0001, + "loss": 1.1652, + "loss/crossentropy": 2.4375510215759277, + "loss/hidden": 0.8828125, + "loss/logits": 0.1753455549478531, + "loss/reg": 0.010707871057093143, + "step": 2035 + }, + { + "epoch": 0.2545, + "grad_norm": 2.7251429557800293, + "grad_norm_var": 1.3107766388586657, + "learning_rate": 0.0001, + "loss": 1.1256, + "loss/crossentropy": 2.449026346206665, + "loss/hidden": 0.86328125, + "loss/logits": 0.1552899032831192, + "loss/reg": 0.01070234552025795, + "step": 2036 + }, + { + "epoch": 0.254625, + "grad_norm": 3.2145721912384033, + "grad_norm_var": 0.6794870541858852, + "learning_rate": 0.0001, + "loss": 1.0623, + "loss/crossentropy": 2.2772269248962402, + "loss/hidden": 0.82421875, + "loss/logits": 0.1311383992433548, + "loss/reg": 0.01069684885442257, + "step": 2037 + }, + { + "epoch": 0.25475, + "grad_norm": 2.4735593795776367, + "grad_norm_var": 0.709721747436839, + "learning_rate": 0.0001, + "loss": 1.1294, + "loss/crossentropy": 2.367722749710083, + "loss/hidden": 0.86328125, + "loss/logits": 0.15924035012722015, + "loss/reg": 0.010690837167203426, + "step": 2038 + }, + { + "epoch": 0.254875, + "grad_norm": 6.48469877243042, + "grad_norm_var": 1.2774131324666462, + "learning_rate": 0.0001, + "loss": 1.2165, + "loss/crossentropy": 3.019282102584839, + "loss/hidden": 0.953125, + "loss/logits": 0.15649846196174622, + "loss/reg": 0.010685018263757229, + "step": 2039 + }, + { + "epoch": 0.255, + "grad_norm": 3.2317538261413574, + "grad_norm_var": 1.0377410880342928, + "learning_rate": 0.0001, + "loss": 1.0649, + "loss/crossentropy": 2.4050068855285645, + "loss/hidden": 0.81640625, + "loss/logits": 0.14170151948928833, + "loss/reg": 0.010679157450795174, + "step": 2040 + }, + { + "epoch": 0.255125, + "grad_norm": 2.378143072128296, + "grad_norm_var": 1.0993964804936218, + "learning_rate": 0.0001, + "loss": 1.0464, + "loss/crossentropy": 2.6661128997802734, + "loss/hidden": 0.80859375, + "loss/logits": 0.13103529810905457, + "loss/reg": 0.01067331898957491, + "step": 2041 + }, + { + "epoch": 0.25525, + "grad_norm": 2.7865211963653564, + "grad_norm_var": 1.0937599196755132, + "learning_rate": 0.0001, + "loss": 1.1499, + "loss/crossentropy": 2.1714141368865967, + "loss/hidden": 0.87890625, + "loss/logits": 0.16433167457580566, + "loss/reg": 0.010667411610484123, + "step": 2042 + }, + { + "epoch": 0.255375, + "grad_norm": 3.5598180294036865, + "grad_norm_var": 1.0636889545782036, + "learning_rate": 0.0001, + "loss": 0.9551, + "loss/crossentropy": 2.5189130306243896, + "loss/hidden": 0.734375, + "loss/logits": 0.11414233595132828, + "loss/reg": 0.010661386884748936, + "step": 2043 + }, + { + "epoch": 0.2555, + "grad_norm": 2.490208148956299, + "grad_norm_var": 1.1146618244138533, + "learning_rate": 0.0001, + "loss": 0.9578, + "loss/crossentropy": 2.6771488189697266, + "loss/hidden": 0.7421875, + "loss/logits": 0.1090717688202858, + "loss/reg": 0.010655377060174942, + "step": 2044 + }, + { + "epoch": 0.255625, + "grad_norm": 2.7985575199127197, + "grad_norm_var": 1.1359032582995399, + "learning_rate": 0.0001, + "loss": 1.37, + "loss/crossentropy": 2.0045809745788574, + "loss/hidden": 1.078125, + "loss/logits": 0.18535012006759644, + "loss/reg": 0.010649108327925205, + "step": 2045 + }, + { + "epoch": 0.25575, + "grad_norm": 4.170068740844727, + "grad_norm_var": 1.165512079520861, + "learning_rate": 0.0001, + "loss": 1.4862, + "loss/crossentropy": 2.2638754844665527, + "loss/hidden": 1.1796875, + "loss/logits": 0.2001211941242218, + "loss/reg": 0.01064310409128666, + "step": 2046 + }, + { + "epoch": 0.255875, + "grad_norm": 2.69113826751709, + "grad_norm_var": 1.1699764864590407, + "learning_rate": 0.0001, + "loss": 1.0754, + "loss/crossentropy": 2.516594171524048, + "loss/hidden": 0.8125, + "loss/logits": 0.1565219610929489, + "loss/reg": 0.010636772960424423, + "step": 2047 + }, + { + "epoch": 0.256, + "grad_norm": 3.8640964031219482, + "grad_norm_var": 1.1762510392154117, + "learning_rate": 0.0001, + "loss": 1.2097, + "loss/crossentropy": 2.5563271045684814, + "loss/hidden": 0.94921875, + "loss/logits": 0.15413978695869446, + "loss/reg": 0.010630756616592407, + "step": 2048 + }, + { + "epoch": 0.256125, + "grad_norm": 3.5007386207580566, + "grad_norm_var": 1.1222236767626848, + "learning_rate": 0.0001, + "loss": 1.1677, + "loss/crossentropy": 2.202484607696533, + "loss/hidden": 0.875, + "loss/logits": 0.18647682666778564, + "loss/reg": 0.010624704882502556, + "step": 2049 + }, + { + "epoch": 0.25625, + "grad_norm": 3.732865571975708, + "grad_norm_var": 1.106629288681254, + "learning_rate": 0.0001, + "loss": 1.1005, + "loss/crossentropy": 2.5272040367126465, + "loss/hidden": 0.85546875, + "loss/logits": 0.13880380988121033, + "loss/reg": 0.010618180967867374, + "step": 2050 + }, + { + "epoch": 0.256375, + "grad_norm": 2.9021427631378174, + "grad_norm_var": 1.0056809489854572, + "learning_rate": 0.0001, + "loss": 1.4065, + "loss/crossentropy": 2.332169771194458, + "loss/hidden": 1.0703125, + "loss/logits": 0.23006314039230347, + "loss/reg": 0.010611588135361671, + "step": 2051 + }, + { + "epoch": 0.2565, + "grad_norm": 2.165713310241699, + "grad_norm_var": 1.0690711365888386, + "learning_rate": 0.0001, + "loss": 0.9693, + "loss/crossentropy": 2.5843775272369385, + "loss/hidden": 0.74609375, + "loss/logits": 0.11719683557748795, + "loss/reg": 0.010605687275528908, + "step": 2052 + }, + { + "epoch": 0.256625, + "grad_norm": 6.2252373695373535, + "grad_norm_var": 1.6102017754294553, + "learning_rate": 0.0001, + "loss": 1.1227, + "loss/crossentropy": 2.7939460277557373, + "loss/hidden": 0.859375, + "loss/logits": 0.15731649100780487, + "loss/reg": 0.010599389672279358, + "step": 2053 + }, + { + "epoch": 0.25675, + "grad_norm": 2.5095605850219727, + "grad_norm_var": 1.6055191280749035, + "learning_rate": 0.0001, + "loss": 1.1263, + "loss/crossentropy": 2.7344400882720947, + "loss/hidden": 0.8671875, + "loss/logits": 0.15321476757526398, + "loss/reg": 0.010593513958156109, + "step": 2054 + }, + { + "epoch": 0.256875, + "grad_norm": 3.13476300239563, + "grad_norm_var": 0.9595565999441523, + "learning_rate": 0.0001, + "loss": 1.091, + "loss/crossentropy": 2.842742443084717, + "loss/hidden": 0.84375, + "loss/logits": 0.14142204821109772, + "loss/reg": 0.010587593540549278, + "step": 2055 + }, + { + "epoch": 0.257, + "grad_norm": 3.0338258743286133, + "grad_norm_var": 0.9627196945380205, + "learning_rate": 0.0001, + "loss": 1.1247, + "loss/crossentropy": 2.101410150527954, + "loss/hidden": 0.890625, + "loss/logits": 0.12829464673995972, + "loss/reg": 0.010581732727587223, + "step": 2056 + }, + { + "epoch": 0.257125, + "grad_norm": 2.944615125656128, + "grad_norm_var": 0.9171915381915218, + "learning_rate": 0.0001, + "loss": 1.1247, + "loss/crossentropy": 2.3841867446899414, + "loss/hidden": 0.85546875, + "loss/logits": 0.1634388417005539, + "loss/reg": 0.010575841180980206, + "step": 2057 + }, + { + "epoch": 0.25725, + "grad_norm": 5.067098140716553, + "grad_norm_var": 1.0916327868550408, + "learning_rate": 0.0001, + "loss": 1.0682, + "loss/crossentropy": 2.4407575130462646, + "loss/hidden": 0.80859375, + "loss/logits": 0.15389126539230347, + "loss/reg": 0.01057005301117897, + "step": 2058 + }, + { + "epoch": 0.257375, + "grad_norm": 8.683815956115723, + "grad_norm_var": 2.8251079920494844, + "learning_rate": 0.0001, + "loss": 1.9453, + "loss/crossentropy": 1.8485121726989746, + "loss/hidden": 1.6015625, + "loss/logits": 0.23812265694141388, + "loss/reg": 0.010564405471086502, + "step": 2059 + }, + { + "epoch": 0.2575, + "grad_norm": 3.6597707271575928, + "grad_norm_var": 2.7149800725151727, + "learning_rate": 0.0001, + "loss": 1.2665, + "loss/crossentropy": 2.4700844287872314, + "loss/hidden": 0.96484375, + "loss/logits": 0.19607657194137573, + "loss/reg": 0.010558774694800377, + "step": 2060 + }, + { + "epoch": 0.257625, + "grad_norm": 2.560063362121582, + "grad_norm_var": 2.750944581783611, + "learning_rate": 0.0001, + "loss": 1.0961, + "loss/crossentropy": 2.39172101020813, + "loss/hidden": 0.8359375, + "loss/logits": 0.15458577871322632, + "loss/reg": 0.010553127154707909, + "step": 2061 + }, + { + "epoch": 0.25775, + "grad_norm": 3.3876848220825195, + "grad_norm_var": 2.7508943355095568, + "learning_rate": 0.0001, + "loss": 1.2258, + "loss/crossentropy": 2.5666110515594482, + "loss/hidden": 0.953125, + "loss/logits": 0.16717703640460968, + "loss/reg": 0.010547193698585033, + "step": 2062 + }, + { + "epoch": 0.257875, + "grad_norm": 2.643869161605835, + "grad_norm_var": 2.7577323773001448, + "learning_rate": 0.0001, + "loss": 1.0521, + "loss/crossentropy": 2.709843873977661, + "loss/hidden": 0.80859375, + "loss/logits": 0.13805679976940155, + "loss/reg": 0.010541180148720741, + "step": 2063 + }, + { + "epoch": 0.258, + "grad_norm": 2.5117688179016113, + "grad_norm_var": 2.851637725733224, + "learning_rate": 0.0001, + "loss": 1.0321, + "loss/crossentropy": 2.448089838027954, + "loss/hidden": 0.80078125, + "loss/logits": 0.1259264349937439, + "loss/reg": 0.010535228997468948, + "step": 2064 + }, + { + "epoch": 0.258125, + "grad_norm": 2.239600419998169, + "grad_norm_var": 2.978910235283494, + "learning_rate": 0.0001, + "loss": 0.9866, + "loss/crossentropy": 2.721714973449707, + "loss/hidden": 0.75, + "loss/logits": 0.13132566213607788, + "loss/reg": 0.010529426857829094, + "step": 2065 + }, + { + "epoch": 0.25825, + "grad_norm": 4.718794345855713, + "grad_norm_var": 3.058753376089699, + "learning_rate": 0.0001, + "loss": 1.4274, + "loss/crossentropy": 2.514378786087036, + "loss/hidden": 1.1484375, + "loss/logits": 0.1737312525510788, + "loss/reg": 0.010523487813770771, + "step": 2066 + }, + { + "epoch": 0.258375, + "grad_norm": 3.8948166370391846, + "grad_norm_var": 3.0214537750653903, + "learning_rate": 0.0001, + "loss": 1.3045, + "loss/crossentropy": 2.377101421356201, + "loss/hidden": 1.03125, + "loss/logits": 0.16806933283805847, + "loss/reg": 0.010517789050936699, + "step": 2067 + }, + { + "epoch": 0.2585, + "grad_norm": 3.5205633640289307, + "grad_norm_var": 2.856972615558527, + "learning_rate": 0.0001, + "loss": 1.1546, + "loss/crossentropy": 2.632283926010132, + "loss/hidden": 0.89453125, + "loss/logits": 0.15493813157081604, + "loss/reg": 0.01051197201013565, + "step": 2068 + }, + { + "epoch": 0.258625, + "grad_norm": 3.328749179840088, + "grad_norm_var": 2.443154032499194, + "learning_rate": 0.0001, + "loss": 1.2502, + "loss/crossentropy": 2.3981831073760986, + "loss/hidden": 0.94921875, + "loss/logits": 0.19590550661087036, + "loss/reg": 0.010506249964237213, + "step": 2069 + }, + { + "epoch": 0.25875, + "grad_norm": 4.440552234649658, + "grad_norm_var": 2.3915973151495904, + "learning_rate": 0.0001, + "loss": 1.1416, + "loss/crossentropy": 2.713144540786743, + "loss/hidden": 0.875, + "loss/logits": 0.16163308918476105, + "loss/reg": 0.010500641539692879, + "step": 2070 + }, + { + "epoch": 0.258875, + "grad_norm": 3.0892152786254883, + "grad_norm_var": 2.3953761634768465, + "learning_rate": 0.0001, + "loss": 1.3043, + "loss/crossentropy": 2.198448419570923, + "loss/hidden": 1.015625, + "loss/logits": 0.18372184038162231, + "loss/reg": 0.01049516536295414, + "step": 2071 + }, + { + "epoch": 0.259, + "grad_norm": 3.3777263164520264, + "grad_norm_var": 2.3707175384468635, + "learning_rate": 0.0001, + "loss": 1.3567, + "loss/crossentropy": 2.277832269668579, + "loss/hidden": 1.0390625, + "loss/logits": 0.2126983106136322, + "loss/reg": 0.010489530861377716, + "step": 2072 + }, + { + "epoch": 0.259125, + "grad_norm": 2.8383560180664062, + "grad_norm_var": 2.382894659670538, + "learning_rate": 0.0001, + "loss": 1.2239, + "loss/crossentropy": 2.4116485118865967, + "loss/hidden": 0.94140625, + "loss/logits": 0.17760911583900452, + "loss/reg": 0.010484064929187298, + "step": 2073 + }, + { + "epoch": 0.25925, + "grad_norm": 2.530372381210327, + "grad_norm_var": 2.3388046267718794, + "learning_rate": 0.0001, + "loss": 1.2141, + "loss/crossentropy": 2.6379024982452393, + "loss/hidden": 0.92578125, + "loss/logits": 0.18353202939033508, + "loss/reg": 0.010478835552930832, + "step": 2074 + }, + { + "epoch": 0.259375, + "grad_norm": 4.30379581451416, + "grad_norm_var": 0.562517198204377, + "learning_rate": 0.0001, + "loss": 1.4503, + "loss/crossentropy": 2.8567347526550293, + "loss/hidden": 1.15625, + "loss/logits": 0.1892782598733902, + "loss/reg": 0.010473009198904037, + "step": 2075 + }, + { + "epoch": 0.2595, + "grad_norm": 3.061861515045166, + "grad_norm_var": 0.5574035115931859, + "learning_rate": 0.0001, + "loss": 1.1355, + "loss/crossentropy": 2.5755980014801025, + "loss/hidden": 0.875, + "loss/logits": 0.15577895939350128, + "loss/reg": 0.010467196814715862, + "step": 2076 + }, + { + "epoch": 0.259625, + "grad_norm": 3.24180269241333, + "grad_norm_var": 0.5211933196448816, + "learning_rate": 0.0001, + "loss": 1.132, + "loss/crossentropy": 2.3648555278778076, + "loss/hidden": 0.87109375, + "loss/logits": 0.1563071310520172, + "loss/reg": 0.010461376048624516, + "step": 2077 + }, + { + "epoch": 0.25975, + "grad_norm": 2.80733585357666, + "grad_norm_var": 0.5370522697899598, + "learning_rate": 0.0001, + "loss": 1.1601, + "loss/crossentropy": 2.73288893699646, + "loss/hidden": 0.89453125, + "loss/logits": 0.1610286682844162, + "loss/reg": 0.010455910116434097, + "step": 2078 + }, + { + "epoch": 0.259875, + "grad_norm": 3.0824356079101562, + "grad_norm_var": 0.5116226336194678, + "learning_rate": 0.0001, + "loss": 1.094, + "loss/crossentropy": 2.986253261566162, + "loss/hidden": 0.8359375, + "loss/logits": 0.15357401967048645, + "loss/reg": 0.01045006513595581, + "step": 2079 + }, + { + "epoch": 0.26, + "grad_norm": 2.9852945804595947, + "grad_norm_var": 0.47512957294942454, + "learning_rate": 0.0001, + "loss": 1.2167, + "loss/crossentropy": 2.410843849182129, + "loss/hidden": 0.921875, + "loss/logits": 0.1904173642396927, + "loss/reg": 0.010444463230669498, + "step": 2080 + }, + { + "epoch": 0.260125, + "grad_norm": 4.459211826324463, + "grad_norm_var": 0.4569920188555971, + "learning_rate": 0.0001, + "loss": 1.3403, + "loss/crossentropy": 2.5617294311523438, + "loss/hidden": 1.0546875, + "loss/logits": 0.18125054240226746, + "loss/reg": 0.010439139790832996, + "step": 2081 + }, + { + "epoch": 0.26025, + "grad_norm": 2.68888521194458, + "grad_norm_var": 0.37925483208550353, + "learning_rate": 0.0001, + "loss": 1.1682, + "loss/crossentropy": 2.416043281555176, + "loss/hidden": 0.91015625, + "loss/logits": 0.15370242297649384, + "loss/reg": 0.010433608666062355, + "step": 2082 + }, + { + "epoch": 0.260375, + "grad_norm": 2.870277166366577, + "grad_norm_var": 0.37087029432782564, + "learning_rate": 0.0001, + "loss": 1.0909, + "loss/crossentropy": 2.4830379486083984, + "loss/hidden": 0.84765625, + "loss/logits": 0.13893413543701172, + "loss/reg": 0.010428385809063911, + "step": 2083 + }, + { + "epoch": 0.2605, + "grad_norm": 2.501788377761841, + "grad_norm_var": 0.40430507679438177, + "learning_rate": 0.0001, + "loss": 1.0521, + "loss/crossentropy": 2.444650173187256, + "loss/hidden": 0.81640625, + "loss/logits": 0.13147732615470886, + "loss/reg": 0.010423212312161922, + "step": 2084 + }, + { + "epoch": 0.260625, + "grad_norm": 2.904280185699463, + "grad_norm_var": 0.40972126802901104, + "learning_rate": 0.0001, + "loss": 1.1784, + "loss/crossentropy": 2.4828052520751953, + "loss/hidden": 0.91015625, + "loss/logits": 0.1640183925628662, + "loss/reg": 0.010418218560516834, + "step": 2085 + }, + { + "epoch": 0.26075, + "grad_norm": 3.0363059043884277, + "grad_norm_var": 0.3004966806243959, + "learning_rate": 0.0001, + "loss": 0.8733, + "loss/crossentropy": 2.7187061309814453, + "loss/hidden": 0.6640625, + "loss/logits": 0.10506261885166168, + "loss/reg": 0.01041326206177473, + "step": 2086 + }, + { + "epoch": 0.260875, + "grad_norm": 2.588026285171509, + "grad_norm_var": 0.3176641483043492, + "learning_rate": 0.0001, + "loss": 1.1362, + "loss/crossentropy": 2.567997932434082, + "loss/hidden": 0.8828125, + "loss/logits": 0.14933381974697113, + "loss/reg": 0.010407461784780025, + "step": 2087 + }, + { + "epoch": 0.261, + "grad_norm": 2.252103090286255, + "grad_norm_var": 0.3521486370625695, + "learning_rate": 0.0001, + "loss": 1.0071, + "loss/crossentropy": 2.480170488357544, + "loss/hidden": 0.7734375, + "loss/logits": 0.1295960247516632, + "loss/reg": 0.010402267798781395, + "step": 2088 + }, + { + "epoch": 0.261125, + "grad_norm": 2.6625747680664062, + "grad_norm_var": 0.3580912087379744, + "learning_rate": 0.0001, + "loss": 1.054, + "loss/crossentropy": 2.3518621921539307, + "loss/hidden": 0.81640625, + "loss/logits": 0.1336396187543869, + "loss/reg": 0.010397165082395077, + "step": 2089 + }, + { + "epoch": 0.26125, + "grad_norm": 2.207487106323242, + "grad_norm_var": 0.3847616204929198, + "learning_rate": 0.0001, + "loss": 0.9686, + "loss/crossentropy": 2.557934522628784, + "loss/hidden": 0.75, + "loss/logits": 0.1146397665143013, + "loss/reg": 0.01039210706949234, + "step": 2090 + }, + { + "epoch": 0.261375, + "grad_norm": 3.29636287689209, + "grad_norm_var": 0.27015336290526737, + "learning_rate": 0.0001, + "loss": 1.1815, + "loss/crossentropy": 2.6995208263397217, + "loss/hidden": 0.890625, + "loss/logits": 0.18703284859657288, + "loss/reg": 0.010386320762336254, + "step": 2091 + }, + { + "epoch": 0.2615, + "grad_norm": 2.889937162399292, + "grad_norm_var": 0.2686428376242816, + "learning_rate": 0.0001, + "loss": 1.1666, + "loss/crossentropy": 2.6725549697875977, + "loss/hidden": 0.8984375, + "loss/logits": 0.16434305906295776, + "loss/reg": 0.010380519554018974, + "step": 2092 + }, + { + "epoch": 0.261625, + "grad_norm": 3.2912068367004395, + "grad_norm_var": 0.27101640434824503, + "learning_rate": 0.0001, + "loss": 1.1243, + "loss/crossentropy": 2.643287181854248, + "loss/hidden": 0.83984375, + "loss/logits": 0.18071629106998444, + "loss/reg": 0.010375326499342918, + "step": 2093 + }, + { + "epoch": 0.26175, + "grad_norm": 2.835505247116089, + "grad_norm_var": 0.27068896596296643, + "learning_rate": 0.0001, + "loss": 1.1214, + "loss/crossentropy": 2.316237688064575, + "loss/hidden": 0.8515625, + "loss/logits": 0.16614554822444916, + "loss/reg": 0.0103701027110219, + "step": 2094 + }, + { + "epoch": 0.261875, + "grad_norm": 4.332640647888184, + "grad_norm_var": 0.3972078958413436, + "learning_rate": 0.0001, + "loss": 1.1081, + "loss/crossentropy": 2.437239170074463, + "loss/hidden": 0.859375, + "loss/logits": 0.14509451389312744, + "loss/reg": 0.010364314541220665, + "step": 2095 + }, + { + "epoch": 0.262, + "grad_norm": 2.5772182941436768, + "grad_norm_var": 0.40774220199327993, + "learning_rate": 0.0001, + "loss": 1.0466, + "loss/crossentropy": 2.5926930904388428, + "loss/hidden": 0.8125, + "loss/logits": 0.13053925335407257, + "loss/reg": 0.010358490981161594, + "step": 2096 + }, + { + "epoch": 0.262125, + "grad_norm": 3.8109514713287354, + "grad_norm_var": 0.3046060022410747, + "learning_rate": 0.0001, + "loss": 1.2457, + "loss/crossentropy": 2.1364307403564453, + "loss/hidden": 0.97265625, + "loss/logits": 0.16946956515312195, + "loss/reg": 0.010352512821555138, + "step": 2097 + }, + { + "epoch": 0.26225, + "grad_norm": 2.7484521865844727, + "grad_norm_var": 0.3029795086588024, + "learning_rate": 0.0001, + "loss": 1.202, + "loss/crossentropy": 2.534072160720825, + "loss/hidden": 0.92578125, + "loss/logits": 0.17271853983402252, + "loss/reg": 0.01034675445407629, + "step": 2098 + }, + { + "epoch": 0.262375, + "grad_norm": 2.7413110733032227, + "grad_norm_var": 0.3049655098036091, + "learning_rate": 0.0001, + "loss": 1.0681, + "loss/crossentropy": 2.4099173545837402, + "loss/hidden": 0.828125, + "loss/logits": 0.13661763072013855, + "loss/reg": 0.010340685024857521, + "step": 2099 + }, + { + "epoch": 0.2625, + "grad_norm": 3.3578720092773438, + "grad_norm_var": 0.3033467253309368, + "learning_rate": 0.0001, + "loss": 1.3632, + "loss/crossentropy": 2.1501314640045166, + "loss/hidden": 1.0703125, + "loss/logits": 0.18956157565116882, + "loss/reg": 0.010334583930671215, + "step": 2100 + }, + { + "epoch": 0.262625, + "grad_norm": 4.2792534828186035, + "grad_norm_var": 0.409317608030176, + "learning_rate": 0.0001, + "loss": 1.4494, + "loss/crossentropy": 2.171112060546875, + "loss/hidden": 1.140625, + "loss/logits": 0.20548683404922485, + "loss/reg": 0.010328834876418114, + "step": 2101 + }, + { + "epoch": 0.26275, + "grad_norm": 3.419863700866699, + "grad_norm_var": 0.41746939224728125, + "learning_rate": 0.0001, + "loss": 1.0112, + "loss/crossentropy": 2.3898708820343018, + "loss/hidden": 0.7734375, + "loss/logits": 0.13451659679412842, + "loss/reg": 0.010323096998035908, + "step": 2102 + }, + { + "epoch": 0.262875, + "grad_norm": 2.8384969234466553, + "grad_norm_var": 0.4049378955981638, + "learning_rate": 0.0001, + "loss": 1.2786, + "loss/crossentropy": 2.628030300140381, + "loss/hidden": 0.96875, + "loss/logits": 0.20668946206569672, + "loss/reg": 0.010317344218492508, + "step": 2103 + }, + { + "epoch": 0.263, + "grad_norm": 2.6042320728302, + "grad_norm_var": 0.3730507967085714, + "learning_rate": 0.0001, + "loss": 1.0236, + "loss/crossentropy": 2.5990564823150635, + "loss/hidden": 0.7734375, + "loss/logits": 0.14705008268356323, + "loss/reg": 0.010311637073755264, + "step": 2104 + }, + { + "epoch": 0.263125, + "grad_norm": 3.4519386291503906, + "grad_norm_var": 0.36402612721446986, + "learning_rate": 0.0001, + "loss": 1.1696, + "loss/crossentropy": 2.5916223526000977, + "loss/hidden": 0.91015625, + "loss/logits": 0.15633687376976013, + "loss/reg": 0.010306085459887981, + "step": 2105 + }, + { + "epoch": 0.26325, + "grad_norm": 3.552064895629883, + "grad_norm_var": 0.3048803620922399, + "learning_rate": 0.0001, + "loss": 1.2476, + "loss/crossentropy": 2.4516284465789795, + "loss/hidden": 0.9609375, + "loss/logits": 0.18366728723049164, + "loss/reg": 0.010300572961568832, + "step": 2106 + }, + { + "epoch": 0.263375, + "grad_norm": 3.1537628173828125, + "grad_norm_var": 0.3053022228908541, + "learning_rate": 0.0001, + "loss": 1.1179, + "loss/crossentropy": 2.616114377975464, + "loss/hidden": 0.859375, + "loss/logits": 0.15561805665493011, + "loss/reg": 0.010294746607542038, + "step": 2107 + }, + { + "epoch": 0.2635, + "grad_norm": 2.4106662273406982, + "grad_norm_var": 0.3422070628643458, + "learning_rate": 0.0001, + "loss": 1.0311, + "loss/crossentropy": 2.3661184310913086, + "loss/hidden": 0.78515625, + "loss/logits": 0.14306451380252838, + "loss/reg": 0.010288943536579609, + "step": 2108 + }, + { + "epoch": 0.263625, + "grad_norm": 2.5815038681030273, + "grad_norm_var": 0.36627131268263136, + "learning_rate": 0.0001, + "loss": 1.0964, + "loss/crossentropy": 2.4047305583953857, + "loss/hidden": 0.8359375, + "loss/logits": 0.1576739251613617, + "loss/reg": 0.010283387266099453, + "step": 2109 + }, + { + "epoch": 0.26375, + "grad_norm": 3.790802240371704, + "grad_norm_var": 0.3808959372849169, + "learning_rate": 0.0001, + "loss": 1.0866, + "loss/crossentropy": 2.565654754638672, + "loss/hidden": 0.8203125, + "loss/logits": 0.1634831726551056, + "loss/reg": 0.010277556255459785, + "step": 2110 + }, + { + "epoch": 0.263875, + "grad_norm": 3.265228033065796, + "grad_norm_var": 0.2949191849167124, + "learning_rate": 0.0001, + "loss": 1.3535, + "loss/crossentropy": 1.9508463144302368, + "loss/hidden": 1.0625, + "loss/logits": 0.1883111149072647, + "loss/reg": 0.010271648876369, + "step": 2111 + }, + { + "epoch": 0.264, + "grad_norm": 3.213825225830078, + "grad_norm_var": 0.27065611473443124, + "learning_rate": 0.0001, + "loss": 1.2105, + "loss/crossentropy": 2.624788761138916, + "loss/hidden": 0.9375, + "loss/logits": 0.1703457236289978, + "loss/reg": 0.010265778750181198, + "step": 2112 + }, + { + "epoch": 0.264125, + "grad_norm": 3.240231990814209, + "grad_norm_var": 0.24461892856342024, + "learning_rate": 0.0001, + "loss": 1.2478, + "loss/crossentropy": 2.359703302383423, + "loss/hidden": 0.9296875, + "loss/logits": 0.2155158668756485, + "loss/reg": 0.010260026901960373, + "step": 2113 + }, + { + "epoch": 0.26425, + "grad_norm": 3.8198609352111816, + "grad_norm_var": 0.25677312532707763, + "learning_rate": 0.0001, + "loss": 1.4105, + "loss/crossentropy": 2.3813629150390625, + "loss/hidden": 1.09375, + "loss/logits": 0.21416184306144714, + "loss/reg": 0.01025412417948246, + "step": 2114 + }, + { + "epoch": 0.264375, + "grad_norm": 2.8523075580596924, + "grad_norm_var": 0.2502729276193037, + "learning_rate": 0.0001, + "loss": 1.159, + "loss/crossentropy": 2.517249345779419, + "loss/hidden": 0.88671875, + "loss/logits": 0.16980549693107605, + "loss/reg": 0.010248334147036076, + "step": 2115 + }, + { + "epoch": 0.2645, + "grad_norm": 2.7850544452667236, + "grad_norm_var": 0.261739256387839, + "learning_rate": 0.0001, + "loss": 1.2115, + "loss/crossentropy": 2.249230146408081, + "loss/hidden": 0.91796875, + "loss/logits": 0.191123366355896, + "loss/reg": 0.010242954827845097, + "step": 2116 + }, + { + "epoch": 0.264625, + "grad_norm": 8.806661605834961, + "grad_norm_var": 2.1920949210864893, + "learning_rate": 0.0001, + "loss": 1.3274, + "loss/crossentropy": 2.445176362991333, + "loss/hidden": 1.046875, + "loss/logits": 0.17813825607299805, + "loss/reg": 0.010237051174044609, + "step": 2117 + }, + { + "epoch": 0.26475, + "grad_norm": 2.5015766620635986, + "grad_norm_var": 2.2529760871735096, + "learning_rate": 0.0001, + "loss": 1.0075, + "loss/crossentropy": 2.423354148864746, + "loss/hidden": 0.78125, + "loss/logits": 0.12395646423101425, + "loss/reg": 0.010231039486825466, + "step": 2118 + }, + { + "epoch": 0.264875, + "grad_norm": 2.3766047954559326, + "grad_norm_var": 2.3026928252546432, + "learning_rate": 0.0001, + "loss": 1.0822, + "loss/crossentropy": 2.1317436695098877, + "loss/hidden": 0.8359375, + "loss/logits": 0.14402800798416138, + "loss/reg": 0.010225295089185238, + "step": 2119 + }, + { + "epoch": 0.265, + "grad_norm": 3.0615155696868896, + "grad_norm_var": 2.2672191242513082, + "learning_rate": 0.0001, + "loss": 1.1377, + "loss/crossentropy": 2.498379707336426, + "loss/hidden": 0.8828125, + "loss/logits": 0.15265986323356628, + "loss/reg": 0.010219316929578781, + "step": 2120 + }, + { + "epoch": 0.265125, + "grad_norm": 4.3087568283081055, + "grad_norm_var": 2.3157260948528307, + "learning_rate": 0.0001, + "loss": 1.3121, + "loss/crossentropy": 2.7038280963897705, + "loss/hidden": 1.0078125, + "loss/logits": 0.20214135944843292, + "loss/reg": 0.01021356601268053, + "step": 2121 + }, + { + "epoch": 0.26525, + "grad_norm": 2.2284393310546875, + "grad_norm_var": 2.4129527581361763, + "learning_rate": 0.0001, + "loss": 0.9797, + "loss/crossentropy": 2.5127954483032227, + "loss/hidden": 0.74609375, + "loss/logits": 0.13156917691230774, + "loss/reg": 0.010207895189523697, + "step": 2122 + }, + { + "epoch": 0.265375, + "grad_norm": 3.118272066116333, + "grad_norm_var": 2.414195754766751, + "learning_rate": 0.0001, + "loss": 1.1355, + "loss/crossentropy": 2.2942655086517334, + "loss/hidden": 0.89453125, + "loss/logits": 0.1389252245426178, + "loss/reg": 0.010201762430369854, + "step": 2123 + }, + { + "epoch": 0.2655, + "grad_norm": 4.872684478759766, + "grad_norm_var": 2.469067763743077, + "learning_rate": 0.0001, + "loss": 1.2681, + "loss/crossentropy": 2.6232783794403076, + "loss/hidden": 0.96875, + "loss/logits": 0.19743511080741882, + "loss/reg": 0.010195438750088215, + "step": 2124 + }, + { + "epoch": 0.265625, + "grad_norm": 2.6287519931793213, + "grad_norm_var": 2.4630968202050685, + "learning_rate": 0.0001, + "loss": 1.1772, + "loss/crossentropy": 2.4108433723449707, + "loss/hidden": 0.8984375, + "loss/logits": 0.17682504653930664, + "loss/reg": 0.010189146734774113, + "step": 2125 + }, + { + "epoch": 0.26575, + "grad_norm": 3.1955409049987793, + "grad_norm_var": 2.4664808706206105, + "learning_rate": 0.0001, + "loss": 1.0298, + "loss/crossentropy": 2.5196099281311035, + "loss/hidden": 0.7890625, + "loss/logits": 0.13892781734466553, + "loss/reg": 0.010182846337556839, + "step": 2126 + }, + { + "epoch": 0.265875, + "grad_norm": 2.9813921451568604, + "grad_norm_var": 2.481052137168754, + "learning_rate": 0.0001, + "loss": 1.1784, + "loss/crossentropy": 2.384112596511841, + "loss/hidden": 0.9296875, + "loss/logits": 0.1469021588563919, + "loss/reg": 0.010176514275372028, + "step": 2127 + }, + { + "epoch": 0.266, + "grad_norm": 2.5263257026672363, + "grad_norm_var": 2.5367769489401892, + "learning_rate": 0.0001, + "loss": 1.0448, + "loss/crossentropy": 2.6461563110351562, + "loss/hidden": 0.796875, + "loss/logits": 0.14621557295322418, + "loss/reg": 0.010170318186283112, + "step": 2128 + }, + { + "epoch": 0.266125, + "grad_norm": 4.333083629608154, + "grad_norm_var": 2.5799092725466193, + "learning_rate": 0.0001, + "loss": 1.058, + "loss/crossentropy": 2.575655698776245, + "loss/hidden": 0.84375, + "loss/logits": 0.1125684529542923, + "loss/reg": 0.010164611041545868, + "step": 2129 + }, + { + "epoch": 0.26625, + "grad_norm": 2.7422056198120117, + "grad_norm_var": 2.6100968096655595, + "learning_rate": 0.0001, + "loss": 1.056, + "loss/crossentropy": 2.628509521484375, + "loss/hidden": 0.80859375, + "loss/logits": 0.14578229188919067, + "loss/reg": 0.010158922523260117, + "step": 2130 + }, + { + "epoch": 0.266375, + "grad_norm": 3.169044017791748, + "grad_norm_var": 2.5908109141428777, + "learning_rate": 0.0001, + "loss": 1.1435, + "loss/crossentropy": 2.6887242794036865, + "loss/hidden": 0.8828125, + "loss/logits": 0.15920519828796387, + "loss/reg": 0.010152962058782578, + "step": 2131 + }, + { + "epoch": 0.2665, + "grad_norm": 3.110137462615967, + "grad_norm_var": 2.5674132914151944, + "learning_rate": 0.0001, + "loss": 0.9299, + "loss/crossentropy": 2.4567489624023438, + "loss/hidden": 0.71875, + "loss/logits": 0.1097012311220169, + "loss/reg": 0.010147429071366787, + "step": 2132 + }, + { + "epoch": 0.266625, + "grad_norm": 3.8575761318206787, + "grad_norm_var": 0.5948953990241305, + "learning_rate": 0.0001, + "loss": 1.2373, + "loss/crossentropy": 2.6997244358062744, + "loss/hidden": 0.94921875, + "loss/logits": 0.18663081526756287, + "loss/reg": 0.010141723789274693, + "step": 2133 + }, + { + "epoch": 0.26675, + "grad_norm": 2.6897120475769043, + "grad_norm_var": 0.5798827199153114, + "learning_rate": 0.0001, + "loss": 1.1483, + "loss/crossentropy": 2.0659902095794678, + "loss/hidden": 0.89453125, + "loss/logits": 0.15243153274059296, + "loss/reg": 0.010135877877473831, + "step": 2134 + }, + { + "epoch": 0.266875, + "grad_norm": 3.7331113815307617, + "grad_norm_var": 0.5459636502716911, + "learning_rate": 0.0001, + "loss": 1.5496, + "loss/crossentropy": 2.4835729598999023, + "loss/hidden": 1.2109375, + "loss/logits": 0.23733875155448914, + "loss/reg": 0.010130085982382298, + "step": 2135 + }, + { + "epoch": 0.267, + "grad_norm": 2.9091737270355225, + "grad_norm_var": 0.5519492425382242, + "learning_rate": 0.0001, + "loss": 1.087, + "loss/crossentropy": 2.6955347061157227, + "loss/hidden": 0.83203125, + "loss/logits": 0.15369224548339844, + "loss/reg": 0.01012417208403349, + "step": 2136 + }, + { + "epoch": 0.267125, + "grad_norm": 2.3608083724975586, + "grad_norm_var": 0.5206799887997702, + "learning_rate": 0.0001, + "loss": 1.0926, + "loss/crossentropy": 2.638737678527832, + "loss/hidden": 0.82421875, + "loss/logits": 0.1672324538230896, + "loss/reg": 0.010118553414940834, + "step": 2137 + }, + { + "epoch": 0.26725, + "grad_norm": 16.352521896362305, + "grad_norm_var": 11.246671836879353, + "learning_rate": 0.0001, + "loss": 1.5778, + "loss/crossentropy": 2.2380735874176025, + "loss/hidden": 1.1875, + "loss/logits": 0.28916794061660767, + "loss/reg": 0.010112768970429897, + "step": 2138 + }, + { + "epoch": 0.267375, + "grad_norm": 2.879335403442383, + "grad_norm_var": 11.27948583207982, + "learning_rate": 0.0001, + "loss": 1.0294, + "loss/crossentropy": 2.3748319149017334, + "loss/hidden": 0.80078125, + "loss/logits": 0.12752759456634521, + "loss/reg": 0.01010711956769228, + "step": 2139 + }, + { + "epoch": 0.2675, + "grad_norm": 3.0504684448242188, + "grad_norm_var": 11.280170279339817, + "learning_rate": 0.0001, + "loss": 1.1666, + "loss/crossentropy": 2.357994318008423, + "loss/hidden": 0.90625, + "loss/logits": 0.15932440757751465, + "loss/reg": 0.010101627558469772, + "step": 2140 + }, + { + "epoch": 0.267625, + "grad_norm": 3.313279867172241, + "grad_norm_var": 11.19274923633117, + "learning_rate": 0.0001, + "loss": 1.0976, + "loss/crossentropy": 2.3743796348571777, + "loss/hidden": 0.859375, + "loss/logits": 0.1372767686843872, + "loss/reg": 0.010095990262925625, + "step": 2141 + }, + { + "epoch": 0.26775, + "grad_norm": 3.429500102996826, + "grad_norm_var": 11.172628027263166, + "learning_rate": 0.0001, + "loss": 1.2376, + "loss/crossentropy": 2.158029079437256, + "loss/hidden": 0.9609375, + "loss/logits": 0.17577707767486572, + "loss/reg": 0.010090584866702557, + "step": 2142 + }, + { + "epoch": 0.267875, + "grad_norm": 2.7998569011688232, + "grad_norm_var": 11.19849213401565, + "learning_rate": 0.0001, + "loss": 1.2822, + "loss/crossentropy": 2.461344003677368, + "loss/hidden": 0.98828125, + "loss/logits": 0.19309166073799133, + "loss/reg": 0.010085242800414562, + "step": 2143 + }, + { + "epoch": 0.268, + "grad_norm": 3.6905696392059326, + "grad_norm_var": 11.061663395731365, + "learning_rate": 0.0001, + "loss": 1.1683, + "loss/crossentropy": 2.435070037841797, + "loss/hidden": 0.88671875, + "loss/logits": 0.18073806166648865, + "loss/reg": 0.010079882107675076, + "step": 2144 + }, + { + "epoch": 0.268125, + "grad_norm": 3.5029520988464355, + "grad_norm_var": 11.070774317930445, + "learning_rate": 0.0001, + "loss": 1.0589, + "loss/crossentropy": 2.6161611080169678, + "loss/hidden": 0.80859375, + "loss/logits": 0.1496104747056961, + "loss/reg": 0.010074068792164326, + "step": 2145 + }, + { + "epoch": 0.26825, + "grad_norm": 3.993612051010132, + "grad_norm_var": 10.96305518196281, + "learning_rate": 0.0001, + "loss": 1.0577, + "loss/crossentropy": 2.4619667530059814, + "loss/hidden": 0.83203125, + "loss/logits": 0.12501287460327148, + "loss/reg": 0.010068584233522415, + "step": 2146 + }, + { + "epoch": 0.268375, + "grad_norm": 2.4622223377227783, + "grad_norm_var": 11.077549207999636, + "learning_rate": 0.0001, + "loss": 1.0911, + "loss/crossentropy": 2.5504589080810547, + "loss/hidden": 0.84375, + "loss/logits": 0.14670145511627197, + "loss/reg": 0.010062742047011852, + "step": 2147 + }, + { + "epoch": 0.2685, + "grad_norm": 4.003227233886719, + "grad_norm_var": 11.020432655068516, + "learning_rate": 0.0001, + "loss": 1.1054, + "loss/crossentropy": 2.41068959236145, + "loss/hidden": 0.87109375, + "loss/logits": 0.13369348645210266, + "loss/reg": 0.010056855157017708, + "step": 2148 + }, + { + "epoch": 0.268625, + "grad_norm": 2.8384718894958496, + "grad_norm_var": 11.113425843681208, + "learning_rate": 0.0001, + "loss": 1.1193, + "loss/crossentropy": 2.443211078643799, + "loss/hidden": 0.87109375, + "loss/logits": 0.14774015545845032, + "loss/reg": 0.010050834156572819, + "step": 2149 + }, + { + "epoch": 0.26875, + "grad_norm": 2.593756675720215, + "grad_norm_var": 11.130772252730903, + "learning_rate": 0.0001, + "loss": 1.1712, + "loss/crossentropy": 2.3977110385894775, + "loss/hidden": 0.921875, + "loss/logits": 0.1488749086856842, + "loss/reg": 0.010044893249869347, + "step": 2150 + }, + { + "epoch": 0.268875, + "grad_norm": 2.3382418155670166, + "grad_norm_var": 11.300999898854352, + "learning_rate": 0.0001, + "loss": 1.0507, + "loss/crossentropy": 2.5140347480773926, + "loss/hidden": 0.82421875, + "loss/logits": 0.12612497806549072, + "loss/reg": 0.010039133951067924, + "step": 2151 + }, + { + "epoch": 0.269, + "grad_norm": 3.880267858505249, + "grad_norm_var": 11.230692579842113, + "learning_rate": 0.0001, + "loss": 1.5146, + "loss/crossentropy": 2.5326569080352783, + "loss/hidden": 1.15625, + "loss/logits": 0.25804269313812256, + "loss/reg": 0.010033266618847847, + "step": 2152 + }, + { + "epoch": 0.269125, + "grad_norm": 3.2311160564422607, + "grad_norm_var": 11.091524209255498, + "learning_rate": 0.0001, + "loss": 1.7227, + "loss/crossentropy": 1.8929760456085205, + "loss/hidden": 1.3203125, + "loss/logits": 0.3020824193954468, + "loss/reg": 0.01002716924995184, + "step": 2153 + }, + { + "epoch": 0.26925, + "grad_norm": 2.860647678375244, + "grad_norm_var": 0.2876930460319841, + "learning_rate": 0.0001, + "loss": 1.1474, + "loss/crossentropy": 2.279733896255493, + "loss/hidden": 0.89453125, + "loss/logits": 0.15265312790870667, + "loss/reg": 0.010021227411925793, + "step": 2154 + }, + { + "epoch": 0.269375, + "grad_norm": 4.402594089508057, + "grad_norm_var": 0.3718058743510625, + "learning_rate": 0.0001, + "loss": 1.3112, + "loss/crossentropy": 2.4982807636260986, + "loss/hidden": 0.99609375, + "loss/logits": 0.21492905914783478, + "loss/reg": 0.010015525855123997, + "step": 2155 + }, + { + "epoch": 0.2695, + "grad_norm": 11.074371337890625, + "grad_norm_var": 4.156144743562905, + "learning_rate": 0.0001, + "loss": 1.4799, + "loss/crossentropy": 2.526057004928589, + "loss/hidden": 1.1953125, + "loss/logits": 0.18447992205619812, + "loss/reg": 0.010009690187871456, + "step": 2156 + }, + { + "epoch": 0.269625, + "grad_norm": 2.971534490585327, + "grad_norm_var": 4.184524703810582, + "learning_rate": 0.0001, + "loss": 1.1036, + "loss/crossentropy": 2.3096463680267334, + "loss/hidden": 0.86328125, + "loss/logits": 0.14024464786052704, + "loss/reg": 0.0100040128454566, + "step": 2157 + }, + { + "epoch": 0.26975, + "grad_norm": 14.015320777893066, + "grad_norm_var": 10.72944779198163, + "learning_rate": 0.0001, + "loss": 2.1587, + "loss/crossentropy": 2.4940409660339355, + "loss/hidden": 1.6796875, + "loss/logits": 0.37902015447616577, + "loss/reg": 0.009998547844588757, + "step": 2158 + }, + { + "epoch": 0.269875, + "grad_norm": 3.77718448638916, + "grad_norm_var": 10.578523200834447, + "learning_rate": 0.0001, + "loss": 1.0648, + "loss/crossentropy": 2.706181526184082, + "loss/hidden": 0.8046875, + "loss/logits": 0.16015514731407166, + "loss/reg": 0.009993069805204868, + "step": 2159 + }, + { + "epoch": 0.27, + "grad_norm": 3.687567710876465, + "grad_norm_var": 10.578838640730357, + "learning_rate": 0.0001, + "loss": 1.3443, + "loss/crossentropy": 2.6082820892333984, + "loss/hidden": 1.015625, + "loss/logits": 0.22882798314094543, + "loss/reg": 0.009987478144466877, + "step": 2160 + }, + { + "epoch": 0.270125, + "grad_norm": 3.5592525005340576, + "grad_norm_var": 10.571724333369337, + "learning_rate": 0.0001, + "loss": 1.2565, + "loss/crossentropy": 2.419528007507324, + "loss/hidden": 0.98828125, + "loss/logits": 0.16835004091262817, + "loss/reg": 0.009982030838727951, + "step": 2161 + }, + { + "epoch": 0.27025, + "grad_norm": 3.6662449836730957, + "grad_norm_var": 10.599678341413187, + "learning_rate": 0.0001, + "loss": 1.412, + "loss/crossentropy": 2.7670695781707764, + "loss/hidden": 1.078125, + "loss/logits": 0.23413586616516113, + "loss/reg": 0.009976562112569809, + "step": 2162 + }, + { + "epoch": 0.270375, + "grad_norm": 3.237701654434204, + "grad_norm_var": 10.430686084884783, + "learning_rate": 0.0001, + "loss": 1.4157, + "loss/crossentropy": 2.5717592239379883, + "loss/hidden": 1.078125, + "loss/logits": 0.23786383867263794, + "loss/reg": 0.009971106424927711, + "step": 2163 + }, + { + "epoch": 0.2705, + "grad_norm": 16.27867889404297, + "grad_norm_var": 19.02145858413533, + "learning_rate": 0.0001, + "loss": 1.5732, + "loss/crossentropy": 2.3264029026031494, + "loss/hidden": 1.265625, + "loss/logits": 0.2078859508037567, + "loss/reg": 0.009965726174414158, + "step": 2164 + }, + { + "epoch": 0.270625, + "grad_norm": 45.029998779296875, + "grad_norm_var": 116.56793438744144, + "learning_rate": 0.0001, + "loss": 1.3735, + "loss/crossentropy": 2.3806426525115967, + "loss/hidden": 1.078125, + "loss/logits": 0.19577783346176147, + "loss/reg": 0.009960070252418518, + "step": 2165 + }, + { + "epoch": 0.27075, + "grad_norm": 4.214069843292236, + "grad_norm_var": 115.58289167538311, + "learning_rate": 0.0001, + "loss": 1.1063, + "loss/crossentropy": 3.007105588912964, + "loss/hidden": 0.859375, + "loss/logits": 0.1473969966173172, + "loss/reg": 0.00995447114109993, + "step": 2166 + }, + { + "epoch": 0.270875, + "grad_norm": 4.425316333770752, + "grad_norm_var": 114.27568968306282, + "learning_rate": 0.0001, + "loss": 1.3143, + "loss/crossentropy": 2.483200788497925, + "loss/hidden": 1.046875, + "loss/logits": 0.16794663667678833, + "loss/reg": 0.009949086233973503, + "step": 2167 + }, + { + "epoch": 0.271, + "grad_norm": 2.866948366165161, + "grad_norm_var": 114.91600186175832, + "learning_rate": 0.0001, + "loss": 1.208, + "loss/crossentropy": 2.5307364463806152, + "loss/hidden": 0.94921875, + "loss/logits": 0.15934261679649353, + "loss/reg": 0.009943562559783459, + "step": 2168 + }, + { + "epoch": 0.271125, + "grad_norm": 5.033145904541016, + "grad_norm_var": 113.95363582210308, + "learning_rate": 0.0001, + "loss": 1.3658, + "loss/crossentropy": 2.376593589782715, + "loss/hidden": 1.0703125, + "loss/logits": 0.19611963629722595, + "loss/reg": 0.009937995113432407, + "step": 2169 + }, + { + "epoch": 0.27125, + "grad_norm": 2.8074445724487305, + "grad_norm_var": 113.99164466997617, + "learning_rate": 0.0001, + "loss": 1.199, + "loss/crossentropy": 2.34513783454895, + "loss/hidden": 0.92578125, + "loss/logits": 0.1739283800125122, + "loss/reg": 0.009932412765920162, + "step": 2170 + }, + { + "epoch": 0.271375, + "grad_norm": 2.721811532974243, + "grad_norm_var": 115.01708644455293, + "learning_rate": 0.0001, + "loss": 1.2708, + "loss/crossentropy": 2.452144145965576, + "loss/hidden": 0.984375, + "loss/logits": 0.18710508942604065, + "loss/reg": 0.009927044622600079, + "step": 2171 + }, + { + "epoch": 0.2715, + "grad_norm": 3.104896068572998, + "grad_norm_var": 115.81056162345506, + "learning_rate": 0.0001, + "loss": 1.0945, + "loss/crossentropy": 2.0558090209960938, + "loss/hidden": 0.85546875, + "loss/logits": 0.13982000946998596, + "loss/reg": 0.009921545162796974, + "step": 2172 + }, + { + "epoch": 0.271625, + "grad_norm": 3.726212978363037, + "grad_norm_var": 115.3816999987786, + "learning_rate": 0.0001, + "loss": 1.3773, + "loss/crossentropy": 2.3786098957061768, + "loss/hidden": 1.0703125, + "loss/logits": 0.20779243111610413, + "loss/reg": 0.009916161186993122, + "step": 2173 + }, + { + "epoch": 0.27175, + "grad_norm": 2.967250347137451, + "grad_norm_var": 113.61098811113767, + "learning_rate": 0.0001, + "loss": 1.0372, + "loss/crossentropy": 2.4904751777648926, + "loss/hidden": 0.8046875, + "loss/logits": 0.1333538293838501, + "loss/reg": 0.009910937398672104, + "step": 2174 + }, + { + "epoch": 0.271875, + "grad_norm": 2.885894775390625, + "grad_norm_var": 114.03697591377224, + "learning_rate": 0.0001, + "loss": 1.1075, + "loss/crossentropy": 2.7325031757354736, + "loss/hidden": 0.86328125, + "loss/logits": 0.14511612057685852, + "loss/reg": 0.009905933402478695, + "step": 2175 + }, + { + "epoch": 0.272, + "grad_norm": 2.9962995052337646, + "grad_norm_var": 114.36184814133671, + "learning_rate": 0.0001, + "loss": 1.1445, + "loss/crossentropy": 2.445965051651001, + "loss/hidden": 0.90234375, + "loss/logits": 0.14316099882125854, + "loss/reg": 0.009900378063321114, + "step": 2176 + }, + { + "epoch": 0.272125, + "grad_norm": 2.7793033123016357, + "grad_norm_var": 114.74157125194176, + "learning_rate": 0.0001, + "loss": 1.1314, + "loss/crossentropy": 2.7145345211029053, + "loss/hidden": 0.875, + "loss/logits": 0.1574961245059967, + "loss/reg": 0.009895196184515953, + "step": 2177 + }, + { + "epoch": 0.27225, + "grad_norm": 3.2374234199523926, + "grad_norm_var": 114.93203041920228, + "learning_rate": 0.0001, + "loss": 1.1662, + "loss/crossentropy": 2.6908035278320312, + "loss/hidden": 0.90625, + "loss/logits": 0.16106431186199188, + "loss/reg": 0.009889421984553337, + "step": 2178 + }, + { + "epoch": 0.272375, + "grad_norm": 3.201021671295166, + "grad_norm_var": 114.94938746965903, + "learning_rate": 0.0001, + "loss": 1.1241, + "loss/crossentropy": 2.5376927852630615, + "loss/hidden": 0.8515625, + "loss/logits": 0.17366735637187958, + "loss/reg": 0.009883632883429527, + "step": 2179 + }, + { + "epoch": 0.2725, + "grad_norm": 3.1757233142852783, + "grad_norm_var": 109.06277776060648, + "learning_rate": 0.0001, + "loss": 1.1366, + "loss/crossentropy": 2.5199360847473145, + "loss/hidden": 0.90234375, + "loss/logits": 0.13546113669872284, + "loss/reg": 0.009877925738692284, + "step": 2180 + }, + { + "epoch": 0.272625, + "grad_norm": 5.264944553375244, + "grad_norm_var": 0.6800363519314504, + "learning_rate": 0.0001, + "loss": 1.3355, + "loss/crossentropy": 2.3270413875579834, + "loss/hidden": 1.0546875, + "loss/logits": 0.18204890191555023, + "loss/reg": 0.009872402995824814, + "step": 2181 + }, + { + "epoch": 0.27275, + "grad_norm": 5.18062162399292, + "grad_norm_var": 0.8352206651070369, + "learning_rate": 0.0001, + "loss": 1.2897, + "loss/crossentropy": 2.2362985610961914, + "loss/hidden": 1.0390625, + "loss/logits": 0.15199674665927887, + "loss/reg": 0.009866653010249138, + "step": 2182 + }, + { + "epoch": 0.272875, + "grad_norm": 4.996032238006592, + "grad_norm_var": 0.9242103621291087, + "learning_rate": 0.0001, + "loss": 1.42, + "loss/crossentropy": 2.92541241645813, + "loss/hidden": 1.1484375, + "loss/logits": 0.17291401326656342, + "loss/reg": 0.009860844351351261, + "step": 2183 + }, + { + "epoch": 0.273, + "grad_norm": 2.9425084590911865, + "grad_norm_var": 0.9175943835932039, + "learning_rate": 0.0001, + "loss": 1.0082, + "loss/crossentropy": 2.8947973251342773, + "loss/hidden": 0.78125, + "loss/logits": 0.12839564681053162, + "loss/reg": 0.009855199605226517, + "step": 2184 + }, + { + "epoch": 0.273125, + "grad_norm": 2.51678466796875, + "grad_norm_var": 0.820356084884228, + "learning_rate": 0.0001, + "loss": 1.1101, + "loss/crossentropy": 2.5720982551574707, + "loss/hidden": 0.84765625, + "loss/logits": 0.16394191980361938, + "loss/reg": 0.009849678725004196, + "step": 2185 + }, + { + "epoch": 0.27325, + "grad_norm": 4.033634662628174, + "grad_norm_var": 0.8163849231283145, + "learning_rate": 0.0001, + "loss": 1.2044, + "loss/crossentropy": 2.419597864151001, + "loss/hidden": 0.90234375, + "loss/logits": 0.20366601645946503, + "loss/reg": 0.009843993000686169, + "step": 2186 + }, + { + "epoch": 0.273375, + "grad_norm": 2.59316086769104, + "grad_norm_var": 0.8304788807443456, + "learning_rate": 0.0001, + "loss": 1.1844, + "loss/crossentropy": 2.229067802429199, + "loss/hidden": 0.92578125, + "loss/logits": 0.16026070713996887, + "loss/reg": 0.009838265366852283, + "step": 2187 + }, + { + "epoch": 0.2735, + "grad_norm": 3.500840425491333, + "grad_norm_var": 0.8207327345143501, + "learning_rate": 0.0001, + "loss": 1.2558, + "loss/crossentropy": 2.5278289318084717, + "loss/hidden": 0.9609375, + "loss/logits": 0.196561798453331, + "loss/reg": 0.009832593612372875, + "step": 2188 + }, + { + "epoch": 0.273625, + "grad_norm": 2.86061954498291, + "grad_norm_var": 0.8414362861989796, + "learning_rate": 0.0001, + "loss": 1.2531, + "loss/crossentropy": 2.2744202613830566, + "loss/hidden": 0.98046875, + "loss/logits": 0.1744043231010437, + "loss/reg": 0.009827111847698689, + "step": 2189 + }, + { + "epoch": 0.27375, + "grad_norm": 3.6910080909729004, + "grad_norm_var": 0.8279992728085735, + "learning_rate": 0.0001, + "loss": 1.2958, + "loss/crossentropy": 2.42425799369812, + "loss/hidden": 1.0078125, + "loss/logits": 0.18975931406021118, + "loss/reg": 0.009821229614317417, + "step": 2190 + }, + { + "epoch": 0.273875, + "grad_norm": 2.8260607719421387, + "grad_norm_var": 0.8330503894498368, + "learning_rate": 0.0001, + "loss": 1.2422, + "loss/crossentropy": 2.42783260345459, + "loss/hidden": 0.94921875, + "loss/logits": 0.19478347897529602, + "loss/reg": 0.009815327823162079, + "step": 2191 + }, + { + "epoch": 0.274, + "grad_norm": 3.3954732418060303, + "grad_norm_var": 0.8168792226328644, + "learning_rate": 0.0001, + "loss": 1.2222, + "loss/crossentropy": 2.5824122428894043, + "loss/hidden": 0.96484375, + "loss/logits": 0.15926843881607056, + "loss/reg": 0.009809765964746475, + "step": 2192 + }, + { + "epoch": 0.274125, + "grad_norm": 2.861605644226074, + "grad_norm_var": 0.8092600565605511, + "learning_rate": 0.0001, + "loss": 1.2175, + "loss/crossentropy": 2.2833588123321533, + "loss/hidden": 0.9296875, + "loss/logits": 0.1897801160812378, + "loss/reg": 0.00980426650494337, + "step": 2193 + }, + { + "epoch": 0.27425, + "grad_norm": 3.0484890937805176, + "grad_norm_var": 0.8185425510343727, + "learning_rate": 0.0001, + "loss": 1.0603, + "loss/crossentropy": 2.440812349319458, + "loss/hidden": 0.81640625, + "loss/logits": 0.1459067314863205, + "loss/reg": 0.009798455983400345, + "step": 2194 + }, + { + "epoch": 0.274375, + "grad_norm": 4.596663951873779, + "grad_norm_var": 0.8836159421560142, + "learning_rate": 0.0001, + "loss": 1.758, + "loss/crossentropy": 2.2382004261016846, + "loss/hidden": 1.3515625, + "loss/logits": 0.3084772229194641, + "loss/reg": 0.009792608208954334, + "step": 2195 + }, + { + "epoch": 0.2745, + "grad_norm": 3.210632085800171, + "grad_norm_var": 0.8817510043573423, + "learning_rate": 0.0001, + "loss": 1.1809, + "loss/crossentropy": 2.5081589221954346, + "loss/hidden": 0.91015625, + "loss/logits": 0.17289654910564423, + "loss/reg": 0.009786740876734257, + "step": 2196 + }, + { + "epoch": 0.274625, + "grad_norm": 3.0279722213745117, + "grad_norm_var": 0.6964040437792636, + "learning_rate": 0.0001, + "loss": 1.0846, + "loss/crossentropy": 2.3824667930603027, + "loss/hidden": 0.83203125, + "loss/logits": 0.1547224521636963, + "loss/reg": 0.009780737571418285, + "step": 2197 + }, + { + "epoch": 0.27475, + "grad_norm": 3.2791800498962402, + "grad_norm_var": 0.4849157834275405, + "learning_rate": 0.0001, + "loss": 1.0237, + "loss/crossentropy": 2.7365756034851074, + "loss/hidden": 0.796875, + "loss/logits": 0.12910836935043335, + "loss/reg": 0.009775198064744473, + "step": 2198 + }, + { + "epoch": 0.274875, + "grad_norm": 4.019720554351807, + "grad_norm_var": 0.3284332614057763, + "learning_rate": 0.0001, + "loss": 1.3273, + "loss/crossentropy": 2.228800058364868, + "loss/hidden": 1.0390625, + "loss/logits": 0.19058409333229065, + "loss/reg": 0.009769170545041561, + "step": 2199 + }, + { + "epoch": 0.275, + "grad_norm": 2.937382698059082, + "grad_norm_var": 0.3286623257800547, + "learning_rate": 0.0001, + "loss": 1.1792, + "loss/crossentropy": 2.404200792312622, + "loss/hidden": 0.91796875, + "loss/logits": 0.16355863213539124, + "loss/reg": 0.009763635694980621, + "step": 2200 + }, + { + "epoch": 0.275125, + "grad_norm": 2.616483688354492, + "grad_norm_var": 0.3192051000051066, + "learning_rate": 0.0001, + "loss": 1.0374, + "loss/crossentropy": 2.4788200855255127, + "loss/hidden": 0.80859375, + "loss/logits": 0.1312486231327057, + "loss/reg": 0.009757671505212784, + "step": 2201 + }, + { + "epoch": 0.27525, + "grad_norm": 3.2147293090820312, + "grad_norm_var": 0.27895974488360104, + "learning_rate": 0.0001, + "loss": 1.0739, + "loss/crossentropy": 2.8450942039489746, + "loss/hidden": 0.82421875, + "loss/logits": 0.15215209126472473, + "loss/reg": 0.009752185083925724, + "step": 2202 + }, + { + "epoch": 0.275375, + "grad_norm": 3.9556267261505127, + "grad_norm_var": 0.2792895249966477, + "learning_rate": 0.0001, + "loss": 1.2343, + "loss/crossentropy": 3.072803497314453, + "loss/hidden": 0.94921875, + "loss/logits": 0.18756534159183502, + "loss/reg": 0.009746596217155457, + "step": 2203 + }, + { + "epoch": 0.2755, + "grad_norm": 2.5916085243225098, + "grad_norm_var": 0.3084476869953408, + "learning_rate": 0.0001, + "loss": 0.9465, + "loss/crossentropy": 2.6937105655670166, + "loss/hidden": 0.7265625, + "loss/logits": 0.12250223755836487, + "loss/reg": 0.00974097941070795, + "step": 2204 + }, + { + "epoch": 0.275625, + "grad_norm": 3.011049509048462, + "grad_norm_var": 0.30188503095406055, + "learning_rate": 0.0001, + "loss": 1.1624, + "loss/crossentropy": 2.2983462810516357, + "loss/hidden": 0.92578125, + "loss/logits": 0.1392519772052765, + "loss/reg": 0.009735219180583954, + "step": 2205 + }, + { + "epoch": 0.27575, + "grad_norm": 2.877084732055664, + "grad_norm_var": 0.29735406813803517, + "learning_rate": 0.0001, + "loss": 1.0721, + "loss/crossentropy": 2.390702247619629, + "loss/hidden": 0.828125, + "loss/logits": 0.14667537808418274, + "loss/reg": 0.00972927175462246, + "step": 2206 + }, + { + "epoch": 0.275875, + "grad_norm": 2.475576162338257, + "grad_norm_var": 0.32329409132086817, + "learning_rate": 0.0001, + "loss": 0.878, + "loss/crossentropy": 2.287673234939575, + "loss/hidden": 0.6796875, + "loss/logits": 0.10103243589401245, + "loss/reg": 0.00972361396998167, + "step": 2207 + }, + { + "epoch": 0.276, + "grad_norm": 3.575303554534912, + "grad_norm_var": 0.33012317894978577, + "learning_rate": 0.0001, + "loss": 1.3548, + "loss/crossentropy": 2.0907974243164062, + "loss/hidden": 1.09375, + "loss/logits": 0.1638820469379425, + "loss/reg": 0.00971762090921402, + "step": 2208 + }, + { + "epoch": 0.276125, + "grad_norm": 3.1657371520996094, + "grad_norm_var": 0.3219308090800155, + "learning_rate": 0.0001, + "loss": 1.1951, + "loss/crossentropy": 2.649160146713257, + "loss/hidden": 0.9296875, + "loss/logits": 0.16832080483436584, + "loss/reg": 0.009711584076285362, + "step": 2209 + }, + { + "epoch": 0.27625, + "grad_norm": 2.5019640922546387, + "grad_norm_var": 0.3534760136677505, + "learning_rate": 0.0001, + "loss": 1.0012, + "loss/crossentropy": 2.5844788551330566, + "loss/hidden": 0.7734375, + "loss/logits": 0.13065876066684723, + "loss/reg": 0.009706114418804646, + "step": 2210 + }, + { + "epoch": 0.276375, + "grad_norm": 2.4066948890686035, + "grad_norm_var": 0.24278876780725037, + "learning_rate": 0.0001, + "loss": 1.108, + "loss/crossentropy": 2.536252975463867, + "loss/hidden": 0.87109375, + "loss/logits": 0.13992956280708313, + "loss/reg": 0.009700177237391472, + "step": 2211 + }, + { + "epoch": 0.2765, + "grad_norm": 2.7826576232910156, + "grad_norm_var": 0.24530825719429927, + "learning_rate": 0.0001, + "loss": 1.1227, + "loss/crossentropy": 2.509038209915161, + "loss/hidden": 0.8671875, + "loss/logits": 0.15858623385429382, + "loss/reg": 0.0096947206184268, + "step": 2212 + }, + { + "epoch": 0.276625, + "grad_norm": 9.388289451599121, + "grad_norm_var": 2.7741260396865735, + "learning_rate": 0.0001, + "loss": 1.3788, + "loss/crossentropy": 2.5622992515563965, + "loss/hidden": 1.109375, + "loss/logits": 0.17249298095703125, + "loss/reg": 0.009689229540526867, + "step": 2213 + }, + { + "epoch": 0.27675, + "grad_norm": 3.0452990531921387, + "grad_norm_var": 2.7820903023287977, + "learning_rate": 0.0001, + "loss": 1.099, + "loss/crossentropy": 2.8121752738952637, + "loss/hidden": 0.85546875, + "loss/logits": 0.1466730833053589, + "loss/reg": 0.009683320298790932, + "step": 2214 + }, + { + "epoch": 0.276875, + "grad_norm": 2.647901773452759, + "grad_norm_var": 2.7882442780294356, + "learning_rate": 0.0001, + "loss": 1.1681, + "loss/crossentropy": 2.3218188285827637, + "loss/hidden": 0.9140625, + "loss/logits": 0.15728545188903809, + "loss/reg": 0.00967735517770052, + "step": 2215 + }, + { + "epoch": 0.277, + "grad_norm": 65.86066436767578, + "grad_norm_var": 246.99840150713922, + "learning_rate": 0.0001, + "loss": 1.0933, + "loss/crossentropy": 2.5657155513763428, + "loss/hidden": 0.84765625, + "loss/logits": 0.1489214450120926, + "loss/reg": 0.009671718813478947, + "step": 2216 + }, + { + "epoch": 0.277125, + "grad_norm": 3.6494710445404053, + "grad_norm_var": 246.42590677453992, + "learning_rate": 0.0001, + "loss": 1.1849, + "loss/crossentropy": 2.6815273761749268, + "loss/hidden": 0.921875, + "loss/logits": 0.1663939505815506, + "loss/reg": 0.00966588407754898, + "step": 2217 + }, + { + "epoch": 0.27725, + "grad_norm": 4.285943508148193, + "grad_norm_var": 245.91101086485037, + "learning_rate": 0.0001, + "loss": 1.4573, + "loss/crossentropy": 2.479628086090088, + "loss/hidden": 1.1484375, + "loss/logits": 0.21224689483642578, + "loss/reg": 0.009660634212195873, + "step": 2218 + }, + { + "epoch": 0.277375, + "grad_norm": 2.62919282913208, + "grad_norm_var": 246.628159496688, + "learning_rate": 0.0001, + "loss": 1.2003, + "loss/crossentropy": 2.2475430965423584, + "loss/hidden": 0.9375, + "loss/logits": 0.16628330945968628, + "loss/reg": 0.009655403904616833, + "step": 2219 + }, + { + "epoch": 0.2775, + "grad_norm": 77.71385192871094, + "grad_norm_var": 552.1178478607354, + "learning_rate": 0.0001, + "loss": 1.6729, + "loss/crossentropy": 1.5805362462997437, + "loss/hidden": 1.3828125, + "loss/logits": 0.19361108541488647, + "loss/reg": 0.009649988263845444, + "step": 2220 + }, + { + "epoch": 0.277625, + "grad_norm": 3.253176689147949, + "grad_norm_var": 551.8312824019707, + "learning_rate": 0.0001, + "loss": 1.0561, + "loss/crossentropy": 2.3721370697021484, + "loss/hidden": 0.78515625, + "loss/logits": 0.1744880974292755, + "loss/reg": 0.009644413366913795, + "step": 2221 + }, + { + "epoch": 0.27775, + "grad_norm": 3.896437406539917, + "grad_norm_var": 550.654097338038, + "learning_rate": 0.0001, + "loss": 1.3626, + "loss/crossentropy": 2.062020778656006, + "loss/hidden": 1.0859375, + "loss/logits": 0.1802404820919037, + "loss/reg": 0.00963887944817543, + "step": 2222 + }, + { + "epoch": 0.277875, + "grad_norm": 2.8896896839141846, + "grad_norm_var": 550.1345122376442, + "learning_rate": 0.0001, + "loss": 1.3564, + "loss/crossentropy": 2.3877456188201904, + "loss/hidden": 1.0546875, + "loss/logits": 0.20542627573013306, + "loss/reg": 0.009633551351726055, + "step": 2223 + }, + { + "epoch": 0.278, + "grad_norm": 2.7406136989593506, + "grad_norm_var": 551.1274286295331, + "learning_rate": 0.0001, + "loss": 1.0243, + "loss/crossentropy": 2.8025410175323486, + "loss/hidden": 0.78125, + "loss/logits": 0.14674940705299377, + "loss/reg": 0.009628547355532646, + "step": 2224 + }, + { + "epoch": 0.278125, + "grad_norm": 2.649326801300049, + "grad_norm_var": 551.7560672934434, + "learning_rate": 0.0001, + "loss": 1.2126, + "loss/crossentropy": 2.535365343093872, + "loss/hidden": 0.9296875, + "loss/logits": 0.18663521111011505, + "loss/reg": 0.00962311215698719, + "step": 2225 + }, + { + "epoch": 0.27825, + "grad_norm": 3.289268970489502, + "grad_norm_var": 550.7955227818352, + "learning_rate": 0.0001, + "loss": 1.4752, + "loss/crossentropy": 2.6070683002471924, + "loss/hidden": 1.125, + "loss/logits": 0.2540661692619324, + "loss/reg": 0.009617606177926064, + "step": 2226 + }, + { + "epoch": 0.278375, + "grad_norm": 3.790322780609131, + "grad_norm_var": 549.1323541791328, + "learning_rate": 0.0001, + "loss": 1.2542, + "loss/crossentropy": 2.6407763957977295, + "loss/hidden": 0.9609375, + "loss/logits": 0.19717758893966675, + "loss/reg": 0.00961229857057333, + "step": 2227 + }, + { + "epoch": 0.2785, + "grad_norm": 2.914515256881714, + "grad_norm_var": 548.9686302328081, + "learning_rate": 0.0001, + "loss": 1.2613, + "loss/crossentropy": 2.4076671600341797, + "loss/hidden": 0.9921875, + "loss/logits": 0.1730874478816986, + "loss/reg": 0.009607002139091492, + "step": 2228 + }, + { + "epoch": 0.278625, + "grad_norm": 3.0857045650482178, + "grad_norm_var": 553.7848933675326, + "learning_rate": 0.0001, + "loss": 1.1807, + "loss/crossentropy": 2.538856029510498, + "loss/hidden": 0.9140625, + "loss/logits": 0.17063549160957336, + "loss/reg": 0.009601949714124203, + "step": 2229 + }, + { + "epoch": 0.27875, + "grad_norm": 3.1248600482940674, + "grad_norm_var": 553.6927220289352, + "learning_rate": 0.0001, + "loss": 1.2267, + "loss/crossentropy": 2.1752779483795166, + "loss/hidden": 0.96875, + "loss/logits": 0.1619725227355957, + "loss/reg": 0.009596589021384716, + "step": 2230 + }, + { + "epoch": 0.278875, + "grad_norm": 2.442943572998047, + "grad_norm_var": 553.9448064383325, + "learning_rate": 0.0001, + "loss": 1.0652, + "loss/crossentropy": 2.233424186706543, + "loss/hidden": 0.82421875, + "loss/logits": 0.14511600136756897, + "loss/reg": 0.009590846486389637, + "step": 2231 + }, + { + "epoch": 0.279, + "grad_norm": 4.077425003051758, + "grad_norm_var": 346.8780987365817, + "learning_rate": 0.0001, + "loss": 1.3518, + "loss/crossentropy": 2.628734827041626, + "loss/hidden": 1.015625, + "loss/logits": 0.24029171466827393, + "loss/reg": 0.009584841318428516, + "step": 2232 + }, + { + "epoch": 0.279125, + "grad_norm": 2.6896305084228516, + "grad_norm_var": 347.4799188414807, + "learning_rate": 0.0001, + "loss": 1.1287, + "loss/crossentropy": 2.644796848297119, + "loss/hidden": 0.87109375, + "loss/logits": 0.16185884177684784, + "loss/reg": 0.009578878991305828, + "step": 2233 + }, + { + "epoch": 0.27925, + "grad_norm": 2.7318971157073975, + "grad_norm_var": 348.3677087024686, + "learning_rate": 0.0001, + "loss": 1.253, + "loss/crossentropy": 2.3669991493225098, + "loss/hidden": 0.97265625, + "loss/logits": 0.18456155061721802, + "loss/reg": 0.00957344938069582, + "step": 2234 + }, + { + "epoch": 0.279375, + "grad_norm": 2.8467166423797607, + "grad_norm_var": 348.22229341156833, + "learning_rate": 0.0001, + "loss": 1.2366, + "loss/crossentropy": 2.107529878616333, + "loss/hidden": 0.953125, + "loss/logits": 0.18779894709587097, + "loss/reg": 0.00956790242344141, + "step": 2235 + }, + { + "epoch": 0.2795, + "grad_norm": 3.4069461822509766, + "grad_norm_var": 0.22852860371247724, + "learning_rate": 0.0001, + "loss": 1.2015, + "loss/crossentropy": 2.4742836952209473, + "loss/hidden": 0.94921875, + "loss/logits": 0.15662327408790588, + "loss/reg": 0.009561954066157341, + "step": 2236 + }, + { + "epoch": 0.279625, + "grad_norm": 3.3008339405059814, + "grad_norm_var": 0.22955275069126604, + "learning_rate": 0.0001, + "loss": 1.265, + "loss/crossentropy": 2.6688270568847656, + "loss/hidden": 0.96484375, + "loss/logits": 0.2045484185218811, + "loss/reg": 0.009556453675031662, + "step": 2237 + }, + { + "epoch": 0.27975, + "grad_norm": 3.0573244094848633, + "grad_norm_var": 0.18639074409896542, + "learning_rate": 0.0001, + "loss": 1.246, + "loss/crossentropy": 2.685687303543091, + "loss/hidden": 0.93359375, + "loss/logits": 0.2168709933757782, + "loss/reg": 0.009550639428198338, + "step": 2238 + }, + { + "epoch": 0.279875, + "grad_norm": 14.836824417114258, + "grad_norm_var": 8.82820392873877, + "learning_rate": 0.0001, + "loss": 1.1296, + "loss/crossentropy": 2.719700336456299, + "loss/hidden": 0.87890625, + "loss/logits": 0.1552865207195282, + "loss/reg": 0.009545081295073032, + "step": 2239 + }, + { + "epoch": 0.28, + "grad_norm": 3.2265188694000244, + "grad_norm_var": 8.773575853883326, + "learning_rate": 0.0001, + "loss": 1.2559, + "loss/crossentropy": 2.5745716094970703, + "loss/hidden": 0.96875, + "loss/logits": 0.1917484998703003, + "loss/reg": 0.009539241902530193, + "step": 2240 + }, + { + "epoch": 0.280125, + "grad_norm": 3.270475149154663, + "grad_norm_var": 8.69891787207773, + "learning_rate": 0.0001, + "loss": 1.3129, + "loss/crossentropy": 2.425095796585083, + "loss/hidden": 1.046875, + "loss/logits": 0.17071107029914856, + "loss/reg": 0.009533590637147427, + "step": 2241 + }, + { + "epoch": 0.28025, + "grad_norm": 5.0778584480285645, + "grad_norm_var": 8.757799984199899, + "learning_rate": 0.0001, + "loss": 1.4698, + "loss/crossentropy": 2.3729984760284424, + "loss/hidden": 1.1953125, + "loss/logits": 0.1791631579399109, + "loss/reg": 0.009527940303087234, + "step": 2242 + }, + { + "epoch": 0.280375, + "grad_norm": 3.647056818008423, + "grad_norm_var": 8.76294577181251, + "learning_rate": 0.0001, + "loss": 1.372, + "loss/crossentropy": 2.638150215148926, + "loss/hidden": 1.0234375, + "loss/logits": 0.2533681094646454, + "loss/reg": 0.009522613137960434, + "step": 2243 + }, + { + "epoch": 0.2805, + "grad_norm": 6.8505330085754395, + "grad_norm_var": 9.170154567025804, + "learning_rate": 0.0001, + "loss": 1.9322, + "loss/crossentropy": 2.417832374572754, + "loss/hidden": 1.546875, + "loss/logits": 0.2901902198791504, + "loss/reg": 0.009517074562609196, + "step": 2244 + }, + { + "epoch": 0.280625, + "grad_norm": 2.580599784851074, + "grad_norm_var": 9.263138302154985, + "learning_rate": 0.0001, + "loss": 1.1183, + "loss/crossentropy": 2.525139331817627, + "loss/hidden": 0.875, + "loss/logits": 0.14818444848060608, + "loss/reg": 0.009511714801192284, + "step": 2245 + }, + { + "epoch": 0.28075, + "grad_norm": 3.272780418395996, + "grad_norm_var": 9.243340047683272, + "learning_rate": 0.0001, + "loss": 1.2171, + "loss/crossentropy": 2.4416310787200928, + "loss/hidden": 0.94140625, + "loss/logits": 0.1806139051914215, + "loss/reg": 0.009506085887551308, + "step": 2246 + }, + { + "epoch": 0.280875, + "grad_norm": 4.936016082763672, + "grad_norm_var": 9.045323124462561, + "learning_rate": 0.0001, + "loss": 1.3245, + "loss/crossentropy": 2.7093281745910645, + "loss/hidden": 1.046875, + "loss/logits": 0.18263736367225647, + "loss/reg": 0.0095004728063941, + "step": 2247 + }, + { + "epoch": 0.281, + "grad_norm": 2.9791793823242188, + "grad_norm_var": 9.162537771293175, + "learning_rate": 0.0001, + "loss": 1.1564, + "loss/crossentropy": 2.5206501483917236, + "loss/hidden": 0.91015625, + "loss/logits": 0.15134036540985107, + "loss/reg": 0.009495021775364876, + "step": 2248 + }, + { + "epoch": 0.281125, + "grad_norm": 3.1535937786102295, + "grad_norm_var": 9.07671470191016, + "learning_rate": 0.0001, + "loss": 1.1834, + "loss/crossentropy": 2.747591972351074, + "loss/hidden": 0.9140625, + "loss/logits": 0.17446273565292358, + "loss/reg": 0.009489394724369049, + "step": 2249 + }, + { + "epoch": 0.28125, + "grad_norm": 2.6284468173980713, + "grad_norm_var": 9.09933641815783, + "learning_rate": 0.0001, + "loss": 1.2361, + "loss/crossentropy": 2.50215220451355, + "loss/hidden": 0.96875, + "loss/logits": 0.1725444793701172, + "loss/reg": 0.009484088979661465, + "step": 2250 + }, + { + "epoch": 0.281375, + "grad_norm": 3.2230780124664307, + "grad_norm_var": 9.034409290751688, + "learning_rate": 0.0001, + "loss": 1.1003, + "loss/crossentropy": 2.3967666625976562, + "loss/hidden": 0.859375, + "loss/logits": 0.14610040187835693, + "loss/reg": 0.009478544816374779, + "step": 2251 + }, + { + "epoch": 0.2815, + "grad_norm": 3.7582857608795166, + "grad_norm_var": 8.998391480314327, + "learning_rate": 0.0001, + "loss": 1.0153, + "loss/crossentropy": 2.6266894340515137, + "loss/hidden": 0.7890625, + "loss/logits": 0.13150277733802795, + "loss/reg": 0.009473329409956932, + "step": 2252 + }, + { + "epoch": 0.281625, + "grad_norm": 3.822023868560791, + "grad_norm_var": 8.941594210319238, + "learning_rate": 0.0001, + "loss": 1.1948, + "loss/crossentropy": 2.263728380203247, + "loss/hidden": 0.92578125, + "loss/logits": 0.17433184385299683, + "loss/reg": 0.00946798361837864, + "step": 2253 + }, + { + "epoch": 0.28175, + "grad_norm": 2.628258466720581, + "grad_norm_var": 9.029629241931024, + "learning_rate": 0.0001, + "loss": 1.1164, + "loss/crossentropy": 2.6066477298736572, + "loss/hidden": 0.8671875, + "loss/logits": 0.15456253290176392, + "loss/reg": 0.009462515823543072, + "step": 2254 + }, + { + "epoch": 0.281875, + "grad_norm": 3.048149585723877, + "grad_norm_var": 1.2606370718673503, + "learning_rate": 0.0001, + "loss": 1.0535, + "loss/crossentropy": 2.533827304840088, + "loss/hidden": 0.82421875, + "loss/logits": 0.1347401738166809, + "loss/reg": 0.009457193315029144, + "step": 2255 + }, + { + "epoch": 0.282, + "grad_norm": 2.7999813556671143, + "grad_norm_var": 1.295035842600486, + "learning_rate": 0.0001, + "loss": 1.2237, + "loss/crossentropy": 2.6519150733947754, + "loss/hidden": 0.9609375, + "loss/logits": 0.16826486587524414, + "loss/reg": 0.00945141352713108, + "step": 2256 + }, + { + "epoch": 0.282125, + "grad_norm": 3.052527666091919, + "grad_norm_var": 1.3077191519410272, + "learning_rate": 0.0001, + "loss": 1.1536, + "loss/crossentropy": 2.525834083557129, + "loss/hidden": 0.90234375, + "loss/logits": 0.15679647028446198, + "loss/reg": 0.009445516392588615, + "step": 2257 + }, + { + "epoch": 0.28225, + "grad_norm": 3.316465377807617, + "grad_norm_var": 1.1524682363577834, + "learning_rate": 0.0001, + "loss": 1.0477, + "loss/crossentropy": 2.6249217987060547, + "loss/hidden": 0.8203125, + "loss/logits": 0.13295969367027283, + "loss/reg": 0.009439031593501568, + "step": 2258 + }, + { + "epoch": 0.282375, + "grad_norm": 2.6918253898620605, + "grad_norm_var": 1.1883555074379386, + "learning_rate": 0.0001, + "loss": 0.9782, + "loss/crossentropy": 2.490191698074341, + "loss/hidden": 0.76171875, + "loss/logits": 0.1221388503909111, + "loss/reg": 0.00943272840231657, + "step": 2259 + }, + { + "epoch": 0.2825, + "grad_norm": 2.965463638305664, + "grad_norm_var": 0.3553719285356292, + "learning_rate": 0.0001, + "loss": 1.2215, + "loss/crossentropy": 2.363909959793091, + "loss/hidden": 0.9453125, + "loss/logits": 0.18187889456748962, + "loss/reg": 0.009426291100680828, + "step": 2260 + }, + { + "epoch": 0.282625, + "grad_norm": 2.6161577701568604, + "grad_norm_var": 0.35261606794378225, + "learning_rate": 0.0001, + "loss": 1.0278, + "loss/crossentropy": 2.4729695320129395, + "loss/hidden": 0.78515625, + "loss/logits": 0.14840959012508392, + "loss/reg": 0.009419837966561317, + "step": 2261 + }, + { + "epoch": 0.28275, + "grad_norm": 3.319518804550171, + "grad_norm_var": 0.35332602060928836, + "learning_rate": 0.0001, + "loss": 1.234, + "loss/crossentropy": 2.5813112258911133, + "loss/hidden": 0.953125, + "loss/logits": 0.18668454885482788, + "loss/reg": 0.009414486587047577, + "step": 2262 + }, + { + "epoch": 0.282875, + "grad_norm": 3.478592872619629, + "grad_norm_var": 0.14556291533590007, + "learning_rate": 0.0001, + "loss": 1.2646, + "loss/crossentropy": 2.3299901485443115, + "loss/hidden": 0.98828125, + "loss/logits": 0.18219327926635742, + "loss/reg": 0.009408492594957352, + "step": 2263 + }, + { + "epoch": 0.283, + "grad_norm": 3.198110342025757, + "grad_norm_var": 0.14524784406937669, + "learning_rate": 0.0001, + "loss": 1.2581, + "loss/crossentropy": 2.495682716369629, + "loss/hidden": 0.984375, + "loss/logits": 0.17970743775367737, + "loss/reg": 0.00940236821770668, + "step": 2264 + }, + { + "epoch": 0.283125, + "grad_norm": 5.070627689361572, + "grad_norm_var": 0.387030156112602, + "learning_rate": 0.0001, + "loss": 1.3101, + "loss/crossentropy": 2.746734142303467, + "loss/hidden": 1.03125, + "loss/logits": 0.1849251687526703, + "loss/reg": 0.00939682312309742, + "step": 2265 + }, + { + "epoch": 0.28325, + "grad_norm": 2.801924705505371, + "grad_norm_var": 0.37508724412493755, + "learning_rate": 0.0001, + "loss": 1.1359, + "loss/crossentropy": 2.410548686981201, + "loss/hidden": 0.89453125, + "loss/logits": 0.147461399435997, + "loss/reg": 0.009391373954713345, + "step": 2266 + }, + { + "epoch": 0.283375, + "grad_norm": 2.435459852218628, + "grad_norm_var": 0.41531404950410056, + "learning_rate": 0.0001, + "loss": 1.1756, + "loss/crossentropy": 2.6951451301574707, + "loss/hidden": 0.90625, + "loss/logits": 0.17552579939365387, + "loss/reg": 0.009386060759425163, + "step": 2267 + }, + { + "epoch": 0.2835, + "grad_norm": 2.5269439220428467, + "grad_norm_var": 0.4164003471062709, + "learning_rate": 0.0001, + "loss": 1.1339, + "loss/crossentropy": 2.522493839263916, + "loss/hidden": 0.8828125, + "loss/logits": 0.1572396159172058, + "loss/reg": 0.009380444884300232, + "step": 2268 + }, + { + "epoch": 0.283625, + "grad_norm": 2.9823505878448486, + "grad_norm_var": 0.380834578958081, + "learning_rate": 0.0001, + "loss": 1.1291, + "loss/crossentropy": 2.4628095626831055, + "loss/hidden": 0.890625, + "loss/logits": 0.14473502337932587, + "loss/reg": 0.009375154972076416, + "step": 2269 + }, + { + "epoch": 0.28375, + "grad_norm": 2.6929304599761963, + "grad_norm_var": 0.37738800223646624, + "learning_rate": 0.0001, + "loss": 1.1499, + "loss/crossentropy": 2.2941513061523438, + "loss/hidden": 0.8984375, + "loss/logits": 0.1577795147895813, + "loss/reg": 0.009369867853820324, + "step": 2270 + }, + { + "epoch": 0.283875, + "grad_norm": 2.5207014083862305, + "grad_norm_var": 0.39577176003084535, + "learning_rate": 0.0001, + "loss": 1.3322, + "loss/crossentropy": 2.496821641921997, + "loss/hidden": 1.0390625, + "loss/logits": 0.19946375489234924, + "loss/reg": 0.009364121593534946, + "step": 2271 + }, + { + "epoch": 0.284, + "grad_norm": 3.484135150909424, + "grad_norm_var": 0.4041028907198466, + "learning_rate": 0.0001, + "loss": 1.0196, + "loss/crossentropy": 2.5315027236938477, + "loss/hidden": 0.796875, + "loss/logits": 0.1291487216949463, + "loss/reg": 0.009358393959701061, + "step": 2272 + }, + { + "epoch": 0.284125, + "grad_norm": 2.876593589782715, + "grad_norm_var": 0.4064967649293083, + "learning_rate": 0.0001, + "loss": 1.1535, + "loss/crossentropy": 2.9269776344299316, + "loss/hidden": 0.90625, + "loss/logits": 0.1536765992641449, + "loss/reg": 0.009353107772767544, + "step": 2273 + }, + { + "epoch": 0.28425, + "grad_norm": 4.194181442260742, + "grad_norm_var": 0.4845294896823378, + "learning_rate": 0.0001, + "loss": 1.5176, + "loss/crossentropy": 2.372791051864624, + "loss/hidden": 1.1875, + "loss/logits": 0.23657436668872833, + "loss/reg": 0.009347806684672832, + "step": 2274 + }, + { + "epoch": 0.284375, + "grad_norm": 2.4793362617492676, + "grad_norm_var": 0.4993682781130661, + "learning_rate": 0.0001, + "loss": 1.1005, + "loss/crossentropy": 2.4985427856445312, + "loss/hidden": 0.85546875, + "loss/logits": 0.15159931778907776, + "loss/reg": 0.009342130273580551, + "step": 2275 + }, + { + "epoch": 0.2845, + "grad_norm": 2.7460553646087646, + "grad_norm_var": 0.5063914863759843, + "learning_rate": 0.0001, + "loss": 1.1138, + "loss/crossentropy": 2.446516513824463, + "loss/hidden": 0.8515625, + "loss/logits": 0.1689249575138092, + "loss/reg": 0.009336147457361221, + "step": 2276 + }, + { + "epoch": 0.284625, + "grad_norm": 3.3482463359832764, + "grad_norm_var": 0.4937359222321741, + "learning_rate": 0.0001, + "loss": 1.2087, + "loss/crossentropy": 2.635927200317383, + "loss/hidden": 0.953125, + "loss/logits": 0.1622520536184311, + "loss/reg": 0.009330171160399914, + "step": 2277 + }, + { + "epoch": 0.28475, + "grad_norm": 3.5546188354492188, + "grad_norm_var": 0.5029828811824186, + "learning_rate": 0.0001, + "loss": 1.2414, + "loss/crossentropy": 2.255173444747925, + "loss/hidden": 0.99609375, + "loss/logits": 0.1520824432373047, + "loss/reg": 0.009324093349277973, + "step": 2278 + }, + { + "epoch": 0.284875, + "grad_norm": 3.0981099605560303, + "grad_norm_var": 0.4953318286175384, + "learning_rate": 0.0001, + "loss": 1.0613, + "loss/crossentropy": 2.4925482273101807, + "loss/hidden": 0.8203125, + "loss/logits": 0.14785414934158325, + "loss/reg": 0.0093180937692523, + "step": 2279 + }, + { + "epoch": 0.285, + "grad_norm": 2.3994228839874268, + "grad_norm_var": 0.5274837667782587, + "learning_rate": 0.0001, + "loss": 1.009, + "loss/crossentropy": 2.375962257385254, + "loss/hidden": 0.796875, + "loss/logits": 0.11902016401290894, + "loss/reg": 0.009312797337770462, + "step": 2280 + }, + { + "epoch": 0.285125, + "grad_norm": 2.9039511680603027, + "grad_norm_var": 0.24458206520008996, + "learning_rate": 0.0001, + "loss": 1.1169, + "loss/crossentropy": 2.5168046951293945, + "loss/hidden": 0.875, + "loss/logits": 0.148838073015213, + "loss/reg": 0.009307482279837132, + "step": 2281 + }, + { + "epoch": 0.28525, + "grad_norm": 4.613414764404297, + "grad_norm_var": 0.4162510726319965, + "learning_rate": 0.0001, + "loss": 1.3079, + "loss/crossentropy": 2.583071231842041, + "loss/hidden": 1.015625, + "loss/logits": 0.19928890466690063, + "loss/reg": 0.009302211925387383, + "step": 2282 + }, + { + "epoch": 0.285375, + "grad_norm": 3.8388094902038574, + "grad_norm_var": 0.42368915236828925, + "learning_rate": 0.0001, + "loss": 1.4623, + "loss/crossentropy": 2.191469430923462, + "loss/hidden": 1.1484375, + "loss/logits": 0.2209167778491974, + "loss/reg": 0.009296446107327938, + "step": 2283 + }, + { + "epoch": 0.2855, + "grad_norm": 4.862847805023193, + "grad_norm_var": 0.5733929545817915, + "learning_rate": 0.0001, + "loss": 1.5286, + "loss/crossentropy": 2.5529236793518066, + "loss/hidden": 1.1875, + "loss/logits": 0.2482062578201294, + "loss/reg": 0.009290698915719986, + "step": 2284 + }, + { + "epoch": 0.285625, + "grad_norm": 4.359282493591309, + "grad_norm_var": 0.6359159119445505, + "learning_rate": 0.0001, + "loss": 1.34, + "loss/crossentropy": 2.640444755554199, + "loss/hidden": 1.046875, + "loss/logits": 0.20026032626628876, + "loss/reg": 0.009285423904657364, + "step": 2285 + }, + { + "epoch": 0.28575, + "grad_norm": 2.9475839138031006, + "grad_norm_var": 0.6168681537154913, + "learning_rate": 0.0001, + "loss": 1.4011, + "loss/crossentropy": 2.3639228343963623, + "loss/hidden": 1.09375, + "loss/logits": 0.2145346701145172, + "loss/reg": 0.009280139580368996, + "step": 2286 + }, + { + "epoch": 0.285875, + "grad_norm": 2.9800379276275635, + "grad_norm_var": 0.5768636005852424, + "learning_rate": 0.0001, + "loss": 1.1572, + "loss/crossentropy": 2.615760326385498, + "loss/hidden": 0.90234375, + "loss/logits": 0.16209115087985992, + "loss/reg": 0.00927441194653511, + "step": 2287 + }, + { + "epoch": 0.286, + "grad_norm": 3.3330628871917725, + "grad_norm_var": 0.5769561410980484, + "learning_rate": 0.0001, + "loss": 1.4261, + "loss/crossentropy": 2.0002458095550537, + "loss/hidden": 1.125, + "loss/logits": 0.2083829641342163, + "loss/reg": 0.009269144386053085, + "step": 2288 + }, + { + "epoch": 0.286125, + "grad_norm": 4.286744594573975, + "grad_norm_var": 0.6012351204574483, + "learning_rate": 0.0001, + "loss": 1.2582, + "loss/crossentropy": 2.5433619022369385, + "loss/hidden": 0.97265625, + "loss/logits": 0.1929214745759964, + "loss/reg": 0.009263594634830952, + "step": 2289 + }, + { + "epoch": 0.28625, + "grad_norm": 4.047984600067139, + "grad_norm_var": 0.5889732006761048, + "learning_rate": 0.0001, + "loss": 1.5295, + "loss/crossentropy": 2.8939483165740967, + "loss/hidden": 1.1953125, + "loss/logits": 0.24164989590644836, + "loss/reg": 0.009258082136511803, + "step": 2290 + }, + { + "epoch": 0.286375, + "grad_norm": 2.7053489685058594, + "grad_norm_var": 0.5617856918356208, + "learning_rate": 0.0001, + "loss": 1.1657, + "loss/crossentropy": 2.4860267639160156, + "loss/hidden": 0.91796875, + "loss/logits": 0.1552383005619049, + "loss/reg": 0.00925289373844862, + "step": 2291 + }, + { + "epoch": 0.2865, + "grad_norm": 3.4142940044403076, + "grad_norm_var": 0.5223771736132996, + "learning_rate": 0.0001, + "loss": 1.2344, + "loss/crossentropy": 2.649752140045166, + "loss/hidden": 0.9765625, + "loss/logits": 0.1653738170862198, + "loss/reg": 0.009247452020645142, + "step": 2292 + }, + { + "epoch": 0.286625, + "grad_norm": 4.158377647399902, + "grad_norm_var": 0.542321023894605, + "learning_rate": 0.0001, + "loss": 1.1851, + "loss/crossentropy": 2.5413994789123535, + "loss/hidden": 0.93359375, + "loss/logits": 0.15909796953201294, + "loss/reg": 0.009242062456905842, + "step": 2293 + }, + { + "epoch": 0.28675, + "grad_norm": 2.9762167930603027, + "grad_norm_var": 0.566266896866469, + "learning_rate": 0.0001, + "loss": 1.11, + "loss/crossentropy": 2.4784295558929443, + "loss/hidden": 0.86328125, + "loss/logits": 0.15438120067119598, + "loss/reg": 0.009236682206392288, + "step": 2294 + }, + { + "epoch": 0.286875, + "grad_norm": 3.2820780277252197, + "grad_norm_var": 0.5571053330677274, + "learning_rate": 0.0001, + "loss": 1.1988, + "loss/crossentropy": 2.5515854358673096, + "loss/hidden": 0.92578125, + "loss/logits": 0.18073450028896332, + "loss/reg": 0.009231574833393097, + "step": 2295 + }, + { + "epoch": 0.287, + "grad_norm": 4.7166948318481445, + "grad_norm_var": 0.5312455008496932, + "learning_rate": 0.0001, + "loss": 1.1485, + "loss/crossentropy": 2.499162197113037, + "loss/hidden": 0.890625, + "loss/logits": 0.16561537981033325, + "loss/reg": 0.009226474910974503, + "step": 2296 + }, + { + "epoch": 0.287125, + "grad_norm": 2.99236798286438, + "grad_norm_var": 0.5221824935750637, + "learning_rate": 0.0001, + "loss": 1.2098, + "loss/crossentropy": 2.617089033126831, + "loss/hidden": 0.9375, + "loss/logits": 0.1800779104232788, + "loss/reg": 0.009221197105944157, + "step": 2297 + }, + { + "epoch": 0.28725, + "grad_norm": 2.887284994125366, + "grad_norm_var": 0.5027129548775328, + "learning_rate": 0.0001, + "loss": 1.1578, + "loss/crossentropy": 2.3613736629486084, + "loss/hidden": 0.91796875, + "loss/logits": 0.14769989252090454, + "loss/reg": 0.009216159582138062, + "step": 2298 + }, + { + "epoch": 0.287375, + "grad_norm": 3.471057415008545, + "grad_norm_var": 0.5000351242653587, + "learning_rate": 0.0001, + "loss": 1.2699, + "loss/crossentropy": 2.3855464458465576, + "loss/hidden": 0.9765625, + "loss/logits": 0.2012341171503067, + "loss/reg": 0.009210838936269283, + "step": 2299 + }, + { + "epoch": 0.2875, + "grad_norm": 3.6935126781463623, + "grad_norm_var": 0.3868601807061765, + "learning_rate": 0.0001, + "loss": 1.2622, + "loss/crossentropy": 2.687220335006714, + "loss/hidden": 0.98828125, + "loss/logits": 0.18187205493450165, + "loss/reg": 0.009205520153045654, + "step": 2300 + }, + { + "epoch": 0.287625, + "grad_norm": 2.3923606872558594, + "grad_norm_var": 0.4074362056161732, + "learning_rate": 0.0001, + "loss": 1.0037, + "loss/crossentropy": 2.498629570007324, + "loss/hidden": 0.77734375, + "loss/logits": 0.1343335658311844, + "loss/reg": 0.00920000858604908, + "step": 2301 + }, + { + "epoch": 0.28775, + "grad_norm": 2.793649435043335, + "grad_norm_var": 0.41805534218280227, + "learning_rate": 0.0001, + "loss": 1.1498, + "loss/crossentropy": 2.5481371879577637, + "loss/hidden": 0.91015625, + "loss/logits": 0.14769962430000305, + "loss/reg": 0.009194904007017612, + "step": 2302 + }, + { + "epoch": 0.287875, + "grad_norm": 3.619518280029297, + "grad_norm_var": 0.40923923162265863, + "learning_rate": 0.0001, + "loss": 1.1985, + "loss/crossentropy": 2.7551188468933105, + "loss/hidden": 0.91796875, + "loss/logits": 0.1886293888092041, + "loss/reg": 0.0091897277161479, + "step": 2303 + }, + { + "epoch": 0.288, + "grad_norm": 3.1799814701080322, + "grad_norm_var": 0.41254280292479445, + "learning_rate": 0.0001, + "loss": 1.5252, + "loss/crossentropy": 2.099945545196533, + "loss/hidden": 1.2109375, + "loss/logits": 0.22244003415107727, + "loss/reg": 0.009184639900922775, + "step": 2304 + }, + { + "epoch": 0.288125, + "grad_norm": 3.200214385986328, + "grad_norm_var": 0.35983282726327254, + "learning_rate": 0.0001, + "loss": 1.3542, + "loss/crossentropy": 2.3131332397460938, + "loss/hidden": 1.078125, + "loss/logits": 0.18429729342460632, + "loss/reg": 0.009179775603115559, + "step": 2305 + }, + { + "epoch": 0.28825, + "grad_norm": 3.254546642303467, + "grad_norm_var": 0.3248817085765095, + "learning_rate": 0.0001, + "loss": 1.1998, + "loss/crossentropy": 2.800431251525879, + "loss/hidden": 0.92578125, + "loss/logits": 0.18223266303539276, + "loss/reg": 0.009174962528049946, + "step": 2306 + }, + { + "epoch": 0.288375, + "grad_norm": 4.686166763305664, + "grad_norm_var": 0.4140880478204449, + "learning_rate": 0.0001, + "loss": 1.3774, + "loss/crossentropy": 2.7989413738250732, + "loss/hidden": 1.0625, + "loss/logits": 0.22316935658454895, + "loss/reg": 0.009169746190309525, + "step": 2307 + }, + { + "epoch": 0.2885, + "grad_norm": 2.91051983833313, + "grad_norm_var": 0.4303260502802457, + "learning_rate": 0.0001, + "loss": 1.1877, + "loss/crossentropy": 2.447007417678833, + "loss/hidden": 0.921875, + "loss/logits": 0.17420323193073273, + "loss/reg": 0.009164923802018166, + "step": 2308 + }, + { + "epoch": 0.288625, + "grad_norm": 2.880136728286743, + "grad_norm_var": 0.401217441114458, + "learning_rate": 0.0001, + "loss": 1.1445, + "loss/crossentropy": 2.327925682067871, + "loss/hidden": 0.890625, + "loss/logits": 0.16227543354034424, + "loss/reg": 0.009159999899566174, + "step": 2309 + }, + { + "epoch": 0.28875, + "grad_norm": 4.102278709411621, + "grad_norm_var": 0.4305759970387262, + "learning_rate": 0.0001, + "loss": 1.0832, + "loss/crossentropy": 2.476874828338623, + "loss/hidden": 0.85546875, + "loss/logits": 0.1361844539642334, + "loss/reg": 0.009154681116342545, + "step": 2310 + }, + { + "epoch": 0.288875, + "grad_norm": 2.6885852813720703, + "grad_norm_var": 0.4602521973384566, + "learning_rate": 0.0001, + "loss": 1.0572, + "loss/crossentropy": 2.399263858795166, + "loss/hidden": 0.82421875, + "loss/logits": 0.14146915078163147, + "loss/reg": 0.009149202145636082, + "step": 2311 + }, + { + "epoch": 0.289, + "grad_norm": 3.9530670642852783, + "grad_norm_var": 0.356710426073163, + "learning_rate": 0.0001, + "loss": 1.1544, + "loss/crossentropy": 2.483734130859375, + "loss/hidden": 0.90234375, + "loss/logits": 0.16064409911632538, + "loss/reg": 0.009144189767539501, + "step": 2312 + }, + { + "epoch": 0.289125, + "grad_norm": 5.744041442871094, + "grad_norm_var": 0.719247768583498, + "learning_rate": 0.0001, + "loss": 1.372, + "loss/crossentropy": 2.3567168712615967, + "loss/hidden": 1.0703125, + "loss/logits": 0.21033620834350586, + "loss/reg": 0.009139393456280231, + "step": 2313 + }, + { + "epoch": 0.28925, + "grad_norm": 2.933044195175171, + "grad_norm_var": 0.7158474145933134, + "learning_rate": 0.0001, + "loss": 1.0481, + "loss/crossentropy": 2.365586757659912, + "loss/hidden": 0.828125, + "loss/logits": 0.12862184643745422, + "loss/reg": 0.009134124033153057, + "step": 2314 + }, + { + "epoch": 0.289375, + "grad_norm": 2.4427220821380615, + "grad_norm_var": 0.7816461139146356, + "learning_rate": 0.0001, + "loss": 0.9712, + "loss/crossentropy": 2.5909483432769775, + "loss/hidden": 0.75390625, + "loss/logits": 0.12598982453346252, + "loss/reg": 0.009128816425800323, + "step": 2315 + }, + { + "epoch": 0.2895, + "grad_norm": 2.968493938446045, + "grad_norm_var": 0.7865749325737698, + "learning_rate": 0.0001, + "loss": 1.2151, + "loss/crossentropy": 2.575425148010254, + "loss/hidden": 0.96484375, + "loss/logits": 0.1590544581413269, + "loss/reg": 0.00912347063422203, + "step": 2316 + }, + { + "epoch": 0.289625, + "grad_norm": 2.5986244678497314, + "grad_norm_var": 0.762640465759848, + "learning_rate": 0.0001, + "loss": 0.9679, + "loss/crossentropy": 2.4131813049316406, + "loss/hidden": 0.76171875, + "loss/logits": 0.11498948186635971, + "loss/reg": 0.00911857932806015, + "step": 2317 + }, + { + "epoch": 0.28975, + "grad_norm": 3.7702338695526123, + "grad_norm_var": 0.7469108114767844, + "learning_rate": 0.0001, + "loss": 1.1515, + "loss/crossentropy": 2.4600918292999268, + "loss/hidden": 0.921875, + "loss/logits": 0.1385062336921692, + "loss/reg": 0.009113702923059464, + "step": 2318 + }, + { + "epoch": 0.289875, + "grad_norm": 5.30330753326416, + "grad_norm_var": 0.9659231980461366, + "learning_rate": 0.0001, + "loss": 1.5569, + "loss/crossentropy": 2.3851242065429688, + "loss/hidden": 1.1328125, + "loss/logits": 0.3329842686653137, + "loss/reg": 0.009108653292059898, + "step": 2319 + }, + { + "epoch": 0.29, + "grad_norm": 3.2748043537139893, + "grad_norm_var": 0.9619524192596212, + "learning_rate": 0.0001, + "loss": 1.2181, + "loss/crossentropy": 2.603100538253784, + "loss/hidden": 0.94921875, + "loss/logits": 0.17784467339515686, + "loss/reg": 0.009103918448090553, + "step": 2320 + }, + { + "epoch": 0.290125, + "grad_norm": 4.37143087387085, + "grad_norm_var": 0.9939341109011214, + "learning_rate": 0.0001, + "loss": 1.2935, + "loss/crossentropy": 2.355426788330078, + "loss/hidden": 1.0078125, + "loss/logits": 0.19474349915981293, + "loss/reg": 0.009098876267671585, + "step": 2321 + }, + { + "epoch": 0.29025, + "grad_norm": 3.4387564659118652, + "grad_norm_var": 0.9871372537564033, + "learning_rate": 0.0001, + "loss": 1.1712, + "loss/crossentropy": 2.520423650741577, + "loss/hidden": 0.8984375, + "loss/logits": 0.1817954033613205, + "loss/reg": 0.009094218723475933, + "step": 2322 + }, + { + "epoch": 0.290375, + "grad_norm": 3.149522066116333, + "grad_norm_var": 0.9181467808823581, + "learning_rate": 0.0001, + "loss": 1.1428, + "loss/crossentropy": 2.331247329711914, + "loss/hidden": 0.90234375, + "loss/logits": 0.14953988790512085, + "loss/reg": 0.009089522995054722, + "step": 2323 + }, + { + "epoch": 0.2905, + "grad_norm": 3.0558767318725586, + "grad_norm_var": 0.9074011819268397, + "learning_rate": 0.0001, + "loss": 1.1503, + "loss/crossentropy": 2.3818721771240234, + "loss/hidden": 0.90625, + "loss/logits": 0.15320459008216858, + "loss/reg": 0.009084295481443405, + "step": 2324 + }, + { + "epoch": 0.290625, + "grad_norm": 3.1917977333068848, + "grad_norm_var": 0.8859607731530803, + "learning_rate": 0.0001, + "loss": 1.163, + "loss/crossentropy": 2.487668514251709, + "loss/hidden": 0.921875, + "loss/logits": 0.15031218528747559, + "loss/reg": 0.009079117327928543, + "step": 2325 + }, + { + "epoch": 0.29075, + "grad_norm": 4.293625831604004, + "grad_norm_var": 0.902041865877396, + "learning_rate": 0.0001, + "loss": 1.3669, + "loss/crossentropy": 2.3357431888580322, + "loss/hidden": 1.09375, + "loss/logits": 0.18243734538555145, + "loss/reg": 0.009074253961443901, + "step": 2326 + }, + { + "epoch": 0.290875, + "grad_norm": 3.2660837173461914, + "grad_norm_var": 0.8547383377829303, + "learning_rate": 0.0001, + "loss": 1.0667, + "loss/crossentropy": 2.54026460647583, + "loss/hidden": 0.8359375, + "loss/logits": 0.14005467295646667, + "loss/reg": 0.009069058112800121, + "step": 2327 + }, + { + "epoch": 0.291, + "grad_norm": 3.146923303604126, + "grad_norm_var": 0.8584495384425639, + "learning_rate": 0.0001, + "loss": 1.1888, + "loss/crossentropy": 2.6916675567626953, + "loss/hidden": 0.91796875, + "loss/logits": 0.1802411675453186, + "loss/reg": 0.009063874371349812, + "step": 2328 + }, + { + "epoch": 0.291125, + "grad_norm": 5.241123676300049, + "grad_norm_var": 0.7277601070818565, + "learning_rate": 0.0001, + "loss": 1.4183, + "loss/crossentropy": 2.718595266342163, + "loss/hidden": 1.0546875, + "loss/logits": 0.27306067943573, + "loss/reg": 0.009058966301381588, + "step": 2329 + }, + { + "epoch": 0.29125, + "grad_norm": 4.592714786529541, + "grad_norm_var": 0.7682818734868221, + "learning_rate": 0.0001, + "loss": 1.1957, + "loss/crossentropy": 2.4723165035247803, + "loss/hidden": 0.953125, + "loss/logits": 0.15208083391189575, + "loss/reg": 0.009053805842995644, + "step": 2330 + }, + { + "epoch": 0.291375, + "grad_norm": 2.8367936611175537, + "grad_norm_var": 0.7155191330011945, + "learning_rate": 0.0001, + "loss": 1.1185, + "loss/crossentropy": 2.621317148208618, + "loss/hidden": 0.87109375, + "loss/logits": 0.15695253014564514, + "loss/reg": 0.009048878215253353, + "step": 2331 + }, + { + "epoch": 0.2915, + "grad_norm": 4.0428924560546875, + "grad_norm_var": 0.6891406696964054, + "learning_rate": 0.0001, + "loss": 1.2482, + "loss/crossentropy": 1.921502947807312, + "loss/hidden": 1.0078125, + "loss/logits": 0.1499861478805542, + "loss/reg": 0.009043702855706215, + "step": 2332 + }, + { + "epoch": 0.291625, + "grad_norm": 3.5794973373413086, + "grad_norm_var": 0.6021701583706902, + "learning_rate": 0.0001, + "loss": 1.4262, + "loss/crossentropy": 2.220508575439453, + "loss/hidden": 1.109375, + "loss/logits": 0.2264639139175415, + "loss/reg": 0.009038531221449375, + "step": 2333 + }, + { + "epoch": 0.29175, + "grad_norm": 2.2717108726501465, + "grad_norm_var": 0.7454110365554537, + "learning_rate": 0.0001, + "loss": 1.0648, + "loss/crossentropy": 2.454874277114868, + "loss/hidden": 0.828125, + "loss/logits": 0.1463043987751007, + "loss/reg": 0.00903335865586996, + "step": 2334 + }, + { + "epoch": 0.291875, + "grad_norm": 2.4929463863372803, + "grad_norm_var": 0.6349087948373833, + "learning_rate": 0.0001, + "loss": 1.1014, + "loss/crossentropy": 2.291163206100464, + "loss/hidden": 0.8515625, + "loss/logits": 0.15958374738693237, + "loss/reg": 0.009028360247612, + "step": 2335 + }, + { + "epoch": 0.292, + "grad_norm": 2.856205701828003, + "grad_norm_var": 0.6592890982725443, + "learning_rate": 0.0001, + "loss": 1.2619, + "loss/crossentropy": 2.4762227535247803, + "loss/hidden": 1.0, + "loss/logits": 0.1716938018798828, + "loss/reg": 0.009023264981806278, + "step": 2336 + }, + { + "epoch": 0.292125, + "grad_norm": 3.6141278743743896, + "grad_norm_var": 0.6060556206100122, + "learning_rate": 0.0001, + "loss": 1.1967, + "loss/crossentropy": 2.5191256999969482, + "loss/hidden": 0.9609375, + "loss/logits": 0.14553788304328918, + "loss/reg": 0.009018037468194962, + "step": 2337 + }, + { + "epoch": 0.29225, + "grad_norm": 3.189410924911499, + "grad_norm_var": 0.6100463683687468, + "learning_rate": 0.0001, + "loss": 1.2385, + "loss/crossentropy": 2.3111839294433594, + "loss/hidden": 0.98046875, + "loss/logits": 0.16792058944702148, + "loss/reg": 0.009012858383357525, + "step": 2338 + }, + { + "epoch": 0.292375, + "grad_norm": 3.147862434387207, + "grad_norm_var": 0.6101077933754292, + "learning_rate": 0.0001, + "loss": 1.0817, + "loss/crossentropy": 2.478745460510254, + "loss/hidden": 0.85546875, + "loss/logits": 0.13619570434093475, + "loss/reg": 0.009007743559777737, + "step": 2339 + }, + { + "epoch": 0.2925, + "grad_norm": 2.918719530105591, + "grad_norm_var": 0.6180563329417734, + "learning_rate": 0.0001, + "loss": 1.1284, + "loss/crossentropy": 2.4649863243103027, + "loss/hidden": 0.8828125, + "loss/logits": 0.15553182363510132, + "loss/reg": 0.009002809412777424, + "step": 2340 + }, + { + "epoch": 0.292625, + "grad_norm": 2.9358303546905518, + "grad_norm_var": 0.6298594747547933, + "learning_rate": 0.0001, + "loss": 1.2582, + "loss/crossentropy": 2.5478312969207764, + "loss/hidden": 0.97265625, + "loss/logits": 0.19560518860816956, + "loss/reg": 0.008997694589197636, + "step": 2341 + }, + { + "epoch": 0.29275, + "grad_norm": 2.726518154144287, + "grad_norm_var": 0.5969732385771968, + "learning_rate": 0.0001, + "loss": 1.1871, + "loss/crossentropy": 2.858321189880371, + "loss/hidden": 0.921875, + "loss/logits": 0.17527028918266296, + "loss/reg": 0.008992422372102737, + "step": 2342 + }, + { + "epoch": 0.292875, + "grad_norm": 3.0482592582702637, + "grad_norm_var": 0.6010314990953247, + "learning_rate": 0.0001, + "loss": 1.1788, + "loss/crossentropy": 2.6980865001678467, + "loss/hidden": 0.91796875, + "loss/logits": 0.17095425724983215, + "loss/reg": 0.00898739229887724, + "step": 2343 + }, + { + "epoch": 0.293, + "grad_norm": 2.3009376525878906, + "grad_norm_var": 0.6619118429629942, + "learning_rate": 0.0001, + "loss": 1.0527, + "loss/crossentropy": 2.2809457778930664, + "loss/hidden": 0.81640625, + "loss/logits": 0.14645175635814667, + "loss/reg": 0.008982405066490173, + "step": 2344 + }, + { + "epoch": 0.293125, + "grad_norm": 2.642392635345459, + "grad_norm_var": 0.38965264636329733, + "learning_rate": 0.0001, + "loss": 1.1133, + "loss/crossentropy": 2.4677882194519043, + "loss/hidden": 0.8671875, + "loss/logits": 0.1563827246427536, + "loss/reg": 0.008977506309747696, + "step": 2345 + }, + { + "epoch": 0.29325, + "grad_norm": 3.3558998107910156, + "grad_norm_var": 0.23494250932546165, + "learning_rate": 0.0001, + "loss": 1.1909, + "loss/crossentropy": 2.322314500808716, + "loss/hidden": 0.921875, + "loss/logits": 0.1792723536491394, + "loss/reg": 0.008972177281975746, + "step": 2346 + }, + { + "epoch": 0.293375, + "grad_norm": 2.436591386795044, + "grad_norm_var": 0.25352798139956934, + "learning_rate": 0.0001, + "loss": 1.031, + "loss/crossentropy": 2.71860671043396, + "loss/hidden": 0.8046875, + "loss/logits": 0.13668766617774963, + "loss/reg": 0.008967115543782711, + "step": 2347 + }, + { + "epoch": 0.2935, + "grad_norm": 2.4685699939727783, + "grad_norm_var": 0.18374537654962828, + "learning_rate": 0.0001, + "loss": 1.1872, + "loss/crossentropy": 2.3281967639923096, + "loss/hidden": 0.921875, + "loss/logits": 0.17572277784347534, + "loss/reg": 0.008961874060332775, + "step": 2348 + }, + { + "epoch": 0.293625, + "grad_norm": 5.025516033172607, + "grad_norm_var": 0.45043481318182993, + "learning_rate": 0.0001, + "loss": 1.1278, + "loss/crossentropy": 2.5042476654052734, + "loss/hidden": 0.890625, + "loss/logits": 0.14763936400413513, + "loss/reg": 0.008956575766205788, + "step": 2349 + }, + { + "epoch": 0.29375, + "grad_norm": 2.8866448402404785, + "grad_norm_var": 0.41726875813518044, + "learning_rate": 0.0001, + "loss": 1.1281, + "loss/crossentropy": 2.734729051589966, + "loss/hidden": 0.875, + "loss/logits": 0.16356751322746277, + "loss/reg": 0.008951269090175629, + "step": 2350 + }, + { + "epoch": 0.293875, + "grad_norm": 3.3262367248535156, + "grad_norm_var": 0.40400823919407547, + "learning_rate": 0.0001, + "loss": 1.3049, + "loss/crossentropy": 2.235628843307495, + "loss/hidden": 1.0390625, + "loss/logits": 0.17634913325309753, + "loss/reg": 0.008946053683757782, + "step": 2351 + }, + { + "epoch": 0.294, + "grad_norm": 2.7679591178894043, + "grad_norm_var": 0.4068338076394326, + "learning_rate": 0.0001, + "loss": 1.1585, + "loss/crossentropy": 2.5536861419677734, + "loss/hidden": 0.8984375, + "loss/logits": 0.17063967883586884, + "loss/reg": 0.008940762840211391, + "step": 2352 + }, + { + "epoch": 0.294125, + "grad_norm": 2.263996124267578, + "grad_norm_var": 0.41911346812872025, + "learning_rate": 0.0001, + "loss": 0.997, + "loss/crossentropy": 2.571012020111084, + "loss/hidden": 0.76171875, + "loss/logits": 0.14592288434505463, + "loss/reg": 0.008935695514082909, + "step": 2353 + }, + { + "epoch": 0.29425, + "grad_norm": 2.7407052516937256, + "grad_norm_var": 0.41827611875495807, + "learning_rate": 0.0001, + "loss": 1.1862, + "loss/crossentropy": 2.538874626159668, + "loss/hidden": 0.91796875, + "loss/logits": 0.17889195680618286, + "loss/reg": 0.008930731564760208, + "step": 2354 + }, + { + "epoch": 0.294375, + "grad_norm": 3.768303632736206, + "grad_norm_var": 0.4597757172396683, + "learning_rate": 0.0001, + "loss": 1.4307, + "loss/crossentropy": 2.420802354812622, + "loss/hidden": 1.09375, + "loss/logits": 0.2476983219385147, + "loss/reg": 0.008925938047468662, + "step": 2355 + }, + { + "epoch": 0.2945, + "grad_norm": 3.269245147705078, + "grad_norm_var": 0.4647864053235705, + "learning_rate": 0.0001, + "loss": 1.2852, + "loss/crossentropy": 2.3985133171081543, + "loss/hidden": 0.98828125, + "loss/logits": 0.20766450464725494, + "loss/reg": 0.008921150118112564, + "step": 2356 + }, + { + "epoch": 0.294625, + "grad_norm": 5.935306072235107, + "grad_norm_var": 1.0023361322098927, + "learning_rate": 0.0001, + "loss": 1.1544, + "loss/crossentropy": 2.304121971130371, + "loss/hidden": 0.91796875, + "loss/logits": 0.1472860872745514, + "loss/reg": 0.008916064165532589, + "step": 2357 + }, + { + "epoch": 0.29475, + "grad_norm": 3.042843818664551, + "grad_norm_var": 0.9892446021119061, + "learning_rate": 0.0001, + "loss": 1.1037, + "loss/crossentropy": 2.3063137531280518, + "loss/hidden": 0.85546875, + "loss/logits": 0.15909211337566376, + "loss/reg": 0.008910427801311016, + "step": 2358 + }, + { + "epoch": 0.294875, + "grad_norm": 3.300213575363159, + "grad_norm_var": 0.9879478730468897, + "learning_rate": 0.0001, + "loss": 1.107, + "loss/crossentropy": 2.5074150562286377, + "loss/hidden": 0.84375, + "loss/logits": 0.174173504114151, + "loss/reg": 0.008904799818992615, + "step": 2359 + }, + { + "epoch": 0.295, + "grad_norm": 3.961124897003174, + "grad_norm_var": 0.9566124607729002, + "learning_rate": 0.0001, + "loss": 1.5107, + "loss/crossentropy": 2.059396505355835, + "loss/hidden": 1.21875, + "loss/logits": 0.20293757319450378, + "loss/reg": 0.008899876847863197, + "step": 2360 + }, + { + "epoch": 0.295125, + "grad_norm": 3.080355405807495, + "grad_norm_var": 0.9287706341665739, + "learning_rate": 0.0001, + "loss": 1.1939, + "loss/crossentropy": 2.342959403991699, + "loss/hidden": 0.92578125, + "loss/logits": 0.17919732630252838, + "loss/reg": 0.00889492779970169, + "step": 2361 + }, + { + "epoch": 0.29525, + "grad_norm": 2.267551898956299, + "grad_norm_var": 1.0022134776908758, + "learning_rate": 0.0001, + "loss": 1.0108, + "loss/crossentropy": 2.606346368789673, + "loss/hidden": 0.7890625, + "loss/logits": 0.1328810602426529, + "loss/reg": 0.008889498189091682, + "step": 2362 + }, + { + "epoch": 0.295375, + "grad_norm": 3.204789876937866, + "grad_norm_var": 0.9523176218416741, + "learning_rate": 0.0001, + "loss": 1.2298, + "loss/crossentropy": 2.4286961555480957, + "loss/hidden": 0.95703125, + "loss/logits": 0.18392913043498993, + "loss/reg": 0.008884013630449772, + "step": 2363 + }, + { + "epoch": 0.2955, + "grad_norm": 3.0771589279174805, + "grad_norm_var": 0.9054165863050909, + "learning_rate": 0.0001, + "loss": 1.1545, + "loss/crossentropy": 2.372408390045166, + "loss/hidden": 0.91796875, + "loss/logits": 0.1477205753326416, + "loss/reg": 0.00887845829129219, + "step": 2364 + }, + { + "epoch": 0.295625, + "grad_norm": 4.011463165283203, + "grad_norm_var": 0.745830787947784, + "learning_rate": 0.0001, + "loss": 1.384, + "loss/crossentropy": 2.7621638774871826, + "loss/hidden": 1.0859375, + "loss/logits": 0.20936976373195648, + "loss/reg": 0.008873477578163147, + "step": 2365 + }, + { + "epoch": 0.29575, + "grad_norm": 3.22518253326416, + "grad_norm_var": 0.7340424869988905, + "learning_rate": 0.0001, + "loss": 1.1156, + "loss/crossentropy": 2.4410455226898193, + "loss/hidden": 0.88671875, + "loss/logits": 0.1401636302471161, + "loss/reg": 0.008867956697940826, + "step": 2366 + }, + { + "epoch": 0.295875, + "grad_norm": 3.216606855392456, + "grad_norm_var": 0.734814347600376, + "learning_rate": 0.0001, + "loss": 1.0835, + "loss/crossentropy": 2.5422208309173584, + "loss/hidden": 0.83203125, + "loss/logits": 0.16287311911582947, + "loss/reg": 0.008862389251589775, + "step": 2367 + }, + { + "epoch": 0.296, + "grad_norm": 3.1624083518981934, + "grad_norm_var": 0.7154630259459499, + "learning_rate": 0.0001, + "loss": 1.15, + "loss/crossentropy": 2.380965232849121, + "loss/hidden": 0.90234375, + "loss/logits": 0.15905311703681946, + "loss/reg": 0.008856795728206635, + "step": 2368 + }, + { + "epoch": 0.296125, + "grad_norm": 6.016289710998535, + "grad_norm_var": 1.054385328008227, + "learning_rate": 0.0001, + "loss": 1.1631, + "loss/crossentropy": 2.290355682373047, + "loss/hidden": 0.94921875, + "loss/logits": 0.12541013956069946, + "loss/reg": 0.008851776830852032, + "step": 2369 + }, + { + "epoch": 0.29625, + "grad_norm": 4.912112236022949, + "grad_norm_var": 1.1060881077813365, + "learning_rate": 0.0001, + "loss": 1.6878, + "loss/crossentropy": 2.5137951374053955, + "loss/hidden": 1.3359375, + "loss/logits": 0.26338279247283936, + "loss/reg": 0.00884647760540247, + "step": 2370 + }, + { + "epoch": 0.296375, + "grad_norm": 3.976901054382324, + "grad_norm_var": 1.1102711513024133, + "learning_rate": 0.0001, + "loss": 1.2745, + "loss/crossentropy": 2.1904850006103516, + "loss/hidden": 1.0, + "loss/logits": 0.18603858351707458, + "loss/reg": 0.008841556496918201, + "step": 2371 + }, + { + "epoch": 0.2965, + "grad_norm": 35.77780532836914, + "grad_norm_var": 65.16908526388644, + "learning_rate": 0.0001, + "loss": 1.2545, + "loss/crossentropy": 2.708794593811035, + "loss/hidden": 0.984375, + "loss/logits": 0.18171827495098114, + "loss/reg": 0.00883670337498188, + "step": 2372 + }, + { + "epoch": 0.296625, + "grad_norm": 2.7593460083007812, + "grad_norm_var": 65.72548480490148, + "learning_rate": 0.0001, + "loss": 1.1809, + "loss/crossentropy": 2.297833204269409, + "loss/hidden": 0.93359375, + "loss/logits": 0.1589377224445343, + "loss/reg": 0.008831853047013283, + "step": 2373 + }, + { + "epoch": 0.29675, + "grad_norm": 2.9837350845336914, + "grad_norm_var": 65.74555713013, + "learning_rate": 0.0001, + "loss": 1.2632, + "loss/crossentropy": 2.3337066173553467, + "loss/hidden": 1.0, + "loss/logits": 0.17494305968284607, + "loss/reg": 0.008827249519526958, + "step": 2374 + }, + { + "epoch": 0.296875, + "grad_norm": 3.603537082672119, + "grad_norm_var": 65.65998274040095, + "learning_rate": 0.0001, + "loss": 1.1356, + "loss/crossentropy": 2.829282760620117, + "loss/hidden": 0.890625, + "loss/logits": 0.15676194429397583, + "loss/reg": 0.008822724223136902, + "step": 2375 + }, + { + "epoch": 0.297, + "grad_norm": 3.626814126968384, + "grad_norm_var": 65.7390074011569, + "learning_rate": 0.0001, + "loss": 1.134, + "loss/crossentropy": 2.5433101654052734, + "loss/hidden": 0.890625, + "loss/logits": 0.15515559911727905, + "loss/reg": 0.008817736990749836, + "step": 2376 + }, + { + "epoch": 0.297125, + "grad_norm": 2.4402549266815186, + "grad_norm_var": 65.97593592476306, + "learning_rate": 0.0001, + "loss": 1.0986, + "loss/crossentropy": 2.476583242416382, + "loss/hidden": 0.85546875, + "loss/logits": 0.15500102937221527, + "loss/reg": 0.008813051506876945, + "step": 2377 + }, + { + "epoch": 0.29725, + "grad_norm": 2.6527156829833984, + "grad_norm_var": 65.81836414195753, + "learning_rate": 0.0001, + "loss": 1.0888, + "loss/crossentropy": 2.5548741817474365, + "loss/hidden": 0.859375, + "loss/logits": 0.14137858152389526, + "loss/reg": 0.008808017708361149, + "step": 2378 + }, + { + "epoch": 0.297375, + "grad_norm": 3.9048824310302734, + "grad_norm_var": 65.63097393400726, + "learning_rate": 0.0001, + "loss": 1.2494, + "loss/crossentropy": 2.3402023315429688, + "loss/hidden": 1.015625, + "loss/logits": 0.1457948386669159, + "loss/reg": 0.008802973665297031, + "step": 2379 + }, + { + "epoch": 0.2975, + "grad_norm": 3.9777634143829346, + "grad_norm_var": 65.38061986856799, + "learning_rate": 0.0001, + "loss": 1.1562, + "loss/crossentropy": 2.555475950241089, + "loss/hidden": 0.875, + "loss/logits": 0.19325734674930573, + "loss/reg": 0.00879788026213646, + "step": 2380 + }, + { + "epoch": 0.297625, + "grad_norm": 3.3086934089660645, + "grad_norm_var": 65.5641316783922, + "learning_rate": 0.0001, + "loss": 1.1131, + "loss/crossentropy": 2.415076732635498, + "loss/hidden": 0.859375, + "loss/logits": 0.16574707627296448, + "loss/reg": 0.008793032728135586, + "step": 2381 + }, + { + "epoch": 0.29775, + "grad_norm": 3.237091541290283, + "grad_norm_var": 65.56037509989369, + "learning_rate": 0.0001, + "loss": 1.1574, + "loss/crossentropy": 2.5702967643737793, + "loss/hidden": 0.9140625, + "loss/logits": 0.1554325520992279, + "loss/reg": 0.008788003586232662, + "step": 2382 + }, + { + "epoch": 0.297875, + "grad_norm": 2.826282262802124, + "grad_norm_var": 65.69379676940554, + "learning_rate": 0.0001, + "loss": 1.1833, + "loss/crossentropy": 2.4402315616607666, + "loss/hidden": 0.91796875, + "loss/logits": 0.17750728130340576, + "loss/reg": 0.008783168159425259, + "step": 2383 + }, + { + "epoch": 0.298, + "grad_norm": 3.38002347946167, + "grad_norm_var": 65.62681485931844, + "learning_rate": 0.0001, + "loss": 1.2473, + "loss/crossentropy": 2.584322452545166, + "loss/hidden": 0.984375, + "loss/logits": 0.1751251518726349, + "loss/reg": 0.008778119459748268, + "step": 2384 + }, + { + "epoch": 0.298125, + "grad_norm": 3.2768285274505615, + "grad_norm_var": 65.93887535864025, + "learning_rate": 0.0001, + "loss": 1.2257, + "loss/crossentropy": 2.2270596027374268, + "loss/hidden": 0.97265625, + "loss/logits": 0.16530007123947144, + "loss/reg": 0.008773105219006538, + "step": 2385 + }, + { + "epoch": 0.29825, + "grad_norm": 2.898982048034668, + "grad_norm_var": 66.3272327862756, + "learning_rate": 0.0001, + "loss": 1.0157, + "loss/crossentropy": 2.732396125793457, + "loss/hidden": 0.8046875, + "loss/logits": 0.12332381308078766, + "loss/reg": 0.008768080733716488, + "step": 2386 + }, + { + "epoch": 0.298375, + "grad_norm": 2.806828022003174, + "grad_norm_var": 66.61757458451252, + "learning_rate": 0.0001, + "loss": 1.2338, + "loss/crossentropy": 2.242908000946045, + "loss/hidden": 0.98828125, + "loss/logits": 0.1578962206840515, + "loss/reg": 0.008763030171394348, + "step": 2387 + }, + { + "epoch": 0.2985, + "grad_norm": 4.9925737380981445, + "grad_norm_var": 0.4051949046023232, + "learning_rate": 0.0001, + "loss": 1.4744, + "loss/crossentropy": 2.644155979156494, + "loss/hidden": 1.1875, + "loss/logits": 0.1993318796157837, + "loss/reg": 0.008757824078202248, + "step": 2388 + }, + { + "epoch": 0.298625, + "grad_norm": 3.5517148971557617, + "grad_norm_var": 0.38813223773458744, + "learning_rate": 0.0001, + "loss": 1.0211, + "loss/crossentropy": 2.775987148284912, + "loss/hidden": 0.796875, + "loss/logits": 0.13671258091926575, + "loss/reg": 0.008752482011914253, + "step": 2389 + }, + { + "epoch": 0.29875, + "grad_norm": 2.667902708053589, + "grad_norm_var": 0.4094448753401328, + "learning_rate": 0.0001, + "loss": 1.159, + "loss/crossentropy": 2.4226431846618652, + "loss/hidden": 0.8984375, + "loss/logits": 0.1731385439634323, + "loss/reg": 0.008747127838432789, + "step": 2390 + }, + { + "epoch": 0.298875, + "grad_norm": 2.527306318283081, + "grad_norm_var": 0.441445033967585, + "learning_rate": 0.0001, + "loss": 1.1519, + "loss/crossentropy": 2.367466449737549, + "loss/hidden": 0.90625, + "loss/logits": 0.15824103355407715, + "loss/reg": 0.008741697296500206, + "step": 2391 + }, + { + "epoch": 0.299, + "grad_norm": 3.333625316619873, + "grad_norm_var": 0.4322744485526821, + "learning_rate": 0.0001, + "loss": 1.2069, + "loss/crossentropy": 2.685943603515625, + "loss/hidden": 0.95703125, + "loss/logits": 0.1624826192855835, + "loss/reg": 0.008736291900277138, + "step": 2392 + }, + { + "epoch": 0.299125, + "grad_norm": 3.2942020893096924, + "grad_norm_var": 0.3871946762074041, + "learning_rate": 0.0001, + "loss": 1.4577, + "loss/crossentropy": 2.4284005165100098, + "loss/hidden": 1.15625, + "loss/logits": 0.21419063210487366, + "loss/reg": 0.008730761706829071, + "step": 2393 + }, + { + "epoch": 0.29925, + "grad_norm": 2.5786232948303223, + "grad_norm_var": 0.39383190806880275, + "learning_rate": 0.0001, + "loss": 1.0937, + "loss/crossentropy": 2.839616298675537, + "loss/hidden": 0.84765625, + "loss/logits": 0.15874697268009186, + "loss/reg": 0.008725869469344616, + "step": 2394 + }, + { + "epoch": 0.299375, + "grad_norm": 2.647444486618042, + "grad_norm_var": 0.3887601283599108, + "learning_rate": 0.0001, + "loss": 1.2557, + "loss/crossentropy": 2.545048952102661, + "loss/hidden": 0.96875, + "loss/logits": 0.19971713423728943, + "loss/reg": 0.008720939978957176, + "step": 2395 + }, + { + "epoch": 0.2995, + "grad_norm": 2.833951950073242, + "grad_norm_var": 0.3529231512408728, + "learning_rate": 0.0001, + "loss": 0.9191, + "loss/crossentropy": 2.49770450592041, + "loss/hidden": 0.73046875, + "loss/logits": 0.10150116682052612, + "loss/reg": 0.00871585588902235, + "step": 2396 + }, + { + "epoch": 0.299625, + "grad_norm": 3.6440744400024414, + "grad_norm_var": 0.3677145116829621, + "learning_rate": 0.0001, + "loss": 1.1543, + "loss/crossentropy": 2.559692621231079, + "loss/hidden": 0.90625, + "loss/logits": 0.16094306111335754, + "loss/reg": 0.008710961788892746, + "step": 2397 + }, + { + "epoch": 0.29975, + "grad_norm": 3.6816117763519287, + "grad_norm_var": 0.3848652555242568, + "learning_rate": 0.0001, + "loss": 1.1773, + "loss/crossentropy": 2.8105576038360596, + "loss/hidden": 0.91015625, + "loss/logits": 0.18004927039146423, + "loss/reg": 0.008705983869731426, + "step": 2398 + }, + { + "epoch": 0.299875, + "grad_norm": 2.779917001724243, + "grad_norm_var": 0.3872102553410227, + "learning_rate": 0.0001, + "loss": 1.046, + "loss/crossentropy": 2.4095442295074463, + "loss/hidden": 0.81640625, + "loss/logits": 0.14253917336463928, + "loss/reg": 0.008701103739440441, + "step": 2399 + }, + { + "epoch": 0.3, + "grad_norm": 3.391683340072632, + "grad_norm_var": 0.3875282017187819, + "learning_rate": 0.0001, + "loss": 1.2043, + "loss/crossentropy": 2.5792393684387207, + "loss/hidden": 0.93359375, + "loss/logits": 0.18373356759548187, + "loss/reg": 0.008696138858795166, + "step": 2400 + }, + { + "epoch": 0.300125, + "grad_norm": 2.7068686485290527, + "grad_norm_var": 0.40060266625553304, + "learning_rate": 0.0001, + "loss": 1.0836, + "loss/crossentropy": 2.502349376678467, + "loss/hidden": 0.84375, + "loss/logits": 0.15291652083396912, + "loss/reg": 0.00869122426956892, + "step": 2401 + }, + { + "epoch": 0.30025, + "grad_norm": 2.9308359622955322, + "grad_norm_var": 0.39961660366601087, + "learning_rate": 0.0001, + "loss": 1.0122, + "loss/crossentropy": 2.70316219329834, + "loss/hidden": 0.8046875, + "loss/logits": 0.1206178367137909, + "loss/reg": 0.00868629477918148, + "step": 2402 + }, + { + "epoch": 0.300375, + "grad_norm": 2.991497039794922, + "grad_norm_var": 0.39334570856519047, + "learning_rate": 0.0001, + "loss": 1.2081, + "loss/crossentropy": 2.3863425254821777, + "loss/hidden": 0.9453125, + "loss/logits": 0.17599254846572876, + "loss/reg": 0.008681208826601505, + "step": 2403 + }, + { + "epoch": 0.3005, + "grad_norm": 4.145664215087891, + "grad_norm_var": 0.2311941149002564, + "learning_rate": 0.0001, + "loss": 1.4019, + "loss/crossentropy": 2.680297613143921, + "loss/hidden": 1.109375, + "loss/logits": 0.20576120913028717, + "loss/reg": 0.008676383644342422, + "step": 2404 + }, + { + "epoch": 0.300625, + "grad_norm": 2.720083236694336, + "grad_norm_var": 0.22507276936436635, + "learning_rate": 0.0001, + "loss": 1.098, + "loss/crossentropy": 2.5008459091186523, + "loss/hidden": 0.85546875, + "loss/logits": 0.15578803420066833, + "loss/reg": 0.008671391755342484, + "step": 2405 + }, + { + "epoch": 0.30075, + "grad_norm": 2.6801791191101074, + "grad_norm_var": 0.22444904835533422, + "learning_rate": 0.0001, + "loss": 1.1841, + "loss/crossentropy": 2.62465238571167, + "loss/hidden": 0.921875, + "loss/logits": 0.17554675042629242, + "loss/reg": 0.00866637658327818, + "step": 2406 + }, + { + "epoch": 0.300875, + "grad_norm": 3.1040406227111816, + "grad_norm_var": 0.2046230383360495, + "learning_rate": 0.0001, + "loss": 1.2496, + "loss/crossentropy": 2.5489933490753174, + "loss/hidden": 0.953125, + "loss/logits": 0.20985206961631775, + "loss/reg": 0.008661621250212193, + "step": 2407 + }, + { + "epoch": 0.301, + "grad_norm": 4.717700958251953, + "grad_norm_var": 0.36903126894011395, + "learning_rate": 0.0001, + "loss": 1.3718, + "loss/crossentropy": 2.5189945697784424, + "loss/hidden": 1.0625, + "loss/logits": 0.2227438986301422, + "loss/reg": 0.008656736463308334, + "step": 2408 + }, + { + "epoch": 0.301125, + "grad_norm": 2.323028564453125, + "grad_norm_var": 0.4129359698772568, + "learning_rate": 0.0001, + "loss": 1.1975, + "loss/crossentropy": 2.590888023376465, + "loss/hidden": 0.9296875, + "loss/logits": 0.1812998652458191, + "loss/reg": 0.00865168496966362, + "step": 2409 + }, + { + "epoch": 0.30125, + "grad_norm": 2.4610395431518555, + "grad_norm_var": 0.4222457712768609, + "learning_rate": 0.0001, + "loss": 1.1327, + "loss/crossentropy": 2.4569079875946045, + "loss/hidden": 0.859375, + "loss/logits": 0.18681609630584717, + "loss/reg": 0.00864657573401928, + "step": 2410 + }, + { + "epoch": 0.301375, + "grad_norm": 3.316208839416504, + "grad_norm_var": 0.4089553254888178, + "learning_rate": 0.0001, + "loss": 1.4334, + "loss/crossentropy": 2.490478992462158, + "loss/hidden": 1.0859375, + "loss/logits": 0.26109689474105835, + "loss/reg": 0.008641259744763374, + "step": 2411 + }, + { + "epoch": 0.3015, + "grad_norm": 24.40239906311035, + "grad_norm_var": 28.569834118645204, + "learning_rate": 0.0001, + "loss": 1.243, + "loss/crossentropy": 2.595289468765259, + "loss/hidden": 0.98046875, + "loss/logits": 0.1762104332447052, + "loss/reg": 0.008636434562504292, + "step": 2412 + }, + { + "epoch": 0.301625, + "grad_norm": 3.0518271923065186, + "grad_norm_var": 28.6593300595834, + "learning_rate": 0.0001, + "loss": 1.2233, + "loss/crossentropy": 2.619058132171631, + "loss/hidden": 0.953125, + "loss/logits": 0.18382637202739716, + "loss/reg": 0.008631410077214241, + "step": 2413 + }, + { + "epoch": 0.30175, + "grad_norm": 2.953193187713623, + "grad_norm_var": 28.768361794208296, + "learning_rate": 0.0001, + "loss": 1.1549, + "loss/crossentropy": 2.505188226699829, + "loss/hidden": 0.91796875, + "loss/logits": 0.15070617198944092, + "loss/reg": 0.00862650852650404, + "step": 2414 + }, + { + "epoch": 0.301875, + "grad_norm": 2.481987714767456, + "grad_norm_var": 28.838951084280588, + "learning_rate": 0.0001, + "loss": 1.098, + "loss/crossentropy": 2.713684320449829, + "loss/hidden": 0.8671875, + "loss/logits": 0.14459127187728882, + "loss/reg": 0.008621657267212868, + "step": 2415 + }, + { + "epoch": 0.302, + "grad_norm": 3.0760812759399414, + "grad_norm_var": 28.887549381853713, + "learning_rate": 0.0001, + "loss": 1.0383, + "loss/crossentropy": 2.6677870750427246, + "loss/hidden": 0.8125, + "loss/logits": 0.13965703547000885, + "loss/reg": 0.008616678416728973, + "step": 2416 + }, + { + "epoch": 0.302125, + "grad_norm": 3.9381103515625, + "grad_norm_var": 28.70780426316846, + "learning_rate": 0.0001, + "loss": 1.4546, + "loss/crossentropy": 2.422744035720825, + "loss/hidden": 1.171875, + "loss/logits": 0.19660133123397827, + "loss/reg": 0.00861173402518034, + "step": 2417 + }, + { + "epoch": 0.30225, + "grad_norm": 2.5658814907073975, + "grad_norm_var": 28.790337682961194, + "learning_rate": 0.0001, + "loss": 1.0233, + "loss/crossentropy": 2.684520721435547, + "loss/hidden": 0.7890625, + "loss/logits": 0.14820876717567444, + "loss/reg": 0.008607473224401474, + "step": 2418 + }, + { + "epoch": 0.302375, + "grad_norm": 3.002297878265381, + "grad_norm_var": 28.78826896565235, + "learning_rate": 0.0001, + "loss": 1.2427, + "loss/crossentropy": 2.604464054107666, + "loss/hidden": 0.98046875, + "loss/logits": 0.17621472477912903, + "loss/reg": 0.008603207767009735, + "step": 2419 + }, + { + "epoch": 0.3025, + "grad_norm": 2.300240993499756, + "grad_norm_var": 29.07199924850674, + "learning_rate": 0.0001, + "loss": 0.9842, + "loss/crossentropy": 2.5724611282348633, + "loss/hidden": 0.76171875, + "loss/logits": 0.13653458654880524, + "loss/reg": 0.008598416112363338, + "step": 2420 + }, + { + "epoch": 0.302625, + "grad_norm": 3.1282362937927246, + "grad_norm_var": 28.995430346148794, + "learning_rate": 0.0001, + "loss": 1.0806, + "loss/crossentropy": 2.722224235534668, + "loss/hidden": 0.85546875, + "loss/logits": 0.13924378156661987, + "loss/reg": 0.00859353318810463, + "step": 2421 + }, + { + "epoch": 0.30275, + "grad_norm": 3.070507764816284, + "grad_norm_var": 28.918366062590824, + "learning_rate": 0.0001, + "loss": 1.162, + "loss/crossentropy": 2.472001314163208, + "loss/hidden": 0.9140625, + "loss/logits": 0.1620345413684845, + "loss/reg": 0.008588820695877075, + "step": 2422 + }, + { + "epoch": 0.302875, + "grad_norm": 6.976761341094971, + "grad_norm_var": 29.202923047719196, + "learning_rate": 0.0001, + "loss": 1.3661, + "loss/crossentropy": 2.518024206161499, + "loss/hidden": 1.0703125, + "loss/logits": 0.20996594429016113, + "loss/reg": 0.008584471419453621, + "step": 2423 + }, + { + "epoch": 0.303, + "grad_norm": 2.7494466304779053, + "grad_norm_var": 29.416875484658267, + "learning_rate": 0.0001, + "loss": 1.3766, + "loss/crossentropy": 2.06559157371521, + "loss/hidden": 1.0703125, + "loss/logits": 0.22048631310462952, + "loss/reg": 0.008580142632126808, + "step": 2424 + }, + { + "epoch": 0.303125, + "grad_norm": 2.6227638721466064, + "grad_norm_var": 29.335994968462636, + "learning_rate": 0.0001, + "loss": 1.1898, + "loss/crossentropy": 2.5547640323638916, + "loss/hidden": 0.92578125, + "loss/logits": 0.17826434969902039, + "loss/reg": 0.00857546366751194, + "step": 2425 + }, + { + "epoch": 0.30325, + "grad_norm": 3.027575969696045, + "grad_norm_var": 29.20157793375158, + "learning_rate": 0.0001, + "loss": 1.2628, + "loss/crossentropy": 2.7030017375946045, + "loss/hidden": 0.984375, + "loss/logits": 0.1927398443222046, + "loss/reg": 0.008570637553930283, + "step": 2426 + }, + { + "epoch": 0.303375, + "grad_norm": 2.526233673095703, + "grad_norm_var": 29.36963851575393, + "learning_rate": 0.0001, + "loss": 1.0245, + "loss/crossentropy": 2.325460910797119, + "loss/hidden": 0.8046875, + "loss/logits": 0.13416746258735657, + "loss/reg": 0.008566015399992466, + "step": 2427 + }, + { + "epoch": 0.3035, + "grad_norm": 2.5091707706451416, + "grad_norm_var": 1.2066223739803374, + "learning_rate": 0.0001, + "loss": 1.1117, + "loss/crossentropy": 2.417747735977173, + "loss/hidden": 0.87890625, + "loss/logits": 0.1472015380859375, + "loss/reg": 0.00856158696115017, + "step": 2428 + }, + { + "epoch": 0.303625, + "grad_norm": 2.7826859951019287, + "grad_norm_var": 1.2137313805283818, + "learning_rate": 0.0001, + "loss": 1.3235, + "loss/crossentropy": 2.36360502243042, + "loss/hidden": 1.046875, + "loss/logits": 0.19105210900306702, + "loss/reg": 0.008557192981243134, + "step": 2429 + }, + { + "epoch": 0.30375, + "grad_norm": 2.525635242462158, + "grad_norm_var": 1.2339219806930863, + "learning_rate": 0.0001, + "loss": 1.1475, + "loss/crossentropy": 2.5855934619903564, + "loss/hidden": 0.89453125, + "loss/logits": 0.16740617156028748, + "loss/reg": 0.008552337065339088, + "step": 2430 + }, + { + "epoch": 0.303875, + "grad_norm": 2.8171823024749756, + "grad_norm_var": 1.2142073590231213, + "learning_rate": 0.0001, + "loss": 0.9686, + "loss/crossentropy": 2.630697011947632, + "loss/hidden": 0.765625, + "loss/logits": 0.1175055131316185, + "loss/reg": 0.008547664619982243, + "step": 2431 + }, + { + "epoch": 0.304, + "grad_norm": 2.2007381916046143, + "grad_norm_var": 1.2650252891804779, + "learning_rate": 0.0001, + "loss": 0.9862, + "loss/crossentropy": 2.338225841522217, + "loss/hidden": 0.78125, + "loss/logits": 0.11953779309988022, + "loss/reg": 0.008542955853044987, + "step": 2432 + }, + { + "epoch": 0.304125, + "grad_norm": 2.927119016647339, + "grad_norm_var": 1.208714235715221, + "learning_rate": 0.0001, + "loss": 1.2864, + "loss/crossentropy": 2.4550628662109375, + "loss/hidden": 1.015625, + "loss/logits": 0.18543177843093872, + "loss/reg": 0.008538227528333664, + "step": 2433 + }, + { + "epoch": 0.30425, + "grad_norm": 5.459600448608398, + "grad_norm_var": 1.5710203551513502, + "learning_rate": 0.0001, + "loss": 1.4985, + "loss/crossentropy": 2.7933189868927, + "loss/hidden": 1.203125, + "loss/logits": 0.20999671518802643, + "loss/reg": 0.008533895947039127, + "step": 2434 + }, + { + "epoch": 0.304375, + "grad_norm": 3.3298285007476807, + "grad_norm_var": 1.5706574767047052, + "learning_rate": 0.0001, + "loss": 1.1879, + "loss/crossentropy": 2.5396249294281006, + "loss/hidden": 0.91796875, + "loss/logits": 0.1846163272857666, + "loss/reg": 0.00852968730032444, + "step": 2435 + }, + { + "epoch": 0.3045, + "grad_norm": 3.0491790771484375, + "grad_norm_var": 1.517402764578537, + "learning_rate": 0.0001, + "loss": 1.0259, + "loss/crossentropy": 2.7251358032226562, + "loss/hidden": 0.82421875, + "loss/logits": 0.11642439663410187, + "loss/reg": 0.008525633253157139, + "step": 2436 + }, + { + "epoch": 0.304625, + "grad_norm": 3.4224965572357178, + "grad_norm_var": 1.518766336197973, + "learning_rate": 0.0001, + "loss": 1.2202, + "loss/crossentropy": 2.2464852333068848, + "loss/hidden": 0.98828125, + "loss/logits": 0.1466951072216034, + "loss/reg": 0.008520832285284996, + "step": 2437 + }, + { + "epoch": 0.30475, + "grad_norm": 3.422363758087158, + "grad_norm_var": 1.5180922939771901, + "learning_rate": 0.0001, + "loss": 1.1578, + "loss/crossentropy": 2.6844217777252197, + "loss/hidden": 0.8984375, + "loss/logits": 0.1742226779460907, + "loss/reg": 0.00851602852344513, + "step": 2438 + }, + { + "epoch": 0.304875, + "grad_norm": 3.5305891036987305, + "grad_norm_var": 0.5579568795111525, + "learning_rate": 0.0001, + "loss": 1.5565, + "loss/crossentropy": 2.3324625492095947, + "loss/hidden": 1.234375, + "loss/logits": 0.23700308799743652, + "loss/reg": 0.008511193096637726, + "step": 2439 + }, + { + "epoch": 0.305, + "grad_norm": 3.407106876373291, + "grad_norm_var": 0.5580719087930409, + "learning_rate": 0.0001, + "loss": 1.2137, + "loss/crossentropy": 2.3598814010620117, + "loss/hidden": 0.96484375, + "loss/logits": 0.16381308436393738, + "loss/reg": 0.00850673858076334, + "step": 2440 + }, + { + "epoch": 0.305125, + "grad_norm": 3.357808828353882, + "grad_norm_var": 0.545311465691474, + "learning_rate": 0.0001, + "loss": 1.0932, + "loss/crossentropy": 2.851924180984497, + "loss/hidden": 0.8671875, + "loss/logits": 0.14103174209594727, + "loss/reg": 0.008502088487148285, + "step": 2441 + }, + { + "epoch": 0.30525, + "grad_norm": 10.613187789916992, + "grad_norm_var": 4.024451685723701, + "learning_rate": 0.0001, + "loss": 1.6528, + "loss/crossentropy": 2.9296388626098633, + "loss/hidden": 1.3203125, + "loss/logits": 0.24747411906719208, + "loss/reg": 0.008497673086822033, + "step": 2442 + }, + { + "epoch": 0.305375, + "grad_norm": 2.7130720615386963, + "grad_norm_var": 3.999446637959761, + "learning_rate": 0.0001, + "loss": 1.0823, + "loss/crossentropy": 2.4702353477478027, + "loss/hidden": 0.859375, + "loss/logits": 0.1380041539669037, + "loss/reg": 0.00849311612546444, + "step": 2443 + }, + { + "epoch": 0.3055, + "grad_norm": 3.434694528579712, + "grad_norm_var": 3.9147642682923105, + "learning_rate": 0.0001, + "loss": 1.2748, + "loss/crossentropy": 2.3088977336883545, + "loss/hidden": 1.015625, + "loss/logits": 0.17431774735450745, + "loss/reg": 0.0084883077070117, + "step": 2444 + }, + { + "epoch": 0.305625, + "grad_norm": 2.297743320465088, + "grad_norm_var": 3.9879396221243004, + "learning_rate": 0.0001, + "loss": 1.186, + "loss/crossentropy": 2.1842119693756104, + "loss/hidden": 0.94140625, + "loss/logits": 0.15974247455596924, + "loss/reg": 0.008483455516397953, + "step": 2445 + }, + { + "epoch": 0.30575, + "grad_norm": 2.785144090652466, + "grad_norm_var": 3.953010022320063, + "learning_rate": 0.0001, + "loss": 1.131, + "loss/crossentropy": 2.2691195011138916, + "loss/hidden": 0.88671875, + "loss/logits": 0.1594940721988678, + "loss/reg": 0.008478758856654167, + "step": 2446 + }, + { + "epoch": 0.305875, + "grad_norm": 15.622485160827637, + "grad_norm_var": 12.740311243628318, + "learning_rate": 0.0001, + "loss": 1.673, + "loss/crossentropy": 2.8026766777038574, + "loss/hidden": 1.375, + "loss/logits": 0.21322891116142273, + "loss/reg": 0.008473950438201427, + "step": 2447 + }, + { + "epoch": 0.306, + "grad_norm": 3.7892706394195557, + "grad_norm_var": 12.416682799270726, + "learning_rate": 0.0001, + "loss": 1.3284, + "loss/crossentropy": 2.1243934631347656, + "loss/hidden": 1.0390625, + "loss/logits": 0.20460397005081177, + "loss/reg": 0.008469190448522568, + "step": 2448 + }, + { + "epoch": 0.306125, + "grad_norm": 3.279370069503784, + "grad_norm_var": 12.347154598211334, + "learning_rate": 0.0001, + "loss": 1.2037, + "loss/crossentropy": 2.4942173957824707, + "loss/hidden": 0.9453125, + "loss/logits": 0.17377984523773193, + "loss/reg": 0.008464318700134754, + "step": 2449 + }, + { + "epoch": 0.30625, + "grad_norm": 4.799704551696777, + "grad_norm_var": 12.298264851634466, + "learning_rate": 0.0001, + "loss": 1.2891, + "loss/crossentropy": 2.4870104789733887, + "loss/hidden": 1.015625, + "loss/logits": 0.18885855376720428, + "loss/reg": 0.008459558710455894, + "step": 2450 + }, + { + "epoch": 0.306375, + "grad_norm": 3.2744569778442383, + "grad_norm_var": 12.307489782106911, + "learning_rate": 0.0001, + "loss": 1.249, + "loss/crossentropy": 2.401167154312134, + "loss/hidden": 0.9921875, + "loss/logits": 0.1722332090139389, + "loss/reg": 0.008454885333776474, + "step": 2451 + }, + { + "epoch": 0.3065, + "grad_norm": 3.1636385917663574, + "grad_norm_var": 12.28540542749778, + "learning_rate": 0.0001, + "loss": 1.1119, + "loss/crossentropy": 2.1348047256469727, + "loss/hidden": 0.89453125, + "loss/logits": 0.13289546966552734, + "loss/reg": 0.008450067602097988, + "step": 2452 + }, + { + "epoch": 0.306625, + "grad_norm": 3.113227605819702, + "grad_norm_var": 12.338168527888682, + "learning_rate": 0.0001, + "loss": 1.061, + "loss/crossentropy": 2.690253973007202, + "loss/hidden": 0.8359375, + "loss/logits": 0.14056900143623352, + "loss/reg": 0.00844533834606409, + "step": 2453 + }, + { + "epoch": 0.30675, + "grad_norm": 2.675001382827759, + "grad_norm_var": 12.484223449570326, + "learning_rate": 0.0001, + "loss": 1.0686, + "loss/crossentropy": 2.663367509841919, + "loss/hidden": 0.84375, + "loss/logits": 0.14045220613479614, + "loss/reg": 0.008440541103482246, + "step": 2454 + }, + { + "epoch": 0.306875, + "grad_norm": 4.320045471191406, + "grad_norm_var": 12.422079068207328, + "learning_rate": 0.0001, + "loss": 1.2872, + "loss/crossentropy": 2.6254730224609375, + "loss/hidden": 1.0078125, + "loss/logits": 0.19500669836997986, + "loss/reg": 0.008435754105448723, + "step": 2455 + }, + { + "epoch": 0.307, + "grad_norm": 3.218492031097412, + "grad_norm_var": 12.452802633505488, + "learning_rate": 0.0001, + "loss": 1.0992, + "loss/crossentropy": 2.502739429473877, + "loss/hidden": 0.8671875, + "loss/logits": 0.14767178893089294, + "loss/reg": 0.00843086652457714, + "step": 2456 + }, + { + "epoch": 0.307125, + "grad_norm": 3.3030409812927246, + "grad_norm_var": 12.46153954765156, + "learning_rate": 0.0001, + "loss": 1.1603, + "loss/crossentropy": 2.509340286254883, + "loss/hidden": 0.91015625, + "loss/logits": 0.16591142117977142, + "loss/reg": 0.008426223881542683, + "step": 2457 + }, + { + "epoch": 0.30725, + "grad_norm": 3.762617826461792, + "grad_norm_var": 9.833823344772357, + "learning_rate": 0.0001, + "loss": 1.2481, + "loss/crossentropy": 2.324880838394165, + "loss/hidden": 0.98828125, + "loss/logits": 0.17556428909301758, + "loss/reg": 0.008421618491411209, + "step": 2458 + }, + { + "epoch": 0.307375, + "grad_norm": 2.9094738960266113, + "grad_norm_var": 9.799993393027869, + "learning_rate": 0.0001, + "loss": 1.0512, + "loss/crossentropy": 2.8471007347106934, + "loss/hidden": 0.82421875, + "loss/logits": 0.14284679293632507, + "loss/reg": 0.008416818454861641, + "step": 2459 + }, + { + "epoch": 0.3075, + "grad_norm": 3.027622699737549, + "grad_norm_var": 9.84696382847447, + "learning_rate": 0.0001, + "loss": 1.1143, + "loss/crossentropy": 2.5665125846862793, + "loss/hidden": 0.86328125, + "loss/logits": 0.16688036918640137, + "loss/reg": 0.008411974646151066, + "step": 2460 + }, + { + "epoch": 0.307625, + "grad_norm": 2.6896543502807617, + "grad_norm_var": 9.763231679513295, + "learning_rate": 0.0001, + "loss": 1.1067, + "loss/crossentropy": 2.5124568939208984, + "loss/hidden": 0.8671875, + "loss/logits": 0.15543505549430847, + "loss/reg": 0.008407117798924446, + "step": 2461 + }, + { + "epoch": 0.30775, + "grad_norm": 3.0365183353424072, + "grad_norm_var": 9.722832415426788, + "learning_rate": 0.0001, + "loss": 1.3809, + "loss/crossentropy": 2.5087716579437256, + "loss/hidden": 1.0390625, + "loss/logits": 0.2578057646751404, + "loss/reg": 0.008402649313211441, + "step": 2462 + }, + { + "epoch": 0.307875, + "grad_norm": 2.8391647338867188, + "grad_norm_var": 0.337719229834813, + "learning_rate": 0.0001, + "loss": 1.0769, + "loss/crossentropy": 2.5051848888397217, + "loss/hidden": 0.8359375, + "loss/logits": 0.1569894552230835, + "loss/reg": 0.008398093283176422, + "step": 2463 + }, + { + "epoch": 0.308, + "grad_norm": 3.7724037170410156, + "grad_norm_var": 0.3366930844852843, + "learning_rate": 0.0001, + "loss": 1.2091, + "loss/crossentropy": 2.5084896087646484, + "loss/hidden": 0.93359375, + "loss/logits": 0.19161269068717957, + "loss/reg": 0.008393323048949242, + "step": 2464 + }, + { + "epoch": 0.308125, + "grad_norm": 2.4853949546813965, + "grad_norm_var": 0.3808204049566598, + "learning_rate": 0.0001, + "loss": 1.1699, + "loss/crossentropy": 2.612621784210205, + "loss/hidden": 0.921875, + "loss/logits": 0.1641857922077179, + "loss/reg": 0.008388564921915531, + "step": 2465 + }, + { + "epoch": 0.30825, + "grad_norm": 5.955503463745117, + "grad_norm_var": 0.69937116946071, + "learning_rate": 0.0001, + "loss": 1.5109, + "loss/crossentropy": 2.940636396408081, + "loss/hidden": 1.1875, + "loss/logits": 0.2395998239517212, + "loss/reg": 0.008383556269109249, + "step": 2466 + }, + { + "epoch": 0.308375, + "grad_norm": 2.6847071647644043, + "grad_norm_var": 0.726785045572106, + "learning_rate": 0.0001, + "loss": 1.1414, + "loss/crossentropy": 2.4248883724212646, + "loss/hidden": 0.89453125, + "loss/logits": 0.16309913992881775, + "loss/reg": 0.008378535509109497, + "step": 2467 + }, + { + "epoch": 0.3085, + "grad_norm": 2.2300915718078613, + "grad_norm_var": 0.7994452847802819, + "learning_rate": 0.0001, + "loss": 0.978, + "loss/crossentropy": 2.4430646896362305, + "loss/hidden": 0.76953125, + "loss/logits": 0.12473122030496597, + "loss/reg": 0.008373593911528587, + "step": 2468 + }, + { + "epoch": 0.308625, + "grad_norm": 14.154952049255371, + "grad_norm_var": 8.215952123785092, + "learning_rate": 0.0001, + "loss": 1.6183, + "loss/crossentropy": 2.22820782661438, + "loss/hidden": 1.296875, + "loss/logits": 0.2377779334783554, + "loss/reg": 0.008368860930204391, + "step": 2469 + }, + { + "epoch": 0.30875, + "grad_norm": 3.0958447456359863, + "grad_norm_var": 8.155952705499327, + "learning_rate": 0.0001, + "loss": 1.295, + "loss/crossentropy": 2.549400806427002, + "loss/hidden": 1.03125, + "loss/logits": 0.1801406890153885, + "loss/reg": 0.008364192210137844, + "step": 2470 + }, + { + "epoch": 0.308875, + "grad_norm": 3.0508296489715576, + "grad_norm_var": 8.197032135491966, + "learning_rate": 0.0001, + "loss": 1.1827, + "loss/crossentropy": 2.6679482460021973, + "loss/hidden": 0.9140625, + "loss/logits": 0.1850062906742096, + "loss/reg": 0.008359471336007118, + "step": 2471 + }, + { + "epoch": 0.309, + "grad_norm": 2.6177220344543457, + "grad_norm_var": 8.273260909846474, + "learning_rate": 0.0001, + "loss": 1.2408, + "loss/crossentropy": 2.391519546508789, + "loss/hidden": 0.9765625, + "loss/logits": 0.18065842986106873, + "loss/reg": 0.008354629389941692, + "step": 2472 + }, + { + "epoch": 0.309125, + "grad_norm": 3.6693973541259766, + "grad_norm_var": 8.254884432431362, + "learning_rate": 0.0001, + "loss": 1.2897, + "loss/crossentropy": 2.3759405612945557, + "loss/hidden": 1.015625, + "loss/logits": 0.19060981273651123, + "loss/reg": 0.008349529467523098, + "step": 2473 + }, + { + "epoch": 0.30925, + "grad_norm": 2.972973585128784, + "grad_norm_var": 8.305568703088834, + "learning_rate": 0.0001, + "loss": 1.316, + "loss/crossentropy": 2.351238489151001, + "loss/hidden": 1.0390625, + "loss/logits": 0.19352483749389648, + "loss/reg": 0.008344343863427639, + "step": 2474 + }, + { + "epoch": 0.309375, + "grad_norm": 2.8257896900177, + "grad_norm_var": 8.31621633522744, + "learning_rate": 0.0001, + "loss": 1.1418, + "loss/crossentropy": 2.752835988998413, + "loss/hidden": 0.8984375, + "loss/logits": 0.15997451543807983, + "loss/reg": 0.008339089341461658, + "step": 2475 + }, + { + "epoch": 0.3095, + "grad_norm": 3.59055233001709, + "grad_norm_var": 8.276601876260814, + "learning_rate": 0.0001, + "loss": 1.1864, + "loss/crossentropy": 2.531418800354004, + "loss/hidden": 0.9453125, + "loss/logits": 0.15772724151611328, + "loss/reg": 0.00833413191139698, + "step": 2476 + }, + { + "epoch": 0.309625, + "grad_norm": 2.9225127696990967, + "grad_norm_var": 8.243825905000179, + "learning_rate": 0.0001, + "loss": 1.2206, + "loss/crossentropy": 2.408013105392456, + "loss/hidden": 0.9453125, + "loss/logits": 0.19199243187904358, + "loss/reg": 0.008329110220074654, + "step": 2477 + }, + { + "epoch": 0.30975, + "grad_norm": 2.886647939682007, + "grad_norm_var": 8.26186542079585, + "learning_rate": 0.0001, + "loss": 1.0563, + "loss/crossentropy": 2.5499773025512695, + "loss/hidden": 0.83203125, + "loss/logits": 0.14107027649879456, + "loss/reg": 0.00832393579185009, + "step": 2478 + }, + { + "epoch": 0.309875, + "grad_norm": 3.3956942558288574, + "grad_norm_var": 8.205498809479822, + "learning_rate": 0.0001, + "loss": 1.515, + "loss/crossentropy": 2.2446517944335938, + "loss/hidden": 1.171875, + "loss/logits": 0.25993162393569946, + "loss/reg": 0.00831907894462347, + "step": 2479 + }, + { + "epoch": 0.31, + "grad_norm": 3.032118797302246, + "grad_norm_var": 8.251795578761755, + "learning_rate": 0.0001, + "loss": 1.2559, + "loss/crossentropy": 2.421250343322754, + "loss/hidden": 1.0, + "loss/logits": 0.1727350354194641, + "loss/reg": 0.008314297534525394, + "step": 2480 + }, + { + "epoch": 0.310125, + "grad_norm": 2.7298812866210938, + "grad_norm_var": 8.211107418551006, + "learning_rate": 0.0001, + "loss": 1.1409, + "loss/crossentropy": 2.616569995880127, + "loss/hidden": 0.890625, + "loss/logits": 0.16722124814987183, + "loss/reg": 0.008309799246490002, + "step": 2481 + }, + { + "epoch": 0.31025, + "grad_norm": 2.4837992191314697, + "grad_norm_var": 7.996004821474075, + "learning_rate": 0.0001, + "loss": 1.0577, + "loss/crossentropy": 2.470477819442749, + "loss/hidden": 0.83984375, + "loss/logits": 0.13481104373931885, + "loss/reg": 0.008305060677230358, + "step": 2482 + }, + { + "epoch": 0.310375, + "grad_norm": 2.642988443374634, + "grad_norm_var": 8.00146339987627, + "learning_rate": 0.0001, + "loss": 1.144, + "loss/crossentropy": 2.640923500061035, + "loss/hidden": 0.88671875, + "loss/logits": 0.1743232011795044, + "loss/reg": 0.008300449699163437, + "step": 2483 + }, + { + "epoch": 0.3105, + "grad_norm": 2.701859474182129, + "grad_norm_var": 7.926444160274639, + "learning_rate": 0.0001, + "loss": 1.1702, + "loss/crossentropy": 2.3798654079437256, + "loss/hidden": 0.921875, + "loss/logits": 0.16538016498088837, + "loss/reg": 0.008295743726193905, + "step": 2484 + }, + { + "epoch": 0.310625, + "grad_norm": 2.3794946670532227, + "grad_norm_var": 0.13602344526067683, + "learning_rate": 0.0001, + "loss": 1.0081, + "loss/crossentropy": 2.5220437049865723, + "loss/hidden": 0.79296875, + "loss/logits": 0.13217590749263763, + "loss/reg": 0.008291090838611126, + "step": 2485 + }, + { + "epoch": 0.31075, + "grad_norm": 3.3525218963623047, + "grad_norm_var": 0.14556432386626714, + "learning_rate": 0.0001, + "loss": 1.1081, + "loss/crossentropy": 2.465467929840088, + "loss/hidden": 0.87890625, + "loss/logits": 0.14636963605880737, + "loss/reg": 0.00828634388744831, + "step": 2486 + }, + { + "epoch": 0.310875, + "grad_norm": 4.075220584869385, + "grad_norm_var": 0.22445457359199944, + "learning_rate": 0.0001, + "loss": 1.4904, + "loss/crossentropy": 2.746276617050171, + "loss/hidden": 1.171875, + "loss/logits": 0.23566846549510956, + "loss/reg": 0.008281702175736427, + "step": 2487 + }, + { + "epoch": 0.311, + "grad_norm": 2.4073922634124756, + "grad_norm_var": 0.23842940074481261, + "learning_rate": 0.0001, + "loss": 0.9898, + "loss/crossentropy": 2.563324213027954, + "loss/hidden": 0.77734375, + "loss/logits": 0.1296350061893463, + "loss/reg": 0.008277208544313908, + "step": 2488 + }, + { + "epoch": 0.311125, + "grad_norm": 3.3396146297454834, + "grad_norm_var": 0.2159817978703425, + "learning_rate": 0.0001, + "loss": 1.1739, + "loss/crossentropy": 2.3436970710754395, + "loss/hidden": 0.9296875, + "loss/logits": 0.16149966418743134, + "loss/reg": 0.008272715844213963, + "step": 2489 + }, + { + "epoch": 0.31125, + "grad_norm": 2.958264112472534, + "grad_norm_var": 0.21601634129135525, + "learning_rate": 0.0001, + "loss": 1.363, + "loss/crossentropy": 2.5359067916870117, + "loss/hidden": 1.0625, + "loss/logits": 0.21784111857414246, + "loss/reg": 0.008268316276371479, + "step": 2490 + }, + { + "epoch": 0.311375, + "grad_norm": 3.207165241241455, + "grad_norm_var": 0.2171242350896551, + "learning_rate": 0.0001, + "loss": 1.212, + "loss/crossentropy": 2.702636480331421, + "loss/hidden": 0.9453125, + "loss/logits": 0.18409806489944458, + "loss/reg": 0.008263681083917618, + "step": 2491 + }, + { + "epoch": 0.3115, + "grad_norm": 3.0768396854400635, + "grad_norm_var": 0.1936207491423488, + "learning_rate": 0.0001, + "loss": 1.1725, + "loss/crossentropy": 2.5358407497406006, + "loss/hidden": 0.90234375, + "loss/logits": 0.18761487305164337, + "loss/reg": 0.008259044028818607, + "step": 2492 + }, + { + "epoch": 0.311625, + "grad_norm": 2.7717533111572266, + "grad_norm_var": 0.1960863031103976, + "learning_rate": 0.0001, + "loss": 1.091, + "loss/crossentropy": 2.6724939346313477, + "loss/hidden": 0.84375, + "loss/logits": 0.1647084653377533, + "loss/reg": 0.008254318498075008, + "step": 2493 + }, + { + "epoch": 0.31175, + "grad_norm": 2.3412671089172363, + "grad_norm_var": 0.22037958778339165, + "learning_rate": 0.0001, + "loss": 1.0774, + "loss/crossentropy": 2.6601550579071045, + "loss/hidden": 0.84765625, + "loss/logits": 0.14724040031433105, + "loss/reg": 0.008249460719525814, + "step": 2494 + }, + { + "epoch": 0.311875, + "grad_norm": 2.9546611309051514, + "grad_norm_var": 0.2052099422321246, + "learning_rate": 0.0001, + "loss": 1.2966, + "loss/crossentropy": 2.0416440963745117, + "loss/hidden": 1.0234375, + "loss/logits": 0.19068187475204468, + "loss/reg": 0.008244695141911507, + "step": 2495 + }, + { + "epoch": 0.312, + "grad_norm": 2.2250843048095703, + "grad_norm_var": 0.23206872125230665, + "learning_rate": 0.0001, + "loss": 1.0326, + "loss/crossentropy": 2.405212640762329, + "loss/hidden": 0.8046875, + "loss/logits": 0.14548514783382416, + "loss/reg": 0.00823995377868414, + "step": 2496 + }, + { + "epoch": 0.312125, + "grad_norm": 2.6060986518859863, + "grad_norm_var": 0.23505815082794068, + "learning_rate": 0.0001, + "loss": 1.038, + "loss/crossentropy": 2.5417017936706543, + "loss/hidden": 0.8203125, + "loss/logits": 0.13534501194953918, + "loss/reg": 0.008235367015004158, + "step": 2497 + }, + { + "epoch": 0.31225, + "grad_norm": 4.2700605392456055, + "grad_norm_var": 0.348392303569031, + "learning_rate": 0.0001, + "loss": 1.1426, + "loss/crossentropy": 2.49465274810791, + "loss/hidden": 0.91015625, + "loss/logits": 0.15012937784194946, + "loss/reg": 0.0082307830452919, + "step": 2498 + }, + { + "epoch": 0.312375, + "grad_norm": 2.6776180267333984, + "grad_norm_var": 0.3470178701011944, + "learning_rate": 0.0001, + "loss": 1.1661, + "loss/crossentropy": 2.4823501110076904, + "loss/hidden": 0.93359375, + "loss/logits": 0.15022966265678406, + "loss/reg": 0.008226322941482067, + "step": 2499 + }, + { + "epoch": 0.3125, + "grad_norm": 3.0893046855926514, + "grad_norm_var": 0.3431133104077086, + "learning_rate": 0.0001, + "loss": 1.001, + "loss/crossentropy": 2.1879446506500244, + "loss/hidden": 0.79296875, + "loss/logits": 0.1258106529712677, + "loss/reg": 0.008221834897994995, + "step": 2500 + }, + { + "epoch": 0.312625, + "grad_norm": 4.2277984619140625, + "grad_norm_var": 0.4078321652012657, + "learning_rate": 0.0001, + "loss": 1.1349, + "loss/crossentropy": 2.5304198265075684, + "loss/hidden": 0.90234375, + "loss/logits": 0.1503707319498062, + "loss/reg": 0.008217175491154194, + "step": 2501 + }, + { + "epoch": 0.31275, + "grad_norm": 3.031630754470825, + "grad_norm_var": 0.40341188399138045, + "learning_rate": 0.0001, + "loss": 1.2626, + "loss/crossentropy": 2.5491790771484375, + "loss/hidden": 0.96875, + "loss/logits": 0.21170274913311005, + "loss/reg": 0.008212613873183727, + "step": 2502 + }, + { + "epoch": 0.312875, + "grad_norm": 2.515403985977173, + "grad_norm_var": 0.34823166415208107, + "learning_rate": 0.0001, + "loss": 0.9965, + "loss/crossentropy": 2.7140889167785645, + "loss/hidden": 0.78125, + "loss/logits": 0.1331307291984558, + "loss/reg": 0.008207826875150204, + "step": 2503 + }, + { + "epoch": 0.313, + "grad_norm": 2.0359883308410645, + "grad_norm_var": 0.385270571821966, + "learning_rate": 0.0001, + "loss": 0.9553, + "loss/crossentropy": 2.3980460166931152, + "loss/hidden": 0.76171875, + "loss/logits": 0.1115872710943222, + "loss/reg": 0.008203042671084404, + "step": 2504 + }, + { + "epoch": 0.313125, + "grad_norm": 3.1058859825134277, + "grad_norm_var": 0.3767933968898423, + "learning_rate": 0.0001, + "loss": 1.1618, + "loss/crossentropy": 2.700930118560791, + "loss/hidden": 0.8984375, + "loss/logits": 0.1813761293888092, + "loss/reg": 0.008198052644729614, + "step": 2505 + }, + { + "epoch": 0.31325, + "grad_norm": 2.679569721221924, + "grad_norm_var": 0.38109645326810526, + "learning_rate": 0.0001, + "loss": 1.2594, + "loss/crossentropy": 2.3166518211364746, + "loss/hidden": 0.9921875, + "loss/logits": 0.18530996143817902, + "loss/reg": 0.008192971348762512, + "step": 2506 + }, + { + "epoch": 0.313375, + "grad_norm": 4.029979228973389, + "grad_norm_var": 0.45425571684662763, + "learning_rate": 0.0001, + "loss": 1.2466, + "loss/crossentropy": 2.8141579627990723, + "loss/hidden": 0.96484375, + "loss/logits": 0.19990737736225128, + "loss/reg": 0.008187885396182537, + "step": 2507 + }, + { + "epoch": 0.3135, + "grad_norm": 2.686330795288086, + "grad_norm_var": 0.4586109506167115, + "learning_rate": 0.0001, + "loss": 1.0777, + "loss/crossentropy": 2.555020332336426, + "loss/hidden": 0.85546875, + "loss/logits": 0.14035674929618835, + "loss/reg": 0.008183191530406475, + "step": 2508 + }, + { + "epoch": 0.313625, + "grad_norm": 2.5621328353881836, + "grad_norm_var": 0.46642374263757724, + "learning_rate": 0.0001, + "loss": 1.1441, + "loss/crossentropy": 2.498279094696045, + "loss/hidden": 0.9140625, + "loss/logits": 0.1482505351305008, + "loss/reg": 0.00817788951098919, + "step": 2509 + }, + { + "epoch": 0.31375, + "grad_norm": 2.7276833057403564, + "grad_norm_var": 0.4449118907286577, + "learning_rate": 0.0001, + "loss": 1.1044, + "loss/crossentropy": 2.7958109378814697, + "loss/hidden": 0.87109375, + "loss/logits": 0.15154990553855896, + "loss/reg": 0.00817323662340641, + "step": 2510 + }, + { + "epoch": 0.313875, + "grad_norm": 4.74147891998291, + "grad_norm_var": 0.6422135136917518, + "learning_rate": 0.0001, + "loss": 1.3536, + "loss/crossentropy": 2.4710745811462402, + "loss/hidden": 1.1015625, + "loss/logits": 0.17034141719341278, + "loss/reg": 0.00816851481795311, + "step": 2511 + }, + { + "epoch": 0.314, + "grad_norm": 3.099266290664673, + "grad_norm_var": 0.590823743948986, + "learning_rate": 0.0001, + "loss": 1.4926, + "loss/crossentropy": 2.315178394317627, + "loss/hidden": 1.1875, + "loss/logits": 0.2234746813774109, + "loss/reg": 0.008163852617144585, + "step": 2512 + }, + { + "epoch": 0.314125, + "grad_norm": 2.525214433670044, + "grad_norm_var": 0.5968868813786155, + "learning_rate": 0.0001, + "loss": 1.0313, + "loss/crossentropy": 2.290358066558838, + "loss/hidden": 0.8203125, + "loss/logits": 0.12944141030311584, + "loss/reg": 0.008159250020980835, + "step": 2513 + }, + { + "epoch": 0.31425, + "grad_norm": 2.755531072616577, + "grad_norm_var": 0.5090864361424468, + "learning_rate": 0.0001, + "loss": 1.098, + "loss/crossentropy": 2.727020502090454, + "loss/hidden": 0.87109375, + "loss/logits": 0.1454046070575714, + "loss/reg": 0.008154422976076603, + "step": 2514 + }, + { + "epoch": 0.314375, + "grad_norm": 4.692269802093506, + "grad_norm_var": 0.667924266825821, + "learning_rate": 0.0001, + "loss": 1.2908, + "loss/crossentropy": 2.917980194091797, + "loss/hidden": 1.046875, + "loss/logits": 0.16244551539421082, + "loss/reg": 0.008149709552526474, + "step": 2515 + }, + { + "epoch": 0.3145, + "grad_norm": 2.7078747749328613, + "grad_norm_var": 0.6804393571637396, + "learning_rate": 0.0001, + "loss": 1.2332, + "loss/crossentropy": 2.5379669666290283, + "loss/hidden": 0.94921875, + "loss/logits": 0.2024967521429062, + "loss/reg": 0.008145029656589031, + "step": 2516 + }, + { + "epoch": 0.314625, + "grad_norm": 3.172844886779785, + "grad_norm_var": 0.5959675990636432, + "learning_rate": 0.0001, + "loss": 1.1875, + "loss/crossentropy": 2.4681851863861084, + "loss/hidden": 0.93359375, + "loss/logits": 0.1725274622440338, + "loss/reg": 0.00814049318432808, + "step": 2517 + }, + { + "epoch": 0.31475, + "grad_norm": 2.3384783267974854, + "grad_norm_var": 0.6292483670555536, + "learning_rate": 0.0001, + "loss": 1.1444, + "loss/crossentropy": 2.5236966609954834, + "loss/hidden": 0.89453125, + "loss/logits": 0.16852623224258423, + "loss/reg": 0.008135776035487652, + "step": 2518 + }, + { + "epoch": 0.314875, + "grad_norm": 2.6271474361419678, + "grad_norm_var": 0.6224586552640773, + "learning_rate": 0.0001, + "loss": 1.0236, + "loss/crossentropy": 2.4808454513549805, + "loss/hidden": 0.8046875, + "loss/logits": 0.1375848352909088, + "loss/reg": 0.008131255395710468, + "step": 2519 + }, + { + "epoch": 0.315, + "grad_norm": 2.251837730407715, + "grad_norm_var": 0.5967492067849055, + "learning_rate": 0.0001, + "loss": 1.0499, + "loss/crossentropy": 2.274949550628662, + "loss/hidden": 0.8359375, + "loss/logits": 0.13274267315864563, + "loss/reg": 0.008126837201416492, + "step": 2520 + }, + { + "epoch": 0.315125, + "grad_norm": 2.243041753768921, + "grad_norm_var": 0.6361573270778381, + "learning_rate": 0.0001, + "loss": 1.0866, + "loss/crossentropy": 2.7266902923583984, + "loss/hidden": 0.8515625, + "loss/logits": 0.15385454893112183, + "loss/reg": 0.008122115395963192, + "step": 2521 + }, + { + "epoch": 0.31525, + "grad_norm": 2.948204278945923, + "grad_norm_var": 0.6295471113038124, + "learning_rate": 0.0001, + "loss": 1.3099, + "loss/crossentropy": 2.567535161972046, + "loss/hidden": 1.0390625, + "loss/logits": 0.18966838717460632, + "loss/reg": 0.00811740756034851, + "step": 2522 + }, + { + "epoch": 0.315375, + "grad_norm": 2.6897497177124023, + "grad_norm_var": 0.5589769862277905, + "learning_rate": 0.0001, + "loss": 1.1445, + "loss/crossentropy": 2.23712158203125, + "loss/hidden": 0.921875, + "loss/logits": 0.14150108397006989, + "loss/reg": 0.008112751878798008, + "step": 2523 + }, + { + "epoch": 0.3155, + "grad_norm": 2.5390353202819824, + "grad_norm_var": 0.564982357395013, + "learning_rate": 0.0001, + "loss": 1.1139, + "loss/crossentropy": 2.4869041442871094, + "loss/hidden": 0.8671875, + "loss/logits": 0.1655886173248291, + "loss/reg": 0.008108001202344894, + "step": 2524 + }, + { + "epoch": 0.315625, + "grad_norm": 2.460150957107544, + "grad_norm_var": 0.5704150421972465, + "learning_rate": 0.0001, + "loss": 1.1353, + "loss/crossentropy": 2.8104233741760254, + "loss/hidden": 0.875, + "loss/logits": 0.1792430579662323, + "loss/reg": 0.00810318160802126, + "step": 2525 + }, + { + "epoch": 0.31575, + "grad_norm": 2.6032872200012207, + "grad_norm_var": 0.5743644590644625, + "learning_rate": 0.0001, + "loss": 1.0926, + "loss/crossentropy": 2.662508487701416, + "loss/hidden": 0.85546875, + "loss/logits": 0.15619996190071106, + "loss/reg": 0.008098081685602665, + "step": 2526 + }, + { + "epoch": 0.315875, + "grad_norm": 2.670227527618408, + "grad_norm_var": 0.3338599928436906, + "learning_rate": 0.0001, + "loss": 1.3989, + "loss/crossentropy": 2.448843479156494, + "loss/hidden": 1.0703125, + "loss/logits": 0.24770912528038025, + "loss/reg": 0.008092799223959446, + "step": 2527 + }, + { + "epoch": 0.316, + "grad_norm": 2.7434043884277344, + "grad_norm_var": 0.3261640792952029, + "learning_rate": 0.0001, + "loss": 1.2423, + "loss/crossentropy": 2.5062313079833984, + "loss/hidden": 0.9765625, + "loss/logits": 0.18484677374362946, + "loss/reg": 0.008087786845862865, + "step": 2528 + }, + { + "epoch": 0.316125, + "grad_norm": 2.3189005851745605, + "grad_norm_var": 0.33495343187128884, + "learning_rate": 0.0001, + "loss": 1.1363, + "loss/crossentropy": 2.477735757827759, + "loss/hidden": 0.890625, + "loss/logits": 0.1648310422897339, + "loss/reg": 0.008082669228315353, + "step": 2529 + }, + { + "epoch": 0.31625, + "grad_norm": 2.6439194679260254, + "grad_norm_var": 0.3354283166033665, + "learning_rate": 0.0001, + "loss": 1.1496, + "loss/crossentropy": 2.0244314670562744, + "loss/hidden": 0.921875, + "loss/logits": 0.1469593644142151, + "loss/reg": 0.008077649399638176, + "step": 2530 + }, + { + "epoch": 0.316375, + "grad_norm": 3.3819785118103027, + "grad_norm_var": 0.0995894460610143, + "learning_rate": 0.0001, + "loss": 1.1011, + "loss/crossentropy": 2.3411011695861816, + "loss/hidden": 0.8671875, + "loss/logits": 0.1531715840101242, + "loss/reg": 0.00807263795286417, + "step": 2531 + }, + { + "epoch": 0.3165, + "grad_norm": 2.3509695529937744, + "grad_norm_var": 0.10461846563550713, + "learning_rate": 0.0001, + "loss": 1.0096, + "loss/crossentropy": 2.461629629135132, + "loss/hidden": 0.80078125, + "loss/logits": 0.128116175532341, + "loss/reg": 0.008067458868026733, + "step": 2532 + }, + { + "epoch": 0.316625, + "grad_norm": 2.800790786743164, + "grad_norm_var": 0.08604076646320115, + "learning_rate": 0.0001, + "loss": 1.0749, + "loss/crossentropy": 2.4930965900421143, + "loss/hidden": 0.85546875, + "loss/logits": 0.13885578513145447, + "loss/reg": 0.008062000386416912, + "step": 2533 + }, + { + "epoch": 0.31675, + "grad_norm": 2.240063190460205, + "grad_norm_var": 0.09008692752806055, + "learning_rate": 0.0001, + "loss": 1.0819, + "loss/crossentropy": 2.2248756885528564, + "loss/hidden": 0.84375, + "loss/logits": 0.15762293338775635, + "loss/reg": 0.008057336322963238, + "step": 2534 + }, + { + "epoch": 0.316875, + "grad_norm": 5.764140605926514, + "grad_norm_var": 0.7187690969076574, + "learning_rate": 0.0001, + "loss": 1.7771, + "loss/crossentropy": 2.9873297214508057, + "loss/hidden": 1.375, + "loss/logits": 0.32158011198043823, + "loss/reg": 0.008052188903093338, + "step": 2535 + }, + { + "epoch": 0.317, + "grad_norm": 2.8569881916046143, + "grad_norm_var": 0.6981855623242947, + "learning_rate": 0.0001, + "loss": 0.9504, + "loss/crossentropy": 2.7362804412841797, + "loss/hidden": 0.7421875, + "loss/logits": 0.1277720332145691, + "loss/reg": 0.008047577925026417, + "step": 2536 + }, + { + "epoch": 0.317125, + "grad_norm": 2.6561717987060547, + "grad_norm_var": 0.676607405990823, + "learning_rate": 0.0001, + "loss": 1.0372, + "loss/crossentropy": 2.2529711723327637, + "loss/hidden": 0.828125, + "loss/logits": 0.12862426042556763, + "loss/reg": 0.008042579516768456, + "step": 2537 + }, + { + "epoch": 0.31725, + "grad_norm": 2.360325574874878, + "grad_norm_var": 0.6908429080082509, + "learning_rate": 0.0001, + "loss": 1.1012, + "loss/crossentropy": 2.5445258617401123, + "loss/hidden": 0.8515625, + "loss/logits": 0.16922800242900848, + "loss/reg": 0.008037998341023922, + "step": 2538 + }, + { + "epoch": 0.317375, + "grad_norm": 2.924952983856201, + "grad_norm_var": 0.690293936885909, + "learning_rate": 0.0001, + "loss": 1.2245, + "loss/crossentropy": 2.581141948699951, + "loss/hidden": 0.96484375, + "loss/logits": 0.17930260300636292, + "loss/reg": 0.00803348422050476, + "step": 2539 + }, + { + "epoch": 0.3175, + "grad_norm": 3.1181447505950928, + "grad_norm_var": 0.6886173146499476, + "learning_rate": 0.0001, + "loss": 1.0597, + "loss/crossentropy": 2.483273983001709, + "loss/hidden": 0.8359375, + "loss/logits": 0.143497496843338, + "loss/reg": 0.008029043674468994, + "step": 2540 + }, + { + "epoch": 0.317625, + "grad_norm": 2.7864935398101807, + "grad_norm_var": 0.6775096155236343, + "learning_rate": 0.0001, + "loss": 1.5654, + "loss/crossentropy": 2.336775779724121, + "loss/hidden": 1.28125, + "loss/logits": 0.20392963290214539, + "loss/reg": 0.008024375885725021, + "step": 2541 + }, + { + "epoch": 0.31775, + "grad_norm": 4.5650715827941895, + "grad_norm_var": 0.8433657246787635, + "learning_rate": 0.0001, + "loss": 1.2705, + "loss/crossentropy": 2.6035873889923096, + "loss/hidden": 1.015625, + "loss/logits": 0.17469017207622528, + "loss/reg": 0.00801988784223795, + "step": 2542 + }, + { + "epoch": 0.317875, + "grad_norm": 2.4016213417053223, + "grad_norm_var": 0.8600941799229036, + "learning_rate": 0.0001, + "loss": 0.9866, + "loss/crossentropy": 2.430835247039795, + "loss/hidden": 0.7890625, + "loss/logits": 0.11737307906150818, + "loss/reg": 0.008015446364879608, + "step": 2543 + }, + { + "epoch": 0.318, + "grad_norm": 2.8635733127593994, + "grad_norm_var": 0.8569715907184815, + "learning_rate": 0.0001, + "loss": 1.1688, + "loss/crossentropy": 2.513808012008667, + "loss/hidden": 0.921875, + "loss/logits": 0.16680805385112762, + "loss/reg": 0.00801115669310093, + "step": 2544 + }, + { + "epoch": 0.318125, + "grad_norm": 2.8633594512939453, + "grad_norm_var": 0.825899981484118, + "learning_rate": 0.0001, + "loss": 1.0874, + "loss/crossentropy": 2.4013705253601074, + "loss/hidden": 0.86328125, + "loss/logits": 0.1440640091896057, + "loss/reg": 0.008006923831999302, + "step": 2545 + }, + { + "epoch": 0.31825, + "grad_norm": 3.336221933364868, + "grad_norm_var": 0.8196485786844673, + "learning_rate": 0.0001, + "loss": 1.1661, + "loss/crossentropy": 2.4172966480255127, + "loss/hidden": 0.94140625, + "loss/logits": 0.14471112191677094, + "loss/reg": 0.008002296090126038, + "step": 2546 + }, + { + "epoch": 0.318375, + "grad_norm": 25.402185440063477, + "grad_norm_var": 32.013536294963174, + "learning_rate": 0.0001, + "loss": 1.1739, + "loss/crossentropy": 2.476424217224121, + "loss/hidden": 0.96875, + "loss/logits": 0.12517961859703064, + "loss/reg": 0.007997564971446991, + "step": 2547 + }, + { + "epoch": 0.3185, + "grad_norm": 2.5788767337799072, + "grad_norm_var": 31.95282513171967, + "learning_rate": 0.0001, + "loss": 0.987, + "loss/crossentropy": 2.2598233222961426, + "loss/hidden": 0.78515625, + "loss/logits": 0.12186400592327118, + "loss/reg": 0.007993310689926147, + "step": 2548 + }, + { + "epoch": 0.318625, + "grad_norm": 5.203117847442627, + "grad_norm_var": 31.778879122039758, + "learning_rate": 0.0001, + "loss": 1.5493, + "loss/crossentropy": 2.8368258476257324, + "loss/hidden": 1.25, + "loss/logits": 0.21936602890491486, + "loss/reg": 0.007988505996763706, + "step": 2549 + }, + { + "epoch": 0.31875, + "grad_norm": 3.1387836933135986, + "grad_norm_var": 31.544164081956517, + "learning_rate": 0.0001, + "loss": 1.4324, + "loss/crossentropy": 2.2365758419036865, + "loss/hidden": 1.125, + "loss/logits": 0.22757811844348907, + "loss/reg": 0.007983655668795109, + "step": 2550 + }, + { + "epoch": 0.318875, + "grad_norm": 3.0909738540649414, + "grad_norm_var": 31.603030103660057, + "learning_rate": 0.0001, + "loss": 1.1948, + "loss/crossentropy": 2.4808223247528076, + "loss/hidden": 0.953125, + "loss/logits": 0.1618569791316986, + "loss/reg": 0.00797884352505207, + "step": 2551 + }, + { + "epoch": 0.319, + "grad_norm": 16.40399169921875, + "grad_norm_var": 40.0888138713829, + "learning_rate": 0.0001, + "loss": 1.1314, + "loss/crossentropy": 2.6985421180725098, + "loss/hidden": 0.89453125, + "loss/logits": 0.15714946389198303, + "loss/reg": 0.007974395528435707, + "step": 2552 + }, + { + "epoch": 0.319125, + "grad_norm": 2.6357789039611816, + "grad_norm_var": 40.09618047551386, + "learning_rate": 0.0001, + "loss": 1.138, + "loss/crossentropy": 2.3694357872009277, + "loss/hidden": 0.92578125, + "loss/logits": 0.13253375887870789, + "loss/reg": 0.007969595491886139, + "step": 2553 + }, + { + "epoch": 0.31925, + "grad_norm": 3.619582414627075, + "grad_norm_var": 39.69254839209577, + "learning_rate": 0.0001, + "loss": 1.1681, + "loss/crossentropy": 2.6467034816741943, + "loss/hidden": 0.92578125, + "loss/logits": 0.16270460188388824, + "loss/reg": 0.007964862510561943, + "step": 2554 + }, + { + "epoch": 0.319375, + "grad_norm": 2.9519665241241455, + "grad_norm_var": 39.68355943842979, + "learning_rate": 0.0001, + "loss": 1.0972, + "loss/crossentropy": 2.531005859375, + "loss/hidden": 0.86328125, + "loss/logits": 0.1543324589729309, + "loss/reg": 0.007960199378430843, + "step": 2555 + }, + { + "epoch": 0.3195, + "grad_norm": 2.5988266468048096, + "grad_norm_var": 39.860838682114135, + "learning_rate": 0.0001, + "loss": 1.0897, + "loss/crossentropy": 2.4022152423858643, + "loss/hidden": 0.859375, + "loss/logits": 0.15077164769172668, + "loss/reg": 0.007956115528941154, + "step": 2556 + }, + { + "epoch": 0.319625, + "grad_norm": 3.033994436264038, + "grad_norm_var": 39.77833782320421, + "learning_rate": 0.0001, + "loss": 1.0754, + "loss/crossentropy": 2.4406771659851074, + "loss/hidden": 0.84765625, + "loss/logits": 0.1482125073671341, + "loss/reg": 0.007952181622385979, + "step": 2557 + }, + { + "epoch": 0.31975, + "grad_norm": 2.8204097747802734, + "grad_norm_var": 40.16698659743211, + "learning_rate": 0.0001, + "loss": 1.0806, + "loss/crossentropy": 2.49525785446167, + "loss/hidden": 0.859375, + "loss/logits": 0.14175555109977722, + "loss/reg": 0.00794767215847969, + "step": 2558 + }, + { + "epoch": 0.319875, + "grad_norm": 2.7112882137298584, + "grad_norm_var": 40.05293933094223, + "learning_rate": 0.0001, + "loss": 1.1233, + "loss/crossentropy": 2.2404439449310303, + "loss/hidden": 0.90234375, + "loss/logits": 0.14156538248062134, + "loss/reg": 0.007943613454699516, + "step": 2559 + }, + { + "epoch": 0.32, + "grad_norm": 2.495206117630005, + "grad_norm_var": 40.18247722631502, + "learning_rate": 0.0001, + "loss": 1.134, + "loss/crossentropy": 2.5469627380371094, + "loss/hidden": 0.8984375, + "loss/logits": 0.1561906337738037, + "loss/reg": 0.007939061149954796, + "step": 2560 + }, + { + "epoch": 0.320125, + "grad_norm": 3.302274703979492, + "grad_norm_var": 40.05161117604535, + "learning_rate": 0.0001, + "loss": 1.1982, + "loss/crossentropy": 2.7550911903381348, + "loss/hidden": 0.9375, + "loss/logits": 0.18138551712036133, + "loss/reg": 0.007934242486953735, + "step": 2561 + }, + { + "epoch": 0.32025, + "grad_norm": 3.7350454330444336, + "grad_norm_var": 39.95538586120728, + "learning_rate": 0.0001, + "loss": 1.3011, + "loss/crossentropy": 2.4785315990448, + "loss/hidden": 0.99609375, + "loss/logits": 0.22566866874694824, + "loss/reg": 0.007929767481982708, + "step": 2562 + }, + { + "epoch": 0.320375, + "grad_norm": 3.059227228164673, + "grad_norm_var": 11.441958354222736, + "learning_rate": 0.0001, + "loss": 1.2334, + "loss/crossentropy": 2.086972951889038, + "loss/hidden": 0.984375, + "loss/logits": 0.1697504073381424, + "loss/reg": 0.007925317622721195, + "step": 2563 + }, + { + "epoch": 0.3205, + "grad_norm": 4.701015472412109, + "grad_norm_var": 11.332291954962598, + "learning_rate": 0.0001, + "loss": 1.4703, + "loss/crossentropy": 2.418721914291382, + "loss/hidden": 1.15625, + "loss/logits": 0.2348509132862091, + "loss/reg": 0.007920571602880955, + "step": 2564 + }, + { + "epoch": 0.320625, + "grad_norm": 2.6832938194274902, + "grad_norm_var": 11.35644609806377, + "learning_rate": 0.0001, + "loss": 1.3029, + "loss/crossentropy": 2.481543779373169, + "loss/hidden": 1.0390625, + "loss/logits": 0.1847291886806488, + "loss/reg": 0.007915699854493141, + "step": 2565 + }, + { + "epoch": 0.32075, + "grad_norm": 3.084307909011841, + "grad_norm_var": 11.362424673855896, + "learning_rate": 0.0001, + "loss": 1.1475, + "loss/crossentropy": 2.3587937355041504, + "loss/hidden": 0.91796875, + "loss/logits": 0.15041357278823853, + "loss/reg": 0.007911217398941517, + "step": 2566 + }, + { + "epoch": 0.320875, + "grad_norm": 2.499419927597046, + "grad_norm_var": 11.450705499809004, + "learning_rate": 0.0001, + "loss": 1.2532, + "loss/crossentropy": 2.5330963134765625, + "loss/hidden": 0.97265625, + "loss/logits": 0.20151573419570923, + "loss/reg": 0.007906860671937466, + "step": 2567 + }, + { + "epoch": 0.321, + "grad_norm": 2.9942328929901123, + "grad_norm_var": 0.32562910742197554, + "learning_rate": 0.0001, + "loss": 1.4124, + "loss/crossentropy": 2.047797441482544, + "loss/hidden": 1.1171875, + "loss/logits": 0.21620450913906097, + "loss/reg": 0.007902403362095356, + "step": 2568 + }, + { + "epoch": 0.321125, + "grad_norm": 2.7636852264404297, + "grad_norm_var": 0.31945324599887875, + "learning_rate": 0.0001, + "loss": 1.1161, + "loss/crossentropy": 2.628687620162964, + "loss/hidden": 0.87890625, + "loss/logits": 0.15823045372962952, + "loss/reg": 0.007897865027189255, + "step": 2569 + }, + { + "epoch": 0.32125, + "grad_norm": 3.103445053100586, + "grad_norm_var": 0.2979969355216892, + "learning_rate": 0.0001, + "loss": 1.1125, + "loss/crossentropy": 2.620704412460327, + "loss/hidden": 0.8828125, + "loss/logits": 0.1507074236869812, + "loss/reg": 0.007893339730799198, + "step": 2570 + }, + { + "epoch": 0.321375, + "grad_norm": 3.3367934226989746, + "grad_norm_var": 0.3030639087904395, + "learning_rate": 0.0001, + "loss": 1.1457, + "loss/crossentropy": 2.564239025115967, + "loss/hidden": 0.8984375, + "loss/logits": 0.1683800369501114, + "loss/reg": 0.007888550870120525, + "step": 2571 + }, + { + "epoch": 0.3215, + "grad_norm": 3.724079132080078, + "grad_norm_var": 0.3133614103887962, + "learning_rate": 0.0001, + "loss": 1.2817, + "loss/crossentropy": 2.6372389793395996, + "loss/hidden": 1.0078125, + "loss/logits": 0.19503168761730194, + "loss/reg": 0.00788370706140995, + "step": 2572 + }, + { + "epoch": 0.321625, + "grad_norm": 8.869174003601074, + "grad_norm_var": 2.368319043086323, + "learning_rate": 0.0001, + "loss": 1.2064, + "loss/crossentropy": 2.6313023567199707, + "loss/hidden": 0.96875, + "loss/logits": 0.15889215469360352, + "loss/reg": 0.007879137992858887, + "step": 2573 + }, + { + "epoch": 0.32175, + "grad_norm": 3.2357513904571533, + "grad_norm_var": 2.3418711972123463, + "learning_rate": 0.0001, + "loss": 1.148, + "loss/crossentropy": 2.729548692703247, + "loss/hidden": 0.89453125, + "loss/logits": 0.17475661635398865, + "loss/reg": 0.007874642498791218, + "step": 2574 + }, + { + "epoch": 0.321875, + "grad_norm": 2.6461427211761475, + "grad_norm_var": 2.349149153770658, + "learning_rate": 0.0001, + "loss": 1.0281, + "loss/crossentropy": 2.849135160446167, + "loss/hidden": 0.80859375, + "loss/logits": 0.14081352949142456, + "loss/reg": 0.00787015724927187, + "step": 2575 + }, + { + "epoch": 0.322, + "grad_norm": 3.000493288040161, + "grad_norm_var": 2.2964302577261813, + "learning_rate": 0.0001, + "loss": 1.2042, + "loss/crossentropy": 2.4427332878112793, + "loss/hidden": 0.9609375, + "loss/logits": 0.16465440392494202, + "loss/reg": 0.007865658961236477, + "step": 2576 + }, + { + "epoch": 0.322125, + "grad_norm": 2.3975491523742676, + "grad_norm_var": 2.377006834312514, + "learning_rate": 0.0001, + "loss": 1.1652, + "loss/crossentropy": 2.3622562885284424, + "loss/hidden": 0.921875, + "loss/logits": 0.16467657685279846, + "loss/reg": 0.007861320860683918, + "step": 2577 + }, + { + "epoch": 0.32225, + "grad_norm": 4.879811763763428, + "grad_norm_var": 2.496375610429292, + "learning_rate": 0.0001, + "loss": 1.1249, + "loss/crossentropy": 2.716374397277832, + "loss/hidden": 0.91015625, + "loss/logits": 0.1361505091190338, + "loss/reg": 0.007856857031583786, + "step": 2578 + }, + { + "epoch": 0.322375, + "grad_norm": 2.653228759765625, + "grad_norm_var": 2.5338485087143225, + "learning_rate": 0.0001, + "loss": 1.1286, + "loss/crossentropy": 2.3120439052581787, + "loss/hidden": 0.89453125, + "loss/logits": 0.15558606386184692, + "loss/reg": 0.007852481678128242, + "step": 2579 + }, + { + "epoch": 0.3225, + "grad_norm": 2.3732690811157227, + "grad_norm_var": 2.51084793626862, + "learning_rate": 0.0001, + "loss": 1.0542, + "loss/crossentropy": 2.48455810546875, + "loss/hidden": 0.8203125, + "loss/logits": 0.15541693568229675, + "loss/reg": 0.007848329842090607, + "step": 2580 + }, + { + "epoch": 0.322625, + "grad_norm": 2.2885780334472656, + "grad_norm_var": 2.5577939346555154, + "learning_rate": 0.0001, + "loss": 1.0968, + "loss/crossentropy": 2.5193240642547607, + "loss/hidden": 0.859375, + "loss/logits": 0.15894070267677307, + "loss/reg": 0.007843822240829468, + "step": 2581 + }, + { + "epoch": 0.32275, + "grad_norm": 2.368438243865967, + "grad_norm_var": 2.616674558840077, + "learning_rate": 0.0001, + "loss": 1.1216, + "loss/crossentropy": 2.543750286102295, + "loss/hidden": 0.87890625, + "loss/logits": 0.16430911421775818, + "loss/reg": 0.007839600555598736, + "step": 2582 + }, + { + "epoch": 0.322875, + "grad_norm": 2.7130985260009766, + "grad_norm_var": 2.5961244046821927, + "learning_rate": 0.0001, + "loss": 1.1149, + "loss/crossentropy": 2.5501976013183594, + "loss/hidden": 0.87890625, + "loss/logits": 0.15763741731643677, + "loss/reg": 0.007835152558982372, + "step": 2583 + }, + { + "epoch": 0.323, + "grad_norm": 3.332221269607544, + "grad_norm_var": 2.5879418987983693, + "learning_rate": 0.0001, + "loss": 1.3418, + "loss/crossentropy": 2.464792013168335, + "loss/hidden": 1.046875, + "loss/logits": 0.2166520357131958, + "loss/reg": 0.007831021212041378, + "step": 2584 + }, + { + "epoch": 0.323125, + "grad_norm": 2.6998746395111084, + "grad_norm_var": 2.5932304005165743, + "learning_rate": 0.0001, + "loss": 1.1051, + "loss/crossentropy": 2.629274606704712, + "loss/hidden": 0.875, + "loss/logits": 0.15186628699302673, + "loss/reg": 0.007826832123100758, + "step": 2585 + }, + { + "epoch": 0.32325, + "grad_norm": 2.593357801437378, + "grad_norm_var": 2.6263541149868805, + "learning_rate": 0.0001, + "loss": 1.1353, + "loss/crossentropy": 2.4835731983184814, + "loss/hidden": 0.890625, + "loss/logits": 0.1664915233850479, + "loss/reg": 0.00782258901745081, + "step": 2586 + }, + { + "epoch": 0.323375, + "grad_norm": 3.2135069370269775, + "grad_norm_var": 2.627019672054109, + "learning_rate": 0.0001, + "loss": 1.0558, + "loss/crossentropy": 2.5876874923706055, + "loss/hidden": 0.83203125, + "loss/logits": 0.14562730491161346, + "loss/reg": 0.00781804509460926, + "step": 2587 + }, + { + "epoch": 0.3235, + "grad_norm": 2.3434550762176514, + "grad_norm_var": 2.6702560894095986, + "learning_rate": 0.0001, + "loss": 0.9702, + "loss/crossentropy": 2.3595199584960938, + "loss/hidden": 0.76953125, + "loss/logits": 0.12251024693250656, + "loss/reg": 0.007813764736056328, + "step": 2588 + }, + { + "epoch": 0.323625, + "grad_norm": 2.7084426879882812, + "grad_norm_var": 0.40652881496496984, + "learning_rate": 0.0001, + "loss": 1.1934, + "loss/crossentropy": 2.40519380569458, + "loss/hidden": 0.9453125, + "loss/logits": 0.16999296844005585, + "loss/reg": 0.007809475529938936, + "step": 2589 + }, + { + "epoch": 0.32375, + "grad_norm": 3.2095394134521484, + "grad_norm_var": 0.40519021013584106, + "learning_rate": 0.0001, + "loss": 1.2046, + "loss/crossentropy": 2.4516022205352783, + "loss/hidden": 0.94140625, + "loss/logits": 0.185172900557518, + "loss/reg": 0.007804952561855316, + "step": 2590 + }, + { + "epoch": 0.323875, + "grad_norm": 2.906341791152954, + "grad_norm_var": 0.40273733338422985, + "learning_rate": 0.0001, + "loss": 1.1454, + "loss/crossentropy": 2.215069532394409, + "loss/hidden": 0.90625, + "loss/logits": 0.16119013726711273, + "loss/reg": 0.007800604682415724, + "step": 2591 + }, + { + "epoch": 0.324, + "grad_norm": 8.041544914245605, + "grad_norm_var": 2.088741135090609, + "learning_rate": 0.0001, + "loss": 1.1322, + "loss/crossentropy": 2.5879595279693604, + "loss/hidden": 0.90234375, + "loss/logits": 0.1518431007862091, + "loss/reg": 0.007796355057507753, + "step": 2592 + }, + { + "epoch": 0.324125, + "grad_norm": 4.211373805999756, + "grad_norm_var": 2.107517443561201, + "learning_rate": 0.0001, + "loss": 1.4008, + "loss/crossentropy": 2.5505714416503906, + "loss/hidden": 1.1171875, + "loss/logits": 0.20565779507160187, + "loss/reg": 0.007792046293616295, + "step": 2593 + }, + { + "epoch": 0.32425, + "grad_norm": 2.848515748977661, + "grad_norm_var": 1.9330598330694877, + "learning_rate": 0.0001, + "loss": 1.0428, + "loss/crossentropy": 2.484156608581543, + "loss/hidden": 0.828125, + "loss/logits": 0.13679122924804688, + "loss/reg": 0.00778788048774004, + "step": 2594 + }, + { + "epoch": 0.324375, + "grad_norm": 3.4857234954833984, + "grad_norm_var": 1.9205070885605189, + "learning_rate": 0.0001, + "loss": 1.353, + "loss/crossentropy": 2.4910690784454346, + "loss/hidden": 1.046875, + "loss/logits": 0.22832083702087402, + "loss/reg": 0.007783424109220505, + "step": 2595 + }, + { + "epoch": 0.3245, + "grad_norm": 3.0777857303619385, + "grad_norm_var": 1.8730631684861834, + "learning_rate": 0.0001, + "loss": 1.3452, + "loss/crossentropy": 2.078157901763916, + "loss/hidden": 1.109375, + "loss/logits": 0.15806248784065247, + "loss/reg": 0.007778877392411232, + "step": 2596 + }, + { + "epoch": 0.324625, + "grad_norm": 3.4929563999176025, + "grad_norm_var": 1.8089128397021774, + "learning_rate": 0.0001, + "loss": 1.3896, + "loss/crossentropy": 2.5213236808776855, + "loss/hidden": 1.1015625, + "loss/logits": 0.21032989025115967, + "loss/reg": 0.007774207275360823, + "step": 2597 + }, + { + "epoch": 0.32475, + "grad_norm": 3.1592681407928467, + "grad_norm_var": 1.7468330858856993, + "learning_rate": 0.0001, + "loss": 1.278, + "loss/crossentropy": 2.2738771438598633, + "loss/hidden": 0.99609375, + "loss/logits": 0.20421947538852692, + "loss/reg": 0.007769493386149406, + "step": 2598 + }, + { + "epoch": 0.324875, + "grad_norm": 4.697360992431641, + "grad_norm_var": 1.81718409529495, + "learning_rate": 0.0001, + "loss": 1.3252, + "loss/crossentropy": 2.7483253479003906, + "loss/hidden": 1.0390625, + "loss/logits": 0.2084609568119049, + "loss/reg": 0.007764711976051331, + "step": 2599 + }, + { + "epoch": 0.325, + "grad_norm": 4.689452171325684, + "grad_norm_var": 1.901711341382982, + "learning_rate": 0.0001, + "loss": 1.2521, + "loss/crossentropy": 2.4038500785827637, + "loss/hidden": 0.96484375, + "loss/logits": 0.20968744158744812, + "loss/reg": 0.0077603100799024105, + "step": 2600 + }, + { + "epoch": 0.325125, + "grad_norm": 3.7870068550109863, + "grad_norm_var": 1.8471099999524152, + "learning_rate": 0.0001, + "loss": 1.4223, + "loss/crossentropy": 2.95763897895813, + "loss/hidden": 1.140625, + "loss/logits": 0.20407956838607788, + "loss/reg": 0.007756032049655914, + "step": 2601 + }, + { + "epoch": 0.32525, + "grad_norm": 8.312338829040527, + "grad_norm_var": 3.082431375172884, + "learning_rate": 0.0001, + "loss": 1.2734, + "loss/crossentropy": 2.4641189575195312, + "loss/hidden": 1.0234375, + "loss/logits": 0.17245729267597198, + "loss/reg": 0.007751623634248972, + "step": 2602 + }, + { + "epoch": 0.325375, + "grad_norm": 2.8623106479644775, + "grad_norm_var": 3.1275088065821834, + "learning_rate": 0.0001, + "loss": 1.1114, + "loss/crossentropy": 2.725635290145874, + "loss/hidden": 0.87109375, + "loss/logits": 0.16285188496112823, + "loss/reg": 0.007747271563857794, + "step": 2603 + }, + { + "epoch": 0.3255, + "grad_norm": 3.544903516769409, + "grad_norm_var": 2.954026938784157, + "learning_rate": 0.0001, + "loss": 1.5288, + "loss/crossentropy": 2.289555549621582, + "loss/hidden": 1.2109375, + "loss/logits": 0.24041545391082764, + "loss/reg": 0.0077430433593690395, + "step": 2604 + }, + { + "epoch": 0.325625, + "grad_norm": 2.457275867462158, + "grad_norm_var": 3.003388614388156, + "learning_rate": 0.0001, + "loss": 1.2423, + "loss/crossentropy": 2.3157176971435547, + "loss/hidden": 0.97265625, + "loss/logits": 0.1922733634710312, + "loss/reg": 0.007738523650914431, + "step": 2605 + }, + { + "epoch": 0.32575, + "grad_norm": 3.030805826187134, + "grad_norm_var": 3.025390077937041, + "learning_rate": 0.0001, + "loss": 1.2629, + "loss/crossentropy": 2.3268895149230957, + "loss/hidden": 1.0, + "loss/logits": 0.18556368350982666, + "loss/reg": 0.007734215352684259, + "step": 2606 + }, + { + "epoch": 0.325875, + "grad_norm": 4.555939674377441, + "grad_norm_var": 2.9466009947603005, + "learning_rate": 0.0001, + "loss": 1.2761, + "loss/crossentropy": 2.376013994216919, + "loss/hidden": 1.0546875, + "loss/logits": 0.1441352367401123, + "loss/reg": 0.0077300663106143475, + "step": 2607 + }, + { + "epoch": 0.326, + "grad_norm": 2.4952762126922607, + "grad_norm_var": 1.984640402073895, + "learning_rate": 0.0001, + "loss": 1.2484, + "loss/crossentropy": 2.573808193206787, + "loss/hidden": 0.9765625, + "loss/logits": 0.19456154108047485, + "loss/reg": 0.007725674193352461, + "step": 2608 + }, + { + "epoch": 0.326125, + "grad_norm": 3.2436137199401855, + "grad_norm_var": 1.9893543103573392, + "learning_rate": 0.0001, + "loss": 1.0603, + "loss/crossentropy": 2.6704530715942383, + "loss/hidden": 0.84375, + "loss/logits": 0.13931196928024292, + "loss/reg": 0.0077212098985910416, + "step": 2609 + }, + { + "epoch": 0.32625, + "grad_norm": 2.6457607746124268, + "grad_norm_var": 2.0158559807709024, + "learning_rate": 0.0001, + "loss": 1.0643, + "loss/crossentropy": 2.48942494392395, + "loss/hidden": 0.84765625, + "loss/logits": 0.13949505984783173, + "loss/reg": 0.00771659379824996, + "step": 2610 + }, + { + "epoch": 0.326375, + "grad_norm": 2.9313836097717285, + "grad_norm_var": 2.052459745909231, + "learning_rate": 0.0001, + "loss": 1.1551, + "loss/crossentropy": 2.5049359798431396, + "loss/hidden": 0.9296875, + "loss/logits": 0.1482677161693573, + "loss/reg": 0.007712232414633036, + "step": 2611 + }, + { + "epoch": 0.3265, + "grad_norm": 2.5596418380737305, + "grad_norm_var": 2.1112904358991478, + "learning_rate": 0.0001, + "loss": 1.255, + "loss/crossentropy": 2.6059083938598633, + "loss/hidden": 1.0, + "loss/logits": 0.17793220281600952, + "loss/reg": 0.007707822602242231, + "step": 2612 + }, + { + "epoch": 0.326625, + "grad_norm": 3.1190755367279053, + "grad_norm_var": 2.1280593015372893, + "learning_rate": 0.0001, + "loss": 1.138, + "loss/crossentropy": 2.456433057785034, + "loss/hidden": 0.91015625, + "loss/logits": 0.15076090395450592, + "loss/reg": 0.007703661452978849, + "step": 2613 + }, + { + "epoch": 0.32675, + "grad_norm": 4.6586012840271, + "grad_norm_var": 2.174312162374879, + "learning_rate": 0.0001, + "loss": 1.0382, + "loss/crossentropy": 2.080876588821411, + "loss/hidden": 0.84375, + "loss/logits": 0.11746075749397278, + "loss/reg": 0.007699252106249332, + "step": 2614 + }, + { + "epoch": 0.326875, + "grad_norm": 2.835235834121704, + "grad_norm_var": 2.1494669151850565, + "learning_rate": 0.0001, + "loss": 1.0397, + "loss/crossentropy": 2.7537124156951904, + "loss/hidden": 0.8203125, + "loss/logits": 0.14241716265678406, + "loss/reg": 0.007695053704082966, + "step": 2615 + }, + { + "epoch": 0.327, + "grad_norm": 3.0049407482147217, + "grad_norm_var": 2.0839285154426372, + "learning_rate": 0.0001, + "loss": 1.3834, + "loss/crossentropy": 2.381675958633423, + "loss/hidden": 1.09375, + "loss/logits": 0.2127782702445984, + "loss/reg": 0.007690922357141972, + "step": 2616 + }, + { + "epoch": 0.327125, + "grad_norm": 9.94892406463623, + "grad_norm_var": 4.690541602611514, + "learning_rate": 0.0001, + "loss": 1.7381, + "loss/crossentropy": 2.1527769565582275, + "loss/hidden": 1.40625, + "loss/logits": 0.25497421622276306, + "loss/reg": 0.0076871225610375404, + "step": 2617 + }, + { + "epoch": 0.32725, + "grad_norm": 2.976547956466675, + "grad_norm_var": 3.322224199681947, + "learning_rate": 0.0001, + "loss": 1.0938, + "loss/crossentropy": 3.0534162521362305, + "loss/hidden": 0.83203125, + "loss/logits": 0.18496111035346985, + "loss/reg": 0.007683487143367529, + "step": 2618 + }, + { + "epoch": 0.327375, + "grad_norm": 3.3262860774993896, + "grad_norm_var": 3.2928644105144804, + "learning_rate": 0.0001, + "loss": 1.1325, + "loss/crossentropy": 3.1036295890808105, + "loss/hidden": 0.87109375, + "loss/logits": 0.18456748127937317, + "loss/reg": 0.007679822854697704, + "step": 2619 + }, + { + "epoch": 0.3275, + "grad_norm": 2.950965642929077, + "grad_norm_var": 3.317959722652934, + "learning_rate": 0.0001, + "loss": 1.3355, + "loss/crossentropy": 2.441296339035034, + "loss/hidden": 1.0390625, + "loss/logits": 0.2197001576423645, + "loss/reg": 0.007675455883145332, + "step": 2620 + }, + { + "epoch": 0.327625, + "grad_norm": 2.957763910293579, + "grad_norm_var": 3.2609449570121267, + "learning_rate": 0.0001, + "loss": 1.2508, + "loss/crossentropy": 2.7704644203186035, + "loss/hidden": 0.98828125, + "loss/logits": 0.18581868708133698, + "loss/reg": 0.00767112523317337, + "step": 2621 + }, + { + "epoch": 0.32775, + "grad_norm": 3.2371456623077393, + "grad_norm_var": 3.248564015366132, + "learning_rate": 0.0001, + "loss": 1.1846, + "loss/crossentropy": 2.5042285919189453, + "loss/hidden": 0.93359375, + "loss/logits": 0.1743517518043518, + "loss/reg": 0.007667165715247393, + "step": 2622 + }, + { + "epoch": 0.327875, + "grad_norm": 3.416496515274048, + "grad_norm_var": 3.1830260122763963, + "learning_rate": 0.0001, + "loss": 1.2052, + "loss/crossentropy": 2.668356418609619, + "loss/hidden": 0.96484375, + "loss/logits": 0.16371533274650574, + "loss/reg": 0.007663294207304716, + "step": 2623 + }, + { + "epoch": 0.328, + "grad_norm": 3.055330514907837, + "grad_norm_var": 3.126167279969479, + "learning_rate": 0.0001, + "loss": 1.0786, + "loss/crossentropy": 2.644609212875366, + "loss/hidden": 0.8515625, + "loss/logits": 0.15043522417545319, + "loss/reg": 0.007659241557121277, + "step": 2624 + }, + { + "epoch": 0.328125, + "grad_norm": 3.334918975830078, + "grad_norm_var": 3.1229068417539905, + "learning_rate": 0.0001, + "loss": 1.0906, + "loss/crossentropy": 2.4782259464263916, + "loss/hidden": 0.8828125, + "loss/logits": 0.1311902403831482, + "loss/reg": 0.007655306253582239, + "step": 2625 + }, + { + "epoch": 0.32825, + "grad_norm": 4.632652759552002, + "grad_norm_var": 3.1274575419183814, + "learning_rate": 0.0001, + "loss": 1.5088, + "loss/crossentropy": 2.262376070022583, + "loss/hidden": 1.1953125, + "loss/logits": 0.23699502646923065, + "loss/reg": 0.007651374209672213, + "step": 2626 + }, + { + "epoch": 0.328375, + "grad_norm": 2.788149356842041, + "grad_norm_var": 3.143115468894764, + "learning_rate": 0.0001, + "loss": 0.9977, + "loss/crossentropy": 2.8059229850769043, + "loss/hidden": 0.8046875, + "loss/logits": 0.11657983064651489, + "loss/reg": 0.007647271268069744, + "step": 2627 + }, + { + "epoch": 0.3285, + "grad_norm": 8.038895606994629, + "grad_norm_var": 4.204538062115246, + "learning_rate": 0.0001, + "loss": 1.8372, + "loss/crossentropy": 2.869004011154175, + "loss/hidden": 1.3359375, + "loss/logits": 0.4247875511646271, + "loss/reg": 0.007642901036888361, + "step": 2628 + }, + { + "epoch": 0.328625, + "grad_norm": 3.3649466037750244, + "grad_norm_var": 4.178859515598371, + "learning_rate": 0.0001, + "loss": 1.4136, + "loss/crossentropy": 2.3756937980651855, + "loss/hidden": 1.1484375, + "loss/logits": 0.18873977661132812, + "loss/reg": 0.007638789713382721, + "step": 2629 + }, + { + "epoch": 0.32875, + "grad_norm": 3.2156546115875244, + "grad_norm_var": 4.188626833799399, + "learning_rate": 0.0001, + "loss": 1.3637, + "loss/crossentropy": 2.5369019508361816, + "loss/hidden": 1.0859375, + "loss/logits": 0.2014039158821106, + "loss/reg": 0.007634490262717009, + "step": 2630 + }, + { + "epoch": 0.328875, + "grad_norm": 2.5186150074005127, + "grad_norm_var": 4.241649576155923, + "learning_rate": 0.0001, + "loss": 1.1466, + "loss/crossentropy": 2.465280771255493, + "loss/hidden": 0.89453125, + "loss/logits": 0.1757795661687851, + "loss/reg": 0.007630498148500919, + "step": 2631 + }, + { + "epoch": 0.329, + "grad_norm": 2.8743369579315186, + "grad_norm_var": 4.258702850958074, + "learning_rate": 0.0001, + "loss": 1.1462, + "loss/crossentropy": 2.1694412231445312, + "loss/hidden": 0.92578125, + "loss/logits": 0.14413660764694214, + "loss/reg": 0.007625923492014408, + "step": 2632 + }, + { + "epoch": 0.329125, + "grad_norm": 2.5247464179992676, + "grad_norm_var": 1.7305338737499953, + "learning_rate": 0.0001, + "loss": 1.1132, + "loss/crossentropy": 2.7393782138824463, + "loss/hidden": 0.89453125, + "loss/logits": 0.14242886006832123, + "loss/reg": 0.007621730677783489, + "step": 2633 + }, + { + "epoch": 0.32925, + "grad_norm": 2.2605559825897217, + "grad_norm_var": 1.8078528033206014, + "learning_rate": 0.0001, + "loss": 1.0102, + "loss/crossentropy": 2.6780014038085938, + "loss/hidden": 0.80859375, + "loss/logits": 0.12543730437755585, + "loss/reg": 0.007617744617164135, + "step": 2634 + }, + { + "epoch": 0.329375, + "grad_norm": 3.1765859127044678, + "grad_norm_var": 1.810846350779106, + "learning_rate": 0.0001, + "loss": 1.2796, + "loss/crossentropy": 2.6743502616882324, + "loss/hidden": 0.9921875, + "loss/logits": 0.21129342913627625, + "loss/reg": 0.007613205350935459, + "step": 2635 + }, + { + "epoch": 0.3295, + "grad_norm": 4.388557434082031, + "grad_norm_var": 1.8545686479322578, + "learning_rate": 0.0001, + "loss": 1.5406, + "loss/crossentropy": 2.2469587326049805, + "loss/hidden": 1.234375, + "loss/logits": 0.23012441396713257, + "loss/reg": 0.007608664222061634, + "step": 2636 + }, + { + "epoch": 0.329625, + "grad_norm": 2.4936611652374268, + "grad_norm_var": 1.900754220338436, + "learning_rate": 0.0001, + "loss": 1.0943, + "loss/crossentropy": 2.708010673522949, + "loss/hidden": 0.8515625, + "loss/logits": 0.16666361689567566, + "loss/reg": 0.007603816222399473, + "step": 2637 + }, + { + "epoch": 0.32975, + "grad_norm": 3.0768558979034424, + "grad_norm_var": 1.9070710958546129, + "learning_rate": 0.0001, + "loss": 1.118, + "loss/crossentropy": 2.61555552482605, + "loss/hidden": 0.89453125, + "loss/logits": 0.1474892795085907, + "loss/reg": 0.0075995223596692085, + "step": 2638 + }, + { + "epoch": 0.329875, + "grad_norm": 2.866813898086548, + "grad_norm_var": 1.9282322051466196, + "learning_rate": 0.0001, + "loss": 1.0025, + "loss/crossentropy": 2.629725217819214, + "loss/hidden": 0.8046875, + "loss/logits": 0.12184653431177139, + "loss/reg": 0.007594859227538109, + "step": 2639 + }, + { + "epoch": 0.33, + "grad_norm": 2.832630157470703, + "grad_norm_var": 1.9419584187792336, + "learning_rate": 0.0001, + "loss": 1.0373, + "loss/crossentropy": 2.4773855209350586, + "loss/hidden": 0.82421875, + "loss/logits": 0.13717564940452576, + "loss/reg": 0.007590660825371742, + "step": 2640 + }, + { + "epoch": 0.330125, + "grad_norm": 2.3241353034973145, + "grad_norm_var": 2.0144884703544164, + "learning_rate": 0.0001, + "loss": 1.0894, + "loss/crossentropy": 2.5991921424865723, + "loss/hidden": 0.859375, + "loss/logits": 0.1541684865951538, + "loss/reg": 0.00758618488907814, + "step": 2641 + }, + { + "epoch": 0.33025, + "grad_norm": 3.2769205570220947, + "grad_norm_var": 1.8949958206797182, + "learning_rate": 0.0001, + "loss": 1.4368, + "loss/crossentropy": 2.4339427947998047, + "loss/hidden": 1.1171875, + "loss/logits": 0.24381572008132935, + "loss/reg": 0.007581994403153658, + "step": 2642 + }, + { + "epoch": 0.330375, + "grad_norm": 2.4310824871063232, + "grad_norm_var": 1.9250182193644925, + "learning_rate": 0.0001, + "loss": 1.027, + "loss/crossentropy": 2.76615571975708, + "loss/hidden": 0.8203125, + "loss/logits": 0.1309587061405182, + "loss/reg": 0.007577815093100071, + "step": 2643 + }, + { + "epoch": 0.3305, + "grad_norm": 6.903796672821045, + "grad_norm_var": 1.2775947375799828, + "learning_rate": 0.0001, + "loss": 1.426, + "loss/crossentropy": 2.5195741653442383, + "loss/hidden": 1.15625, + "loss/logits": 0.1940373331308365, + "loss/reg": 0.007573540322482586, + "step": 2644 + }, + { + "epoch": 0.330625, + "grad_norm": 3.249500274658203, + "grad_norm_var": 1.2752440549023845, + "learning_rate": 0.0001, + "loss": 1.0498, + "loss/crossentropy": 2.5033602714538574, + "loss/hidden": 0.83203125, + "loss/logits": 0.14205285906791687, + "loss/reg": 0.0075692241080105305, + "step": 2645 + }, + { + "epoch": 0.33075, + "grad_norm": 2.7828736305236816, + "grad_norm_var": 1.2832138331973943, + "learning_rate": 0.0001, + "loss": 1.1224, + "loss/crossentropy": 2.771994113922119, + "loss/hidden": 0.89453125, + "loss/logits": 0.15220947563648224, + "loss/reg": 0.007565027102828026, + "step": 2646 + }, + { + "epoch": 0.330875, + "grad_norm": 3.1820526123046875, + "grad_norm_var": 1.2571847123818316, + "learning_rate": 0.0001, + "loss": 1.1327, + "loss/crossentropy": 2.57944917678833, + "loss/hidden": 0.90625, + "loss/logits": 0.15087124705314636, + "loss/reg": 0.007560811471194029, + "step": 2647 + }, + { + "epoch": 0.331, + "grad_norm": 2.606199026107788, + "grad_norm_var": 1.272081447057974, + "learning_rate": 0.0001, + "loss": 1.2982, + "loss/crossentropy": 2.3060500621795654, + "loss/hidden": 1.015625, + "loss/logits": 0.20703396201133728, + "loss/reg": 0.007556730415672064, + "step": 2648 + }, + { + "epoch": 0.331125, + "grad_norm": 3.2134037017822266, + "grad_norm_var": 1.244442788895654, + "learning_rate": 0.0001, + "loss": 1.2156, + "loss/crossentropy": 2.6045126914978027, + "loss/hidden": 0.97265625, + "loss/logits": 0.1674089878797531, + "loss/reg": 0.0075527154840528965, + "step": 2649 + }, + { + "epoch": 0.33125, + "grad_norm": 4.38543176651001, + "grad_norm_var": 1.2628555349996151, + "learning_rate": 0.0001, + "loss": 1.4408, + "loss/crossentropy": 2.4084393978118896, + "loss/hidden": 1.1171875, + "loss/logits": 0.24815234541893005, + "loss/reg": 0.00754834758117795, + "step": 2650 + }, + { + "epoch": 0.331375, + "grad_norm": 2.7211809158325195, + "grad_norm_var": 1.2847933932031756, + "learning_rate": 0.0001, + "loss": 1.2015, + "loss/crossentropy": 2.51731276512146, + "loss/hidden": 0.953125, + "loss/logits": 0.17294739186763763, + "loss/reg": 0.007543998770415783, + "step": 2651 + }, + { + "epoch": 0.3315, + "grad_norm": 2.955306053161621, + "grad_norm_var": 1.2043827583213584, + "learning_rate": 0.0001, + "loss": 1.2373, + "loss/crossentropy": 2.6782066822052, + "loss/hidden": 0.9609375, + "loss/logits": 0.2009318768978119, + "loss/reg": 0.007539518643170595, + "step": 2652 + }, + { + "epoch": 0.331625, + "grad_norm": 4.024037837982178, + "grad_norm_var": 1.2053336268686639, + "learning_rate": 0.0001, + "loss": 1.1001, + "loss/crossentropy": 2.6157238483428955, + "loss/hidden": 0.875, + "loss/logits": 0.14977209270000458, + "loss/reg": 0.007535218261182308, + "step": 2653 + }, + { + "epoch": 0.33175, + "grad_norm": 3.901939868927002, + "grad_norm_var": 1.2231114592416512, + "learning_rate": 0.0001, + "loss": 1.0614, + "loss/crossentropy": 2.7018167972564697, + "loss/hidden": 0.84765625, + "loss/logits": 0.13844364881515503, + "loss/reg": 0.007530762813985348, + "step": 2654 + }, + { + "epoch": 0.331875, + "grad_norm": 3.2456393241882324, + "grad_norm_var": 1.2074940915691188, + "learning_rate": 0.0001, + "loss": 1.2765, + "loss/crossentropy": 2.635643482208252, + "loss/hidden": 0.98046875, + "loss/logits": 0.22078126668930054, + "loss/reg": 0.007526170928031206, + "step": 2655 + }, + { + "epoch": 0.332, + "grad_norm": 2.4693031311035156, + "grad_norm_var": 1.242128241472132, + "learning_rate": 0.0001, + "loss": 1.0504, + "loss/crossentropy": 2.684950590133667, + "loss/hidden": 0.8203125, + "loss/logits": 0.15488868951797485, + "loss/reg": 0.007521675433963537, + "step": 2656 + }, + { + "epoch": 0.332125, + "grad_norm": 3.023791790008545, + "grad_norm_var": 1.1765983294781523, + "learning_rate": 0.0001, + "loss": 1.1087, + "loss/crossentropy": 2.178278923034668, + "loss/hidden": 0.89453125, + "loss/logits": 0.1390073299407959, + "loss/reg": 0.00751698249951005, + "step": 2657 + }, + { + "epoch": 0.33225, + "grad_norm": 3.209632158279419, + "grad_norm_var": 1.1779701121170563, + "learning_rate": 0.0001, + "loss": 1.2032, + "loss/crossentropy": 2.3827853202819824, + "loss/hidden": 0.94140625, + "loss/logits": 0.1866905391216278, + "loss/reg": 0.00751233845949173, + "step": 2658 + }, + { + "epoch": 0.332375, + "grad_norm": 3.0369887351989746, + "grad_norm_var": 1.123117648727407, + "learning_rate": 0.0001, + "loss": 1.1105, + "loss/crossentropy": 2.3656508922576904, + "loss/hidden": 0.890625, + "loss/logits": 0.14477841556072235, + "loss/reg": 0.007508023642003536, + "step": 2659 + }, + { + "epoch": 0.3325, + "grad_norm": 3.6339616775512695, + "grad_norm_var": 0.2777043502389072, + "learning_rate": 0.0001, + "loss": 1.0879, + "loss/crossentropy": 2.6167714595794678, + "loss/hidden": 0.87890625, + "loss/logits": 0.133995920419693, + "loss/reg": 0.007503731641918421, + "step": 2660 + }, + { + "epoch": 0.332625, + "grad_norm": 2.827454090118408, + "grad_norm_var": 0.28760338896061055, + "learning_rate": 0.0001, + "loss": 1.1345, + "loss/crossentropy": 2.5317904949188232, + "loss/hidden": 0.89453125, + "loss/logits": 0.16494180262088776, + "loss/reg": 0.007499096915125847, + "step": 2661 + }, + { + "epoch": 0.33275, + "grad_norm": 2.612074375152588, + "grad_norm_var": 0.2989533021455297, + "learning_rate": 0.0001, + "loss": 1.0942, + "loss/crossentropy": 2.4415173530578613, + "loss/hidden": 0.87109375, + "loss/logits": 0.1481645703315735, + "loss/reg": 0.007494812365621328, + "step": 2662 + }, + { + "epoch": 0.332875, + "grad_norm": 2.49878191947937, + "grad_norm_var": 0.3289038208037158, + "learning_rate": 0.0001, + "loss": 0.9211, + "loss/crossentropy": 2.6655967235565186, + "loss/hidden": 0.734375, + "loss/logits": 0.11183229088783264, + "loss/reg": 0.0074904439970850945, + "step": 2663 + }, + { + "epoch": 0.333, + "grad_norm": 5.007753372192383, + "grad_norm_var": 0.5159391876141891, + "learning_rate": 0.0001, + "loss": 1.5653, + "loss/crossentropy": 2.7459094524383545, + "loss/hidden": 1.2109375, + "loss/logits": 0.27949392795562744, + "loss/reg": 0.007486012764275074, + "step": 2664 + }, + { + "epoch": 0.333125, + "grad_norm": 2.8720390796661377, + "grad_norm_var": 0.5270689719211643, + "learning_rate": 0.0001, + "loss": 1.0758, + "loss/crossentropy": 2.4798243045806885, + "loss/hidden": 0.86328125, + "loss/logits": 0.13766393065452576, + "loss/reg": 0.007481765933334827, + "step": 2665 + }, + { + "epoch": 0.33325, + "grad_norm": 2.9388954639434814, + "grad_norm_var": 0.4439827004404189, + "learning_rate": 0.0001, + "loss": 1.3954, + "loss/crossentropy": 2.3968465328216553, + "loss/hidden": 1.09375, + "loss/logits": 0.22686097025871277, + "loss/reg": 0.0074773826636374, + "step": 2666 + }, + { + "epoch": 0.333375, + "grad_norm": 2.954888105392456, + "grad_norm_var": 0.432906769038726, + "learning_rate": 0.0001, + "loss": 1.1467, + "loss/crossentropy": 2.850186824798584, + "loss/hidden": 0.8984375, + "loss/logits": 0.17352868616580963, + "loss/reg": 0.007473174482584, + "step": 2667 + }, + { + "epoch": 0.3335, + "grad_norm": 3.909715175628662, + "grad_norm_var": 0.4586000852620541, + "learning_rate": 0.0001, + "loss": 1.4417, + "loss/crossentropy": 2.5017826557159424, + "loss/hidden": 1.1328125, + "loss/logits": 0.2342115342617035, + "loss/reg": 0.007469098549336195, + "step": 2668 + }, + { + "epoch": 0.333625, + "grad_norm": 2.8999171257019043, + "grad_norm_var": 0.42312654554805035, + "learning_rate": 0.0001, + "loss": 1.2469, + "loss/crossentropy": 2.2989230155944824, + "loss/hidden": 1.0, + "loss/logits": 0.172258198261261, + "loss/reg": 0.007465035654604435, + "step": 2669 + }, + { + "epoch": 0.33375, + "grad_norm": 3.3458523750305176, + "grad_norm_var": 0.38967970719240624, + "learning_rate": 0.0001, + "loss": 1.3621, + "loss/crossentropy": 2.175699234008789, + "loss/hidden": 1.0859375, + "loss/logits": 0.20157396793365479, + "loss/reg": 0.0074610929004848, + "step": 2670 + }, + { + "epoch": 0.333875, + "grad_norm": 2.948096513748169, + "grad_norm_var": 0.3916336455124451, + "learning_rate": 0.0001, + "loss": 1.1999, + "loss/crossentropy": 2.5479187965393066, + "loss/hidden": 0.96875, + "loss/logits": 0.156605064868927, + "loss/reg": 0.007457202300429344, + "step": 2671 + }, + { + "epoch": 0.334, + "grad_norm": 3.169522523880005, + "grad_norm_var": 0.3599565981141341, + "learning_rate": 0.0001, + "loss": 1.3399, + "loss/crossentropy": 2.1956872940063477, + "loss/hidden": 1.0546875, + "loss/logits": 0.2106775939464569, + "loss/reg": 0.0074531701393425465, + "step": 2672 + }, + { + "epoch": 0.334125, + "grad_norm": 2.5755181312561035, + "grad_norm_var": 0.38188744654541035, + "learning_rate": 0.0001, + "loss": 1.0826, + "loss/crossentropy": 2.8377108573913574, + "loss/hidden": 0.84765625, + "loss/logits": 0.1604166328907013, + "loss/reg": 0.007449386175721884, + "step": 2673 + }, + { + "epoch": 0.33425, + "grad_norm": 2.8647687435150146, + "grad_norm_var": 0.38669671601911254, + "learning_rate": 0.0001, + "loss": 1.2533, + "loss/crossentropy": 2.5471339225769043, + "loss/hidden": 0.99609375, + "loss/logits": 0.18275025486946106, + "loss/reg": 0.007445590570569038, + "step": 2674 + }, + { + "epoch": 0.334375, + "grad_norm": 2.439521074295044, + "grad_norm_var": 0.41649748235214756, + "learning_rate": 0.0001, + "loss": 1.0287, + "loss/crossentropy": 2.4946768283843994, + "loss/hidden": 0.8046875, + "loss/logits": 0.14957815408706665, + "loss/reg": 0.007441421039402485, + "step": 2675 + }, + { + "epoch": 0.3345, + "grad_norm": 4.1679606437683105, + "grad_norm_var": 0.4727881794444632, + "learning_rate": 0.0001, + "loss": 1.3481, + "loss/crossentropy": 2.5403878688812256, + "loss/hidden": 1.078125, + "loss/logits": 0.1956249475479126, + "loss/reg": 0.007437132298946381, + "step": 2676 + }, + { + "epoch": 0.334625, + "grad_norm": 6.467916965484619, + "grad_norm_var": 1.155677681994151, + "learning_rate": 0.0001, + "loss": 1.0959, + "loss/crossentropy": 2.721575975418091, + "loss/hidden": 0.88671875, + "loss/logits": 0.13484179973602295, + "loss/reg": 0.007433152291923761, + "step": 2677 + }, + { + "epoch": 0.33475, + "grad_norm": 13.4847993850708, + "grad_norm_var": 7.467784365488909, + "learning_rate": 0.0001, + "loss": 1.7002, + "loss/crossentropy": 2.5913195610046387, + "loss/hidden": 1.4140625, + "loss/logits": 0.21183787286281586, + "loss/reg": 0.007428886368870735, + "step": 2678 + }, + { + "epoch": 0.334875, + "grad_norm": 2.686387300491333, + "grad_norm_var": 7.431579035348867, + "learning_rate": 0.0001, + "loss": 1.0905, + "loss/crossentropy": 2.4739487171173096, + "loss/hidden": 0.8671875, + "loss/logits": 0.14906096458435059, + "loss/reg": 0.007424928713589907, + "step": 2679 + }, + { + "epoch": 0.335, + "grad_norm": 2.619624376296997, + "grad_norm_var": 7.481739008077784, + "learning_rate": 0.0001, + "loss": 1.1308, + "loss/crossentropy": 2.6736466884613037, + "loss/hidden": 0.90234375, + "loss/logits": 0.1542046070098877, + "loss/reg": 0.007420879323035479, + "step": 2680 + }, + { + "epoch": 0.335125, + "grad_norm": 3.416043281555176, + "grad_norm_var": 7.425920703522782, + "learning_rate": 0.0001, + "loss": 1.2924, + "loss/crossentropy": 2.2129032611846924, + "loss/hidden": 1.03125, + "loss/logits": 0.18702274560928345, + "loss/reg": 0.007417050190269947, + "step": 2681 + }, + { + "epoch": 0.33525, + "grad_norm": 2.372584104537964, + "grad_norm_var": 7.520845978559933, + "learning_rate": 0.0001, + "loss": 1.1241, + "loss/crossentropy": 2.6171751022338867, + "loss/hidden": 0.890625, + "loss/logits": 0.15932147204875946, + "loss/reg": 0.007413401734083891, + "step": 2682 + }, + { + "epoch": 0.335375, + "grad_norm": 2.609815835952759, + "grad_norm_var": 7.571551323881124, + "learning_rate": 0.0001, + "loss": 1.1616, + "loss/crossentropy": 2.6003637313842773, + "loss/hidden": 0.921875, + "loss/logits": 0.165657639503479, + "loss/reg": 0.007409400772303343, + "step": 2683 + }, + { + "epoch": 0.3355, + "grad_norm": 2.5692524909973145, + "grad_norm_var": 7.677403985654079, + "learning_rate": 0.0001, + "loss": 1.1022, + "loss/crossentropy": 2.506622314453125, + "loss/hidden": 0.8828125, + "loss/logits": 0.14531753957271576, + "loss/reg": 0.007405311334878206, + "step": 2684 + }, + { + "epoch": 0.335625, + "grad_norm": 2.387026071548462, + "grad_norm_var": 7.75470346232348, + "learning_rate": 0.0001, + "loss": 1.1483, + "loss/crossentropy": 2.532626152038574, + "loss/hidden": 0.8984375, + "loss/logits": 0.17586958408355713, + "loss/reg": 0.007401699665933847, + "step": 2685 + }, + { + "epoch": 0.33575, + "grad_norm": 3.1154205799102783, + "grad_norm_var": 7.77067870393536, + "learning_rate": 0.0001, + "loss": 1.077, + "loss/crossentropy": 2.2997701168060303, + "loss/hidden": 0.8671875, + "loss/logits": 0.13586124777793884, + "loss/reg": 0.0073981089517474174, + "step": 2686 + }, + { + "epoch": 0.335875, + "grad_norm": 3.1250596046447754, + "grad_norm_var": 7.753870910862953, + "learning_rate": 0.0001, + "loss": 1.2985, + "loss/crossentropy": 2.3689544200897217, + "loss/hidden": 1.015625, + "loss/logits": 0.20893289148807526, + "loss/reg": 0.0073945848271250725, + "step": 2687 + }, + { + "epoch": 0.336, + "grad_norm": 2.4977219104766846, + "grad_norm_var": 7.834472234706237, + "learning_rate": 0.0001, + "loss": 1.1304, + "loss/crossentropy": 2.5560646057128906, + "loss/hidden": 0.875, + "loss/logits": 0.18148204684257507, + "loss/reg": 0.0073910970240831375, + "step": 2688 + }, + { + "epoch": 0.336125, + "grad_norm": 2.8722658157348633, + "grad_norm_var": 7.794991135436762, + "learning_rate": 0.0001, + "loss": 1.1996, + "loss/crossentropy": 2.3883895874023438, + "loss/hidden": 0.9609375, + "loss/logits": 0.16482338309288025, + "loss/reg": 0.0073870699852705, + "step": 2689 + }, + { + "epoch": 0.33625, + "grad_norm": 2.5151114463806152, + "grad_norm_var": 7.843017433562011, + "learning_rate": 0.0001, + "loss": 1.0385, + "loss/crossentropy": 2.2971785068511963, + "loss/hidden": 0.83203125, + "loss/logits": 0.13261422514915466, + "loss/reg": 0.007382875774055719, + "step": 2690 + }, + { + "epoch": 0.336375, + "grad_norm": 3.686410427093506, + "grad_norm_var": 7.729108858899104, + "learning_rate": 0.0001, + "loss": 1.0878, + "loss/crossentropy": 2.3442211151123047, + "loss/hidden": 0.89453125, + "loss/logits": 0.11945460736751556, + "loss/reg": 0.007378695998340845, + "step": 2691 + }, + { + "epoch": 0.3365, + "grad_norm": 2.811128854751587, + "grad_norm_var": 7.775266787886191, + "learning_rate": 0.0001, + "loss": 1.1372, + "loss/crossentropy": 2.2078144550323486, + "loss/hidden": 0.92578125, + "loss/logits": 0.1377006322145462, + "loss/reg": 0.007374503184109926, + "step": 2692 + }, + { + "epoch": 0.336625, + "grad_norm": 2.3359124660491943, + "grad_norm_var": 7.31867790615113, + "learning_rate": 0.0001, + "loss": 1.2308, + "loss/crossentropy": 2.1839120388031006, + "loss/hidden": 0.96875, + "loss/logits": 0.1883184313774109, + "loss/reg": 0.0073702833615243435, + "step": 2693 + }, + { + "epoch": 0.33675, + "grad_norm": 2.983259916305542, + "grad_norm_var": 0.1521928213154183, + "learning_rate": 0.0001, + "loss": 1.1142, + "loss/crossentropy": 2.738924026489258, + "loss/hidden": 0.88671875, + "loss/logits": 0.15384671092033386, + "loss/reg": 0.007366090547293425, + "step": 2694 + }, + { + "epoch": 0.336875, + "grad_norm": 3.5933401584625244, + "grad_norm_var": 0.19135292012681476, + "learning_rate": 0.0001, + "loss": 1.2041, + "loss/crossentropy": 2.758913993835449, + "loss/hidden": 0.95703125, + "loss/logits": 0.1734953224658966, + "loss/reg": 0.0073621212504804134, + "step": 2695 + }, + { + "epoch": 0.337, + "grad_norm": 2.3198959827423096, + "grad_norm_var": 0.20594956868449873, + "learning_rate": 0.0001, + "loss": 1.1019, + "loss/crossentropy": 2.5430984497070312, + "loss/hidden": 0.859375, + "loss/logits": 0.16898423433303833, + "loss/reg": 0.007358179893344641, + "step": 2696 + }, + { + "epoch": 0.337125, + "grad_norm": 2.297726631164551, + "grad_norm_var": 0.1960797841966799, + "learning_rate": 0.0001, + "loss": 1.0445, + "loss/crossentropy": 2.425297737121582, + "loss/hidden": 0.83203125, + "loss/logits": 0.1388843059539795, + "loss/reg": 0.007354001980274916, + "step": 2697 + }, + { + "epoch": 0.33725, + "grad_norm": 25.74700164794922, + "grad_norm_var": 33.149634573064674, + "learning_rate": 0.0001, + "loss": 1.0081, + "loss/crossentropy": 2.459630012512207, + "loss/hidden": 0.80078125, + "loss/logits": 0.13382871448993683, + "loss/reg": 0.007350105792284012, + "step": 2698 + }, + { + "epoch": 0.337375, + "grad_norm": 2.8932130336761475, + "grad_norm_var": 33.0939380081812, + "learning_rate": 0.0001, + "loss": 1.1815, + "loss/crossentropy": 2.416797399520874, + "loss/hidden": 0.92578125, + "loss/logits": 0.1822102665901184, + "loss/reg": 0.0073462845757603645, + "step": 2699 + }, + { + "epoch": 0.3375, + "grad_norm": 2.63588285446167, + "grad_norm_var": 33.07942259490764, + "learning_rate": 0.0001, + "loss": 1.0588, + "loss/crossentropy": 2.4677910804748535, + "loss/hidden": 0.83984375, + "loss/logits": 0.1454915851354599, + "loss/reg": 0.007342379540205002, + "step": 2700 + }, + { + "epoch": 0.337625, + "grad_norm": 2.684563636779785, + "grad_norm_var": 33.011503624184115, + "learning_rate": 0.0001, + "loss": 1.2493, + "loss/crossentropy": 3.0150465965270996, + "loss/hidden": 0.984375, + "loss/logits": 0.19150957465171814, + "loss/reg": 0.007338451687246561, + "step": 2701 + }, + { + "epoch": 0.33775, + "grad_norm": 3.2277727127075195, + "grad_norm_var": 32.995189584524354, + "learning_rate": 0.0001, + "loss": 1.3624, + "loss/crossentropy": 2.67769193649292, + "loss/hidden": 1.0703125, + "loss/logits": 0.21872764825820923, + "loss/reg": 0.007334563881158829, + "step": 2702 + }, + { + "epoch": 0.337875, + "grad_norm": 2.590430498123169, + "grad_norm_var": 33.09425204405826, + "learning_rate": 0.0001, + "loss": 1.3327, + "loss/crossentropy": 3.005976676940918, + "loss/hidden": 1.03125, + "loss/logits": 0.22810745239257812, + "loss/reg": 0.007330411113798618, + "step": 2703 + }, + { + "epoch": 0.338, + "grad_norm": 2.9008147716522217, + "grad_norm_var": 33.01126566751924, + "learning_rate": 0.0001, + "loss": 1.0266, + "loss/crossentropy": 2.5566608905792236, + "loss/hidden": 0.8125, + "loss/logits": 0.14079444110393524, + "loss/reg": 0.007326404098421335, + "step": 2704 + }, + { + "epoch": 0.338125, + "grad_norm": 3.033874034881592, + "grad_norm_var": 32.98308332711485, + "learning_rate": 0.0001, + "loss": 1.1548, + "loss/crossentropy": 2.6121175289154053, + "loss/hidden": 0.9296875, + "loss/logits": 0.1518634855747223, + "loss/reg": 0.0073225172236561775, + "step": 2705 + }, + { + "epoch": 0.33825, + "grad_norm": 3.1882259845733643, + "grad_norm_var": 32.85425931864166, + "learning_rate": 0.0001, + "loss": 1.0558, + "loss/crossentropy": 2.530656099319458, + "loss/hidden": 0.84765625, + "loss/logits": 0.1349201649427414, + "loss/reg": 0.007318373303860426, + "step": 2706 + }, + { + "epoch": 0.338375, + "grad_norm": 2.486389398574829, + "grad_norm_var": 33.043733083795, + "learning_rate": 0.0001, + "loss": 1.1486, + "loss/crossentropy": 2.5659339427948, + "loss/hidden": 0.91015625, + "loss/logits": 0.16530078649520874, + "loss/reg": 0.007314260583370924, + "step": 2707 + }, + { + "epoch": 0.3385, + "grad_norm": 2.984323501586914, + "grad_norm_var": 33.01277106082387, + "learning_rate": 0.0001, + "loss": 1.0478, + "loss/crossentropy": 2.4988791942596436, + "loss/hidden": 0.83203125, + "loss/logits": 0.14268635213375092, + "loss/reg": 0.00730998395010829, + "step": 2708 + }, + { + "epoch": 0.338625, + "grad_norm": 2.884584665298462, + "grad_norm_var": 32.892003800239, + "learning_rate": 0.0001, + "loss": 1.1471, + "loss/crossentropy": 2.657707929611206, + "loss/hidden": 0.8984375, + "loss/logits": 0.17564672231674194, + "loss/reg": 0.0073058404959738255, + "step": 2709 + }, + { + "epoch": 0.33875, + "grad_norm": 2.9017322063446045, + "grad_norm_var": 32.90649575736703, + "learning_rate": 0.0001, + "loss": 1.3149, + "loss/crossentropy": 2.379610538482666, + "loss/hidden": 1.0546875, + "loss/logits": 0.18724608421325684, + "loss/reg": 0.0073013948276638985, + "step": 2710 + }, + { + "epoch": 0.338875, + "grad_norm": 3.631481409072876, + "grad_norm_var": 32.90312970624515, + "learning_rate": 0.0001, + "loss": 1.3581, + "loss/crossentropy": 2.5535435676574707, + "loss/hidden": 1.0859375, + "loss/logits": 0.19914889335632324, + "loss/reg": 0.0072969431057572365, + "step": 2711 + }, + { + "epoch": 0.339, + "grad_norm": 3.82477068901062, + "grad_norm_var": 32.65227942078553, + "learning_rate": 0.0001, + "loss": 1.4732, + "loss/crossentropy": 2.0533363819122314, + "loss/hidden": 1.2265625, + "loss/logits": 0.17369887232780457, + "loss/reg": 0.007292632013559341, + "step": 2712 + }, + { + "epoch": 0.339125, + "grad_norm": 2.816918134689331, + "grad_norm_var": 32.52570388403659, + "learning_rate": 0.0001, + "loss": 1.2175, + "loss/crossentropy": 2.6925830841064453, + "loss/hidden": 0.98046875, + "loss/logits": 0.16413944959640503, + "loss/reg": 0.007287960033863783, + "step": 2713 + }, + { + "epoch": 0.33925, + "grad_norm": 3.5551934242248535, + "grad_norm_var": 0.14758096770437892, + "learning_rate": 0.0001, + "loss": 1.0995, + "loss/crossentropy": 2.400327682495117, + "loss/hidden": 0.8828125, + "loss/logits": 0.14382781088352203, + "loss/reg": 0.007283810991793871, + "step": 2714 + }, + { + "epoch": 0.339375, + "grad_norm": 2.7028913497924805, + "grad_norm_var": 0.15293562870299957, + "learning_rate": 0.0001, + "loss": 1.3454, + "loss/crossentropy": 2.206462860107422, + "loss/hidden": 1.0859375, + "loss/logits": 0.18671034276485443, + "loss/reg": 0.007279713172465563, + "step": 2715 + }, + { + "epoch": 0.3395, + "grad_norm": 2.5447471141815186, + "grad_norm_var": 0.15791713990029924, + "learning_rate": 0.0001, + "loss": 1.1209, + "loss/crossentropy": 2.0532896518707275, + "loss/hidden": 0.90234375, + "loss/logits": 0.1457746922969818, + "loss/reg": 0.007275587413460016, + "step": 2716 + }, + { + "epoch": 0.339625, + "grad_norm": 3.3144025802612305, + "grad_norm_var": 0.15643752610202835, + "learning_rate": 0.0001, + "loss": 1.1687, + "loss/crossentropy": 2.429457187652588, + "loss/hidden": 0.9140625, + "loss/logits": 0.1818898469209671, + "loss/reg": 0.0072710756212472916, + "step": 2717 + }, + { + "epoch": 0.33975, + "grad_norm": 3.0427777767181396, + "grad_norm_var": 0.15386555860601234, + "learning_rate": 0.0001, + "loss": 1.2334, + "loss/crossentropy": 2.7028565406799316, + "loss/hidden": 0.97265625, + "loss/logits": 0.18810953199863434, + "loss/reg": 0.007266809232532978, + "step": 2718 + }, + { + "epoch": 0.339875, + "grad_norm": 3.270733594894409, + "grad_norm_var": 0.1433526288148336, + "learning_rate": 0.0001, + "loss": 1.0677, + "loss/crossentropy": 2.57698917388916, + "loss/hidden": 0.84375, + "loss/logits": 0.1513090282678604, + "loss/reg": 0.007262683007866144, + "step": 2719 + }, + { + "epoch": 0.34, + "grad_norm": 4.045888423919678, + "grad_norm_var": 0.19981647877183378, + "learning_rate": 0.0001, + "loss": 1.6391, + "loss/crossentropy": 1.4268994331359863, + "loss/hidden": 1.34375, + "loss/logits": 0.22279112040996552, + "loss/reg": 0.007258587516844273, + "step": 2720 + }, + { + "epoch": 0.340125, + "grad_norm": 11.00594711303711, + "grad_norm_var": 4.059867580436813, + "learning_rate": 0.0001, + "loss": 1.4557, + "loss/crossentropy": 2.3171794414520264, + "loss/hidden": 1.1875, + "loss/logits": 0.1956402063369751, + "loss/reg": 0.007254304364323616, + "step": 2721 + }, + { + "epoch": 0.34025, + "grad_norm": 3.896909236907959, + "grad_norm_var": 4.048798732190429, + "learning_rate": 0.0001, + "loss": 1.094, + "loss/crossentropy": 2.5109593868255615, + "loss/hidden": 0.8828125, + "loss/logits": 0.13870997726917267, + "loss/reg": 0.007250032387673855, + "step": 2722 + }, + { + "epoch": 0.340375, + "grad_norm": 3.3574206829071045, + "grad_norm_var": 3.9573787319998623, + "learning_rate": 0.0001, + "loss": 1.118, + "loss/crossentropy": 2.521843910217285, + "loss/hidden": 0.88671875, + "loss/logits": 0.158864364027977, + "loss/reg": 0.007245725952088833, + "step": 2723 + }, + { + "epoch": 0.3405, + "grad_norm": 2.801597833633423, + "grad_norm_var": 3.977786125999356, + "learning_rate": 0.0001, + "loss": 1.1872, + "loss/crossentropy": 2.4897663593292236, + "loss/hidden": 0.953125, + "loss/logits": 0.1616796851158142, + "loss/reg": 0.007241373881697655, + "step": 2724 + }, + { + "epoch": 0.340625, + "grad_norm": 3.8615167140960693, + "grad_norm_var": 3.9279817131308166, + "learning_rate": 0.0001, + "loss": 1.0631, + "loss/crossentropy": 2.5819499492645264, + "loss/hidden": 0.86328125, + "loss/logits": 0.1274602711200714, + "loss/reg": 0.007237263489514589, + "step": 2725 + }, + { + "epoch": 0.34075, + "grad_norm": 2.9434053897857666, + "grad_norm_var": 3.9231772590045533, + "learning_rate": 0.0001, + "loss": 1.1736, + "loss/crossentropy": 2.451287031173706, + "loss/hidden": 0.9296875, + "loss/logits": 0.17155399918556213, + "loss/reg": 0.007233177777379751, + "step": 2726 + }, + { + "epoch": 0.340875, + "grad_norm": 2.488546371459961, + "grad_norm_var": 4.028755042133049, + "learning_rate": 0.0001, + "loss": 0.9937, + "loss/crossentropy": 2.3518362045288086, + "loss/hidden": 0.8046875, + "loss/logits": 0.11668910086154938, + "loss/reg": 0.007228991948068142, + "step": 2727 + }, + { + "epoch": 0.341, + "grad_norm": 2.8890128135681152, + "grad_norm_var": 4.070049409213925, + "learning_rate": 0.0001, + "loss": 1.3061, + "loss/crossentropy": 2.4156317710876465, + "loss/hidden": 1.0390625, + "loss/logits": 0.19478872418403625, + "loss/reg": 0.007224582135677338, + "step": 2728 + }, + { + "epoch": 0.341125, + "grad_norm": 3.0107357501983643, + "grad_norm_var": 4.050645703822002, + "learning_rate": 0.0001, + "loss": 1.0589, + "loss/crossentropy": 2.2087247371673584, + "loss/hidden": 0.8515625, + "loss/logits": 0.13512656092643738, + "loss/reg": 0.0072203571908175945, + "step": 2729 + }, + { + "epoch": 0.34125, + "grad_norm": 4.275131702423096, + "grad_norm_var": 4.071949311646509, + "learning_rate": 0.0001, + "loss": 1.2238, + "loss/crossentropy": 2.5131940841674805, + "loss/hidden": 0.99609375, + "loss/logits": 0.15554986894130707, + "loss/reg": 0.007216315716505051, + "step": 2730 + }, + { + "epoch": 0.341375, + "grad_norm": 2.8808586597442627, + "grad_norm_var": 4.049895234758925, + "learning_rate": 0.0001, + "loss": 1.2116, + "loss/crossentropy": 2.4813549518585205, + "loss/hidden": 0.953125, + "loss/logits": 0.1863250434398651, + "loss/reg": 0.007212034426629543, + "step": 2731 + }, + { + "epoch": 0.3415, + "grad_norm": 5.694790840148926, + "grad_norm_var": 4.173577764469879, + "learning_rate": 0.0001, + "loss": 1.2256, + "loss/crossentropy": 2.2878901958465576, + "loss/hidden": 1.0, + "loss/logits": 0.15352553129196167, + "loss/reg": 0.007207753602415323, + "step": 2732 + }, + { + "epoch": 0.341625, + "grad_norm": 4.0843186378479, + "grad_norm_var": 4.14807516912477, + "learning_rate": 0.0001, + "loss": 1.1801, + "loss/crossentropy": 2.5597105026245117, + "loss/hidden": 0.9140625, + "loss/logits": 0.19396942853927612, + "loss/reg": 0.007203707471489906, + "step": 2733 + }, + { + "epoch": 0.34175, + "grad_norm": 2.312744379043579, + "grad_norm_var": 4.27181824885245, + "learning_rate": 0.0001, + "loss": 1.1649, + "loss/crossentropy": 2.4508650302886963, + "loss/hidden": 0.91015625, + "loss/logits": 0.18273323774337769, + "loss/reg": 0.0071997810155153275, + "step": 2734 + }, + { + "epoch": 0.341875, + "grad_norm": 2.4480528831481934, + "grad_norm_var": 4.386019535417343, + "learning_rate": 0.0001, + "loss": 1.0965, + "loss/crossentropy": 2.693187713623047, + "loss/hidden": 0.87109375, + "loss/logits": 0.15348191559314728, + "loss/reg": 0.007195714395493269, + "step": 2735 + }, + { + "epoch": 0.342, + "grad_norm": 3.3437867164611816, + "grad_norm_var": 4.400812967327364, + "learning_rate": 0.0001, + "loss": 1.3385, + "loss/crossentropy": 2.392199993133545, + "loss/hidden": 1.0546875, + "loss/logits": 0.21190574765205383, + "loss/reg": 0.0071916659362614155, + "step": 2736 + }, + { + "epoch": 0.342125, + "grad_norm": 2.3784148693084717, + "grad_norm_var": 0.7992578099174039, + "learning_rate": 0.0001, + "loss": 1.0975, + "loss/crossentropy": 2.4370079040527344, + "loss/hidden": 0.875, + "loss/logits": 0.1505891978740692, + "loss/reg": 0.0071874805726110935, + "step": 2737 + }, + { + "epoch": 0.34225, + "grad_norm": 3.011951208114624, + "grad_norm_var": 0.7767937470107121, + "learning_rate": 0.0001, + "loss": 0.9872, + "loss/crossentropy": 2.2599635124206543, + "loss/hidden": 0.77734375, + "loss/logits": 0.138058140873909, + "loss/reg": 0.007183091249316931, + "step": 2738 + }, + { + "epoch": 0.342375, + "grad_norm": 4.16599178314209, + "grad_norm_var": 0.8307033972521822, + "learning_rate": 0.0001, + "loss": 1.3049, + "loss/crossentropy": 2.6293485164642334, + "loss/hidden": 1.0234375, + "loss/logits": 0.20969051122665405, + "loss/reg": 0.007178781554102898, + "step": 2739 + }, + { + "epoch": 0.3425, + "grad_norm": 2.924079656600952, + "grad_norm_var": 0.8237151176973299, + "learning_rate": 0.0001, + "loss": 1.2213, + "loss/crossentropy": 2.7024574279785156, + "loss/hidden": 0.9453125, + "loss/logits": 0.20427921414375305, + "loss/reg": 0.007174347992986441, + "step": 2740 + }, + { + "epoch": 0.342625, + "grad_norm": 2.9334263801574707, + "grad_norm_var": 0.8073942505145432, + "learning_rate": 0.0001, + "loss": 1.1832, + "loss/crossentropy": 2.5371458530426025, + "loss/hidden": 0.9453125, + "loss/logits": 0.16614662110805511, + "loss/reg": 0.007170196622610092, + "step": 2741 + }, + { + "epoch": 0.34275, + "grad_norm": 3.0960853099823, + "grad_norm_var": 0.8028829884082918, + "learning_rate": 0.0001, + "loss": 1.1787, + "loss/crossentropy": 2.625760316848755, + "loss/hidden": 0.94140625, + "loss/logits": 0.16565638780593872, + "loss/reg": 0.0071657984517514706, + "step": 2742 + }, + { + "epoch": 0.342875, + "grad_norm": 5.033172130584717, + "grad_norm_var": 0.9505456528456987, + "learning_rate": 0.0001, + "loss": 1.1994, + "loss/crossentropy": 2.152758836746216, + "loss/hidden": 0.98046875, + "loss/logits": 0.14727801084518433, + "loss/reg": 0.007161812391132116, + "step": 2743 + }, + { + "epoch": 0.343, + "grad_norm": 3.580379009246826, + "grad_norm_var": 0.932840327831593, + "learning_rate": 0.0001, + "loss": 1.3989, + "loss/crossentropy": 2.4321463108062744, + "loss/hidden": 1.1015625, + "loss/logits": 0.2257457971572876, + "loss/reg": 0.007157756015658379, + "step": 2744 + }, + { + "epoch": 0.343125, + "grad_norm": 6.597163200378418, + "grad_norm_var": 1.5274717314166613, + "learning_rate": 0.0001, + "loss": 1.7613, + "loss/crossentropy": 2.4759538173675537, + "loss/hidden": 1.3671875, + "loss/logits": 0.3225797414779663, + "loss/reg": 0.007153267506510019, + "step": 2745 + }, + { + "epoch": 0.34325, + "grad_norm": 3.344834327697754, + "grad_norm_var": 1.5068150242799665, + "learning_rate": 0.0001, + "loss": 1.2378, + "loss/crossentropy": 2.4943525791168213, + "loss/hidden": 0.9921875, + "loss/logits": 0.1741321086883545, + "loss/reg": 0.007148735225200653, + "step": 2746 + }, + { + "epoch": 0.343375, + "grad_norm": 2.5262553691864014, + "grad_norm_var": 1.549355114752211, + "learning_rate": 0.0001, + "loss": 1.0268, + "loss/crossentropy": 2.660968780517578, + "loss/hidden": 0.81640625, + "loss/logits": 0.13892903923988342, + "loss/reg": 0.007143804337829351, + "step": 2747 + }, + { + "epoch": 0.3435, + "grad_norm": 2.5791540145874023, + "grad_norm_var": 1.2826064783883566, + "learning_rate": 0.0001, + "loss": 1.1596, + "loss/crossentropy": 2.466801643371582, + "loss/hidden": 0.89453125, + "loss/logits": 0.19370737671852112, + "loss/reg": 0.007139659486711025, + "step": 2748 + }, + { + "epoch": 0.343625, + "grad_norm": 2.584523916244507, + "grad_norm_var": 1.2858456860591196, + "learning_rate": 0.0001, + "loss": 1.0134, + "loss/crossentropy": 2.6055426597595215, + "loss/hidden": 0.80078125, + "loss/logits": 0.14130814373493195, + "loss/reg": 0.00713552488014102, + "step": 2749 + }, + { + "epoch": 0.34375, + "grad_norm": 2.473423957824707, + "grad_norm_var": 1.2662280374419501, + "learning_rate": 0.0001, + "loss": 1.0168, + "loss/crossentropy": 2.4587204456329346, + "loss/hidden": 0.8046875, + "loss/logits": 0.1408056616783142, + "loss/reg": 0.0071313148364424706, + "step": 2750 + }, + { + "epoch": 0.343875, + "grad_norm": 3.362675428390503, + "grad_norm_var": 1.2129346622506942, + "learning_rate": 0.0001, + "loss": 1.0593, + "loss/crossentropy": 2.4907772541046143, + "loss/hidden": 0.86328125, + "loss/logits": 0.12477576732635498, + "loss/reg": 0.007126666605472565, + "step": 2751 + }, + { + "epoch": 0.344, + "grad_norm": 2.9708597660064697, + "grad_norm_var": 1.2229778396825282, + "learning_rate": 0.0001, + "loss": 1.3628, + "loss/crossentropy": 2.544905424118042, + "loss/hidden": 1.109375, + "loss/logits": 0.18215501308441162, + "loss/reg": 0.007122104987502098, + "step": 2752 + }, + { + "epoch": 0.344125, + "grad_norm": 3.4313297271728516, + "grad_norm_var": 1.1561976713558348, + "learning_rate": 0.0001, + "loss": 1.5462, + "loss/crossentropy": 2.4867279529571533, + "loss/hidden": 1.171875, + "loss/logits": 0.30313462018966675, + "loss/reg": 0.007117684464901686, + "step": 2753 + }, + { + "epoch": 0.34425, + "grad_norm": 3.202347993850708, + "grad_norm_var": 1.1482706440388595, + "learning_rate": 0.0001, + "loss": 1.4303, + "loss/crossentropy": 2.0979440212249756, + "loss/hidden": 1.140625, + "loss/logits": 0.218541219830513, + "loss/reg": 0.007113028317689896, + "step": 2754 + }, + { + "epoch": 0.344375, + "grad_norm": 2.814393997192383, + "grad_norm_var": 1.128974522603901, + "learning_rate": 0.0001, + "loss": 1.0262, + "loss/crossentropy": 2.4360811710357666, + "loss/hidden": 0.8203125, + "loss/logits": 0.1348360776901245, + "loss/reg": 0.007108248304575682, + "step": 2755 + }, + { + "epoch": 0.3445, + "grad_norm": 2.2301011085510254, + "grad_norm_var": 1.1976417844940281, + "learning_rate": 0.0001, + "loss": 1.1534, + "loss/crossentropy": 2.23420786857605, + "loss/hidden": 0.91796875, + "loss/logits": 0.16439306735992432, + "loss/reg": 0.0071041639894247055, + "step": 2756 + }, + { + "epoch": 0.344625, + "grad_norm": 2.471613883972168, + "grad_norm_var": 1.2333895248090585, + "learning_rate": 0.0001, + "loss": 1.0291, + "loss/crossentropy": 2.6555330753326416, + "loss/hidden": 0.81640625, + "loss/logits": 0.14171165227890015, + "loss/reg": 0.007099650334566832, + "step": 2757 + }, + { + "epoch": 0.34475, + "grad_norm": 4.258299827575684, + "grad_norm_var": 1.2910708192117906, + "learning_rate": 0.0001, + "loss": 1.0449, + "loss/crossentropy": 2.6711483001708984, + "loss/hidden": 0.8203125, + "loss/logits": 0.15358658134937286, + "loss/reg": 0.007095130626112223, + "step": 2758 + }, + { + "epoch": 0.344875, + "grad_norm": 2.5334291458129883, + "grad_norm_var": 1.1177104342747837, + "learning_rate": 0.0001, + "loss": 1.273, + "loss/crossentropy": 2.297759771347046, + "loss/hidden": 0.99609375, + "loss/logits": 0.20603995025157928, + "loss/reg": 0.007091089617460966, + "step": 2759 + }, + { + "epoch": 0.345, + "grad_norm": 2.5365281105041504, + "grad_norm_var": 1.1307899057501247, + "learning_rate": 0.0001, + "loss": 0.9488, + "loss/crossentropy": 2.4554195404052734, + "loss/hidden": 0.7578125, + "loss/logits": 0.12010732293128967, + "loss/reg": 0.007086923345923424, + "step": 2760 + }, + { + "epoch": 0.345125, + "grad_norm": 3.1713600158691406, + "grad_norm_var": 0.2759334342523703, + "learning_rate": 0.0001, + "loss": 1.4291, + "loss/crossentropy": 2.46505069732666, + "loss/hidden": 1.140625, + "loss/logits": 0.2176586538553238, + "loss/reg": 0.007082792464643717, + "step": 2761 + }, + { + "epoch": 0.34525, + "grad_norm": 4.415699005126953, + "grad_norm_var": 0.4103064596801969, + "learning_rate": 0.0001, + "loss": 1.1729, + "loss/crossentropy": 2.3409478664398193, + "loss/hidden": 0.94140625, + "loss/logits": 0.16069410741329193, + "loss/reg": 0.007078777067363262, + "step": 2762 + }, + { + "epoch": 0.345375, + "grad_norm": 3.1050095558166504, + "grad_norm_var": 0.3967961523545218, + "learning_rate": 0.0001, + "loss": 1.0122, + "loss/crossentropy": 2.6151516437530518, + "loss/hidden": 0.8046875, + "loss/logits": 0.1367163062095642, + "loss/reg": 0.007074722088873386, + "step": 2763 + }, + { + "epoch": 0.3455, + "grad_norm": 3.1937289237976074, + "grad_norm_var": 0.3851961871635884, + "learning_rate": 0.0001, + "loss": 1.2173, + "loss/crossentropy": 2.1596972942352295, + "loss/hidden": 0.9765625, + "loss/logits": 0.16998474299907684, + "loss/reg": 0.007070708088576794, + "step": 2764 + }, + { + "epoch": 0.345625, + "grad_norm": 3.171050786972046, + "grad_norm_var": 0.37051351252971093, + "learning_rate": 0.0001, + "loss": 1.0422, + "loss/crossentropy": 2.63969087600708, + "loss/hidden": 0.8203125, + "loss/logits": 0.15126848220825195, + "loss/reg": 0.007066650316119194, + "step": 2765 + }, + { + "epoch": 0.34575, + "grad_norm": 4.102024555206299, + "grad_norm_var": 0.4037293180445573, + "learning_rate": 0.0001, + "loss": 1.2695, + "loss/crossentropy": 2.2774484157562256, + "loss/hidden": 1.03125, + "loss/logits": 0.16759267449378967, + "loss/reg": 0.007062658667564392, + "step": 2766 + }, + { + "epoch": 0.345875, + "grad_norm": 3.2308101654052734, + "grad_norm_var": 0.40170368568593984, + "learning_rate": 0.0001, + "loss": 1.1007, + "loss/crossentropy": 2.5677249431610107, + "loss/hidden": 0.89453125, + "loss/logits": 0.1355830729007721, + "loss/reg": 0.0070586842484772205, + "step": 2767 + }, + { + "epoch": 0.346, + "grad_norm": 3.5821774005889893, + "grad_norm_var": 0.40822467175396254, + "learning_rate": 0.0001, + "loss": 1.2128, + "loss/crossentropy": 2.4344499111175537, + "loss/hidden": 0.96484375, + "loss/logits": 0.17737522721290588, + "loss/reg": 0.007054819725453854, + "step": 2768 + }, + { + "epoch": 0.346125, + "grad_norm": 3.0213751792907715, + "grad_norm_var": 0.4069377140061, + "learning_rate": 0.0001, + "loss": 1.0831, + "loss/crossentropy": 2.6722168922424316, + "loss/hidden": 0.86328125, + "loss/logits": 0.14934036135673523, + "loss/reg": 0.007050634827464819, + "step": 2769 + }, + { + "epoch": 0.34625, + "grad_norm": 3.4182581901550293, + "grad_norm_var": 0.41020685476289503, + "learning_rate": 0.0001, + "loss": 1.3133, + "loss/crossentropy": 2.5303635597229004, + "loss/hidden": 1.0625, + "loss/logits": 0.1803501546382904, + "loss/reg": 0.007046606857329607, + "step": 2770 + }, + { + "epoch": 0.346375, + "grad_norm": 2.7508528232574463, + "grad_norm_var": 0.4137556900626064, + "learning_rate": 0.0001, + "loss": 1.4352, + "loss/crossentropy": 2.163203239440918, + "loss/hidden": 1.140625, + "loss/logits": 0.2241097092628479, + "loss/reg": 0.0070424918085336685, + "step": 2771 + }, + { + "epoch": 0.3465, + "grad_norm": 2.4582598209381104, + "grad_norm_var": 0.38751836864882894, + "learning_rate": 0.0001, + "loss": 1.0065, + "loss/crossentropy": 2.403536319732666, + "loss/hidden": 0.8046875, + "loss/logits": 0.13143528997898102, + "loss/reg": 0.007038217503577471, + "step": 2772 + }, + { + "epoch": 0.346625, + "grad_norm": 2.750002861022949, + "grad_norm_var": 0.3648140356663807, + "learning_rate": 0.0001, + "loss": 1.0029, + "loss/crossentropy": 2.4958133697509766, + "loss/hidden": 0.80859375, + "loss/logits": 0.12398457527160645, + "loss/reg": 0.00703391432762146, + "step": 2773 + }, + { + "epoch": 0.34675, + "grad_norm": 2.2871015071868896, + "grad_norm_var": 0.3377109873440245, + "learning_rate": 0.0001, + "loss": 0.9093, + "loss/crossentropy": 2.778756856918335, + "loss/hidden": 0.73046875, + "loss/logits": 0.10855208337306976, + "loss/reg": 0.007029656786471605, + "step": 2774 + }, + { + "epoch": 0.346875, + "grad_norm": 2.8151392936706543, + "grad_norm_var": 0.32109014588972185, + "learning_rate": 0.0001, + "loss": 1.3875, + "loss/crossentropy": 2.391831636428833, + "loss/hidden": 1.1015625, + "loss/logits": 0.2156672477722168, + "loss/reg": 0.007025003898888826, + "step": 2775 + }, + { + "epoch": 0.347, + "grad_norm": 3.550903797149658, + "grad_norm_var": 0.30572987095613724, + "learning_rate": 0.0001, + "loss": 1.3186, + "loss/crossentropy": 2.3475117683410645, + "loss/hidden": 1.0703125, + "loss/logits": 0.1780453622341156, + "loss/reg": 0.00702088326215744, + "step": 2776 + }, + { + "epoch": 0.347125, + "grad_norm": 2.8909690380096436, + "grad_norm_var": 0.3113024686433533, + "learning_rate": 0.0001, + "loss": 1.1038, + "loss/crossentropy": 2.4547886848449707, + "loss/hidden": 0.89453125, + "loss/logits": 0.1391197144985199, + "loss/reg": 0.007016368675976992, + "step": 2777 + }, + { + "epoch": 0.34725, + "grad_norm": 2.3362374305725098, + "grad_norm_var": 0.23658291969921946, + "learning_rate": 0.0001, + "loss": 1.0971, + "loss/crossentropy": 2.1467626094818115, + "loss/hidden": 0.88671875, + "loss/logits": 0.14026862382888794, + "loss/reg": 0.007012220099568367, + "step": 2778 + }, + { + "epoch": 0.347375, + "grad_norm": 2.211862564086914, + "grad_norm_var": 0.27887604127437554, + "learning_rate": 0.0001, + "loss": 1.0984, + "loss/crossentropy": 2.4441874027252197, + "loss/hidden": 0.86328125, + "loss/logits": 0.16507872939109802, + "loss/reg": 0.007007448934018612, + "step": 2779 + }, + { + "epoch": 0.3475, + "grad_norm": 2.563422441482544, + "grad_norm_var": 0.286221161202896, + "learning_rate": 0.0001, + "loss": 0.9938, + "loss/crossentropy": 2.7146382331848145, + "loss/hidden": 0.80078125, + "loss/logits": 0.12295642495155334, + "loss/reg": 0.00700345216318965, + "step": 2780 + }, + { + "epoch": 0.347625, + "grad_norm": 2.9547207355499268, + "grad_norm_var": 0.2826627313297306, + "learning_rate": 0.0001, + "loss": 1.1155, + "loss/crossentropy": 2.6305973529815674, + "loss/hidden": 0.88671875, + "loss/logits": 0.1587519645690918, + "loss/reg": 0.006999018602073193, + "step": 2781 + }, + { + "epoch": 0.34775, + "grad_norm": 2.786308765411377, + "grad_norm_var": 0.18573387611426118, + "learning_rate": 0.0001, + "loss": 1.1227, + "loss/crossentropy": 2.6042163372039795, + "loss/hidden": 0.89453125, + "loss/logits": 0.15824094414710999, + "loss/reg": 0.006994720082730055, + "step": 2782 + }, + { + "epoch": 0.347875, + "grad_norm": 2.4689371585845947, + "grad_norm_var": 0.18338151015169993, + "learning_rate": 0.0001, + "loss": 1.1113, + "loss/crossentropy": 2.308681011199951, + "loss/hidden": 0.88671875, + "loss/logits": 0.1547260284423828, + "loss/reg": 0.006990294437855482, + "step": 2783 + }, + { + "epoch": 0.348, + "grad_norm": 2.436281442642212, + "grad_norm_var": 0.14638731224816107, + "learning_rate": 0.0001, + "loss": 0.9468, + "loss/crossentropy": 2.578493356704712, + "loss/hidden": 0.75390625, + "loss/logits": 0.12301641702651978, + "loss/reg": 0.006986573804169893, + "step": 2784 + }, + { + "epoch": 0.348125, + "grad_norm": 2.808598756790161, + "grad_norm_var": 0.1409871412403748, + "learning_rate": 0.0001, + "loss": 0.9522, + "loss/crossentropy": 2.610727071762085, + "loss/hidden": 0.765625, + "loss/logits": 0.11671020090579987, + "loss/reg": 0.006982461083680391, + "step": 2785 + }, + { + "epoch": 0.34825, + "grad_norm": 4.061357498168945, + "grad_norm_var": 0.2268811956026087, + "learning_rate": 0.0001, + "loss": 1.1025, + "loss/crossentropy": 2.986910104751587, + "loss/hidden": 0.875, + "loss/logits": 0.1576857566833496, + "loss/reg": 0.006978299003094435, + "step": 2786 + }, + { + "epoch": 0.348375, + "grad_norm": 2.4104974269866943, + "grad_norm_var": 0.2344540357634834, + "learning_rate": 0.0001, + "loss": 1.0867, + "loss/crossentropy": 2.564767837524414, + "loss/hidden": 0.86328125, + "loss/logits": 0.1537053883075714, + "loss/reg": 0.006974075920879841, + "step": 2787 + }, + { + "epoch": 0.3485, + "grad_norm": 3.2263941764831543, + "grad_norm_var": 0.24279190543831344, + "learning_rate": 0.0001, + "loss": 1.1495, + "loss/crossentropy": 2.2021844387054443, + "loss/hidden": 0.93359375, + "loss/logits": 0.14620757102966309, + "loss/reg": 0.00696988869458437, + "step": 2788 + }, + { + "epoch": 0.348625, + "grad_norm": 6.390467643737793, + "grad_norm_var": 1.054154327937537, + "learning_rate": 0.0001, + "loss": 1.544, + "loss/crossentropy": 2.734853744506836, + "loss/hidden": 1.1875, + "loss/logits": 0.2868664860725403, + "loss/reg": 0.0069659799337387085, + "step": 2789 + }, + { + "epoch": 0.34875, + "grad_norm": 3.952646017074585, + "grad_norm_var": 1.066451712917084, + "learning_rate": 0.0001, + "loss": 1.2588, + "loss/crossentropy": 2.5528910160064697, + "loss/hidden": 1.0, + "loss/logits": 0.18918466567993164, + "loss/reg": 0.006962075363844633, + "step": 2790 + }, + { + "epoch": 0.348875, + "grad_norm": 3.128770589828491, + "grad_norm_var": 1.059995401594858, + "learning_rate": 0.0001, + "loss": 1.3478, + "loss/crossentropy": 2.765064239501953, + "loss/hidden": 1.0625, + "loss/logits": 0.2156769335269928, + "loss/reg": 0.00695825694128871, + "step": 2791 + }, + { + "epoch": 0.349, + "grad_norm": 2.427927255630493, + "grad_norm_var": 1.0767112704134054, + "learning_rate": 0.0001, + "loss": 1.0802, + "loss/crossentropy": 2.553795337677002, + "loss/hidden": 0.859375, + "loss/logits": 0.15131738781929016, + "loss/reg": 0.006954500451683998, + "step": 2792 + }, + { + "epoch": 0.349125, + "grad_norm": 4.388706684112549, + "grad_norm_var": 1.1819665060524127, + "learning_rate": 0.0001, + "loss": 1.2654, + "loss/crossentropy": 2.509208917617798, + "loss/hidden": 1.0234375, + "loss/logits": 0.1724545806646347, + "loss/reg": 0.006950253155082464, + "step": 2793 + }, + { + "epoch": 0.34925, + "grad_norm": 3.170996904373169, + "grad_norm_var": 1.133879896871439, + "learning_rate": 0.0001, + "loss": 1.295, + "loss/crossentropy": 2.251927375793457, + "loss/hidden": 1.0546875, + "loss/logits": 0.1708707958459854, + "loss/reg": 0.006946450565010309, + "step": 2794 + }, + { + "epoch": 0.349375, + "grad_norm": 2.3759429454803467, + "grad_norm_var": 1.1136877656354782, + "learning_rate": 0.0001, + "loss": 1.0187, + "loss/crossentropy": 2.3567440509796143, + "loss/hidden": 0.8203125, + "loss/logits": 0.12896063923835754, + "loss/reg": 0.006942687556147575, + "step": 2795 + }, + { + "epoch": 0.3495, + "grad_norm": 2.6718599796295166, + "grad_norm_var": 1.1049007684366337, + "learning_rate": 0.0001, + "loss": 0.9117, + "loss/crossentropy": 2.6315507888793945, + "loss/hidden": 0.73046875, + "loss/logits": 0.11187173426151276, + "loss/reg": 0.0069384160451591015, + "step": 2796 + }, + { + "epoch": 0.349625, + "grad_norm": 3.6650567054748535, + "grad_norm_var": 1.1104806798318416, + "learning_rate": 0.0001, + "loss": 1.0752, + "loss/crossentropy": 2.355313777923584, + "loss/hidden": 0.859375, + "loss/logits": 0.14648142457008362, + "loss/reg": 0.0069345817901194096, + "step": 2797 + }, + { + "epoch": 0.34975, + "grad_norm": 2.2831342220306396, + "grad_norm_var": 1.1589683348238753, + "learning_rate": 0.0001, + "loss": 1.0328, + "loss/crossentropy": 2.324018955230713, + "loss/hidden": 0.83203125, + "loss/logits": 0.13142096996307373, + "loss/reg": 0.006930700968950987, + "step": 2798 + }, + { + "epoch": 0.349875, + "grad_norm": 3.5909879207611084, + "grad_norm_var": 1.122041642806751, + "learning_rate": 0.0001, + "loss": 1.6079, + "loss/crossentropy": 2.3902969360351562, + "loss/hidden": 1.2421875, + "loss/logits": 0.2964108884334564, + "loss/reg": 0.006926889531314373, + "step": 2799 + }, + { + "epoch": 0.35, + "grad_norm": 2.748495578765869, + "grad_norm_var": 1.0916852781278361, + "learning_rate": 0.0001, + "loss": 1.2995, + "loss/crossentropy": 2.6936540603637695, + "loss/hidden": 1.0234375, + "loss/logits": 0.2068125307559967, + "loss/reg": 0.00692280288785696, + "step": 2800 + }, + { + "epoch": 0.350125, + "grad_norm": 3.219916820526123, + "grad_norm_var": 1.07358942656935, + "learning_rate": 0.0001, + "loss": 1.0645, + "loss/crossentropy": 2.45542049407959, + "loss/hidden": 0.86328125, + "loss/logits": 0.13199202716350555, + "loss/reg": 0.006918946746736765, + "step": 2801 + }, + { + "epoch": 0.35025, + "grad_norm": 2.701479911804199, + "grad_norm_var": 1.0614697475826003, + "learning_rate": 0.0001, + "loss": 1.1361, + "loss/crossentropy": 2.6068365573883057, + "loss/hidden": 0.90625, + "loss/logits": 0.16070356965065002, + "loss/reg": 0.00691502820700407, + "step": 2802 + }, + { + "epoch": 0.350375, + "grad_norm": 2.565516233444214, + "grad_norm_var": 1.0451634731842943, + "learning_rate": 0.0001, + "loss": 1.1464, + "loss/crossentropy": 2.3242340087890625, + "loss/hidden": 0.91796875, + "loss/logits": 0.15932051837444305, + "loss/reg": 0.0069109415635466576, + "step": 2803 + }, + { + "epoch": 0.3505, + "grad_norm": 3.4773285388946533, + "grad_norm_var": 1.0472462596601104, + "learning_rate": 0.0001, + "loss": 1.3019, + "loss/crossentropy": 2.405390977859497, + "loss/hidden": 1.0390625, + "loss/logits": 0.19374266266822815, + "loss/reg": 0.0069069042801856995, + "step": 2804 + }, + { + "epoch": 0.350625, + "grad_norm": 2.7527010440826416, + "grad_norm_var": 0.3741078999525861, + "learning_rate": 0.0001, + "loss": 1.1287, + "loss/crossentropy": 2.4707348346710205, + "loss/hidden": 0.8984375, + "loss/logits": 0.1612110435962677, + "loss/reg": 0.00690265279263258, + "step": 2805 + }, + { + "epoch": 0.35075, + "grad_norm": 3.325270414352417, + "grad_norm_var": 0.32488217037953454, + "learning_rate": 0.0001, + "loss": 0.9022, + "loss/crossentropy": 2.63603138923645, + "loss/hidden": 0.72265625, + "loss/logits": 0.11054549366235733, + "loss/reg": 0.006898204796016216, + "step": 2806 + }, + { + "epoch": 0.350875, + "grad_norm": 3.7777018547058105, + "grad_norm_var": 0.359671495404775, + "learning_rate": 0.0001, + "loss": 1.1584, + "loss/crossentropy": 2.6735293865203857, + "loss/hidden": 0.921875, + "loss/logits": 0.16758015751838684, + "loss/reg": 0.006893798243254423, + "step": 2807 + }, + { + "epoch": 0.351, + "grad_norm": 2.889845371246338, + "grad_norm_var": 0.3333737231510419, + "learning_rate": 0.0001, + "loss": 1.2425, + "loss/crossentropy": 2.5033421516418457, + "loss/hidden": 0.98828125, + "loss/logits": 0.18531502783298492, + "loss/reg": 0.006889672018587589, + "step": 2808 + }, + { + "epoch": 0.351125, + "grad_norm": 3.0140860080718994, + "grad_norm_var": 0.21533148605261943, + "learning_rate": 0.0001, + "loss": 1.2484, + "loss/crossentropy": 2.2085347175598145, + "loss/hidden": 1.0078125, + "loss/logits": 0.17171190679073334, + "loss/reg": 0.006885706912726164, + "step": 2809 + }, + { + "epoch": 0.35125, + "grad_norm": 3.193214178085327, + "grad_norm_var": 0.21582623873827061, + "learning_rate": 0.0001, + "loss": 1.2962, + "loss/crossentropy": 2.635164499282837, + "loss/hidden": 1.015625, + "loss/logits": 0.21172842383384705, + "loss/reg": 0.006881781853735447, + "step": 2810 + }, + { + "epoch": 0.351375, + "grad_norm": 2.405691623687744, + "grad_norm_var": 0.21334362836626586, + "learning_rate": 0.0001, + "loss": 1.1261, + "loss/crossentropy": 2.3777668476104736, + "loss/hidden": 0.91015625, + "loss/logits": 0.14712069928646088, + "loss/reg": 0.006877511274069548, + "step": 2811 + }, + { + "epoch": 0.3515, + "grad_norm": 3.866748332977295, + "grad_norm_var": 0.24748891645961824, + "learning_rate": 0.0001, + "loss": 1.0178, + "loss/crossentropy": 2.591752290725708, + "loss/hidden": 0.8046875, + "loss/logits": 0.1444096863269806, + "loss/reg": 0.006873234175145626, + "step": 2812 + }, + { + "epoch": 0.351625, + "grad_norm": 2.689413547515869, + "grad_norm_var": 0.23247694849763623, + "learning_rate": 0.0001, + "loss": 1.1894, + "loss/crossentropy": 2.535344123840332, + "loss/hidden": 0.94140625, + "loss/logits": 0.17930902540683746, + "loss/reg": 0.006868908181786537, + "step": 2813 + }, + { + "epoch": 0.35175, + "grad_norm": 2.8606173992156982, + "grad_norm_var": 0.19570926727996796, + "learning_rate": 0.0001, + "loss": 1.1547, + "loss/crossentropy": 2.651958703994751, + "loss/hidden": 0.9140625, + "loss/logits": 0.17198587954044342, + "loss/reg": 0.006864617113023996, + "step": 2814 + }, + { + "epoch": 0.351875, + "grad_norm": 2.631596088409424, + "grad_norm_var": 0.18626442876211488, + "learning_rate": 0.0001, + "loss": 1.0665, + "loss/crossentropy": 2.923027276992798, + "loss/hidden": 0.859375, + "loss/logits": 0.13852520287036896, + "loss/reg": 0.006860729772597551, + "step": 2815 + }, + { + "epoch": 0.352, + "grad_norm": 2.6410794258117676, + "grad_norm_var": 0.19069473175449783, + "learning_rate": 0.0001, + "loss": 1.1021, + "loss/crossentropy": 2.792271137237549, + "loss/hidden": 0.8828125, + "loss/logits": 0.15074054896831512, + "loss/reg": 0.006856528576463461, + "step": 2816 + }, + { + "epoch": 0.352125, + "grad_norm": 7.070716381072998, + "grad_norm_var": 1.230008173731672, + "learning_rate": 0.0001, + "loss": 1.3305, + "loss/crossentropy": 2.3979732990264893, + "loss/hidden": 1.078125, + "loss/logits": 0.18387383222579956, + "loss/reg": 0.006852543447166681, + "step": 2817 + }, + { + "epoch": 0.35225, + "grad_norm": 2.4672653675079346, + "grad_norm_var": 1.2502988371574246, + "learning_rate": 0.0001, + "loss": 0.9889, + "loss/crossentropy": 2.909008502960205, + "loss/hidden": 0.79296875, + "loss/logits": 0.12748822569847107, + "loss/reg": 0.006848541088402271, + "step": 2818 + }, + { + "epoch": 0.352375, + "grad_norm": 2.6316936016082764, + "grad_norm_var": 1.244737621024069, + "learning_rate": 0.0001, + "loss": 1.021, + "loss/crossentropy": 2.3428428173065186, + "loss/hidden": 0.82421875, + "loss/logits": 0.12834063172340393, + "loss/reg": 0.006844407878816128, + "step": 2819 + }, + { + "epoch": 0.3525, + "grad_norm": 2.6285533905029297, + "grad_norm_var": 1.2618795275471084, + "learning_rate": 0.0001, + "loss": 1.1444, + "loss/crossentropy": 2.3677148818969727, + "loss/hidden": 0.91015625, + "loss/logits": 0.16581983864307404, + "loss/reg": 0.006840238347649574, + "step": 2820 + }, + { + "epoch": 0.352625, + "grad_norm": 3.0947043895721436, + "grad_norm_var": 1.249801245534674, + "learning_rate": 0.0001, + "loss": 1.299, + "loss/crossentropy": 2.4461467266082764, + "loss/hidden": 1.046875, + "loss/logits": 0.18377745151519775, + "loss/reg": 0.006836321670562029, + "step": 2821 + }, + { + "epoch": 0.35275, + "grad_norm": 2.735858201980591, + "grad_norm_var": 1.2616114077505658, + "learning_rate": 0.0001, + "loss": 1.0899, + "loss/crossentropy": 2.413236618041992, + "loss/hidden": 0.87109375, + "loss/logits": 0.15047301352024078, + "loss/reg": 0.006832106504589319, + "step": 2822 + }, + { + "epoch": 0.352875, + "grad_norm": 4.363320827484131, + "grad_norm_var": 1.3310882022759716, + "learning_rate": 0.0001, + "loss": 1.7687, + "loss/crossentropy": 2.1993408203125, + "loss/hidden": 1.3515625, + "loss/logits": 0.3488839268684387, + "loss/reg": 0.006828185170888901, + "step": 2823 + }, + { + "epoch": 0.353, + "grad_norm": 4.198661804199219, + "grad_norm_var": 1.3841961017807551, + "learning_rate": 0.0001, + "loss": 1.4305, + "loss/crossentropy": 2.4798121452331543, + "loss/hidden": 1.1328125, + "loss/logits": 0.22939807176589966, + "loss/reg": 0.006824191194027662, + "step": 2824 + }, + { + "epoch": 0.353125, + "grad_norm": 2.7924790382385254, + "grad_norm_var": 1.395146988796075, + "learning_rate": 0.0001, + "loss": 1.0841, + "loss/crossentropy": 2.8277339935302734, + "loss/hidden": 0.85546875, + "loss/logits": 0.1604047417640686, + "loss/reg": 0.006820080801844597, + "step": 2825 + }, + { + "epoch": 0.35325, + "grad_norm": 5.363793849945068, + "grad_norm_var": 1.6682630844117263, + "learning_rate": 0.0001, + "loss": 1.1539, + "loss/crossentropy": 2.9747114181518555, + "loss/hidden": 0.9296875, + "loss/logits": 0.15600863099098206, + "loss/reg": 0.006816211622208357, + "step": 2826 + }, + { + "epoch": 0.353375, + "grad_norm": 3.024595022201538, + "grad_norm_var": 1.6099347822020027, + "learning_rate": 0.0001, + "loss": 1.0028, + "loss/crossentropy": 2.7025463581085205, + "loss/hidden": 0.796875, + "loss/logits": 0.13784104585647583, + "loss/reg": 0.006812310311943293, + "step": 2827 + }, + { + "epoch": 0.3535, + "grad_norm": 2.6304514408111572, + "grad_norm_var": 1.6353339870560126, + "learning_rate": 0.0001, + "loss": 1.1201, + "loss/crossentropy": 2.371079921722412, + "loss/hidden": 0.91015625, + "loss/logits": 0.1418907791376114, + "loss/reg": 0.006808441132307053, + "step": 2828 + }, + { + "epoch": 0.353625, + "grad_norm": 3.3734726905822754, + "grad_norm_var": 1.6030478808075919, + "learning_rate": 0.0001, + "loss": 1.091, + "loss/crossentropy": 2.474904775619507, + "loss/hidden": 0.89453125, + "loss/logits": 0.12837998569011688, + "loss/reg": 0.006804563570767641, + "step": 2829 + }, + { + "epoch": 0.35375, + "grad_norm": 5.210881233215332, + "grad_norm_var": 1.7771240539816433, + "learning_rate": 0.0001, + "loss": 1.1383, + "loss/crossentropy": 2.8242027759552, + "loss/hidden": 0.890625, + "loss/logits": 0.17967696487903595, + "loss/reg": 0.006800902541726828, + "step": 2830 + }, + { + "epoch": 0.353875, + "grad_norm": 4.681361675262451, + "grad_norm_var": 1.787708643132524, + "learning_rate": 0.0001, + "loss": 1.0603, + "loss/crossentropy": 2.511890411376953, + "loss/hidden": 0.8515625, + "loss/logits": 0.14080020785331726, + "loss/reg": 0.006797020323574543, + "step": 2831 + }, + { + "epoch": 0.354, + "grad_norm": 2.5539495944976807, + "grad_norm_var": 1.8002735571535327, + "learning_rate": 0.0001, + "loss": 1.1243, + "loss/crossentropy": 2.5387470722198486, + "loss/hidden": 0.89453125, + "loss/logits": 0.16180011630058289, + "loss/reg": 0.00679308595135808, + "step": 2832 + }, + { + "epoch": 0.354125, + "grad_norm": 2.537637710571289, + "grad_norm_var": 1.0329890388947562, + "learning_rate": 0.0001, + "loss": 1.0704, + "loss/crossentropy": 2.5041210651397705, + "loss/hidden": 0.84375, + "loss/logits": 0.1587318778038025, + "loss/reg": 0.006789327599108219, + "step": 2833 + }, + { + "epoch": 0.35425, + "grad_norm": 3.75996470451355, + "grad_norm_var": 0.9778641494130509, + "learning_rate": 0.0001, + "loss": 1.262, + "loss/crossentropy": 2.258767604827881, + "loss/hidden": 0.99609375, + "loss/logits": 0.19803112745285034, + "loss/reg": 0.0067856465466320515, + "step": 2834 + }, + { + "epoch": 0.354375, + "grad_norm": 2.8594603538513184, + "grad_norm_var": 0.9555315606911979, + "learning_rate": 0.0001, + "loss": 1.2785, + "loss/crossentropy": 2.2535250186920166, + "loss/hidden": 1.03125, + "loss/logits": 0.17941920459270477, + "loss/reg": 0.006781726144254208, + "step": 2835 + }, + { + "epoch": 0.3545, + "grad_norm": 2.842460870742798, + "grad_norm_var": 0.9338770191504387, + "learning_rate": 0.0001, + "loss": 1.1883, + "loss/crossentropy": 2.8690316677093506, + "loss/hidden": 0.96875, + "loss/logits": 0.15173578262329102, + "loss/reg": 0.006777776870876551, + "step": 2836 + }, + { + "epoch": 0.354625, + "grad_norm": 12.214866638183594, + "grad_norm_var": 5.637861663959794, + "learning_rate": 0.0001, + "loss": 1.7289, + "loss/crossentropy": 2.578017234802246, + "loss/hidden": 1.4140625, + "loss/logits": 0.24708537757396698, + "loss/reg": 0.006774050183594227, + "step": 2837 + }, + { + "epoch": 0.35475, + "grad_norm": 2.883404016494751, + "grad_norm_var": 5.612947457877968, + "learning_rate": 0.0001, + "loss": 1.1919, + "loss/crossentropy": 2.533013343811035, + "loss/hidden": 0.953125, + "loss/logits": 0.17104831337928772, + "loss/reg": 0.0067701456137001514, + "step": 2838 + }, + { + "epoch": 0.354875, + "grad_norm": 2.8976144790649414, + "grad_norm_var": 5.691978659851372, + "learning_rate": 0.0001, + "loss": 1.0495, + "loss/crossentropy": 2.6040146350860596, + "loss/hidden": 0.8359375, + "loss/logits": 0.1459062248468399, + "loss/reg": 0.00676692696288228, + "step": 2839 + }, + { + "epoch": 0.355, + "grad_norm": 2.930161714553833, + "grad_norm_var": 5.757097290891412, + "learning_rate": 0.0001, + "loss": 1.0689, + "loss/crossentropy": 2.6926674842834473, + "loss/hidden": 0.859375, + "loss/logits": 0.14189040660858154, + "loss/reg": 0.0067633651196956635, + "step": 2840 + }, + { + "epoch": 0.355125, + "grad_norm": 2.8705530166625977, + "grad_norm_var": 5.745847262881331, + "learning_rate": 0.0001, + "loss": 1.422, + "loss/crossentropy": 2.1456615924835205, + "loss/hidden": 1.140625, + "loss/logits": 0.213736429810524, + "loss/reg": 0.006759498734027147, + "step": 2841 + }, + { + "epoch": 0.35525, + "grad_norm": 2.8019299507141113, + "grad_norm_var": 5.661047575033918, + "learning_rate": 0.0001, + "loss": 1.2269, + "loss/crossentropy": 2.5318074226379395, + "loss/hidden": 0.9921875, + "loss/logits": 0.1671692579984665, + "loss/reg": 0.006755680311471224, + "step": 2842 + }, + { + "epoch": 0.355375, + "grad_norm": 3.0711703300476074, + "grad_norm_var": 5.656650116907287, + "learning_rate": 0.0001, + "loss": 1.1378, + "loss/crossentropy": 2.311663866043091, + "loss/hidden": 0.92578125, + "loss/logits": 0.14447444677352905, + "loss/reg": 0.006752366665750742, + "step": 2843 + }, + { + "epoch": 0.3555, + "grad_norm": 3.0888113975524902, + "grad_norm_var": 5.600904311393808, + "learning_rate": 0.0001, + "loss": 1.1423, + "loss/crossentropy": 2.778822422027588, + "loss/hidden": 0.9140625, + "loss/logits": 0.16076108813285828, + "loss/reg": 0.006748510990291834, + "step": 2844 + }, + { + "epoch": 0.355625, + "grad_norm": 3.703735589981079, + "grad_norm_var": 5.589551068514529, + "learning_rate": 0.0001, + "loss": 1.3573, + "loss/crossentropy": 2.571119546890259, + "loss/hidden": 1.09375, + "loss/logits": 0.1961050182580948, + "loss/reg": 0.006745323073118925, + "step": 2845 + }, + { + "epoch": 0.35575, + "grad_norm": 3.1959891319274902, + "grad_norm_var": 5.466064280664754, + "learning_rate": 0.0001, + "loss": 1.2446, + "loss/crossentropy": 2.601706027984619, + "loss/hidden": 1.0078125, + "loss/logits": 0.16933943331241608, + "loss/reg": 0.006741812918335199, + "step": 2846 + }, + { + "epoch": 0.355875, + "grad_norm": 3.283367156982422, + "grad_norm_var": 5.401712766945622, + "learning_rate": 0.0001, + "loss": 1.2027, + "loss/crossentropy": 2.347984790802002, + "loss/hidden": 0.953125, + "loss/logits": 0.18218892812728882, + "loss/reg": 0.006738013122230768, + "step": 2847 + }, + { + "epoch": 0.356, + "grad_norm": 2.770186424255371, + "grad_norm_var": 5.374664950820928, + "learning_rate": 0.0001, + "loss": 1.1609, + "loss/crossentropy": 2.379267930984497, + "loss/hidden": 0.92578125, + "loss/logits": 0.16781800985336304, + "loss/reg": 0.006734793074429035, + "step": 2848 + }, + { + "epoch": 0.356125, + "grad_norm": 4.03292989730835, + "grad_norm_var": 5.301215965387299, + "learning_rate": 0.0001, + "loss": 1.3325, + "loss/crossentropy": 2.56318736076355, + "loss/hidden": 1.078125, + "loss/logits": 0.18708620965480804, + "loss/reg": 0.0067315008491277695, + "step": 2849 + }, + { + "epoch": 0.35625, + "grad_norm": 4.332407474517822, + "grad_norm_var": 5.326241972488862, + "learning_rate": 0.0001, + "loss": 1.4429, + "loss/crossentropy": 2.4864964485168457, + "loss/hidden": 1.140625, + "loss/logits": 0.23501905798912048, + "loss/reg": 0.0067283641546964645, + "step": 2850 + }, + { + "epoch": 0.356375, + "grad_norm": 3.862065553665161, + "grad_norm_var": 5.271866149175641, + "learning_rate": 0.0001, + "loss": 1.2958, + "loss/crossentropy": 2.670090436935425, + "loss/hidden": 1.0546875, + "loss/logits": 0.1738210916519165, + "loss/reg": 0.006725169252604246, + "step": 2851 + }, + { + "epoch": 0.3565, + "grad_norm": 9.186538696289062, + "grad_norm_var": 6.9783334202673775, + "learning_rate": 0.0001, + "loss": 1.2396, + "loss/crossentropy": 2.4273734092712402, + "loss/hidden": 1.03125, + "loss/logits": 0.14108848571777344, + "loss/reg": 0.006721911486238241, + "step": 2852 + }, + { + "epoch": 0.356625, + "grad_norm": 3.0828006267547607, + "grad_norm_var": 2.4258737347688717, + "learning_rate": 0.0001, + "loss": 1.0124, + "loss/crossentropy": 2.6282575130462646, + "loss/hidden": 0.8125, + "loss/logits": 0.13276182115077972, + "loss/reg": 0.006718279793858528, + "step": 2853 + }, + { + "epoch": 0.35675, + "grad_norm": 3.211562156677246, + "grad_norm_var": 2.4001734416835605, + "learning_rate": 0.0001, + "loss": 1.1361, + "loss/crossentropy": 2.7054433822631836, + "loss/hidden": 0.91015625, + "loss/logits": 0.15881550312042236, + "loss/reg": 0.006715097464621067, + "step": 2854 + }, + { + "epoch": 0.356875, + "grad_norm": 2.7019524574279785, + "grad_norm_var": 2.422067136687671, + "learning_rate": 0.0001, + "loss": 1.0836, + "loss/crossentropy": 2.5598132610321045, + "loss/hidden": 0.87109375, + "loss/logits": 0.1454331874847412, + "loss/reg": 0.006711202207952738, + "step": 2855 + }, + { + "epoch": 0.357, + "grad_norm": 3.171034812927246, + "grad_norm_var": 2.4031244227786335, + "learning_rate": 0.0001, + "loss": 1.2962, + "loss/crossentropy": 2.1901423931121826, + "loss/hidden": 1.0625, + "loss/logits": 0.16660726070404053, + "loss/reg": 0.006707904394716024, + "step": 2856 + }, + { + "epoch": 0.357125, + "grad_norm": 3.7662479877471924, + "grad_norm_var": 2.360426090236585, + "learning_rate": 0.0001, + "loss": 1.2526, + "loss/crossentropy": 2.558736562728882, + "loss/hidden": 0.99609375, + "loss/logits": 0.18949384987354279, + "loss/reg": 0.006704638246446848, + "step": 2857 + }, + { + "epoch": 0.35725, + "grad_norm": 2.783151388168335, + "grad_norm_var": 2.362706541604073, + "learning_rate": 0.0001, + "loss": 1.3155, + "loss/crossentropy": 2.403660297393799, + "loss/hidden": 1.03125, + "loss/logits": 0.21722620725631714, + "loss/reg": 0.006701475474983454, + "step": 2858 + }, + { + "epoch": 0.357375, + "grad_norm": 2.3664562702178955, + "grad_norm_var": 2.453089533237081, + "learning_rate": 0.0001, + "loss": 1.0674, + "loss/crossentropy": 2.558664560317993, + "loss/hidden": 0.84765625, + "loss/logits": 0.15275700390338898, + "loss/reg": 0.0066977450624108315, + "step": 2859 + }, + { + "epoch": 0.3575, + "grad_norm": 2.705460786819458, + "grad_norm_var": 2.491403459686654, + "learning_rate": 0.0001, + "loss": 1.1939, + "loss/crossentropy": 2.3926823139190674, + "loss/hidden": 0.96484375, + "loss/logits": 0.1621232032775879, + "loss/reg": 0.0066938879899680614, + "step": 2860 + }, + { + "epoch": 0.357625, + "grad_norm": 3.4779458045959473, + "grad_norm_var": 2.4925127279524264, + "learning_rate": 0.0001, + "loss": 1.3198, + "loss/crossentropy": 2.3816492557525635, + "loss/hidden": 1.046875, + "loss/logits": 0.20599167048931122, + "loss/reg": 0.006690531969070435, + "step": 2861 + }, + { + "epoch": 0.35775, + "grad_norm": 2.9604477882385254, + "grad_norm_var": 2.5093163074607143, + "learning_rate": 0.0001, + "loss": 1.2037, + "loss/crossentropy": 2.7264418601989746, + "loss/hidden": 0.98046875, + "loss/logits": 0.15633198618888855, + "loss/reg": 0.006686678621917963, + "step": 2862 + }, + { + "epoch": 0.357875, + "grad_norm": 4.753926753997803, + "grad_norm_var": 2.581232997694579, + "learning_rate": 0.0001, + "loss": 1.4275, + "loss/crossentropy": 2.495388984680176, + "loss/hidden": 1.1484375, + "loss/logits": 0.2122369408607483, + "loss/reg": 0.0066827707923948765, + "step": 2863 + }, + { + "epoch": 0.358, + "grad_norm": 2.6990163326263428, + "grad_norm_var": 2.5903522040736826, + "learning_rate": 0.0001, + "loss": 1.1401, + "loss/crossentropy": 1.747976541519165, + "loss/hidden": 0.95703125, + "loss/logits": 0.1162765771150589, + "loss/reg": 0.006678814068436623, + "step": 2864 + }, + { + "epoch": 0.358125, + "grad_norm": 3.8729324340820312, + "grad_norm_var": 2.5847083567643625, + "learning_rate": 0.0001, + "loss": 1.1398, + "loss/crossentropy": 2.410270929336548, + "loss/hidden": 0.91015625, + "loss/logits": 0.16293761134147644, + "loss/reg": 0.006674793548882008, + "step": 2865 + }, + { + "epoch": 0.35825, + "grad_norm": 2.9574732780456543, + "grad_norm_var": 2.5838769135005917, + "learning_rate": 0.0001, + "loss": 1.2066, + "loss/crossentropy": 2.7548558712005615, + "loss/hidden": 0.9453125, + "loss/logits": 0.1945320963859558, + "loss/reg": 0.006670871749520302, + "step": 2866 + }, + { + "epoch": 0.358375, + "grad_norm": 3.082350969314575, + "grad_norm_var": 2.594362928819282, + "learning_rate": 0.0001, + "loss": 1.2005, + "loss/crossentropy": 2.8796327114105225, + "loss/hidden": 0.953125, + "loss/logits": 0.18068921566009521, + "loss/reg": 0.006666826084256172, + "step": 2867 + }, + { + "epoch": 0.3585, + "grad_norm": 3.353243589401245, + "grad_norm_var": 0.33611903947230437, + "learning_rate": 0.0001, + "loss": 1.4163, + "loss/crossentropy": 2.21570086479187, + "loss/hidden": 1.1796875, + "loss/logits": 0.17003032565116882, + "loss/reg": 0.006663014180958271, + "step": 2868 + }, + { + "epoch": 0.358625, + "grad_norm": 3.633741617202759, + "grad_norm_var": 0.34764685538668044, + "learning_rate": 0.0001, + "loss": 1.208, + "loss/crossentropy": 2.436885118484497, + "loss/hidden": 0.97265625, + "loss/logits": 0.16879867017269135, + "loss/reg": 0.006659196224063635, + "step": 2869 + }, + { + "epoch": 0.35875, + "grad_norm": 2.733994960784912, + "grad_norm_var": 0.36234678712943663, + "learning_rate": 0.0001, + "loss": 1.1475, + "loss/crossentropy": 2.5145859718322754, + "loss/hidden": 0.91796875, + "loss/logits": 0.1629822850227356, + "loss/reg": 0.006655503064393997, + "step": 2870 + }, + { + "epoch": 0.358875, + "grad_norm": 2.6376380920410156, + "grad_norm_var": 0.3667793844575044, + "learning_rate": 0.0001, + "loss": 1.1549, + "loss/crossentropy": 2.3565375804901123, + "loss/hidden": 0.921875, + "loss/logits": 0.16647452116012573, + "loss/reg": 0.006651906296610832, + "step": 2871 + }, + { + "epoch": 0.359, + "grad_norm": 2.437222957611084, + "grad_norm_var": 0.40177055931237554, + "learning_rate": 0.0001, + "loss": 1.019, + "loss/crossentropy": 2.6894030570983887, + "loss/hidden": 0.8203125, + "loss/logits": 0.13218355178833008, + "loss/reg": 0.006648419890552759, + "step": 2872 + }, + { + "epoch": 0.359125, + "grad_norm": 2.7955782413482666, + "grad_norm_var": 0.37945572173417286, + "learning_rate": 0.0001, + "loss": 1.0123, + "loss/crossentropy": 2.387749433517456, + "loss/hidden": 0.81640625, + "loss/logits": 0.12944383919239044, + "loss/reg": 0.0066446177661418915, + "step": 2873 + }, + { + "epoch": 0.35925, + "grad_norm": 2.3675384521484375, + "grad_norm_var": 0.4065995967944633, + "learning_rate": 0.0001, + "loss": 1.0601, + "loss/crossentropy": 2.6212000846862793, + "loss/hidden": 0.84375, + "loss/logits": 0.1499435007572174, + "loss/reg": 0.006641164422035217, + "step": 2874 + }, + { + "epoch": 0.359375, + "grad_norm": 2.272757053375244, + "grad_norm_var": 0.41571529074281366, + "learning_rate": 0.0001, + "loss": 1.1525, + "loss/crossentropy": 2.560260772705078, + "loss/hidden": 0.92578125, + "loss/logits": 0.16035974025726318, + "loss/reg": 0.0066377646289765835, + "step": 2875 + }, + { + "epoch": 0.3595, + "grad_norm": 3.2367563247680664, + "grad_norm_var": 0.409210550975992, + "learning_rate": 0.0001, + "loss": 1.4227, + "loss/crossentropy": 2.497131586074829, + "loss/hidden": 1.15625, + "loss/logits": 0.2001260370016098, + "loss/reg": 0.006634165998548269, + "step": 2876 + }, + { + "epoch": 0.359625, + "grad_norm": 2.387392520904541, + "grad_norm_var": 0.42561048577206456, + "learning_rate": 0.0001, + "loss": 1.1455, + "loss/crossentropy": 2.5698604583740234, + "loss/hidden": 0.9140625, + "loss/logits": 0.1651347279548645, + "loss/reg": 0.006630334071815014, + "step": 2877 + }, + { + "epoch": 0.35975, + "grad_norm": 3.205293655395508, + "grad_norm_var": 0.4276947306043778, + "learning_rate": 0.0001, + "loss": 1.1507, + "loss/crossentropy": 2.52469801902771, + "loss/hidden": 0.93359375, + "loss/logits": 0.15082001686096191, + "loss/reg": 0.006626829970628023, + "step": 2878 + }, + { + "epoch": 0.359875, + "grad_norm": 2.6675567626953125, + "grad_norm_var": 0.21926296254120886, + "learning_rate": 0.0001, + "loss": 1.2039, + "loss/crossentropy": 2.3201608657836914, + "loss/hidden": 0.94921875, + "loss/logits": 0.18839901685714722, + "loss/reg": 0.006623328197747469, + "step": 2879 + }, + { + "epoch": 0.36, + "grad_norm": 2.7795443534851074, + "grad_norm_var": 0.21755022161935264, + "learning_rate": 0.0001, + "loss": 1.1007, + "loss/crossentropy": 2.522721529006958, + "loss/hidden": 0.87890625, + "loss/logits": 0.15562480688095093, + "loss/reg": 0.006619665306061506, + "step": 2880 + }, + { + "epoch": 0.360125, + "grad_norm": 6.588879585266113, + "grad_norm_var": 1.0304220531178685, + "learning_rate": 0.0001, + "loss": 2.4275, + "loss/crossentropy": 2.2891228199005127, + "loss/hidden": 1.703125, + "loss/logits": 0.6582626104354858, + "loss/reg": 0.006616054568439722, + "step": 2881 + }, + { + "epoch": 0.36025, + "grad_norm": 2.730316400527954, + "grad_norm_var": 1.0370873404697134, + "learning_rate": 0.0001, + "loss": 1.2042, + "loss/crossentropy": 2.1128990650177, + "loss/hidden": 0.984375, + "loss/logits": 0.1536768525838852, + "loss/reg": 0.006612489931285381, + "step": 2882 + }, + { + "epoch": 0.360375, + "grad_norm": 2.2931172847747803, + "grad_norm_var": 1.0733358087921807, + "learning_rate": 0.0001, + "loss": 1.0524, + "loss/crossentropy": 2.439734697341919, + "loss/hidden": 0.83984375, + "loss/logits": 0.1464591920375824, + "loss/reg": 0.006609074305742979, + "step": 2883 + }, + { + "epoch": 0.3605, + "grad_norm": 3.1156201362609863, + "grad_norm_var": 1.065911759681064, + "learning_rate": 0.0001, + "loss": 1.1877, + "loss/crossentropy": 2.335137128829956, + "loss/hidden": 0.9609375, + "loss/logits": 0.16069793701171875, + "loss/reg": 0.0066053117625415325, + "step": 2884 + }, + { + "epoch": 0.360625, + "grad_norm": 3.106614828109741, + "grad_norm_var": 1.0382223756217779, + "learning_rate": 0.0001, + "loss": 1.1332, + "loss/crossentropy": 2.79343318939209, + "loss/hidden": 0.91015625, + "loss/logits": 0.15698067843914032, + "loss/reg": 0.006601545959711075, + "step": 2885 + }, + { + "epoch": 0.36075, + "grad_norm": 3.9636623859405518, + "grad_norm_var": 1.0957155114555839, + "learning_rate": 0.0001, + "loss": 1.3255, + "loss/crossentropy": 3.009531021118164, + "loss/hidden": 1.0546875, + "loss/logits": 0.20478734374046326, + "loss/reg": 0.0065977489575743675, + "step": 2886 + }, + { + "epoch": 0.360875, + "grad_norm": 22.31818389892578, + "grad_norm_var": 24.256571333194323, + "learning_rate": 0.0001, + "loss": 1.6966, + "loss/crossentropy": 2.792743682861328, + "loss/hidden": 1.40625, + "loss/logits": 0.2244512438774109, + "loss/reg": 0.006593991070985794, + "step": 2887 + }, + { + "epoch": 0.361, + "grad_norm": 3.9209213256835938, + "grad_norm_var": 24.032251845393244, + "learning_rate": 0.0001, + "loss": 1.5375, + "loss/crossentropy": 2.3459908962249756, + "loss/hidden": 1.1796875, + "loss/logits": 0.2918829917907715, + "loss/reg": 0.006590109318494797, + "step": 2888 + }, + { + "epoch": 0.361125, + "grad_norm": 3.829087734222412, + "grad_norm_var": 23.883519837472242, + "learning_rate": 0.0001, + "loss": 1.4311, + "loss/crossentropy": 2.0121724605560303, + "loss/hidden": 1.15625, + "loss/logits": 0.2090192437171936, + "loss/reg": 0.006586330011487007, + "step": 2889 + }, + { + "epoch": 0.36125, + "grad_norm": 5.210920333862305, + "grad_norm_var": 23.60919833027089, + "learning_rate": 0.0001, + "loss": 1.3466, + "loss/crossentropy": 2.4832828044891357, + "loss/hidden": 1.0859375, + "loss/logits": 0.19486665725708008, + "loss/reg": 0.006582567002624273, + "step": 2890 + }, + { + "epoch": 0.361375, + "grad_norm": 3.052110195159912, + "grad_norm_var": 23.405154824515535, + "learning_rate": 0.0001, + "loss": 1.1598, + "loss/crossentropy": 2.469416856765747, + "loss/hidden": 0.9375, + "loss/logits": 0.15650694072246552, + "loss/reg": 0.006578729022294283, + "step": 2891 + }, + { + "epoch": 0.3615, + "grad_norm": 4.892289638519287, + "grad_norm_var": 23.264415500064125, + "learning_rate": 0.0001, + "loss": 1.2697, + "loss/crossentropy": 2.4329347610473633, + "loss/hidden": 1.03125, + "loss/logits": 0.17267325520515442, + "loss/reg": 0.00657490361481905, + "step": 2892 + }, + { + "epoch": 0.361625, + "grad_norm": 3.051300525665283, + "grad_norm_var": 23.082483015393834, + "learning_rate": 0.0001, + "loss": 1.1141, + "loss/crossentropy": 2.696091413497925, + "loss/hidden": 0.8984375, + "loss/logits": 0.14999747276306152, + "loss/reg": 0.0065711429342627525, + "step": 2893 + }, + { + "epoch": 0.36175, + "grad_norm": 2.4621989727020264, + "grad_norm_var": 23.274535406864825, + "learning_rate": 0.0001, + "loss": 1.1178, + "loss/crossentropy": 2.6220836639404297, + "loss/hidden": 0.90234375, + "loss/logits": 0.1497473120689392, + "loss/reg": 0.006567361298948526, + "step": 2894 + }, + { + "epoch": 0.361875, + "grad_norm": 2.80969500541687, + "grad_norm_var": 23.236353072093127, + "learning_rate": 0.0001, + "loss": 0.9475, + "loss/crossentropy": 2.591700553894043, + "loss/hidden": 0.7578125, + "loss/logits": 0.1240551620721817, + "loss/reg": 0.0065633803606033325, + "step": 2895 + }, + { + "epoch": 0.362, + "grad_norm": 2.9202463626861572, + "grad_norm_var": 23.200478177766655, + "learning_rate": 0.0001, + "loss": 1.1908, + "loss/crossentropy": 2.524949073791504, + "loss/hidden": 0.96875, + "loss/logits": 0.1564422994852066, + "loss/reg": 0.006559543777257204, + "step": 2896 + }, + { + "epoch": 0.362125, + "grad_norm": 2.6297080516815186, + "grad_norm_var": 23.218191437018, + "learning_rate": 0.0001, + "loss": 1.1195, + "loss/crossentropy": 2.46774959564209, + "loss/hidden": 0.8984375, + "loss/logits": 0.15547922253608704, + "loss/reg": 0.006555567961186171, + "step": 2897 + }, + { + "epoch": 0.36225, + "grad_norm": 2.9142045974731445, + "grad_norm_var": 23.176446105194728, + "learning_rate": 0.0001, + "loss": 1.0626, + "loss/crossentropy": 2.3928418159484863, + "loss/hidden": 0.85546875, + "loss/logits": 0.14161977171897888, + "loss/reg": 0.006551443599164486, + "step": 2898 + }, + { + "epoch": 0.362375, + "grad_norm": 2.4919652938842773, + "grad_norm_var": 23.119594391158326, + "learning_rate": 0.0001, + "loss": 1.0934, + "loss/crossentropy": 2.487020969390869, + "loss/hidden": 0.87890625, + "loss/logits": 0.1490069031715393, + "loss/reg": 0.006547501776367426, + "step": 2899 + }, + { + "epoch": 0.3625, + "grad_norm": 3.1773722171783447, + "grad_norm_var": 23.108079858089706, + "learning_rate": 0.0001, + "loss": 1.1537, + "loss/crossentropy": 2.5389323234558105, + "loss/hidden": 0.9375, + "loss/logits": 0.150716170668602, + "loss/reg": 0.006543751340359449, + "step": 2900 + }, + { + "epoch": 0.362625, + "grad_norm": 2.850186586380005, + "grad_norm_var": 23.161433719722833, + "learning_rate": 0.0001, + "loss": 1.253, + "loss/crossentropy": 2.7134439945220947, + "loss/hidden": 1.015625, + "loss/logits": 0.17201505601406097, + "loss/reg": 0.006540067959576845, + "step": 2901 + }, + { + "epoch": 0.36275, + "grad_norm": 3.3370282649993896, + "grad_norm_var": 23.23336719594284, + "learning_rate": 0.0001, + "loss": 1.2434, + "loss/crossentropy": 2.9030580520629883, + "loss/hidden": 1.0, + "loss/logits": 0.17802169919013977, + "loss/reg": 0.006536413449794054, + "step": 2902 + }, + { + "epoch": 0.362875, + "grad_norm": 3.4266459941864014, + "grad_norm_var": 0.6364132777819946, + "learning_rate": 0.0001, + "loss": 1.1669, + "loss/crossentropy": 2.308927297592163, + "loss/hidden": 0.94140625, + "loss/logits": 0.16015183925628662, + "loss/reg": 0.006532760336995125, + "step": 2903 + }, + { + "epoch": 0.363, + "grad_norm": 3.043642044067383, + "grad_norm_var": 0.6131707465059046, + "learning_rate": 0.0001, + "loss": 1.0242, + "loss/crossentropy": 2.2907464504241943, + "loss/hidden": 0.828125, + "loss/logits": 0.1307801604270935, + "loss/reg": 0.006529004313051701, + "step": 2904 + }, + { + "epoch": 0.363125, + "grad_norm": 2.2269046306610107, + "grad_norm_var": 0.6512168720470316, + "learning_rate": 0.0001, + "loss": 1.0469, + "loss/crossentropy": 2.810088634490967, + "loss/hidden": 0.83984375, + "loss/logits": 0.14180462062358856, + "loss/reg": 0.006525283679366112, + "step": 2905 + }, + { + "epoch": 0.36325, + "grad_norm": 2.2434792518615723, + "grad_norm_var": 0.38853655139039894, + "learning_rate": 0.0001, + "loss": 1.1572, + "loss/crossentropy": 2.3118579387664795, + "loss/hidden": 0.921875, + "loss/logits": 0.17006686329841614, + "loss/reg": 0.006521598435938358, + "step": 2906 + }, + { + "epoch": 0.363375, + "grad_norm": 3.0525450706481934, + "grad_norm_var": 0.38854129170375107, + "learning_rate": 0.0001, + "loss": 1.2339, + "loss/crossentropy": 2.538013458251953, + "loss/hidden": 1.0, + "loss/logits": 0.1687142252922058, + "loss/reg": 0.006517799571156502, + "step": 2907 + }, + { + "epoch": 0.3635, + "grad_norm": 3.2148561477661133, + "grad_norm_var": 0.13459924110492782, + "learning_rate": 0.0001, + "loss": 1.0987, + "loss/crossentropy": 2.675201892852783, + "loss/hidden": 0.890625, + "loss/logits": 0.14290809631347656, + "loss/reg": 0.00651393411681056, + "step": 2908 + }, + { + "epoch": 0.363625, + "grad_norm": 2.7242767810821533, + "grad_norm_var": 0.1331926266562192, + "learning_rate": 0.0001, + "loss": 0.9937, + "loss/crossentropy": 2.4053194522857666, + "loss/hidden": 0.80078125, + "loss/logits": 0.12776824831962585, + "loss/reg": 0.0065100486390292645, + "step": 2909 + }, + { + "epoch": 0.36375, + "grad_norm": 3.1906521320343018, + "grad_norm_var": 0.12914744725896982, + "learning_rate": 0.0001, + "loss": 1.5223, + "loss/crossentropy": 1.9234015941619873, + "loss/hidden": 1.28125, + "loss/logits": 0.17596769332885742, + "loss/reg": 0.0065062325447797775, + "step": 2910 + }, + { + "epoch": 0.363875, + "grad_norm": 3.0963900089263916, + "grad_norm_var": 0.1311828006427798, + "learning_rate": 0.0001, + "loss": 1.0918, + "loss/crossentropy": 2.3263516426086426, + "loss/hidden": 0.890625, + "loss/logits": 0.13611821830272675, + "loss/reg": 0.006502555217593908, + "step": 2911 + }, + { + "epoch": 0.364, + "grad_norm": 2.779703378677368, + "grad_norm_var": 0.13220201135042373, + "learning_rate": 0.0001, + "loss": 1.1591, + "loss/crossentropy": 2.4273808002471924, + "loss/hidden": 0.9140625, + "loss/logits": 0.18001694977283478, + "loss/reg": 0.0064989011734724045, + "step": 2912 + }, + { + "epoch": 0.364125, + "grad_norm": 3.54826283454895, + "grad_norm_var": 0.1518355782459139, + "learning_rate": 0.0001, + "loss": 1.3013, + "loss/crossentropy": 2.3089253902435303, + "loss/hidden": 1.0, + "loss/logits": 0.23632323741912842, + "loss/reg": 0.0064953831024467945, + "step": 2913 + }, + { + "epoch": 0.36425, + "grad_norm": 2.628416061401367, + "grad_norm_var": 0.1585855597961237, + "learning_rate": 0.0001, + "loss": 1.114, + "loss/crossentropy": 2.6731762886047363, + "loss/hidden": 0.890625, + "loss/logits": 0.15847691893577576, + "loss/reg": 0.006491933949291706, + "step": 2914 + }, + { + "epoch": 0.364375, + "grad_norm": 4.108644485473633, + "grad_norm_var": 0.22546504435674705, + "learning_rate": 0.0001, + "loss": 1.4292, + "loss/crossentropy": 2.3468849658966064, + "loss/hidden": 1.1484375, + "loss/logits": 0.2158311903476715, + "loss/reg": 0.00648848433047533, + "step": 2915 + }, + { + "epoch": 0.3645, + "grad_norm": 3.378568410873413, + "grad_norm_var": 0.2316651080014547, + "learning_rate": 0.0001, + "loss": 1.1055, + "loss/crossentropy": 2.8706247806549072, + "loss/hidden": 0.87890625, + "loss/logits": 0.16177114844322205, + "loss/reg": 0.006485124118626118, + "step": 2916 + }, + { + "epoch": 0.364625, + "grad_norm": 2.6398634910583496, + "grad_norm_var": 0.24012121801007022, + "learning_rate": 0.0001, + "loss": 1.0408, + "loss/crossentropy": 2.5529143810272217, + "loss/hidden": 0.83203125, + "loss/logits": 0.14397668838500977, + "loss/reg": 0.006481767166405916, + "step": 2917 + }, + { + "epoch": 0.36475, + "grad_norm": 3.220629930496216, + "grad_norm_var": 0.2363580736847228, + "learning_rate": 0.0001, + "loss": 1.1635, + "loss/crossentropy": 2.5548501014709473, + "loss/hidden": 0.9453125, + "loss/logits": 0.15341481566429138, + "loss/reg": 0.006478061433881521, + "step": 2918 + }, + { + "epoch": 0.364875, + "grad_norm": 3.03249192237854, + "grad_norm_var": 0.22536544602095862, + "learning_rate": 0.0001, + "loss": 1.323, + "loss/crossentropy": 2.4935107231140137, + "loss/hidden": 1.0625, + "loss/logits": 0.19577205181121826, + "loss/reg": 0.0064745377749204636, + "step": 2919 + }, + { + "epoch": 0.365, + "grad_norm": 2.9565083980560303, + "grad_norm_var": 0.22542684345356273, + "learning_rate": 0.0001, + "loss": 1.2527, + "loss/crossentropy": 2.616593599319458, + "loss/hidden": 1.015625, + "loss/logits": 0.17231954634189606, + "loss/reg": 0.006471022963523865, + "step": 2920 + }, + { + "epoch": 0.365125, + "grad_norm": 3.893911361694336, + "grad_norm_var": 0.2266886513777912, + "learning_rate": 0.0001, + "loss": 1.182, + "loss/crossentropy": 2.83146595954895, + "loss/hidden": 0.94140625, + "loss/logits": 0.1759631335735321, + "loss/reg": 0.006467149592936039, + "step": 2921 + }, + { + "epoch": 0.36525, + "grad_norm": 3.465529680252075, + "grad_norm_var": 0.17935300234991303, + "learning_rate": 0.0001, + "loss": 1.1511, + "loss/crossentropy": 2.6543097496032715, + "loss/hidden": 0.9296875, + "loss/logits": 0.156746968626976, + "loss/reg": 0.0064636156894266605, + "step": 2922 + }, + { + "epoch": 0.365375, + "grad_norm": 2.6630501747131348, + "grad_norm_var": 0.1956200641529667, + "learning_rate": 0.0001, + "loss": 1.2214, + "loss/crossentropy": 2.4735219478607178, + "loss/hidden": 0.9765625, + "loss/logits": 0.18027830123901367, + "loss/reg": 0.006460051983594894, + "step": 2923 + }, + { + "epoch": 0.3655, + "grad_norm": 3.5610761642456055, + "grad_norm_var": 0.2056967783685792, + "learning_rate": 0.0001, + "loss": 1.4353, + "loss/crossentropy": 2.696284294128418, + "loss/hidden": 1.109375, + "loss/logits": 0.26133716106414795, + "loss/reg": 0.006456304341554642, + "step": 2924 + }, + { + "epoch": 0.365625, + "grad_norm": 2.617219924926758, + "grad_norm_var": 0.21292532254713378, + "learning_rate": 0.0001, + "loss": 1.2023, + "loss/crossentropy": 2.6485471725463867, + "loss/hidden": 0.9453125, + "loss/logits": 0.19246652722358704, + "loss/reg": 0.006452751811593771, + "step": 2925 + }, + { + "epoch": 0.36575, + "grad_norm": 2.901887893676758, + "grad_norm_var": 0.21748831737276836, + "learning_rate": 0.0001, + "loss": 1.1497, + "loss/crossentropy": 2.4927761554718018, + "loss/hidden": 0.9140625, + "loss/logits": 0.17110440135002136, + "loss/reg": 0.006449414417147636, + "step": 2926 + }, + { + "epoch": 0.365875, + "grad_norm": 4.41480827331543, + "grad_norm_var": 0.31569095454031504, + "learning_rate": 0.0001, + "loss": 1.5064, + "loss/crossentropy": 1.946433424949646, + "loss/hidden": 1.21875, + "loss/logits": 0.22322967648506165, + "loss/reg": 0.006445558276027441, + "step": 2927 + }, + { + "epoch": 0.366, + "grad_norm": 3.533135175704956, + "grad_norm_var": 0.3051141543482172, + "learning_rate": 0.0001, + "loss": 1.0166, + "loss/crossentropy": 2.587625026702881, + "loss/hidden": 0.828125, + "loss/logits": 0.1240384429693222, + "loss/reg": 0.006441666278988123, + "step": 2928 + }, + { + "epoch": 0.366125, + "grad_norm": 2.39128041267395, + "grad_norm_var": 0.34820371811981443, + "learning_rate": 0.0001, + "loss": 1.0455, + "loss/crossentropy": 2.59517240524292, + "loss/hidden": 0.8359375, + "loss/logits": 0.1451393961906433, + "loss/reg": 0.00643783388659358, + "step": 2929 + }, + { + "epoch": 0.36625, + "grad_norm": 3.758814811706543, + "grad_norm_var": 0.3399671227210696, + "learning_rate": 0.0001, + "loss": 1.0727, + "loss/crossentropy": 2.593083143234253, + "loss/hidden": 0.85546875, + "loss/logits": 0.15284234285354614, + "loss/reg": 0.0064340573735535145, + "step": 2930 + }, + { + "epoch": 0.366375, + "grad_norm": 3.4733479022979736, + "grad_norm_var": 0.2953048894247246, + "learning_rate": 0.0001, + "loss": 1.1556, + "loss/crossentropy": 2.4476308822631836, + "loss/hidden": 0.9296875, + "loss/logits": 0.16165688633918762, + "loss/reg": 0.006430534180253744, + "step": 2931 + }, + { + "epoch": 0.3665, + "grad_norm": 3.143385410308838, + "grad_norm_var": 0.294538392470781, + "learning_rate": 0.0001, + "loss": 1.2013, + "loss/crossentropy": 2.528491735458374, + "loss/hidden": 0.93359375, + "loss/logits": 0.20340821146965027, + "loss/reg": 0.006426732987165451, + "step": 2932 + }, + { + "epoch": 0.366625, + "grad_norm": 3.3090200424194336, + "grad_norm_var": 0.26994437465532023, + "learning_rate": 0.0001, + "loss": 1.2912, + "loss/crossentropy": 2.257890462875366, + "loss/hidden": 1.0390625, + "loss/logits": 0.18785947561264038, + "loss/reg": 0.0064232428558170795, + "step": 2933 + }, + { + "epoch": 0.36675, + "grad_norm": 2.6405813694000244, + "grad_norm_var": 0.2948689781812142, + "learning_rate": 0.0001, + "loss": 1.1785, + "loss/crossentropy": 2.5351452827453613, + "loss/hidden": 0.9453125, + "loss/logits": 0.16901114583015442, + "loss/reg": 0.0064197625033557415, + "step": 2934 + }, + { + "epoch": 0.366875, + "grad_norm": 2.809861421585083, + "grad_norm_var": 0.30397068246021847, + "learning_rate": 0.0001, + "loss": 1.2605, + "loss/crossentropy": 2.6845130920410156, + "loss/hidden": 0.98828125, + "loss/logits": 0.20806646347045898, + "loss/reg": 0.006416547577828169, + "step": 2935 + }, + { + "epoch": 0.367, + "grad_norm": 2.443189859390259, + "grad_norm_var": 0.3385305934868671, + "learning_rate": 0.0001, + "loss": 0.9895, + "loss/crossentropy": 2.5384156703948975, + "loss/hidden": 0.7890625, + "loss/logits": 0.13630658388137817, + "loss/reg": 0.006413432769477367, + "step": 2936 + }, + { + "epoch": 0.367125, + "grad_norm": 2.9335296154022217, + "grad_norm_var": 0.3058806648327201, + "learning_rate": 0.0001, + "loss": 1.1383, + "loss/crossentropy": 2.414463758468628, + "loss/hidden": 0.92578125, + "loss/logits": 0.14841397106647491, + "loss/reg": 0.0064104096964001656, + "step": 2937 + }, + { + "epoch": 0.36725, + "grad_norm": 2.2469804286956787, + "grad_norm_var": 0.34396401134545823, + "learning_rate": 0.0001, + "loss": 1.1026, + "loss/crossentropy": 2.7363431453704834, + "loss/hidden": 0.875, + "loss/logits": 0.16351598501205444, + "loss/reg": 0.006406768225133419, + "step": 2938 + }, + { + "epoch": 0.367375, + "grad_norm": 2.5279767513275146, + "grad_norm_var": 0.3521195383928649, + "learning_rate": 0.0001, + "loss": 1.1859, + "loss/crossentropy": 2.3711323738098145, + "loss/hidden": 0.953125, + "loss/logits": 0.1687922179698944, + "loss/reg": 0.006403085310012102, + "step": 2939 + }, + { + "epoch": 0.3675, + "grad_norm": 2.547192335128784, + "grad_norm_var": 0.3464840809805603, + "learning_rate": 0.0001, + "loss": 1.017, + "loss/crossentropy": 2.51830792427063, + "loss/hidden": 0.80859375, + "loss/logits": 0.1443764865398407, + "loss/reg": 0.0063994331285357475, + "step": 2940 + }, + { + "epoch": 0.367625, + "grad_norm": 2.467339277267456, + "grad_norm_var": 0.35515317475169633, + "learning_rate": 0.0001, + "loss": 1.1816, + "loss/crossentropy": 2.3114559650421143, + "loss/hidden": 0.9375, + "loss/logits": 0.18010586500167847, + "loss/reg": 0.0063957045786082745, + "step": 2941 + }, + { + "epoch": 0.36775, + "grad_norm": 5.6721601486206055, + "grad_norm_var": 0.8091296297493232, + "learning_rate": 0.0001, + "loss": 1.1383, + "loss/crossentropy": 2.76481032371521, + "loss/hidden": 0.92578125, + "loss/logits": 0.1485617458820343, + "loss/reg": 0.006392012815922499, + "step": 2942 + }, + { + "epoch": 0.367875, + "grad_norm": 2.7251062393188477, + "grad_norm_var": 0.7013891042820828, + "learning_rate": 0.0001, + "loss": 1.0188, + "loss/crossentropy": 2.7316436767578125, + "loss/hidden": 0.8125, + "loss/logits": 0.14243966341018677, + "loss/reg": 0.006388540379703045, + "step": 2943 + }, + { + "epoch": 0.368, + "grad_norm": 3.4232337474823, + "grad_norm_var": 0.6949021716584807, + "learning_rate": 0.0001, + "loss": 1.3331, + "loss/crossentropy": 2.597306966781616, + "loss/hidden": 1.0390625, + "loss/logits": 0.23018494248390198, + "loss/reg": 0.006384837441146374, + "step": 2944 + }, + { + "epoch": 0.368125, + "grad_norm": 3.4376792907714844, + "grad_norm_var": 0.673934765921869, + "learning_rate": 0.0001, + "loss": 1.43, + "loss/crossentropy": 2.006058692932129, + "loss/hidden": 1.15625, + "loss/logits": 0.20994441211223602, + "loss/reg": 0.006381357088685036, + "step": 2945 + }, + { + "epoch": 0.36825, + "grad_norm": 2.7537946701049805, + "grad_norm_var": 0.6484408615775389, + "learning_rate": 0.0001, + "loss": 1.3179, + "loss/crossentropy": 2.286041021347046, + "loss/hidden": 1.078125, + "loss/logits": 0.17600281536579132, + "loss/reg": 0.006377674173563719, + "step": 2946 + }, + { + "epoch": 0.368375, + "grad_norm": 3.7587153911590576, + "grad_norm_var": 0.6702225912474044, + "learning_rate": 0.0001, + "loss": 1.2178, + "loss/crossentropy": 2.450220823287964, + "loss/hidden": 0.97265625, + "loss/logits": 0.1814137101173401, + "loss/reg": 0.006373940035700798, + "step": 2947 + }, + { + "epoch": 0.3685, + "grad_norm": 2.813281297683716, + "grad_norm_var": 0.6730322181497587, + "learning_rate": 0.0001, + "loss": 1.1702, + "loss/crossentropy": 2.110886573791504, + "loss/hidden": 0.95703125, + "loss/logits": 0.1494581401348114, + "loss/reg": 0.006370212882757187, + "step": 2948 + }, + { + "epoch": 0.368625, + "grad_norm": 3.15108060836792, + "grad_norm_var": 0.6687545167118797, + "learning_rate": 0.0001, + "loss": 1.1503, + "loss/crossentropy": 2.3451642990112305, + "loss/hidden": 0.9453125, + "loss/logits": 0.14134803414344788, + "loss/reg": 0.006366708781570196, + "step": 2949 + }, + { + "epoch": 0.36875, + "grad_norm": 2.8874895572662354, + "grad_norm_var": 0.6600086395450279, + "learning_rate": 0.0001, + "loss": 1.338, + "loss/crossentropy": 2.5204317569732666, + "loss/hidden": 1.0390625, + "loss/logits": 0.23534736037254333, + "loss/reg": 0.006363026797771454, + "step": 2950 + }, + { + "epoch": 0.368875, + "grad_norm": 4.647323131561279, + "grad_norm_var": 0.8152762201199379, + "learning_rate": 0.0001, + "loss": 1.2054, + "loss/crossentropy": 2.4526150226593018, + "loss/hidden": 0.9921875, + "loss/logits": 0.14957469701766968, + "loss/reg": 0.006359660532325506, + "step": 2951 + }, + { + "epoch": 0.369, + "grad_norm": 2.748318672180176, + "grad_norm_var": 0.792247720158381, + "learning_rate": 0.0001, + "loss": 1.2577, + "loss/crossentropy": 2.218827486038208, + "loss/hidden": 1.015625, + "loss/logits": 0.17854490876197815, + "loss/reg": 0.006355958059430122, + "step": 2952 + }, + { + "epoch": 0.369125, + "grad_norm": 3.18152117729187, + "grad_norm_var": 0.7882286230011192, + "learning_rate": 0.0001, + "loss": 1.2341, + "loss/crossentropy": 2.397681951522827, + "loss/hidden": 1.0078125, + "loss/logits": 0.16274084150791168, + "loss/reg": 0.006352248601615429, + "step": 2953 + }, + { + "epoch": 0.36925, + "grad_norm": 2.3362183570861816, + "grad_norm_var": 0.7775437037295726, + "learning_rate": 0.0001, + "loss": 1.0782, + "loss/crossentropy": 2.4071547985076904, + "loss/hidden": 0.859375, + "loss/logits": 0.155318021774292, + "loss/reg": 0.006348692812025547, + "step": 2954 + }, + { + "epoch": 0.369375, + "grad_norm": 2.270404577255249, + "grad_norm_var": 0.8045084923322359, + "learning_rate": 0.0001, + "loss": 0.9727, + "loss/crossentropy": 2.5868589878082275, + "loss/hidden": 0.78515625, + "loss/logits": 0.12414056062698364, + "loss/reg": 0.0063451011665165424, + "step": 2955 + }, + { + "epoch": 0.3695, + "grad_norm": 2.661360025405884, + "grad_norm_var": 0.7957465755019864, + "learning_rate": 0.0001, + "loss": 1.1902, + "loss/crossentropy": 2.6802990436553955, + "loss/hidden": 0.98046875, + "loss/logits": 0.146291121840477, + "loss/reg": 0.006341275293380022, + "step": 2956 + }, + { + "epoch": 0.369625, + "grad_norm": 2.1359386444091797, + "grad_norm_var": 0.8342528503815039, + "learning_rate": 0.0001, + "loss": 0.9695, + "loss/crossentropy": 2.425105333328247, + "loss/hidden": 0.77734375, + "loss/logits": 0.12875565886497498, + "loss/reg": 0.006337776780128479, + "step": 2957 + }, + { + "epoch": 0.36975, + "grad_norm": 7.899991989135742, + "grad_norm_var": 1.8898678157576643, + "learning_rate": 0.0001, + "loss": 1.4037, + "loss/crossentropy": 2.5431063175201416, + "loss/hidden": 1.140625, + "loss/logits": 0.19971221685409546, + "loss/reg": 0.006334193050861359, + "step": 2958 + }, + { + "epoch": 0.369875, + "grad_norm": 2.7537424564361572, + "grad_norm_var": 1.8877165233069482, + "learning_rate": 0.0001, + "loss": 1.1538, + "loss/crossentropy": 2.4700024127960205, + "loss/hidden": 0.92578125, + "loss/logits": 0.1647024154663086, + "loss/reg": 0.006330742500722408, + "step": 2959 + }, + { + "epoch": 0.37, + "grad_norm": 4.308285236358643, + "grad_norm_var": 1.9507729941174126, + "learning_rate": 0.0001, + "loss": 1.6261, + "loss/crossentropy": 2.4107391834259033, + "loss/hidden": 1.296875, + "loss/logits": 0.26593559980392456, + "loss/reg": 0.006327059119939804, + "step": 2960 + }, + { + "epoch": 0.370125, + "grad_norm": 2.3286261558532715, + "grad_norm_var": 2.0160239037217003, + "learning_rate": 0.0001, + "loss": 1.135, + "loss/crossentropy": 2.4957242012023926, + "loss/hidden": 0.91796875, + "loss/logits": 0.15382561087608337, + "loss/reg": 0.006323335692286491, + "step": 2961 + }, + { + "epoch": 0.37025, + "grad_norm": 3.0754542350769043, + "grad_norm_var": 1.9995041908891094, + "learning_rate": 0.0001, + "loss": 1.331, + "loss/crossentropy": 2.3459532260894775, + "loss/hidden": 1.0859375, + "loss/logits": 0.18189483880996704, + "loss/reg": 0.006319581065326929, + "step": 2962 + }, + { + "epoch": 0.370375, + "grad_norm": 3.3157567977905273, + "grad_norm_var": 1.9852575155815648, + "learning_rate": 0.0001, + "loss": 1.3319, + "loss/crossentropy": 2.5656378269195557, + "loss/hidden": 1.0859375, + "loss/logits": 0.1828194409608841, + "loss/reg": 0.0063160136342048645, + "step": 2963 + }, + { + "epoch": 0.3705, + "grad_norm": 2.980442523956299, + "grad_norm_var": 1.9765531756655508, + "learning_rate": 0.0001, + "loss": 1.3363, + "loss/crossentropy": 2.6723766326904297, + "loss/hidden": 1.0625, + "loss/logits": 0.21069511771202087, + "loss/reg": 0.006312500219792128, + "step": 2964 + }, + { + "epoch": 0.370625, + "grad_norm": 2.3983352184295654, + "grad_norm_var": 2.026173241174706, + "learning_rate": 0.0001, + "loss": 1.2704, + "loss/crossentropy": 2.3377437591552734, + "loss/hidden": 1.015625, + "loss/logits": 0.19166290760040283, + "loss/reg": 0.006308842916041613, + "step": 2965 + }, + { + "epoch": 0.37075, + "grad_norm": 3.226989507675171, + "grad_norm_var": 2.0171676479177365, + "learning_rate": 0.0001, + "loss": 1.2274, + "loss/crossentropy": 2.9660701751708984, + "loss/hidden": 0.9765625, + "loss/logits": 0.18782290816307068, + "loss/reg": 0.006305369548499584, + "step": 2966 + }, + { + "epoch": 0.370875, + "grad_norm": 3.16386079788208, + "grad_norm_var": 1.8816472580958807, + "learning_rate": 0.0001, + "loss": 1.0678, + "loss/crossentropy": 2.330775737762451, + "loss/hidden": 0.84375, + "loss/logits": 0.16104839742183685, + "loss/reg": 0.006301956716924906, + "step": 2967 + }, + { + "epoch": 0.371, + "grad_norm": 2.557389736175537, + "grad_norm_var": 1.8947642583959994, + "learning_rate": 0.0001, + "loss": 1.1327, + "loss/crossentropy": 2.802546739578247, + "loss/hidden": 0.9140625, + "loss/logits": 0.15561771392822266, + "loss/reg": 0.006298637483268976, + "step": 2968 + }, + { + "epoch": 0.371125, + "grad_norm": 2.631152629852295, + "grad_norm_var": 1.9122739709563232, + "learning_rate": 0.0001, + "loss": 1.2839, + "loss/crossentropy": 2.466240406036377, + "loss/hidden": 1.03125, + "loss/logits": 0.18970970809459686, + "loss/reg": 0.006295159924775362, + "step": 2969 + }, + { + "epoch": 0.37125, + "grad_norm": 2.792259454727173, + "grad_norm_var": 1.8771430466339694, + "learning_rate": 0.0001, + "loss": 1.0216, + "loss/crossentropy": 2.691854953765869, + "loss/hidden": 0.82421875, + "loss/logits": 0.13450004160404205, + "loss/reg": 0.006291534285992384, + "step": 2970 + }, + { + "epoch": 0.371375, + "grad_norm": 2.8353843688964844, + "grad_norm_var": 1.830361927440587, + "learning_rate": 0.0001, + "loss": 1.196, + "loss/crossentropy": 2.5591063499450684, + "loss/hidden": 0.95703125, + "loss/logits": 0.17609842121601105, + "loss/reg": 0.006287666503340006, + "step": 2971 + }, + { + "epoch": 0.3715, + "grad_norm": 2.8736965656280518, + "grad_norm_var": 1.8181690584356922, + "learning_rate": 0.0001, + "loss": 1.1379, + "loss/crossentropy": 2.7044715881347656, + "loss/hidden": 0.8984375, + "loss/logits": 0.17660945653915405, + "loss/reg": 0.006284040864557028, + "step": 2972 + }, + { + "epoch": 0.371625, + "grad_norm": 2.791693925857544, + "grad_norm_var": 1.7515873645413724, + "learning_rate": 0.0001, + "loss": 1.0889, + "loss/crossentropy": 2.799600839614868, + "loss/hidden": 0.875, + "loss/logits": 0.15106239914894104, + "loss/reg": 0.006280391477048397, + "step": 2973 + }, + { + "epoch": 0.37175, + "grad_norm": 2.8314950466156006, + "grad_norm_var": 0.21190119346286618, + "learning_rate": 0.0001, + "loss": 1.2831, + "loss/crossentropy": 2.494168519973755, + "loss/hidden": 1.0390625, + "loss/logits": 0.18123218417167664, + "loss/reg": 0.006276885978877544, + "step": 2974 + }, + { + "epoch": 0.371875, + "grad_norm": 3.881524085998535, + "grad_norm_var": 0.26503546771812786, + "learning_rate": 0.0001, + "loss": 1.4513, + "loss/crossentropy": 2.4907546043395996, + "loss/hidden": 1.171875, + "loss/logits": 0.21664096415042877, + "loss/reg": 0.006273434031754732, + "step": 2975 + }, + { + "epoch": 0.372, + "grad_norm": 2.536984443664551, + "grad_norm_var": 0.15203442512206872, + "learning_rate": 0.0001, + "loss": 1.1686, + "loss/crossentropy": 2.363800525665283, + "loss/hidden": 0.91796875, + "loss/logits": 0.18794508278369904, + "loss/reg": 0.006270069163292646, + "step": 2976 + }, + { + "epoch": 0.372125, + "grad_norm": 3.7481820583343506, + "grad_norm_var": 0.171951294450925, + "learning_rate": 0.0001, + "loss": 1.3981, + "loss/crossentropy": 2.4065988063812256, + "loss/hidden": 1.15625, + "loss/logits": 0.1791473925113678, + "loss/reg": 0.006266482640057802, + "step": 2977 + }, + { + "epoch": 0.37225, + "grad_norm": 3.820974111557007, + "grad_norm_var": 0.21642196162905805, + "learning_rate": 0.0001, + "loss": 1.3685, + "loss/crossentropy": 2.464829683303833, + "loss/hidden": 1.1015625, + "loss/logits": 0.20430535078048706, + "loss/reg": 0.006262919865548611, + "step": 2978 + }, + { + "epoch": 0.372375, + "grad_norm": 2.9135947227478027, + "grad_norm_var": 0.21089299744382212, + "learning_rate": 0.0001, + "loss": 1.1686, + "loss/crossentropy": 2.5210120677948, + "loss/hidden": 0.93359375, + "loss/logits": 0.1724550426006317, + "loss/reg": 0.006259588059037924, + "step": 2979 + }, + { + "epoch": 0.3725, + "grad_norm": 3.0941245555877686, + "grad_norm_var": 0.21141947449020387, + "learning_rate": 0.0001, + "loss": 1.3898, + "loss/crossentropy": 2.1830661296844482, + "loss/hidden": 1.140625, + "loss/logits": 0.1865786612033844, + "loss/reg": 0.006256050430238247, + "step": 2980 + }, + { + "epoch": 0.372625, + "grad_norm": 2.9306225776672363, + "grad_norm_var": 0.18599333807440718, + "learning_rate": 0.0001, + "loss": 1.2798, + "loss/crossentropy": 2.544529914855957, + "loss/hidden": 1.03125, + "loss/logits": 0.18599992990493774, + "loss/reg": 0.006252431310713291, + "step": 2981 + }, + { + "epoch": 0.37275, + "grad_norm": 2.7183332443237305, + "grad_norm_var": 0.18943956242721333, + "learning_rate": 0.0001, + "loss": 1.1795, + "loss/crossentropy": 2.234020709991455, + "loss/hidden": 0.9609375, + "loss/logits": 0.1560869812965393, + "loss/reg": 0.0062490543350577354, + "step": 2982 + }, + { + "epoch": 0.372875, + "grad_norm": 2.683260440826416, + "grad_norm_var": 0.19386109467259846, + "learning_rate": 0.0001, + "loss": 1.1093, + "loss/crossentropy": 2.532911539077759, + "loss/hidden": 0.890625, + "loss/logits": 0.15620505809783936, + "loss/reg": 0.006245526950806379, + "step": 2983 + }, + { + "epoch": 0.373, + "grad_norm": 3.546365261077881, + "grad_norm_var": 0.19958792431539768, + "learning_rate": 0.0001, + "loss": 1.3208, + "loss/crossentropy": 2.5171666145324707, + "loss/hidden": 1.0390625, + "loss/logits": 0.2192699909210205, + "loss/reg": 0.006241983734071255, + "step": 2984 + }, + { + "epoch": 0.373125, + "grad_norm": 3.3966946601867676, + "grad_norm_var": 0.19455040137438548, + "learning_rate": 0.0001, + "loss": 1.3442, + "loss/crossentropy": 2.1332437992095947, + "loss/hidden": 1.109375, + "loss/logits": 0.1723998785018921, + "loss/reg": 0.006238510832190514, + "step": 2985 + }, + { + "epoch": 0.37325, + "grad_norm": 4.044903755187988, + "grad_norm_var": 0.2433596239659323, + "learning_rate": 0.0001, + "loss": 1.296, + "loss/crossentropy": 2.672873020172119, + "loss/hidden": 1.0390625, + "loss/logits": 0.19460225105285645, + "loss/reg": 0.0062351408414542675, + "step": 2986 + }, + { + "epoch": 0.373375, + "grad_norm": 4.2409443855285645, + "grad_norm_var": 0.30497019447985374, + "learning_rate": 0.0001, + "loss": 1.6891, + "loss/crossentropy": 2.4617483615875244, + "loss/hidden": 1.2890625, + "loss/logits": 0.3377698063850403, + "loss/reg": 0.006231660954654217, + "step": 2987 + }, + { + "epoch": 0.3735, + "grad_norm": 5.106016635894775, + "grad_norm_var": 0.5034262714257103, + "learning_rate": 0.0001, + "loss": 1.6151, + "loss/crossentropy": 2.3135275840759277, + "loss/hidden": 1.2265625, + "loss/logits": 0.3262255787849426, + "loss/reg": 0.0062282453291118145, + "step": 2988 + }, + { + "epoch": 0.373625, + "grad_norm": 2.6131954193115234, + "grad_norm_var": 0.519725193027503, + "learning_rate": 0.0001, + "loss": 1.0703, + "loss/crossentropy": 2.468320608139038, + "loss/hidden": 0.84375, + "loss/logits": 0.16427499055862427, + "loss/reg": 0.006224988028407097, + "step": 2989 + }, + { + "epoch": 0.37375, + "grad_norm": 2.2861011028289795, + "grad_norm_var": 0.5783266325824712, + "learning_rate": 0.0001, + "loss": 1.0721, + "loss/crossentropy": 2.581345319747925, + "loss/hidden": 0.859375, + "loss/logits": 0.15052086114883423, + "loss/reg": 0.006221612449735403, + "step": 2990 + }, + { + "epoch": 0.373875, + "grad_norm": 7.513643264770508, + "grad_norm_var": 1.661408159236626, + "learning_rate": 0.0001, + "loss": 1.3809, + "loss/crossentropy": 2.3798274993896484, + "loss/hidden": 1.125, + "loss/logits": 0.19375327229499817, + "loss/reg": 0.006218081805855036, + "step": 2991 + }, + { + "epoch": 0.374, + "grad_norm": 3.358820915222168, + "grad_norm_var": 1.5899192344239046, + "learning_rate": 0.0001, + "loss": 1.1574, + "loss/crossentropy": 2.314020872116089, + "loss/hidden": 0.921875, + "loss/logits": 0.17342783510684967, + "loss/reg": 0.0062146601267158985, + "step": 2992 + }, + { + "epoch": 0.374125, + "grad_norm": 2.687399387359619, + "grad_norm_var": 1.6429648582831418, + "learning_rate": 0.0001, + "loss": 1.196, + "loss/crossentropy": 2.5449864864349365, + "loss/hidden": 0.95703125, + "loss/logits": 0.17687903344631195, + "loss/reg": 0.006211317144334316, + "step": 2993 + }, + { + "epoch": 0.37425, + "grad_norm": 2.6222400665283203, + "grad_norm_var": 1.691013255424202, + "learning_rate": 0.0001, + "loss": 1.0748, + "loss/crossentropy": 2.5636496543884277, + "loss/hidden": 0.84765625, + "loss/logits": 0.16510482132434845, + "loss/reg": 0.006207973696291447, + "step": 2994 + }, + { + "epoch": 0.374375, + "grad_norm": 2.57047963142395, + "grad_norm_var": 1.7245015971867812, + "learning_rate": 0.0001, + "loss": 1.2024, + "loss/crossentropy": 2.159039258956909, + "loss/hidden": 0.94921875, + "loss/logits": 0.1911052167415619, + "loss/reg": 0.006204445846378803, + "step": 2995 + }, + { + "epoch": 0.3745, + "grad_norm": 2.674190044403076, + "grad_norm_var": 1.7561949689720413, + "learning_rate": 0.0001, + "loss": 1.2525, + "loss/crossentropy": 2.339940309524536, + "loss/hidden": 1.0, + "loss/logits": 0.19047331809997559, + "loss/reg": 0.00620116526260972, + "step": 2996 + }, + { + "epoch": 0.374625, + "grad_norm": 4.393868923187256, + "grad_norm_var": 1.7912043332795824, + "learning_rate": 0.0001, + "loss": 1.5526, + "loss/crossentropy": 2.4916675090789795, + "loss/hidden": 1.203125, + "loss/logits": 0.28752923011779785, + "loss/reg": 0.006197982467710972, + "step": 2997 + }, + { + "epoch": 0.37475, + "grad_norm": 6.245306968688965, + "grad_norm_var": 2.18767079431957, + "learning_rate": 0.0001, + "loss": 1.4976, + "loss/crossentropy": 2.634570360183716, + "loss/hidden": 1.21875, + "loss/logits": 0.21695156395435333, + "loss/reg": 0.006194387096911669, + "step": 2998 + }, + { + "epoch": 0.374875, + "grad_norm": 4.258429050445557, + "grad_norm_var": 2.1189212577268868, + "learning_rate": 0.0001, + "loss": 1.3176, + "loss/crossentropy": 2.242029905319214, + "loss/hidden": 1.09375, + "loss/logits": 0.16192983090877533, + "loss/reg": 0.006190957501530647, + "step": 2999 + }, + { + "epoch": 0.375, + "grad_norm": 3.3935165405273438, + "grad_norm_var": 2.126516719137884, + "learning_rate": 0.0001, + "loss": 1.2273, + "loss/crossentropy": 2.7435600757598877, + "loss/hidden": 0.9765625, + "loss/logits": 0.1888415366411209, + "loss/reg": 0.0061873747035861015, + "step": 3000 + } + ], + "logging_steps": 1, + "max_steps": 8000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": true, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.93217584693248e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}