{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 250, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005, "grad_norm": 1.8149200677871704, "learning_rate": 2.0000000000000002e-07, "loss": 0.6055, "loss/crossentropy": 2.1694753170013428, "loss/hidden": 0.296875, "loss/logits": 0.04434104636311531, "loss/reg": 0.026429571211338043, "step": 1 }, { "epoch": 0.001, "grad_norm": 2.5396013259887695, "learning_rate": 4.0000000000000003e-07, "loss": 0.6507, "loss/crossentropy": 2.5328911542892456, "loss/hidden": 0.314453125, "loss/logits": 0.07194863818585873, "loss/reg": 0.026429571211338043, "step": 2 }, { "epoch": 0.0015, "grad_norm": 1.488558292388916, "learning_rate": 6.000000000000001e-07, "loss": 0.5344, "loss/crossentropy": 2.451871395111084, "loss/hidden": 0.2373046875, "loss/logits": 0.03276057913899422, "loss/reg": 0.02642955631017685, "step": 3 }, { "epoch": 0.002, "grad_norm": 2.1853861808776855, "learning_rate": 8.000000000000001e-07, "loss": 0.5659, "loss/crossentropy": 2.3267983198165894, "loss/hidden": 0.2646484375, "loss/logits": 0.03696209378540516, "loss/reg": 0.02642953395843506, "step": 4 }, { "epoch": 0.0025, "grad_norm": 1.4397950172424316, "learning_rate": 1.0000000000000002e-06, "loss": 0.5414, "loss/crossentropy": 2.410401225090027, "loss/hidden": 0.24462890625, "loss/logits": 0.03243397735059261, "loss/reg": 0.02642953023314476, "step": 5 }, { "epoch": 0.003, "grad_norm": 5.599375247955322, "learning_rate": 1.2000000000000002e-06, "loss": 0.7887, "loss/crossentropy": 2.808457851409912, "loss/hidden": 0.4482421875, "loss/logits": 0.07614399120211601, "loss/reg": 0.02642950788140297, "step": 6 }, { "epoch": 0.0035, "grad_norm": 1.8009779453277588, "learning_rate": 1.4000000000000001e-06, "loss": 0.6491, "loss/crossentropy": 2.0596200227737427, "loss/hidden": 0.3349609375, "loss/logits": 0.049886807799339294, "loss/reg": 0.02642947994172573, "step": 7 }, { "epoch": 0.004, "grad_norm": 1.524167776107788, "learning_rate": 1.6000000000000001e-06, "loss": 0.5283, "loss/crossentropy": 2.5316779613494873, "loss/hidden": 0.234375, "loss/logits": 0.029637396335601807, "loss/reg": 0.026429446414113045, "step": 8 }, { "epoch": 0.0045, "grad_norm": 1.5922240018844604, "learning_rate": 1.8000000000000001e-06, "loss": 0.5713, "loss/crossentropy": 2.3954519033432007, "loss/hidden": 0.26171875, "loss/logits": 0.04526849649846554, "loss/reg": 0.02642940729856491, "step": 9 }, { "epoch": 0.005, "grad_norm": 1.6532399654388428, "learning_rate": 2.0000000000000003e-06, "loss": 0.5624, "loss/crossentropy": 2.3280714750289917, "loss/hidden": 0.2578125, "loss/logits": 0.040291883051395416, "loss/reg": 0.02642936259508133, "step": 10 }, { "epoch": 0.0055, "grad_norm": 2.008364200592041, "learning_rate": 2.2e-06, "loss": 0.5498, "loss/crossentropy": 2.3053948879241943, "loss/hidden": 0.24609375, "loss/logits": 0.039378101006150246, "loss/reg": 0.026429304853081703, "step": 11 }, { "epoch": 0.006, "grad_norm": 1.6782885789871216, "learning_rate": 2.4000000000000003e-06, "loss": 0.5776, "loss/crossentropy": 2.244460344314575, "loss/hidden": 0.2724609375, "loss/logits": 0.04084986075758934, "loss/reg": 0.026429247111082077, "step": 12 }, { "epoch": 0.0065, "grad_norm": 1.4042738676071167, "learning_rate": 2.6e-06, "loss": 0.5512, "loss/crossentropy": 2.2852554321289062, "loss/hidden": 0.25634765625, "loss/logits": 0.03055955469608307, "loss/reg": 0.026429180055856705, "step": 13 }, { "epoch": 0.007, "grad_norm": 3.2632105350494385, "learning_rate": 2.8000000000000003e-06, "loss": 0.5593, "loss/crossentropy": 2.300649642944336, "loss/hidden": 0.2568359375, "loss/logits": 0.03812449052929878, "loss/reg": 0.02642911858856678, "step": 14 }, { "epoch": 0.0075, "grad_norm": 1.1468082666397095, "learning_rate": 3e-06, "loss": 0.5263, "loss/crossentropy": 2.4939738512039185, "loss/hidden": 0.23046875, "loss/logits": 0.03151892125606537, "loss/reg": 0.02642902545630932, "step": 15 }, { "epoch": 0.008, "grad_norm": 1.2633907794952393, "grad_norm_var": 1.1838536622732618, "learning_rate": 3.2000000000000003e-06, "loss": 0.5162, "loss/crossentropy": 2.3341073989868164, "loss/hidden": 0.22216796875, "loss/logits": 0.02972777932882309, "loss/reg": 0.02642892673611641, "step": 16 }, { "epoch": 0.0085, "grad_norm": 1.3773301839828491, "grad_norm_var": 1.2080880649963361, "learning_rate": 3.4000000000000005e-06, "loss": 0.57, "loss/crossentropy": 2.4178860187530518, "loss/hidden": 0.2705078125, "loss/logits": 0.03520551137626171, "loss/reg": 0.02642882987856865, "step": 17 }, { "epoch": 0.009, "grad_norm": 2.9784727096557617, "grad_norm_var": 1.2518295142571243, "learning_rate": 3.6000000000000003e-06, "loss": 0.7119, "loss/crossentropy": 2.143317699432373, "loss/hidden": 0.39453125, "loss/logits": 0.053122956305742264, "loss/reg": 0.026428721845149994, "step": 18 }, { "epoch": 0.0095, "grad_norm": 3.6081793308258057, "grad_norm_var": 1.3809537706703447, "learning_rate": 3.8000000000000005e-06, "loss": 0.616, "loss/crossentropy": 2.280970811843872, "loss/hidden": 0.306640625, "loss/logits": 0.0450353492051363, "loss/reg": 0.02642863430082798, "step": 19 }, { "epoch": 0.01, "grad_norm": 2.2921319007873535, "grad_norm_var": 1.3820597339022322, "learning_rate": 4.000000000000001e-06, "loss": 0.6631, "loss/crossentropy": 2.447663187980652, "loss/hidden": 0.3447265625, "loss/logits": 0.05406281352043152, "loss/reg": 0.02642853744328022, "step": 20 }, { "epoch": 0.0105, "grad_norm": 1.4713051319122314, "grad_norm_var": 1.3790775157724358, "learning_rate": 4.2000000000000004e-06, "loss": 0.5979, "loss/crossentropy": 2.0740893483161926, "loss/hidden": 0.28515625, "loss/logits": 0.04845273308455944, "loss/reg": 0.02642839401960373, "step": 21 }, { "epoch": 0.011, "grad_norm": 1.3936915397644043, "grad_norm_var": 0.5594726223515398, "learning_rate": 4.4e-06, "loss": 0.5342, "loss/crossentropy": 2.308709979057312, "loss/hidden": 0.23681640625, "loss/logits": 0.03306800499558449, "loss/reg": 0.026428284123539925, "step": 22 }, { "epoch": 0.0115, "grad_norm": 1.5905181169509888, "grad_norm_var": 0.5651179587326415, "learning_rate": 4.600000000000001e-06, "loss": 0.5387, "loss/crossentropy": 2.518093228340149, "loss/hidden": 0.2392578125, "loss/logits": 0.03512653335928917, "loss/reg": 0.02642817609012127, "step": 23 }, { "epoch": 0.012, "grad_norm": 1.5539664030075073, "grad_norm_var": 0.5637185598957045, "learning_rate": 4.800000000000001e-06, "loss": 0.5216, "loss/crossentropy": 2.4222742319107056, "loss/hidden": 0.22900390625, "loss/logits": 0.028284232132136822, "loss/reg": 0.02642805129289627, "step": 24 }, { "epoch": 0.0125, "grad_norm": 1.4515613317489624, "grad_norm_var": 0.5705814698960205, "learning_rate": 5e-06, "loss": 0.5546, "loss/crossentropy": 2.1840826272964478, "loss/hidden": 0.26025390625, "loss/logits": 0.03005337156355381, "loss/reg": 0.02642790600657463, "step": 25 }, { "epoch": 0.013, "grad_norm": 1.3925954103469849, "grad_norm_var": 0.5828268281563851, "learning_rate": 5.2e-06, "loss": 0.5187, "loss/crossentropy": 2.417304754257202, "loss/hidden": 0.2255859375, "loss/logits": 0.028857468627393246, "loss/reg": 0.0264277420938015, "step": 26 }, { "epoch": 0.0135, "grad_norm": 1.3494521379470825, "grad_norm_var": 0.5975540703483029, "learning_rate": 5.400000000000001e-06, "loss": 0.581, "loss/crossentropy": 2.4872124195098877, "loss/hidden": 0.275390625, "loss/logits": 0.04128789156675339, "loss/reg": 0.02642754837870598, "step": 27 }, { "epoch": 0.014, "grad_norm": 1.7983005046844482, "grad_norm_var": 0.5960914554887113, "learning_rate": 5.600000000000001e-06, "loss": 0.5793, "loss/crossentropy": 2.5761152505874634, "loss/hidden": 0.28125, "loss/logits": 0.03379652462899685, "loss/reg": 0.02642735280096531, "step": 28 }, { "epoch": 0.0145, "grad_norm": 1.2769767045974731, "grad_norm_var": 0.6043886156117831, "learning_rate": 5.8e-06, "loss": 0.5439, "loss/crossentropy": 2.338332176208496, "loss/hidden": 0.24658203125, "loss/logits": 0.03306223638355732, "loss/reg": 0.02642717957496643, "step": 29 }, { "epoch": 0.015, "grad_norm": 1.1405447721481323, "grad_norm_var": 0.47908970131792705, "learning_rate": 6e-06, "loss": 0.4911, "loss/crossentropy": 2.541923403739929, "loss/hidden": 0.201171875, "loss/logits": 0.025660399347543716, "loss/reg": 0.026426956057548523, "step": 30 }, { "epoch": 0.0155, "grad_norm": 1.4948232173919678, "grad_norm_var": 0.4613230136594038, "learning_rate": 6.200000000000001e-06, "loss": 0.5016, "loss/crossentropy": 2.3482922315597534, "loss/hidden": 0.2109375, "loss/logits": 0.026443324983119965, "loss/reg": 0.026426780968904495, "step": 31 }, { "epoch": 0.016, "grad_norm": 1.9969562292099, "grad_norm_var": 0.45082540579723746, "learning_rate": 6.4000000000000006e-06, "loss": 0.5719, "loss/crossentropy": 2.294642925262451, "loss/hidden": 0.27392578125, "loss/logits": 0.03373559284955263, "loss/reg": 0.02642657607793808, "step": 32 }, { "epoch": 0.0165, "grad_norm": 1.2221813201904297, "grad_norm_var": 0.46025475791477294, "learning_rate": 6.600000000000001e-06, "loss": 0.5252, "loss/crossentropy": 2.3495378494262695, "loss/hidden": 0.22900390625, "loss/logits": 0.03191899135708809, "loss/reg": 0.026426387950778008, "step": 33 }, { "epoch": 0.017, "grad_norm": 1.5299986600875854, "grad_norm_var": 0.3542705004937232, "learning_rate": 6.800000000000001e-06, "loss": 0.5302, "loss/crossentropy": 2.421632170677185, "loss/hidden": 0.23095703125, "loss/logits": 0.03493742551654577, "loss/reg": 0.02642618492245674, "step": 34 }, { "epoch": 0.0175, "grad_norm": 1.3914459943771362, "grad_norm_var": 0.08563591942271481, "learning_rate": 7e-06, "loss": 0.5396, "loss/crossentropy": 2.501790404319763, "loss/hidden": 0.2412109375, "loss/logits": 0.03411697968840599, "loss/reg": 0.02642594650387764, "step": 35 }, { "epoch": 0.018, "grad_norm": 1.2283117771148682, "grad_norm_var": 0.04708121548131293, "learning_rate": 7.2000000000000005e-06, "loss": 0.5276, "loss/crossentropy": 2.369629979133606, "loss/hidden": 0.22802734375, "loss/logits": 0.03534604236483574, "loss/reg": 0.026425734162330627, "step": 36 }, { "epoch": 0.0185, "grad_norm": 2.441415548324585, "grad_norm_var": 0.1079886358057666, "learning_rate": 7.4e-06, "loss": 0.6458, "loss/crossentropy": 2.286492109298706, "loss/hidden": 0.333984375, "loss/logits": 0.047583552077412605, "loss/reg": 0.02642551064491272, "step": 37 }, { "epoch": 0.019, "grad_norm": 1.4532129764556885, "grad_norm_var": 0.107241014688942, "learning_rate": 7.600000000000001e-06, "loss": 0.5401, "loss/crossentropy": 2.5449851751327515, "loss/hidden": 0.24072265625, "loss/logits": 0.035121435299515724, "loss/reg": 0.026425251737236977, "step": 38 }, { "epoch": 0.0195, "grad_norm": 2.312504768371582, "grad_norm_var": 0.1466550400336051, "learning_rate": 7.800000000000002e-06, "loss": 0.6225, "loss/crossentropy": 2.248945951461792, "loss/hidden": 0.3115234375, "loss/logits": 0.04672851786017418, "loss/reg": 0.02642502635717392, "step": 39 }, { "epoch": 0.02, "grad_norm": 1.8888795375823975, "grad_norm_var": 0.15318881349217175, "learning_rate": 8.000000000000001e-06, "loss": 0.5732, "loss/crossentropy": 2.476779580116272, "loss/hidden": 0.26953125, "loss/logits": 0.03942425549030304, "loss/reg": 0.02642476186156273, "step": 40 }, { "epoch": 0.0205, "grad_norm": 1.5608994960784912, "grad_norm_var": 0.15198231846540716, "learning_rate": 8.2e-06, "loss": 0.5381, "loss/crossentropy": 2.38312304019928, "loss/hidden": 0.24072265625, "loss/logits": 0.03310199826955795, "loss/reg": 0.0264244694262743, "step": 41 }, { "epoch": 0.021, "grad_norm": 1.2987440824508667, "grad_norm_var": 0.15503315722776131, "learning_rate": 8.400000000000001e-06, "loss": 0.4821, "loss/crossentropy": 2.515058755874634, "loss/hidden": 0.193359375, "loss/logits": 0.024530705995857716, "loss/reg": 0.026424190029501915, "step": 42 }, { "epoch": 0.0215, "grad_norm": 1.6956250667572021, "grad_norm_var": 0.15157974603150895, "learning_rate": 8.6e-06, "loss": 0.6312, "loss/crossentropy": 2.2517104148864746, "loss/hidden": 0.3251953125, "loss/logits": 0.041729243472218513, "loss/reg": 0.026423903182148933, "step": 43 }, { "epoch": 0.022, "grad_norm": 1.4108027219772339, "grad_norm_var": 0.15114137609056771, "learning_rate": 8.8e-06, "loss": 0.5171, "loss/crossentropy": 2.4841147661209106, "loss/hidden": 0.22412109375, "loss/logits": 0.02869710698723793, "loss/reg": 0.026423608884215355, "step": 44 }, { "epoch": 0.0225, "grad_norm": 1.3235130310058594, "grad_norm_var": 0.14937195903162886, "learning_rate": 9e-06, "loss": 0.52, "loss/crossentropy": 2.2738723754882812, "loss/hidden": 0.2275390625, "loss/logits": 0.028268495574593544, "loss/reg": 0.026423312723636627, "step": 45 }, { "epoch": 0.023, "grad_norm": 1.3061593770980835, "grad_norm_var": 0.14123057510749676, "learning_rate": 9.200000000000002e-06, "loss": 0.4971, "loss/crossentropy": 2.326944351196289, "loss/hidden": 0.2080078125, "loss/logits": 0.024814478121697903, "loss/reg": 0.026423051953315735, "step": 46 }, { "epoch": 0.0235, "grad_norm": 1.972931146621704, "grad_norm_var": 0.14898989683507768, "learning_rate": 9.4e-06, "loss": 0.5559, "loss/crossentropy": 2.4830580949783325, "loss/hidden": 0.26025390625, "loss/logits": 0.03139444626867771, "loss/reg": 0.026422718539834023, "step": 47 }, { "epoch": 0.024, "grad_norm": 1.698211669921875, "grad_norm_var": 0.13983553268258544, "learning_rate": 9.600000000000001e-06, "loss": 0.5664, "loss/crossentropy": 2.416160821914673, "loss/hidden": 0.2578125, "loss/logits": 0.04433598928153515, "loss/reg": 0.026422368362545967, "step": 48 }, { "epoch": 0.0245, "grad_norm": 7.356233596801758, "grad_norm_var": 2.1755974425683684, "learning_rate": 9.800000000000001e-06, "loss": 0.6608, "loss/crossentropy": 2.1511300802230835, "loss/hidden": 0.3544921875, "loss/logits": 0.042037611827254295, "loss/reg": 0.02642207033932209, "step": 49 }, { "epoch": 0.025, "grad_norm": 1.6962363719940186, "grad_norm_var": 2.1670886649573458, "learning_rate": 1e-05, "loss": 0.5114, "loss/crossentropy": 2.493433117866516, "loss/hidden": 0.21875, "loss/logits": 0.028404117561876774, "loss/reg": 0.026421738788485527, "step": 50 }, { "epoch": 0.0255, "grad_norm": 1.5135979652404785, "grad_norm_var": 2.1580740006997834, "learning_rate": 1.02e-05, "loss": 0.4992, "loss/crossentropy": 2.4469869136810303, "loss/hidden": 0.20947265625, "loss/logits": 0.025465765967965126, "loss/reg": 0.02642141655087471, "step": 51 }, { "epoch": 0.026, "grad_norm": 2.1058454513549805, "grad_norm_var": 2.1147619503580235, "learning_rate": 1.04e-05, "loss": 0.5947, "loss/crossentropy": 2.0783703327178955, "loss/hidden": 0.29736328125, "loss/logits": 0.03310043551027775, "loss/reg": 0.02642105147242546, "step": 52 }, { "epoch": 0.0265, "grad_norm": 1.4466326236724854, "grad_norm_var": 2.126641614633889, "learning_rate": 1.0600000000000002e-05, "loss": 0.5175, "loss/crossentropy": 2.5233154296875, "loss/hidden": 0.22607421875, "loss/logits": 0.027255047112703323, "loss/reg": 0.0264207124710083, "step": 53 }, { "epoch": 0.027, "grad_norm": 1.2315421104431152, "grad_norm_var": 2.145947583831748, "learning_rate": 1.0800000000000002e-05, "loss": 0.4939, "loss/crossentropy": 2.482948899269104, "loss/hidden": 0.20263671875, "loss/logits": 0.02701568230986595, "loss/reg": 0.02642032690346241, "step": 54 }, { "epoch": 0.0275, "grad_norm": 1.3502835035324097, "grad_norm_var": 2.1622647893893476, "learning_rate": 1.1000000000000001e-05, "loss": 0.5642, "loss/crossentropy": 2.4037868976593018, "loss/hidden": 0.26220703125, "loss/logits": 0.03778073936700821, "loss/reg": 0.02641993761062622, "step": 55 }, { "epoch": 0.028, "grad_norm": 1.66973078250885, "grad_norm_var": 2.166424380346859, "learning_rate": 1.1200000000000001e-05, "loss": 0.5326, "loss/crossentropy": 2.3202576637268066, "loss/hidden": 0.23681640625, "loss/logits": 0.03157219849526882, "loss/reg": 0.02641947939991951, "step": 56 }, { "epoch": 0.0285, "grad_norm": 1.4568390846252441, "grad_norm_var": 2.1720116007570036, "learning_rate": 1.14e-05, "loss": 0.5873, "loss/crossentropy": 2.3086094856262207, "loss/hidden": 0.27783203125, "loss/logits": 0.0453144833445549, "loss/reg": 0.026419078931212425, "step": 57 }, { "epoch": 0.029, "grad_norm": 1.2021527290344238, "grad_norm_var": 2.1804451998311927, "learning_rate": 1.16e-05, "loss": 0.4861, "loss/crossentropy": 2.5664894580841064, "loss/hidden": 0.19580078125, "loss/logits": 0.0260773915797472, "loss/reg": 0.02641867846250534, "step": 58 }, { "epoch": 0.0295, "grad_norm": 1.2372887134552002, "grad_norm_var": 2.2062031636320434, "learning_rate": 1.18e-05, "loss": 0.5491, "loss/crossentropy": 2.3016046285629272, "loss/hidden": 0.2490234375, "loss/logits": 0.035935116931796074, "loss/reg": 0.0264182947576046, "step": 59 }, { "epoch": 0.03, "grad_norm": 1.4047211408615112, "grad_norm_var": 2.206580767441871, "learning_rate": 1.2e-05, "loss": 0.5279, "loss/crossentropy": 2.2995004653930664, "loss/hidden": 0.23095703125, "loss/logits": 0.032775900326669216, "loss/reg": 0.026417918503284454, "step": 60 }, { "epoch": 0.0305, "grad_norm": 1.2555537223815918, "grad_norm_var": 2.211850675210066, "learning_rate": 1.22e-05, "loss": 0.5124, "loss/crossentropy": 2.3773516416549683, "loss/hidden": 0.22021484375, "loss/logits": 0.028029106557369232, "loss/reg": 0.026417534798383713, "step": 61 }, { "epoch": 0.031, "grad_norm": 1.3694956302642822, "grad_norm_var": 2.207348318396743, "learning_rate": 1.2400000000000002e-05, "loss": 0.5246, "loss/crossentropy": 2.462360382080078, "loss/hidden": 0.2294921875, "loss/logits": 0.030931759625673294, "loss/reg": 0.026417037472128868, "step": 62 }, { "epoch": 0.0315, "grad_norm": 0.8940879106521606, "grad_norm_var": 2.2657112396397707, "learning_rate": 1.2600000000000001e-05, "loss": 0.4918, "loss/crossentropy": 2.4237685203552246, "loss/hidden": 0.20166015625, "loss/logits": 0.026003433391451836, "loss/reg": 0.02641662023961544, "step": 63 }, { "epoch": 0.032, "grad_norm": 1.3153444528579712, "grad_norm_var": 2.2803513495186505, "learning_rate": 1.2800000000000001e-05, "loss": 0.5112, "loss/crossentropy": 2.3414171934127808, "loss/hidden": 0.220703125, "loss/logits": 0.026362700387835503, "loss/reg": 0.026416106149554253, "step": 64 }, { "epoch": 0.0325, "grad_norm": 1.281063437461853, "grad_norm_var": 0.0715017189536231, "learning_rate": 1.3000000000000001e-05, "loss": 0.52, "loss/crossentropy": 2.420620918273926, "loss/hidden": 0.2255859375, "loss/logits": 0.030298630706965923, "loss/reg": 0.026415672153234482, "step": 65 }, { "epoch": 0.033, "grad_norm": 1.3108336925506592, "grad_norm_var": 0.0656601505461642, "learning_rate": 1.3200000000000002e-05, "loss": 0.5189, "loss/crossentropy": 2.2853455543518066, "loss/hidden": 0.22265625, "loss/logits": 0.0321119399741292, "loss/reg": 0.026415223255753517, "step": 66 }, { "epoch": 0.0335, "grad_norm": 1.0983670949935913, "grad_norm_var": 0.06891859533181677, "learning_rate": 1.3400000000000002e-05, "loss": 0.5318, "loss/crossentropy": 2.3410117626190186, "loss/hidden": 0.23681640625, "loss/logits": 0.030876386910676956, "loss/reg": 0.026414690539240837, "step": 67 }, { "epoch": 0.034, "grad_norm": 1.7166627645492554, "grad_norm_var": 0.039260036839271824, "learning_rate": 1.3600000000000002e-05, "loss": 0.5701, "loss/crossentropy": 2.407397150993347, "loss/hidden": 0.2685546875, "loss/logits": 0.03741579130291939, "loss/reg": 0.026414209976792336, "step": 68 }, { "epoch": 0.0345, "grad_norm": 0.9616859555244446, "grad_norm_var": 0.046257726034885954, "learning_rate": 1.38e-05, "loss": 0.455, "loss/crossentropy": 2.5552138090133667, "loss/hidden": 0.169921875, "loss/logits": 0.02096631657332182, "loss/reg": 0.02641364373266697, "step": 69 }, { "epoch": 0.035, "grad_norm": 1.3926982879638672, "grad_norm_var": 0.046469501868423045, "learning_rate": 1.4e-05, "loss": 0.5899, "loss/crossentropy": 2.184352159500122, "loss/hidden": 0.2880859375, "loss/logits": 0.03772860765457153, "loss/reg": 0.02641312964260578, "step": 70 }, { "epoch": 0.0355, "grad_norm": 1.6911873817443848, "grad_norm_var": 0.055686708202271486, "learning_rate": 1.4200000000000001e-05, "loss": 0.5065, "loss/crossentropy": 2.3083138465881348, "loss/hidden": 0.21533203125, "loss/logits": 0.026994884945452213, "loss/reg": 0.02641255594789982, "step": 71 }, { "epoch": 0.036, "grad_norm": 1.5207164287567139, "grad_norm_var": 0.05029689369081134, "learning_rate": 1.4400000000000001e-05, "loss": 0.5155, "loss/crossentropy": 2.514549493789673, "loss/hidden": 0.2265625, "loss/logits": 0.024821529164910316, "loss/reg": 0.02641192451119423, "step": 72 }, { "epoch": 0.0365, "grad_norm": 1.5217493772506714, "grad_norm_var": 0.05175064306116064, "learning_rate": 1.46e-05, "loss": 0.5331, "loss/crossentropy": 2.2549461126327515, "loss/hidden": 0.2353515625, "loss/logits": 0.03362779691815376, "loss/reg": 0.026411263272166252, "step": 73 }, { "epoch": 0.037, "grad_norm": 1.4319448471069336, "grad_norm_var": 0.05133754544456459, "learning_rate": 1.48e-05, "loss": 0.543, "loss/crossentropy": 2.2208691835403442, "loss/hidden": 0.251953125, "loss/logits": 0.026933430694043636, "loss/reg": 0.026410607621073723, "step": 74 }, { "epoch": 0.0375, "grad_norm": 1.5548027753829956, "grad_norm_var": 0.05338703002904901, "learning_rate": 1.5000000000000002e-05, "loss": 0.5053, "loss/crossentropy": 2.4420419931411743, "loss/hidden": 0.21630859375, "loss/logits": 0.02489750273525715, "loss/reg": 0.026409907266497612, "step": 75 }, { "epoch": 0.038, "grad_norm": 1.0714695453643799, "grad_norm_var": 0.058232407176660186, "learning_rate": 1.5200000000000002e-05, "loss": 0.4938, "loss/crossentropy": 2.3792872428894043, "loss/hidden": 0.20458984375, "loss/logits": 0.025158749893307686, "loss/reg": 0.026409264653921127, "step": 76 }, { "epoch": 0.0385, "grad_norm": 1.2519381046295166, "grad_norm_var": 0.05827235736891852, "learning_rate": 1.54e-05, "loss": 0.4813, "loss/crossentropy": 2.3257339000701904, "loss/hidden": 0.1962890625, "loss/logits": 0.02092126850038767, "loss/reg": 0.026408692821860313, "step": 77 }, { "epoch": 0.039, "grad_norm": 1.2653789520263672, "grad_norm_var": 0.05849186368942368, "learning_rate": 1.5600000000000003e-05, "loss": 0.5246, "loss/crossentropy": 2.5811800956726074, "loss/hidden": 0.23095703125, "loss/logits": 0.029558134265244007, "loss/reg": 0.02640816569328308, "step": 78 }, { "epoch": 0.0395, "grad_norm": 2.259216070175171, "grad_norm_var": 0.09562263018362811, "learning_rate": 1.58e-05, "loss": 0.5206, "loss/crossentropy": 2.4250094890594482, "loss/hidden": 0.2265625, "loss/logits": 0.03000534698367119, "loss/reg": 0.026407474651932716, "step": 79 }, { "epoch": 0.04, "grad_norm": 1.7354488372802734, "grad_norm_var": 0.10105330191861767, "learning_rate": 1.6000000000000003e-05, "loss": 0.5139, "loss/crossentropy": 2.3155272006988525, "loss/hidden": 0.22509765625, "loss/logits": 0.02471769694238901, "loss/reg": 0.026406768709421158, "step": 80 }, { "epoch": 0.0405, "grad_norm": 1.6819829940795898, "grad_norm_var": 0.1025191577706432, "learning_rate": 1.62e-05, "loss": 0.5918, "loss/crossentropy": 2.446201205253601, "loss/hidden": 0.2861328125, "loss/logits": 0.04164840281009674, "loss/reg": 0.02640613541007042, "step": 81 }, { "epoch": 0.041, "grad_norm": 1.1699199676513672, "grad_norm_var": 0.1066873821895888, "learning_rate": 1.64e-05, "loss": 0.5134, "loss/crossentropy": 2.456650495529175, "loss/hidden": 0.2177734375, "loss/logits": 0.031566061079502106, "loss/reg": 0.026405589655041695, "step": 82 }, { "epoch": 0.0415, "grad_norm": 1.0190843343734741, "grad_norm_var": 0.11088006372520322, "learning_rate": 1.66e-05, "loss": 0.4661, "loss/crossentropy": 2.4336618185043335, "loss/hidden": 0.18115234375, "loss/logits": 0.02087457850575447, "loss/reg": 0.0264048483222723, "step": 83 }, { "epoch": 0.042, "grad_norm": 1.3154826164245605, "grad_norm_var": 0.10682859054876676, "learning_rate": 1.6800000000000002e-05, "loss": 0.5325, "loss/crossentropy": 2.4335875511169434, "loss/hidden": 0.2392578125, "loss/logits": 0.029201870784163475, "loss/reg": 0.026404235512018204, "step": 84 }, { "epoch": 0.0425, "grad_norm": 1.1499496698379517, "grad_norm_var": 0.0973436240677034, "learning_rate": 1.7e-05, "loss": 0.4708, "loss/crossentropy": 2.3389049768447876, "loss/hidden": 0.1826171875, "loss/logits": 0.024132695980370045, "loss/reg": 0.02640344202518463, "step": 85 }, { "epoch": 0.043, "grad_norm": 1.07028067111969, "grad_norm_var": 0.10585526029347007, "learning_rate": 1.72e-05, "loss": 0.4749, "loss/crossentropy": 2.347036838531494, "loss/hidden": 0.18896484375, "loss/logits": 0.02186472900211811, "loss/reg": 0.02640284039080143, "step": 86 }, { "epoch": 0.0435, "grad_norm": 2.0228259563446045, "grad_norm_var": 0.12474687162745439, "learning_rate": 1.7400000000000003e-05, "loss": 0.5076, "loss/crossentropy": 2.3726253509521484, "loss/hidden": 0.21240234375, "loss/logits": 0.03117395006120205, "loss/reg": 0.026402218267321587, "step": 87 }, { "epoch": 0.044, "grad_norm": 1.689095377922058, "grad_norm_var": 0.12832789033596606, "learning_rate": 1.76e-05, "loss": 0.5393, "loss/crossentropy": 2.6106048822402954, "loss/hidden": 0.2451171875, "loss/logits": 0.030183403752744198, "loss/reg": 0.026401378214359283, "step": 88 }, { "epoch": 0.0445, "grad_norm": 1.4513983726501465, "grad_norm_var": 0.1279703973651166, "learning_rate": 1.7800000000000002e-05, "loss": 0.5203, "loss/crossentropy": 2.3146345615386963, "loss/hidden": 0.22705078125, "loss/logits": 0.029247512109577656, "loss/reg": 0.02640063315629959, "step": 89 }, { "epoch": 0.045, "grad_norm": 1.0706562995910645, "grad_norm_var": 0.13681825045996157, "learning_rate": 1.8e-05, "loss": 0.472, "loss/crossentropy": 2.458780884742737, "loss/hidden": 0.18310546875, "loss/logits": 0.024928967468440533, "loss/reg": 0.026399986818432808, "step": 90 }, { "epoch": 0.0455, "grad_norm": 1.243531346321106, "grad_norm_var": 0.13743203065561993, "learning_rate": 1.8200000000000002e-05, "loss": 0.46, "loss/crossentropy": 2.273237943649292, "loss/hidden": 0.1748046875, "loss/logits": 0.021158389747142792, "loss/reg": 0.026399333029985428, "step": 91 }, { "epoch": 0.046, "grad_norm": 1.248246669769287, "grad_norm_var": 0.13154193773160655, "learning_rate": 1.8400000000000003e-05, "loss": 0.5025, "loss/crossentropy": 2.7035024166107178, "loss/hidden": 0.2138671875, "loss/logits": 0.024649174883961678, "loss/reg": 0.026398882269859314, "step": 92 }, { "epoch": 0.0465, "grad_norm": 1.5103347301483154, "grad_norm_var": 0.13008748368884535, "learning_rate": 1.86e-05, "loss": 0.4831, "loss/crossentropy": 2.3471440076828003, "loss/hidden": 0.193359375, "loss/logits": 0.025768487714231014, "loss/reg": 0.026398463174700737, "step": 93 }, { "epoch": 0.047, "grad_norm": 1.6160238981246948, "grad_norm_var": 0.1300087857040161, "learning_rate": 1.88e-05, "loss": 0.5294, "loss/crossentropy": 2.2618273496627808, "loss/hidden": 0.23681640625, "loss/logits": 0.028604180552065372, "loss/reg": 0.02639804780483246, "step": 94 }, { "epoch": 0.0475, "grad_norm": 1.6858937740325928, "grad_norm_var": 0.08894905728247575, "learning_rate": 1.9e-05, "loss": 0.5078, "loss/crossentropy": 2.2833045721054077, "loss/hidden": 0.21630859375, "loss/logits": 0.027537615969777107, "loss/reg": 0.0263975840061903, "step": 95 }, { "epoch": 0.048, "grad_norm": 1.3516885042190552, "grad_norm_var": 0.08188523397349545, "learning_rate": 1.9200000000000003e-05, "loss": 0.5472, "loss/crossentropy": 2.288330078125, "loss/hidden": 0.25244140625, "loss/logits": 0.030751955695450306, "loss/reg": 0.026396671310067177, "step": 96 }, { "epoch": 0.0485, "grad_norm": 1.1249408721923828, "grad_norm_var": 0.07985427321204851, "learning_rate": 1.94e-05, "loss": 0.4777, "loss/crossentropy": 2.3718440532684326, "loss/hidden": 0.1923828125, "loss/logits": 0.021322906017303467, "loss/reg": 0.026395753026008606, "step": 97 }, { "epoch": 0.049, "grad_norm": 1.2627309560775757, "grad_norm_var": 0.07805640745137694, "learning_rate": 1.9600000000000002e-05, "loss": 0.5154, "loss/crossentropy": 2.2158303260803223, "loss/hidden": 0.21923828125, "loss/logits": 0.03221841435879469, "loss/reg": 0.026394877582788467, "step": 98 }, { "epoch": 0.0495, "grad_norm": 1.2408559322357178, "grad_norm_var": 0.07091623482815752, "learning_rate": 1.98e-05, "loss": 0.5423, "loss/crossentropy": 2.3226230144500732, "loss/hidden": 0.23828125, "loss/logits": 0.04008352570235729, "loss/reg": 0.026393810287117958, "step": 99 }, { "epoch": 0.05, "grad_norm": 1.1801763772964478, "grad_norm_var": 0.07319502933235617, "learning_rate": 2e-05, "loss": 0.5028, "loss/crossentropy": 2.242385983467102, "loss/hidden": 0.20947265625, "loss/logits": 0.02935761120170355, "loss/reg": 0.026393063366413116, "step": 100 }, { "epoch": 0.0505, "grad_norm": 1.55876624584198, "grad_norm_var": 0.07165068938641829, "learning_rate": 2e-05, "loss": 0.6025, "loss/crossentropy": 2.240237832069397, "loss/hidden": 0.28955078125, "loss/logits": 0.04904773272573948, "loss/reg": 0.02639206498861313, "step": 101 }, { "epoch": 0.051, "grad_norm": 2.615293025970459, "grad_norm_var": 0.15385355345349763, "learning_rate": 2e-05, "loss": 0.5793, "loss/crossentropy": 2.32190477848053, "loss/hidden": 0.283203125, "loss/logits": 0.03213760443031788, "loss/reg": 0.026391005143523216, "step": 102 }, { "epoch": 0.0515, "grad_norm": 1.30605149269104, "grad_norm_var": 0.1352356444631638, "learning_rate": 2e-05, "loss": 0.4823, "loss/crossentropy": 2.4284926652908325, "loss/hidden": 0.193359375, "loss/logits": 0.025081547908484936, "loss/reg": 0.02638992853462696, "step": 103 }, { "epoch": 0.052, "grad_norm": 1.141875147819519, "grad_norm_var": 0.13630413553074583, "learning_rate": 2e-05, "loss": 0.508, "loss/crossentropy": 2.3841702938079834, "loss/hidden": 0.2158203125, "loss/logits": 0.02830567955970764, "loss/reg": 0.026389040052890778, "step": 104 }, { "epoch": 0.0525, "grad_norm": 1.3670423030853271, "grad_norm_var": 0.1363173233399147, "learning_rate": 2e-05, "loss": 0.5535, "loss/crossentropy": 2.3601726293563843, "loss/hidden": 0.25048828125, "loss/logits": 0.03918229416012764, "loss/reg": 0.02638789638876915, "step": 105 }, { "epoch": 0.053, "grad_norm": 1.5876195430755615, "grad_norm_var": 0.1297847067338589, "learning_rate": 2e-05, "loss": 0.5052, "loss/crossentropy": 2.3636826276779175, "loss/hidden": 0.21484375, "loss/logits": 0.026523033156991005, "loss/reg": 0.02638677880167961, "step": 106 }, { "epoch": 0.0535, "grad_norm": 1.3877314329147339, "grad_norm_var": 0.12730558444343335, "learning_rate": 2e-05, "loss": 0.5385, "loss/crossentropy": 2.2610294818878174, "loss/hidden": 0.2421875, "loss/logits": 0.03246981091797352, "loss/reg": 0.02638590894639492, "step": 107 }, { "epoch": 0.054, "grad_norm": 1.3986035585403442, "grad_norm_var": 0.12469232217100033, "learning_rate": 2e-05, "loss": 0.5716, "loss/crossentropy": 2.212199330329895, "loss/hidden": 0.2626953125, "loss/logits": 0.045047592371702194, "loss/reg": 0.026384945958852768, "step": 108 }, { "epoch": 0.0545, "grad_norm": 1.056304931640625, "grad_norm_var": 0.1344369200763623, "learning_rate": 2e-05, "loss": 0.4859, "loss/crossentropy": 2.617898106575012, "loss/hidden": 0.193359375, "loss/logits": 0.02872647438198328, "loss/reg": 0.026384029537439346, "step": 109 }, { "epoch": 0.055, "grad_norm": 3.573809862136841, "grad_norm_var": 0.42252804674691424, "learning_rate": 2e-05, "loss": 0.6846, "loss/crossentropy": 2.3089191913604736, "loss/hidden": 0.33984375, "loss/logits": 0.08094017207622528, "loss/reg": 0.026383111253380775, "step": 110 }, { "epoch": 0.0555, "grad_norm": 1.2283390760421753, "grad_norm_var": 0.42747247360055096, "learning_rate": 2e-05, "loss": 0.5406, "loss/crossentropy": 2.066853880882263, "loss/hidden": 0.24658203125, "loss/logits": 0.030207395553588867, "loss/reg": 0.026382330805063248, "step": 111 }, { "epoch": 0.056, "grad_norm": 1.1344459056854248, "grad_norm_var": 0.4354093100708122, "learning_rate": 2e-05, "loss": 0.4825, "loss/crossentropy": 2.4759345054626465, "loss/hidden": 0.19384765625, "loss/logits": 0.02485422883182764, "loss/reg": 0.02638155035674572, "step": 112 }, { "epoch": 0.0565, "grad_norm": 1.476331353187561, "grad_norm_var": 0.425072268588233, "learning_rate": 2e-05, "loss": 0.4962, "loss/crossentropy": 2.257875084877014, "loss/hidden": 0.20751953125, "loss/logits": 0.024832582101225853, "loss/reg": 0.026380501687526703, "step": 113 }, { "epoch": 0.057, "grad_norm": 1.1095691919326782, "grad_norm_var": 0.43204239892278623, "learning_rate": 2e-05, "loss": 0.494, "loss/crossentropy": 2.5208946466445923, "loss/hidden": 0.20361328125, "loss/logits": 0.02655248437076807, "loss/reg": 0.026379412040114403, "step": 114 }, { "epoch": 0.0575, "grad_norm": 1.2755762338638306, "grad_norm_var": 0.4308130924435341, "learning_rate": 2e-05, "loss": 0.494, "loss/crossentropy": 2.5310138463974, "loss/hidden": 0.20458984375, "loss/logits": 0.025613101199269295, "loss/reg": 0.026378460228443146, "step": 115 }, { "epoch": 0.058, "grad_norm": 1.1098158359527588, "grad_norm_var": 0.4343559906164728, "learning_rate": 2e-05, "loss": 0.4746, "loss/crossentropy": 2.6709823608398438, "loss/hidden": 0.18701171875, "loss/logits": 0.023796855472028255, "loss/reg": 0.02637753076851368, "step": 116 }, { "epoch": 0.0585, "grad_norm": 1.940610647201538, "grad_norm_var": 0.44541967059291204, "learning_rate": 2e-05, "loss": 0.5661, "loss/crossentropy": 2.4929665327072144, "loss/hidden": 0.26611328125, "loss/logits": 0.036208903416991234, "loss/reg": 0.026376651600003242, "step": 117 }, { "epoch": 0.059, "grad_norm": 2.426042318344116, "grad_norm_var": 0.42063368115552174, "learning_rate": 2e-05, "loss": 0.5937, "loss/crossentropy": 2.052187740802765, "loss/hidden": 0.298828125, "loss/logits": 0.031148137524724007, "loss/reg": 0.026375625282526016, "step": 118 }, { "epoch": 0.0595, "grad_norm": 1.9228861331939697, "grad_norm_var": 0.4257910091262336, "learning_rate": 2e-05, "loss": 0.6001, "loss/crossentropy": 2.324827551841736, "loss/hidden": 0.291015625, "loss/logits": 0.04536169767379761, "loss/reg": 0.02637471817433834, "step": 119 }, { "epoch": 0.06, "grad_norm": 1.3524922132492065, "grad_norm_var": 0.41651161943220427, "learning_rate": 2e-05, "loss": 0.5096, "loss/crossentropy": 2.5075470209121704, "loss/hidden": 0.2177734375, "loss/logits": 0.028110167011618614, "loss/reg": 0.02637365460395813, "step": 120 }, { "epoch": 0.0605, "grad_norm": 1.4671199321746826, "grad_norm_var": 0.4142398893830403, "learning_rate": 2e-05, "loss": 0.5239, "loss/crossentropy": 2.441853404045105, "loss/hidden": 0.22607421875, "loss/logits": 0.03414294868707657, "loss/reg": 0.026372529566287994, "step": 121 }, { "epoch": 0.061, "grad_norm": 1.0777528285980225, "grad_norm_var": 0.4306804814207595, "learning_rate": 2e-05, "loss": 0.5097, "loss/crossentropy": 2.321939468383789, "loss/hidden": 0.21533203125, "loss/logits": 0.03067285381257534, "loss/reg": 0.026371382176876068, "step": 122 }, { "epoch": 0.0615, "grad_norm": 1.3190155029296875, "grad_norm_var": 0.4325410213447808, "learning_rate": 2e-05, "loss": 0.5256, "loss/crossentropy": 2.414122700691223, "loss/hidden": 0.2294921875, "loss/logits": 0.032370791770517826, "loss/reg": 0.026370296254754066, "step": 123 }, { "epoch": 0.062, "grad_norm": 1.133116364479065, "grad_norm_var": 0.44245743827275397, "learning_rate": 2e-05, "loss": 0.5005, "loss/crossentropy": 2.212061285972595, "loss/hidden": 0.2080078125, "loss/logits": 0.02877300512045622, "loss/reg": 0.026369236409664154, "step": 124 }, { "epoch": 0.0625, "grad_norm": 1.609708547592163, "grad_norm_var": 0.4260775107173165, "learning_rate": 2e-05, "loss": 0.5155, "loss/crossentropy": 2.397601008415222, "loss/hidden": 0.21923828125, "loss/logits": 0.03255470283329487, "loss/reg": 0.026368040591478348, "step": 125 }, { "epoch": 0.063, "grad_norm": 1.7017152309417725, "grad_norm_var": 0.14551891758514066, "learning_rate": 2e-05, "loss": 0.5923, "loss/crossentropy": 2.1400970220565796, "loss/hidden": 0.283203125, "loss/logits": 0.04546245560050011, "loss/reg": 0.026366816833615303, "step": 126 }, { "epoch": 0.0635, "grad_norm": 1.1147139072418213, "grad_norm_var": 0.14976404939792326, "learning_rate": 2e-05, "loss": 0.4842, "loss/crossentropy": 2.1656835079193115, "loss/hidden": 0.19775390625, "loss/logits": 0.022837044671177864, "loss/reg": 0.02636570855975151, "step": 127 }, { "epoch": 0.064, "grad_norm": 1.277297854423523, "grad_norm_var": 0.14506375834877663, "learning_rate": 2e-05, "loss": 0.5123, "loss/crossentropy": 2.5118154287338257, "loss/hidden": 0.21875, "loss/logits": 0.029942544177174568, "loss/reg": 0.02636442333459854, "step": 128 }, { "epoch": 0.0645, "grad_norm": 1.191677451133728, "grad_norm_var": 0.14939848388545987, "learning_rate": 2e-05, "loss": 0.4912, "loss/crossentropy": 2.3038079738616943, "loss/hidden": 0.19921875, "loss/logits": 0.02833767607808113, "loss/reg": 0.026363197714090347, "step": 129 }, { "epoch": 0.065, "grad_norm": 1.2800445556640625, "grad_norm_var": 0.14371961156019347, "learning_rate": 2e-05, "loss": 0.4781, "loss/crossentropy": 2.164215087890625, "loss/hidden": 0.18896484375, "loss/logits": 0.02550451084971428, "loss/reg": 0.026361893862485886, "step": 130 }, { "epoch": 0.0655, "grad_norm": 2.83632230758667, "grad_norm_var": 0.2596730266397259, "learning_rate": 2e-05, "loss": 0.5199, "loss/crossentropy": 2.4381964206695557, "loss/hidden": 0.228515625, "loss/logits": 0.027756940573453903, "loss/reg": 0.02636053040623665, "step": 131 }, { "epoch": 0.066, "grad_norm": 1.4346998929977417, "grad_norm_var": 0.24730943436193792, "learning_rate": 2e-05, "loss": 0.4843, "loss/crossentropy": 2.401941180229187, "loss/hidden": 0.1904296875, "loss/logits": 0.030300754122436047, "loss/reg": 0.026359396055340767, "step": 132 }, { "epoch": 0.0665, "grad_norm": 1.3330755233764648, "grad_norm_var": 0.24018081345897185, "learning_rate": 2e-05, "loss": 0.5346, "loss/crossentropy": 2.3078893423080444, "loss/hidden": 0.244140625, "loss/logits": 0.02686551958322525, "loss/reg": 0.02635829895734787, "step": 133 }, { "epoch": 0.067, "grad_norm": 3.5527265071868896, "grad_norm_var": 0.4541487312436425, "learning_rate": 2e-05, "loss": 0.5719, "loss/crossentropy": 2.3654850721359253, "loss/hidden": 0.271484375, "loss/logits": 0.03688213415443897, "loss/reg": 0.02635718323290348, "step": 134 }, { "epoch": 0.0675, "grad_norm": 1.5558003187179565, "grad_norm_var": 0.4467804937083296, "learning_rate": 2e-05, "loss": 0.5577, "loss/crossentropy": 2.413025140762329, "loss/hidden": 0.255859375, "loss/logits": 0.03832230344414711, "loss/reg": 0.026355979964137077, "step": 135 }, { "epoch": 0.068, "grad_norm": 1.61518394947052, "grad_norm_var": 0.44321835982398144, "learning_rate": 2e-05, "loss": 0.5304, "loss/crossentropy": 2.3400243520736694, "loss/hidden": 0.232421875, "loss/logits": 0.034462086856365204, "loss/reg": 0.026354758068919182, "step": 136 }, { "epoch": 0.0685, "grad_norm": 1.122028112411499, "grad_norm_var": 0.45648783165066575, "learning_rate": 2e-05, "loss": 0.5084, "loss/crossentropy": 2.297537922859192, "loss/hidden": 0.2138671875, "loss/logits": 0.030963504686951637, "loss/reg": 0.02635359950363636, "step": 137 }, { "epoch": 0.069, "grad_norm": 1.678496241569519, "grad_norm_var": 0.43944044570977725, "learning_rate": 2e-05, "loss": 0.5425, "loss/crossentropy": 1.9657554626464844, "loss/hidden": 0.25048828125, "loss/logits": 0.028447046875953674, "loss/reg": 0.02635251358151436, "step": 138 }, { "epoch": 0.0695, "grad_norm": 1.2920198440551758, "grad_norm_var": 0.44053238449102083, "learning_rate": 2e-05, "loss": 0.5061, "loss/crossentropy": 2.2413735389709473, "loss/hidden": 0.212890625, "loss/logits": 0.029677780345082283, "loss/reg": 0.026351330801844597, "step": 139 }, { "epoch": 0.07, "grad_norm": 1.7133574485778809, "grad_norm_var": 0.4248322374530742, "learning_rate": 2e-05, "loss": 0.5116, "loss/crossentropy": 2.4616912603378296, "loss/hidden": 0.21728515625, "loss/logits": 0.03084972407668829, "loss/reg": 0.026350252330303192, "step": 140 }, { "epoch": 0.0705, "grad_norm": 1.637211561203003, "grad_norm_var": 0.42475264869840973, "learning_rate": 2e-05, "loss": 0.5188, "loss/crossentropy": 2.404749631881714, "loss/hidden": 0.22119140625, "loss/logits": 0.03416546434164047, "loss/reg": 0.02634907327592373, "step": 141 }, { "epoch": 0.071, "grad_norm": 1.6117165088653564, "grad_norm_var": 0.4245905890698488, "learning_rate": 2e-05, "loss": 0.5128, "loss/crossentropy": 2.2999398708343506, "loss/hidden": 0.220703125, "loss/logits": 0.0286036329343915, "loss/reg": 0.026347877457737923, "step": 142 }, { "epoch": 0.0715, "grad_norm": 1.5995277166366577, "grad_norm_var": 0.40529966216021474, "learning_rate": 2e-05, "loss": 0.5082, "loss/crossentropy": 2.391393780708313, "loss/hidden": 0.2138671875, "loss/logits": 0.030896139331161976, "loss/reg": 0.02634662576019764, "step": 143 }, { "epoch": 0.072, "grad_norm": 1.5376816987991333, "grad_norm_var": 0.3958791020628865, "learning_rate": 2e-05, "loss": 0.4819, "loss/crossentropy": 2.288747191429138, "loss/hidden": 0.19091796875, "loss/logits": 0.027577555738389492, "loss/reg": 0.026345305144786835, "step": 144 }, { "epoch": 0.0725, "grad_norm": 1.2494720220565796, "grad_norm_var": 0.39227114538706565, "learning_rate": 2e-05, "loss": 0.4809, "loss/crossentropy": 2.2762606143951416, "loss/hidden": 0.19091796875, "loss/logits": 0.026529721915721893, "loss/reg": 0.02634395658969879, "step": 145 }, { "epoch": 0.073, "grad_norm": 1.2957813739776611, "grad_norm_var": 0.39142520941635195, "learning_rate": 2e-05, "loss": 0.5373, "loss/crossentropy": 2.247607469558716, "loss/hidden": 0.236328125, "loss/logits": 0.037526827305555344, "loss/reg": 0.026342619210481644, "step": 146 }, { "epoch": 0.0735, "grad_norm": 1.5920614004135132, "grad_norm_var": 0.2982705153831809, "learning_rate": 2e-05, "loss": 0.5551, "loss/crossentropy": 2.5578393936157227, "loss/hidden": 0.25634765625, "loss/logits": 0.035370574332773685, "loss/reg": 0.026341425254940987, "step": 147 }, { "epoch": 0.074, "grad_norm": 1.115143895149231, "grad_norm_var": 0.3122838762450205, "learning_rate": 2e-05, "loss": 0.4949, "loss/crossentropy": 2.293186843395233, "loss/hidden": 0.20263671875, "loss/logits": 0.028887784108519554, "loss/reg": 0.02634003758430481, "step": 148 }, { "epoch": 0.0745, "grad_norm": 1.2242144346237183, "grad_norm_var": 0.3168093531880851, "learning_rate": 2e-05, "loss": 0.4976, "loss/crossentropy": 2.541364312171936, "loss/hidden": 0.205078125, "loss/logits": 0.029183853417634964, "loss/reg": 0.026338616386055946, "step": 149 }, { "epoch": 0.075, "grad_norm": 1.2801847457885742, "grad_norm_var": 0.043969165908166435, "learning_rate": 2e-05, "loss": 0.5246, "loss/crossentropy": 2.365533709526062, "loss/hidden": 0.22607421875, "loss/logits": 0.035141369327902794, "loss/reg": 0.02633722312748432, "step": 150 }, { "epoch": 0.0755, "grad_norm": 1.456945538520813, "grad_norm_var": 0.0431194160041447, "learning_rate": 2e-05, "loss": 0.4969, "loss/crossentropy": 2.5154623985290527, "loss/hidden": 0.20361328125, "loss/logits": 0.029950300231575966, "loss/reg": 0.02633577026426792, "step": 151 }, { "epoch": 0.076, "grad_norm": 1.2066655158996582, "grad_norm_var": 0.043943164667008955, "learning_rate": 2e-05, "loss": 0.4752, "loss/crossentropy": 2.528536558151245, "loss/hidden": 0.18798828125, "loss/logits": 0.023877541534602642, "loss/reg": 0.026334302499890327, "step": 152 }, { "epoch": 0.0765, "grad_norm": 1.2901597023010254, "grad_norm_var": 0.03918073743505299, "learning_rate": 2e-05, "loss": 0.521, "loss/crossentropy": 2.3224003314971924, "loss/hidden": 0.21484375, "loss/logits": 0.04283316247165203, "loss/reg": 0.02633279375731945, "step": 153 }, { "epoch": 0.077, "grad_norm": 1.74579656124115, "grad_norm_var": 0.04174939581046431, "learning_rate": 2e-05, "loss": 0.4896, "loss/crossentropy": 2.3139768838882446, "loss/hidden": 0.201171875, "loss/logits": 0.02508167363703251, "loss/reg": 0.026331480592489243, "step": 154 }, { "epoch": 0.0775, "grad_norm": 1.2306878566741943, "grad_norm_var": 0.04309645701489041, "learning_rate": 2e-05, "loss": 0.4816, "loss/crossentropy": 2.252236247062683, "loss/hidden": 0.18896484375, "loss/logits": 0.029315452091395855, "loss/reg": 0.02633025124669075, "step": 155 }, { "epoch": 0.078, "grad_norm": 1.297144889831543, "grad_norm_var": 0.03787466463763825, "learning_rate": 2e-05, "loss": 0.5241, "loss/crossentropy": 2.2772055864334106, "loss/hidden": 0.22900390625, "loss/logits": 0.03178275562822819, "loss/reg": 0.026328938081860542, "step": 156 }, { "epoch": 0.0785, "grad_norm": 1.3461697101593018, "grad_norm_var": 0.033891815904075646, "learning_rate": 2e-05, "loss": 0.5533, "loss/crossentropy": 2.2572057247161865, "loss/hidden": 0.2568359375, "loss/logits": 0.03316341433674097, "loss/reg": 0.02632747031748295, "step": 157 }, { "epoch": 0.079, "grad_norm": 1.6142765283584595, "grad_norm_var": 0.033971332471514334, "learning_rate": 2e-05, "loss": 0.477, "loss/crossentropy": 2.3103591203689575, "loss/hidden": 0.189453125, "loss/logits": 0.02428613882511854, "loss/reg": 0.026326211169362068, "step": 158 }, { "epoch": 0.0795, "grad_norm": 1.0435117483139038, "grad_norm_var": 0.03702752005093206, "learning_rate": 2e-05, "loss": 0.4774, "loss/crossentropy": 2.236763596534729, "loss/hidden": 0.18994140625, "loss/logits": 0.02417835220694542, "loss/reg": 0.026324694976210594, "step": 159 }, { "epoch": 0.08, "grad_norm": 3.194502115249634, "grad_norm_var": 0.251077157144137, "learning_rate": 2e-05, "loss": 0.5767, "loss/crossentropy": 2.4404300451278687, "loss/hidden": 0.2236328125, "loss/logits": 0.0898615438491106, "loss/reg": 0.02632344886660576, "step": 160 }, { "epoch": 0.0805, "grad_norm": 1.223811149597168, "grad_norm_var": 0.25180071296473483, "learning_rate": 2e-05, "loss": 0.4781, "loss/crossentropy": 2.2644309997558594, "loss/hidden": 0.19140625, "loss/logits": 0.023457905277609825, "loss/reg": 0.026322180405259132, "step": 161 }, { "epoch": 0.081, "grad_norm": 1.5841586589813232, "grad_norm_var": 0.25117174878087756, "learning_rate": 2e-05, "loss": 0.5629, "loss/crossentropy": 1.9194682240486145, "loss/hidden": 0.25048828125, "loss/logits": 0.049169132485985756, "loss/reg": 0.026320787146687508, "step": 162 }, { "epoch": 0.0815, "grad_norm": 1.2795405387878418, "grad_norm_var": 0.2519956540566284, "learning_rate": 2e-05, "loss": 0.5141, "loss/crossentropy": 2.444055438041687, "loss/hidden": 0.2177734375, "loss/logits": 0.03311499021947384, "loss/reg": 0.026319410651922226, "step": 163 }, { "epoch": 0.082, "grad_norm": 1.0281555652618408, "grad_norm_var": 0.25630376830439533, "learning_rate": 2e-05, "loss": 0.4718, "loss/crossentropy": 2.4007210731506348, "loss/hidden": 0.18359375, "loss/logits": 0.024980327114462852, "loss/reg": 0.026317832991480827, "step": 164 }, { "epoch": 0.0825, "grad_norm": 1.3523935079574585, "grad_norm_var": 0.25363641385507896, "learning_rate": 2e-05, "loss": 0.5099, "loss/crossentropy": 2.6051762104034424, "loss/hidden": 0.21630859375, "loss/logits": 0.030417022295296192, "loss/reg": 0.026316583156585693, "step": 165 }, { "epoch": 0.083, "grad_norm": 1.538618564605713, "grad_norm_var": 0.2520149682902304, "learning_rate": 2e-05, "loss": 0.5429, "loss/crossentropy": 2.453674077987671, "loss/hidden": 0.24951171875, "loss/logits": 0.030248504132032394, "loss/reg": 0.026315055787563324, "step": 166 }, { "epoch": 0.0835, "grad_norm": 1.152441143989563, "grad_norm_var": 0.25811823232553094, "learning_rate": 2e-05, "loss": 0.5287, "loss/crossentropy": 2.23244309425354, "loss/hidden": 0.2294921875, "loss/logits": 0.03604980930685997, "loss/reg": 0.02631353586912155, "step": 167 }, { "epoch": 0.084, "grad_norm": 3.3678812980651855, "grad_norm_var": 0.4812229304062583, "learning_rate": 2e-05, "loss": 0.6537, "loss/crossentropy": 2.2121087312698364, "loss/hidden": 0.322265625, "loss/logits": 0.06832050159573555, "loss/reg": 0.026312291622161865, "step": 168 }, { "epoch": 0.0845, "grad_norm": 1.3094780445098877, "grad_norm_var": 0.48049820171389107, "learning_rate": 2e-05, "loss": 0.5458, "loss/crossentropy": 2.29573655128479, "loss/hidden": 0.24365234375, "loss/logits": 0.03900368791073561, "loss/reg": 0.026311108842492104, "step": 169 }, { "epoch": 0.085, "grad_norm": 1.4413907527923584, "grad_norm_var": 0.47963284313486815, "learning_rate": 2e-05, "loss": 0.5115, "loss/crossentropy": 2.3498464822769165, "loss/hidden": 0.22021484375, "loss/logits": 0.028182944282889366, "loss/reg": 0.026309916749596596, "step": 170 }, { "epoch": 0.0855, "grad_norm": 1.1035057306289673, "grad_norm_var": 0.48627495331464554, "learning_rate": 2e-05, "loss": 0.5094, "loss/crossentropy": 2.3309890031814575, "loss/hidden": 0.20947265625, "loss/logits": 0.036838797852396965, "loss/reg": 0.02630869299173355, "step": 171 }, { "epoch": 0.086, "grad_norm": 1.0321089029312134, "grad_norm_var": 0.4997706555859033, "learning_rate": 2e-05, "loss": 0.4599, "loss/crossentropy": 2.512625813484192, "loss/hidden": 0.17333984375, "loss/logits": 0.023489498533308506, "loss/reg": 0.026307322084903717, "step": 172 }, { "epoch": 0.0865, "grad_norm": 1.2687665224075317, "grad_norm_var": 0.5021274230125977, "learning_rate": 2e-05, "loss": 0.4478, "loss/crossentropy": 2.55221164226532, "loss/hidden": 0.1640625, "loss/logits": 0.020719519816339016, "loss/reg": 0.026306064799427986, "step": 173 }, { "epoch": 0.087, "grad_norm": 1.6230545043945312, "grad_norm_var": 0.5022268861494524, "learning_rate": 2e-05, "loss": 0.5206, "loss/crossentropy": 2.54874849319458, "loss/hidden": 0.22216796875, "loss/logits": 0.03534366935491562, "loss/reg": 0.026304682716727257, "step": 174 }, { "epoch": 0.0875, "grad_norm": 1.4153763055801392, "grad_norm_var": 0.4865523407786817, "learning_rate": 2e-05, "loss": 0.4923, "loss/crossentropy": 2.5351545810699463, "loss/hidden": 0.18896484375, "loss/logits": 0.04026305489242077, "loss/reg": 0.026303274556994438, "step": 175 }, { "epoch": 0.088, "grad_norm": 1.0160194635391235, "grad_norm_var": 0.3075858037077518, "learning_rate": 2e-05, "loss": 0.439, "loss/crossentropy": 2.543141722679138, "loss/hidden": 0.15869140625, "loss/logits": 0.017316540703177452, "loss/reg": 0.0263019111007452, "step": 176 }, { "epoch": 0.0885, "grad_norm": 1.3745949268341064, "grad_norm_var": 0.3050415235722406, "learning_rate": 2e-05, "loss": 0.5442, "loss/crossentropy": 2.3582804203033447, "loss/hidden": 0.24169921875, "loss/logits": 0.03952281177043915, "loss/reg": 0.026300618425011635, "step": 177 }, { "epoch": 0.089, "grad_norm": 1.2340662479400635, "grad_norm_var": 0.30552768222984095, "learning_rate": 2e-05, "loss": 0.5201, "loss/crossentropy": 2.3681315183639526, "loss/hidden": 0.22412109375, "loss/logits": 0.03298753686249256, "loss/reg": 0.026299230754375458, "step": 178 }, { "epoch": 0.0895, "grad_norm": 2.601248264312744, "grad_norm_var": 0.39196807835765096, "learning_rate": 2e-05, "loss": 0.5363, "loss/crossentropy": 2.617705225944519, "loss/hidden": 0.240234375, "loss/logits": 0.03311354760080576, "loss/reg": 0.026297833770513535, "step": 179 }, { "epoch": 0.09, "grad_norm": 1.4031890630722046, "grad_norm_var": 0.37760473459329563, "learning_rate": 2e-05, "loss": 0.5719, "loss/crossentropy": 2.3656851053237915, "loss/hidden": 0.26318359375, "loss/logits": 0.045768093317747116, "loss/reg": 0.02629653364419937, "step": 180 }, { "epoch": 0.0905, "grad_norm": 1.2391202449798584, "grad_norm_var": 0.38085698610252045, "learning_rate": 2e-05, "loss": 0.4815, "loss/crossentropy": 2.306247353553772, "loss/hidden": 0.18798828125, "loss/logits": 0.03056285623461008, "loss/reg": 0.026295220479369164, "step": 181 }, { "epoch": 0.091, "grad_norm": 1.3922662734985352, "grad_norm_var": 0.3815894855763109, "learning_rate": 2e-05, "loss": 0.5416, "loss/crossentropy": 2.421887755393982, "loss/hidden": 0.2333984375, "loss/logits": 0.04522215947508812, "loss/reg": 0.026293916627764702, "step": 182 }, { "epoch": 0.0915, "grad_norm": 1.1777185201644897, "grad_norm_var": 0.38046340604863593, "learning_rate": 2e-05, "loss": 0.481, "loss/crossentropy": 2.294826030731201, "loss/hidden": 0.193359375, "loss/logits": 0.024756859987974167, "loss/reg": 0.026292625814676285, "step": 183 }, { "epoch": 0.092, "grad_norm": 1.3863762617111206, "grad_norm_var": 0.13236200174767798, "learning_rate": 2e-05, "loss": 0.5306, "loss/crossentropy": 2.2481424808502197, "loss/hidden": 0.234375, "loss/logits": 0.03326253779232502, "loss/reg": 0.026291374117136, "step": 184 }, { "epoch": 0.0925, "grad_norm": 1.0816987752914429, "grad_norm_var": 0.13762935148172814, "learning_rate": 2e-05, "loss": 0.4559, "loss/crossentropy": 2.3464468717575073, "loss/hidden": 0.1689453125, "loss/logits": 0.02402583882212639, "loss/reg": 0.026290148496627808, "step": 185 }, { "epoch": 0.093, "grad_norm": 1.0776005983352661, "grad_norm_var": 0.1420453846262613, "learning_rate": 2e-05, "loss": 0.4634, "loss/crossentropy": 2.316567063331604, "loss/hidden": 0.1748046875, "loss/logits": 0.025691150687634945, "loss/reg": 0.02628881298005581, "step": 186 }, { "epoch": 0.0935, "grad_norm": 2.1526918411254883, "grad_norm_var": 0.17787751141178104, "learning_rate": 2e-05, "loss": 0.4898, "loss/crossentropy": 2.2931246757507324, "loss/hidden": 0.20361328125, "loss/logits": 0.023339038714766502, "loss/reg": 0.026287470012903214, "step": 187 }, { "epoch": 0.094, "grad_norm": 1.3883178234100342, "grad_norm_var": 0.16810970663468652, "learning_rate": 2e-05, "loss": 0.4682, "loss/crossentropy": 2.4850372076034546, "loss/hidden": 0.16845703125, "loss/logits": 0.036926812492311, "loss/reg": 0.0262861680239439, "step": 188 }, { "epoch": 0.0945, "grad_norm": 1.1316860914230347, "grad_norm_var": 0.172176362699476, "learning_rate": 2e-05, "loss": 0.4799, "loss/crossentropy": 2.5390676259994507, "loss/hidden": 0.19091796875, "loss/logits": 0.02617151476442814, "loss/reg": 0.026284806430339813, "step": 189 }, { "epoch": 0.095, "grad_norm": 1.310356616973877, "grad_norm_var": 0.1697565690965554, "learning_rate": 2e-05, "loss": 0.5577, "loss/crossentropy": 2.2394298315048218, "loss/hidden": 0.2578125, "loss/logits": 0.03705478459596634, "loss/reg": 0.026283571496605873, "step": 190 }, { "epoch": 0.0955, "grad_norm": 1.224501371383667, "grad_norm_var": 0.1716142091861707, "learning_rate": 2e-05, "loss": 0.4853, "loss/crossentropy": 2.3653067350387573, "loss/hidden": 0.18701171875, "loss/logits": 0.03547767084091902, "loss/reg": 0.026282308623194695, "step": 191 }, { "epoch": 0.096, "grad_norm": 1.1369792222976685, "grad_norm_var": 0.16654605297517922, "learning_rate": 2e-05, "loss": 0.4612, "loss/crossentropy": 2.4437299966812134, "loss/hidden": 0.173828125, "loss/logits": 0.024531416594982147, "loss/reg": 0.02628110721707344, "step": 192 }, { "epoch": 0.0965, "grad_norm": 1.639382004737854, "grad_norm_var": 0.1702244083590602, "learning_rate": 2e-05, "loss": 0.5584, "loss/crossentropy": 2.369232177734375, "loss/hidden": 0.251953125, "loss/logits": 0.04362649656832218, "loss/reg": 0.026279788464307785, "step": 193 }, { "epoch": 0.097, "grad_norm": 1.7320666313171387, "grad_norm_var": 0.17397129527364066, "learning_rate": 2e-05, "loss": 0.584, "loss/crossentropy": 2.290635347366333, "loss/hidden": 0.25537109375, "loss/logits": 0.0658609364181757, "loss/reg": 0.026278505101799965, "step": 194 }, { "epoch": 0.0975, "grad_norm": 1.3818726539611816, "grad_norm_var": 0.07845907156529677, "learning_rate": 2e-05, "loss": 0.47, "loss/crossentropy": 2.1524158716201782, "loss/hidden": 0.1826171875, "loss/logits": 0.024603160098195076, "loss/reg": 0.02627725526690483, "step": 195 }, { "epoch": 0.098, "grad_norm": 1.499199628829956, "grad_norm_var": 0.07951141157999278, "learning_rate": 2e-05, "loss": 0.5272, "loss/crossentropy": 2.4975160360336304, "loss/hidden": 0.22265625, "loss/logits": 0.04179301019757986, "loss/reg": 0.02627602592110634, "step": 196 }, { "epoch": 0.0985, "grad_norm": 1.5929518938064575, "grad_norm_var": 0.0810677599209079, "learning_rate": 2e-05, "loss": 0.6116, "loss/crossentropy": 2.5046887397766113, "loss/hidden": 0.306640625, "loss/logits": 0.04225216433405876, "loss/reg": 0.026274660602211952, "step": 197 }, { "epoch": 0.099, "grad_norm": 1.1331342458724976, "grad_norm_var": 0.0853280978459693, "learning_rate": 2e-05, "loss": 0.4498, "loss/crossentropy": 2.4783315658569336, "loss/hidden": 0.1640625, "loss/logits": 0.022989329881966114, "loss/reg": 0.026273364201188087, "step": 198 }, { "epoch": 0.0995, "grad_norm": 1.2823922634124756, "grad_norm_var": 0.0832189351924588, "learning_rate": 2e-05, "loss": 0.5028, "loss/crossentropy": 2.428224563598633, "loss/hidden": 0.21044921875, "loss/logits": 0.029634020291268826, "loss/reg": 0.02627207711338997, "step": 199 }, { "epoch": 0.1, "grad_norm": 1.5657204389572144, "grad_norm_var": 0.0852752560623344, "learning_rate": 2e-05, "loss": 0.5566, "loss/crossentropy": 2.205379009246826, "loss/hidden": 0.25341796875, "loss/logits": 0.04043233580887318, "loss/reg": 0.026270678266882896, "step": 200 }, { "epoch": 0.1005, "grad_norm": 2.498617649078369, "grad_norm_var": 0.15143969810336458, "learning_rate": 2e-05, "loss": 0.5453, "loss/crossentropy": 2.2436362504959106, "loss/hidden": 0.25048828125, "loss/logits": 0.03208579681813717, "loss/reg": 0.026269439607858658, "step": 201 }, { "epoch": 0.101, "grad_norm": 1.1255189180374146, "grad_norm_var": 0.1489852922170759, "learning_rate": 2e-05, "loss": 0.4846, "loss/crossentropy": 2.423098921775818, "loss/hidden": 0.19384765625, "loss/logits": 0.02810557559132576, "loss/reg": 0.02626824378967285, "step": 202 }, { "epoch": 0.1015, "grad_norm": 1.7557874917984009, "grad_norm_var": 0.1236135205651595, "learning_rate": 2e-05, "loss": 0.5679, "loss/crossentropy": 2.62020206451416, "loss/hidden": 0.2685546875, "loss/logits": 0.03669197857379913, "loss/reg": 0.026267159730196, "step": 203 }, { "epoch": 0.102, "grad_norm": 1.1842639446258545, "grad_norm_var": 0.12823160649832796, "learning_rate": 2e-05, "loss": 0.5182, "loss/crossentropy": 2.43496835231781, "loss/hidden": 0.22412109375, "loss/logits": 0.03145230747759342, "loss/reg": 0.026265980675816536, "step": 204 }, { "epoch": 0.1025, "grad_norm": 3.2057254314422607, "grad_norm_var": 0.30915423120596724, "learning_rate": 2e-05, "loss": 0.495, "loss/crossentropy": 2.6262258291244507, "loss/hidden": 0.2099609375, "loss/logits": 0.022387961857020855, "loss/reg": 0.026264773681759834, "step": 205 }, { "epoch": 0.103, "grad_norm": 1.5268100500106812, "grad_norm_var": 0.3043212521218976, "learning_rate": 2e-05, "loss": 0.5423, "loss/crossentropy": 2.3472981452941895, "loss/hidden": 0.2353515625, "loss/logits": 0.04430149123072624, "loss/reg": 0.026263581588864326, "step": 206 }, { "epoch": 0.1035, "grad_norm": 1.2183657884597778, "grad_norm_var": 0.30462490819346133, "learning_rate": 2e-05, "loss": 0.5171, "loss/crossentropy": 2.2207844257354736, "loss/hidden": 0.22119140625, "loss/logits": 0.0333048552274704, "loss/reg": 0.026262367144227028, "step": 207 }, { "epoch": 0.104, "grad_norm": 1.3168612718582153, "grad_norm_var": 0.29572373678734315, "learning_rate": 2e-05, "loss": 0.4704, "loss/crossentropy": 2.4785603284835815, "loss/hidden": 0.18212890625, "loss/logits": 0.025700876489281654, "loss/reg": 0.026260942220687866, "step": 208 }, { "epoch": 0.1045, "grad_norm": 1.104201316833496, "grad_norm_var": 0.31107634650352317, "learning_rate": 2e-05, "loss": 0.5125, "loss/crossentropy": 2.440949320793152, "loss/hidden": 0.21435546875, "loss/logits": 0.03553314134478569, "loss/reg": 0.026259683072566986, "step": 209 }, { "epoch": 0.105, "grad_norm": 1.179359793663025, "grad_norm_var": 0.3182418442890669, "learning_rate": 2e-05, "loss": 0.5404, "loss/crossentropy": 2.4222298860549927, "loss/hidden": 0.23583984375, "loss/logits": 0.04196472465991974, "loss/reg": 0.026258250698447227, "step": 210 }, { "epoch": 0.1055, "grad_norm": 1.9198130369186401, "grad_norm_var": 0.3252966300662526, "learning_rate": 2e-05, "loss": 0.7362, "loss/crossentropy": 2.1343027353286743, "loss/hidden": 0.42236328125, "loss/logits": 0.051308806985616684, "loss/reg": 0.02625690959393978, "step": 211 }, { "epoch": 0.106, "grad_norm": 1.945879578590393, "grad_norm_var": 0.33359211146878015, "learning_rate": 2e-05, "loss": 0.5191, "loss/crossentropy": 2.629801630973816, "loss/hidden": 0.22802734375, "loss/logits": 0.028506163507699966, "loss/reg": 0.026255663484334946, "step": 212 }, { "epoch": 0.1065, "grad_norm": 1.10844886302948, "grad_norm_var": 0.3485388100979046, "learning_rate": 2e-05, "loss": 0.5026, "loss/crossentropy": 2.4873945713043213, "loss/hidden": 0.2060546875, "loss/logits": 0.033975718542933464, "loss/reg": 0.026254238560795784, "step": 213 }, { "epoch": 0.107, "grad_norm": 1.5501041412353516, "grad_norm_var": 0.3352879309740613, "learning_rate": 2e-05, "loss": 0.5128, "loss/crossentropy": 2.2922143936157227, "loss/hidden": 0.220703125, "loss/logits": 0.029572436586022377, "loss/reg": 0.026252800598740578, "step": 214 }, { "epoch": 0.1075, "grad_norm": 1.4351506233215332, "grad_norm_var": 0.3304201508174941, "learning_rate": 2e-05, "loss": 0.5019, "loss/crossentropy": 2.3728071451187134, "loss/hidden": 0.208984375, "loss/logits": 0.030389025807380676, "loss/reg": 0.02625151537358761, "step": 215 }, { "epoch": 0.108, "grad_norm": 1.1031361818313599, "grad_norm_var": 0.3460650712842908, "learning_rate": 2e-05, "loss": 0.491, "loss/crossentropy": 2.4348747730255127, "loss/hidden": 0.19970703125, "loss/logits": 0.028779378160834312, "loss/reg": 0.026250220835208893, "step": 216 }, { "epoch": 0.1085, "grad_norm": 1.664985179901123, "grad_norm_var": 0.28668546672827777, "learning_rate": 2e-05, "loss": 0.5599, "loss/crossentropy": 2.399816870689392, "loss/hidden": 0.248046875, "loss/logits": 0.04935701750218868, "loss/reg": 0.026248781010508537, "step": 217 }, { "epoch": 0.109, "grad_norm": 1.4927318096160889, "grad_norm_var": 0.2757241244708178, "learning_rate": 2e-05, "loss": 0.5111, "loss/crossentropy": 2.343783974647522, "loss/hidden": 0.22021484375, "loss/logits": 0.02840256877243519, "loss/reg": 0.026247289031744003, "step": 218 }, { "epoch": 0.1095, "grad_norm": 1.477570652961731, "grad_norm_var": 0.2727232102389791, "learning_rate": 2e-05, "loss": 0.5197, "loss/crossentropy": 2.4229984283447266, "loss/hidden": 0.22314453125, "loss/logits": 0.034054605290293694, "loss/reg": 0.026245808228850365, "step": 219 }, { "epoch": 0.11, "grad_norm": 1.3535478115081787, "grad_norm_var": 0.26677633070284795, "learning_rate": 2e-05, "loss": 0.4955, "loss/crossentropy": 2.4335192441940308, "loss/hidden": 0.2021484375, "loss/logits": 0.03086682688444853, "loss/reg": 0.02624441497027874, "step": 220 }, { "epoch": 0.1105, "grad_norm": 1.4819797277450562, "grad_norm_var": 0.06910834048590857, "learning_rate": 2e-05, "loss": 0.494, "loss/crossentropy": 2.3202240467071533, "loss/hidden": 0.1982421875, "loss/logits": 0.03335183020681143, "loss/reg": 0.026242973282933235, "step": 221 }, { "epoch": 0.111, "grad_norm": 3.001047372817993, "grad_norm_var": 0.2239867367636629, "learning_rate": 2e-05, "loss": 0.5824, "loss/crossentropy": 2.5001453161239624, "loss/hidden": 0.259765625, "loss/logits": 0.06020928919315338, "loss/reg": 0.02624150738120079, "step": 222 }, { "epoch": 0.1115, "grad_norm": 1.3792407512664795, "grad_norm_var": 0.21908974537501535, "learning_rate": 2e-05, "loss": 0.5162, "loss/crossentropy": 2.067277252674103, "loss/hidden": 0.22119140625, "loss/logits": 0.03264045529067516, "loss/reg": 0.026240520179271698, "step": 223 }, { "epoch": 0.112, "grad_norm": 1.0752317905426025, "grad_norm_var": 0.2296741211773119, "learning_rate": 2e-05, "loss": 0.4715, "loss/crossentropy": 2.3376221656799316, "loss/hidden": 0.18017578125, "loss/logits": 0.028950211592018604, "loss/reg": 0.026239832863211632, "step": 224 }, { "epoch": 0.1125, "grad_norm": 1.2668484449386597, "grad_norm_var": 0.22237485135677842, "learning_rate": 2e-05, "loss": 0.4997, "loss/crossentropy": 2.0572392344474792, "loss/hidden": 0.20654296875, "loss/logits": 0.03081146441400051, "loss/reg": 0.026238473132252693, "step": 225 }, { "epoch": 0.113, "grad_norm": 1.2038859128952026, "grad_norm_var": 0.2212749830240483, "learning_rate": 2e-05, "loss": 0.496, "loss/crossentropy": 2.3101898431777954, "loss/hidden": 0.20068359375, "loss/logits": 0.032975198701024055, "loss/reg": 0.026237143203616142, "step": 226 }, { "epoch": 0.1135, "grad_norm": 1.1962757110595703, "grad_norm_var": 0.21626523006927076, "learning_rate": 2e-05, "loss": 0.5397, "loss/crossentropy": 2.3421201705932617, "loss/hidden": 0.234375, "loss/logits": 0.042991749942302704, "loss/reg": 0.026236219331622124, "step": 227 }, { "epoch": 0.114, "grad_norm": 1.3072717189788818, "grad_norm_var": 0.20238375910022696, "learning_rate": 2e-05, "loss": 0.4899, "loss/crossentropy": 2.545662522315979, "loss/hidden": 0.19873046875, "loss/logits": 0.028848190791904926, "loss/reg": 0.02623477764427662, "step": 228 }, { "epoch": 0.1145, "grad_norm": 1.4646738767623901, "grad_norm_var": 0.1943966383373566, "learning_rate": 2e-05, "loss": 0.5319, "loss/crossentropy": 2.481440782546997, "loss/hidden": 0.236328125, "loss/logits": 0.03321322426199913, "loss/reg": 0.02623329870402813, "step": 229 }, { "epoch": 0.115, "grad_norm": 1.460798978805542, "grad_norm_var": 0.1938919184279494, "learning_rate": 2e-05, "loss": 0.5487, "loss/crossentropy": 2.2658169269561768, "loss/hidden": 0.25048828125, "loss/logits": 0.03588264063000679, "loss/reg": 0.026232033967971802, "step": 230 }, { "epoch": 0.1155, "grad_norm": 1.8251186609268188, "grad_norm_var": 0.20209032603179977, "learning_rate": 2e-05, "loss": 0.4954, "loss/crossentropy": 2.0918792486190796, "loss/hidden": 0.20458984375, "loss/logits": 0.028470170684158802, "loss/reg": 0.02623056247830391, "step": 231 }, { "epoch": 0.116, "grad_norm": 1.0807620286941528, "grad_norm_var": 0.20325974318190695, "learning_rate": 2e-05, "loss": 0.4663, "loss/crossentropy": 2.4854743480682373, "loss/hidden": 0.18017578125, "loss/logits": 0.023857600055634975, "loss/reg": 0.026229269802570343, "step": 232 }, { "epoch": 0.1165, "grad_norm": 1.2416105270385742, "grad_norm_var": 0.20420357740239006, "learning_rate": 2e-05, "loss": 0.4939, "loss/crossentropy": 2.5404441356658936, "loss/hidden": 0.2001953125, "loss/logits": 0.031377360224723816, "loss/reg": 0.0262277964502573, "step": 233 }, { "epoch": 0.117, "grad_norm": 1.0784002542495728, "grad_norm_var": 0.21294726278598167, "learning_rate": 2e-05, "loss": 0.4764, "loss/crossentropy": 2.1334633231163025, "loss/hidden": 0.18603515625, "loss/logits": 0.028105400502681732, "loss/reg": 0.02622636966407299, "step": 234 }, { "epoch": 0.1175, "grad_norm": 1.4805059432983398, "grad_norm_var": 0.21296607019170413, "learning_rate": 2e-05, "loss": 0.5299, "loss/crossentropy": 2.363998532295227, "loss/hidden": 0.22998046875, "loss/logits": 0.03765851445496082, "loss/reg": 0.026224961504340172, "step": 235 }, { "epoch": 0.118, "grad_norm": 1.4707082509994507, "grad_norm_var": 0.21261289860814922, "learning_rate": 2e-05, "loss": 0.5063, "loss/crossentropy": 2.5150575637817383, "loss/hidden": 0.21142578125, "loss/logits": 0.032685703597962856, "loss/reg": 0.02622355706989765, "step": 236 }, { "epoch": 0.1185, "grad_norm": 1.2693709135055542, "grad_norm_var": 0.2142025931629329, "learning_rate": 2e-05, "loss": 0.4777, "loss/crossentropy": 2.364490509033203, "loss/hidden": 0.189453125, "loss/logits": 0.026029310189187527, "loss/reg": 0.026222191751003265, "step": 237 }, { "epoch": 0.119, "grad_norm": 1.4452778100967407, "grad_norm_var": 0.03857260853696444, "learning_rate": 2e-05, "loss": 0.4884, "loss/crossentropy": 2.370029330253601, "loss/hidden": 0.19873046875, "loss/logits": 0.027492761611938477, "loss/reg": 0.026220764964818954, "step": 238 }, { "epoch": 0.1195, "grad_norm": 1.3660000562667847, "grad_norm_var": 0.03849288132132358, "learning_rate": 2e-05, "loss": 0.4504, "loss/crossentropy": 2.579773426055908, "loss/hidden": 0.1650390625, "loss/logits": 0.02312152460217476, "loss/reg": 0.026219261810183525, "step": 239 }, { "epoch": 0.12, "grad_norm": 1.1201462745666504, "grad_norm_var": 0.03711094738648981, "learning_rate": 2e-05, "loss": 0.4707, "loss/crossentropy": 2.5026135444641113, "loss/hidden": 0.1708984375, "loss/logits": 0.037659027613699436, "loss/reg": 0.02621796354651451, "step": 240 }, { "epoch": 0.1205, "grad_norm": 1.5148764848709106, "grad_norm_var": 0.03887221528161528, "learning_rate": 2e-05, "loss": 0.5356, "loss/crossentropy": 2.2153124809265137, "loss/hidden": 0.23974609375, "loss/logits": 0.03367053158581257, "loss/reg": 0.02621665596961975, "step": 241 }, { "epoch": 0.121, "grad_norm": 4.675024509429932, "grad_norm_var": 0.726447806322074, "learning_rate": 2e-05, "loss": 0.9674, "loss/crossentropy": 2.5507571697235107, "loss/hidden": 0.47119140625, "loss/logits": 0.23407735768705606, "loss/reg": 0.026215286925435066, "step": 242 }, { "epoch": 0.1215, "grad_norm": 1.3312729597091675, "grad_norm_var": 0.7209984947184022, "learning_rate": 2e-05, "loss": 0.4611, "loss/crossentropy": 2.38046658039093, "loss/hidden": 0.1767578125, "loss/logits": 0.022188683971762657, "loss/reg": 0.02621396817266941, "step": 243 }, { "epoch": 0.122, "grad_norm": 2.4490838050842285, "grad_norm_var": 0.7623712839956812, "learning_rate": 2e-05, "loss": 0.6231, "loss/crossentropy": 2.5406309366226196, "loss/hidden": 0.3056640625, "loss/logits": 0.0553472563624382, "loss/reg": 0.02621266432106495, "step": 244 }, { "epoch": 0.1225, "grad_norm": 1.5570958852767944, "grad_norm_var": 0.7607187136014618, "learning_rate": 2e-05, "loss": 0.4948, "loss/crossentropy": 2.2163580656051636, "loss/hidden": 0.2060546875, "loss/logits": 0.026676415465772152, "loss/reg": 0.02621115930378437, "step": 245 }, { "epoch": 0.123, "grad_norm": 1.2748626470565796, "grad_norm_var": 0.767517463439591, "learning_rate": 2e-05, "loss": 0.5207, "loss/crossentropy": 2.3726441860198975, "loss/hidden": 0.2294921875, "loss/logits": 0.02906488999724388, "loss/reg": 0.026209814473986626, "step": 246 }, { "epoch": 0.1235, "grad_norm": 1.5057262182235718, "grad_norm_var": 0.7658503992600496, "learning_rate": 2e-05, "loss": 0.4962, "loss/crossentropy": 2.442053437232971, "loss/hidden": 0.2021484375, "loss/logits": 0.0319626173004508, "loss/reg": 0.02620851993560791, "step": 247 }, { "epoch": 0.124, "grad_norm": 1.2367428541183472, "grad_norm_var": 0.7562333027864989, "learning_rate": 2e-05, "loss": 0.4891, "loss/crossentropy": 2.32527756690979, "loss/hidden": 0.19775390625, "loss/logits": 0.029276075772941113, "loss/reg": 0.02620730549097061, "step": 248 }, { "epoch": 0.1245, "grad_norm": 1.3585014343261719, "grad_norm_var": 0.7510956988655692, "learning_rate": 2e-05, "loss": 0.505, "loss/crossentropy": 2.4313305616378784, "loss/hidden": 0.2060546875, "loss/logits": 0.036865890957415104, "loss/reg": 0.02620592899620533, "step": 249 }, { "epoch": 0.125, "grad_norm": 1.1339526176452637, "grad_norm_var": 0.7471780769863924, "learning_rate": 2e-05, "loss": 0.4488, "loss/crossentropy": 2.309004545211792, "loss/hidden": 0.1650390625, "loss/logits": 0.021719621494412422, "loss/reg": 0.0262046679854393, "step": 250 }, { "epoch": 0.1255, "grad_norm": 1.6961034536361694, "grad_norm_var": 0.7455897121963819, "learning_rate": 2e-05, "loss": 0.4853, "loss/crossentropy": 2.3145781755447388, "loss/hidden": 0.19189453125, "loss/logits": 0.03134002722799778, "loss/reg": 0.02620331011712551, "step": 251 }, { "epoch": 0.126, "grad_norm": 1.137927532196045, "grad_norm_var": 0.760479623698442, "learning_rate": 2e-05, "loss": 0.4469, "loss/crossentropy": 2.371696949005127, "loss/hidden": 0.1630859375, "loss/logits": 0.021795951761305332, "loss/reg": 0.02620198018848896, "step": 252 }, { "epoch": 0.1265, "grad_norm": 1.1879764795303345, "grad_norm_var": 0.7648019998891016, "learning_rate": 2e-05, "loss": 0.4938, "loss/crossentropy": 2.3237900733947754, "loss/hidden": 0.20703125, "loss/logits": 0.02474562544375658, "loss/reg": 0.026200512424111366, "step": 253 }, { "epoch": 0.127, "grad_norm": 1.437303066253662, "grad_norm_var": 0.7649964465157646, "learning_rate": 2e-05, "loss": 0.4641, "loss/crossentropy": 2.4125940799713135, "loss/hidden": 0.17626953125, "loss/logits": 0.025886863470077515, "loss/reg": 0.026198983192443848, "step": 254 }, { "epoch": 0.1275, "grad_norm": 1.298660159111023, "grad_norm_var": 0.7675955671113466, "learning_rate": 2e-05, "loss": 0.4572, "loss/crossentropy": 2.531725764274597, "loss/hidden": 0.16943359375, "loss/logits": 0.025743640959262848, "loss/reg": 0.02619752287864685, "step": 255 }, { "epoch": 0.128, "grad_norm": 1.39458429813385, "grad_norm_var": 0.7540231641910907, "learning_rate": 2e-05, "loss": 0.4862, "loss/crossentropy": 2.186591327190399, "loss/hidden": 0.20166015625, "loss/logits": 0.022591713815927505, "loss/reg": 0.026196002960205078, "step": 256 }, { "epoch": 0.1285, "grad_norm": 1.212915062904358, "grad_norm_var": 0.7646330349939954, "learning_rate": 2e-05, "loss": 0.5087, "loss/crossentropy": 2.471588611602783, "loss/hidden": 0.21337890625, "loss/logits": 0.033330729231238365, "loss/reg": 0.026194443926215172, "step": 257 }, { "epoch": 0.129, "grad_norm": 1.0683094263076782, "grad_norm_var": 0.10754076085599748, "learning_rate": 2e-05, "loss": 0.4712, "loss/crossentropy": 2.2822721004486084, "loss/hidden": 0.18115234375, "loss/logits": 0.028153350576758385, "loss/reg": 0.026192834600806236, "step": 258 }, { "epoch": 0.1295, "grad_norm": 1.2983660697937012, "grad_norm_var": 0.10787735781459536, "learning_rate": 2e-05, "loss": 0.5124, "loss/crossentropy": 2.3575881719589233, "loss/hidden": 0.22021484375, "loss/logits": 0.03026559017598629, "loss/reg": 0.02619129605591297, "step": 259 }, { "epoch": 0.13, "grad_norm": 1.4962793588638306, "grad_norm_var": 0.030134894623511776, "learning_rate": 2e-05, "loss": 0.4676, "loss/crossentropy": 2.409846782684326, "loss/hidden": 0.18212890625, "loss/logits": 0.02358458936214447, "loss/reg": 0.026189813390374184, "step": 260 }, { "epoch": 0.1305, "grad_norm": 1.3754230737686157, "grad_norm_var": 0.026719927934763098, "learning_rate": 2e-05, "loss": 0.5312, "loss/crossentropy": 2.177566409111023, "loss/hidden": 0.23876953125, "loss/logits": 0.030562346801161766, "loss/reg": 0.026188237592577934, "step": 261 }, { "epoch": 0.131, "grad_norm": 1.342571496963501, "grad_norm_var": 0.02660255745073622, "learning_rate": 2e-05, "loss": 0.4839, "loss/crossentropy": 2.513023018836975, "loss/hidden": 0.18896484375, "loss/logits": 0.033111236058175564, "loss/reg": 0.026186756789684296, "step": 262 }, { "epoch": 0.1315, "grad_norm": 1.2367901802062988, "grad_norm_var": 0.02460065001579365, "learning_rate": 2e-05, "loss": 0.4956, "loss/crossentropy": 2.3763153553009033, "loss/hidden": 0.20458984375, "loss/logits": 0.029151923954486847, "loss/reg": 0.02618517354130745, "step": 263 }, { "epoch": 0.132, "grad_norm": 1.9415297508239746, "grad_norm_var": 0.04904137234389789, "learning_rate": 2e-05, "loss": 0.5627, "loss/crossentropy": 2.240867018699646, "loss/hidden": 0.26611328125, "loss/logits": 0.03479018062353134, "loss/reg": 0.026183558627963066, "step": 264 }, { "epoch": 0.1325, "grad_norm": 0.9934033751487732, "grad_norm_var": 0.05701087259719828, "learning_rate": 2e-05, "loss": 0.4713, "loss/crossentropy": 2.3560155630111694, "loss/hidden": 0.18017578125, "loss/logits": 0.029294829815626144, "loss/reg": 0.026182031258940697, "step": 265 }, { "epoch": 0.133, "grad_norm": 1.0373915433883667, "grad_norm_var": 0.06009524379270439, "learning_rate": 2e-05, "loss": 0.494, "loss/crossentropy": 2.400221347808838, "loss/hidden": 0.2001953125, "loss/logits": 0.031994713470339775, "loss/reg": 0.026180392131209373, "step": 266 }, { "epoch": 0.1335, "grad_norm": 1.267191767692566, "grad_norm_var": 0.05021139115474562, "learning_rate": 2e-05, "loss": 0.5615, "loss/crossentropy": 2.1523420810699463, "loss/hidden": 0.2490234375, "loss/logits": 0.05070135369896889, "loss/reg": 0.026178674772381783, "step": 267 }, { "epoch": 0.134, "grad_norm": 1.6182429790496826, "grad_norm_var": 0.05454457187013859, "learning_rate": 2e-05, "loss": 0.5, "loss/crossentropy": 2.299275279045105, "loss/hidden": 0.20166015625, "loss/logits": 0.03661351092159748, "loss/reg": 0.02617703191936016, "step": 268 }, { "epoch": 0.1345, "grad_norm": 1.1830179691314697, "grad_norm_var": 0.05463698624171962, "learning_rate": 2e-05, "loss": 0.542, "loss/crossentropy": 2.237685799598694, "loss/hidden": 0.24462890625, "loss/logits": 0.03558643162250519, "loss/reg": 0.02617518976330757, "step": 269 }, { "epoch": 0.135, "grad_norm": 1.0215861797332764, "grad_norm_var": 0.05922028974216963, "learning_rate": 2e-05, "loss": 0.4509, "loss/crossentropy": 2.386792778968811, "loss/hidden": 0.1650390625, "loss/logits": 0.02410146687179804, "loss/reg": 0.026173612102866173, "step": 270 }, { "epoch": 0.1355, "grad_norm": 1.2516766786575317, "grad_norm_var": 0.059361270037086425, "learning_rate": 2e-05, "loss": 0.5417, "loss/crossentropy": 2.2572768926620483, "loss/hidden": 0.23974609375, "loss/logits": 0.04025658965110779, "loss/reg": 0.026171868667006493, "step": 271 }, { "epoch": 0.136, "grad_norm": 1.1899913549423218, "grad_norm_var": 0.05929371602914331, "learning_rate": 2e-05, "loss": 0.4991, "loss/crossentropy": 2.5554966926574707, "loss/hidden": 0.208984375, "loss/logits": 0.028443695977330208, "loss/reg": 0.026170162484049797, "step": 272 }, { "epoch": 0.1365, "grad_norm": 1.716871976852417, "grad_norm_var": 0.0704296166532296, "learning_rate": 2e-05, "loss": 0.512, "loss/crossentropy": 2.3532203435897827, "loss/hidden": 0.2060546875, "loss/logits": 0.04425806552171707, "loss/reg": 0.026168543845415115, "step": 273 }, { "epoch": 0.137, "grad_norm": 1.4646930694580078, "grad_norm_var": 0.06721621691666481, "learning_rate": 2e-05, "loss": 0.5178, "loss/crossentropy": 2.343596935272217, "loss/hidden": 0.22119140625, "loss/logits": 0.034918731078505516, "loss/reg": 0.026166997849941254, "step": 274 }, { "epoch": 0.1375, "grad_norm": 1.0874199867248535, "grad_norm_var": 0.07115961720678651, "learning_rate": 2e-05, "loss": 0.4609, "loss/crossentropy": 2.172752797603607, "loss/hidden": 0.1748046875, "loss/logits": 0.02447379007935524, "loss/reg": 0.026165394112467766, "step": 275 }, { "epoch": 0.138, "grad_norm": 1.1732720136642456, "grad_norm_var": 0.07036869627631123, "learning_rate": 2e-05, "loss": 0.4846, "loss/crossentropy": 2.4434475898742676, "loss/hidden": 0.1943359375, "loss/logits": 0.02860554587095976, "loss/reg": 0.026163768023252487, "step": 276 }, { "epoch": 0.1385, "grad_norm": 1.5107827186584473, "grad_norm_var": 0.07276105744027898, "learning_rate": 2e-05, "loss": 0.5892, "loss/crossentropy": 2.5824127197265625, "loss/hidden": 0.287109375, "loss/logits": 0.04050498828291893, "loss/reg": 0.026162203401327133, "step": 277 }, { "epoch": 0.139, "grad_norm": 1.420068621635437, "grad_norm_var": 0.07342361868488892, "learning_rate": 2e-05, "loss": 0.5488, "loss/crossentropy": 2.234652519226074, "loss/hidden": 0.2470703125, "loss/logits": 0.04010407812893391, "loss/reg": 0.026160722598433495, "step": 278 }, { "epoch": 0.1395, "grad_norm": 0.9972831010818481, "grad_norm_var": 0.0796539769611795, "learning_rate": 2e-05, "loss": 0.4622, "loss/crossentropy": 2.396607279777527, "loss/hidden": 0.17041015625, "loss/logits": 0.030245795845985413, "loss/reg": 0.026159239932894707, "step": 279 }, { "epoch": 0.14, "grad_norm": 2.338772773742676, "grad_norm_var": 0.1232493317334492, "learning_rate": 2e-05, "loss": 0.5912, "loss/crossentropy": 2.0176676511764526, "loss/hidden": 0.27783203125, "loss/logits": 0.05181618873029947, "loss/reg": 0.026157628744840622, "step": 280 }, { "epoch": 0.1405, "grad_norm": 1.2386250495910645, "grad_norm_var": 0.11601927811151326, "learning_rate": 2e-05, "loss": 0.454, "loss/crossentropy": 2.2258787155151367, "loss/hidden": 0.16943359375, "loss/logits": 0.02302493341267109, "loss/reg": 0.02615603432059288, "step": 281 }, { "epoch": 0.141, "grad_norm": 3.4386959075927734, "grad_norm_var": 0.37798476794662456, "learning_rate": 2e-05, "loss": 0.6987, "loss/crossentropy": 2.291516423225403, "loss/hidden": 0.39892578125, "loss/logits": 0.038274774327874184, "loss/reg": 0.026154499500989914, "step": 282 }, { "epoch": 0.1415, "grad_norm": 2.358877658843994, "grad_norm_var": 0.4193280072280107, "learning_rate": 2e-05, "loss": 0.5369, "loss/crossentropy": 2.0343876481056213, "loss/hidden": 0.2392578125, "loss/logits": 0.036153580993413925, "loss/reg": 0.026152830570936203, "step": 283 }, { "epoch": 0.142, "grad_norm": 1.7734060287475586, "grad_norm_var": 0.42197319133869365, "learning_rate": 2e-05, "loss": 0.5995, "loss/crossentropy": 2.410479426383972, "loss/hidden": 0.28271484375, "loss/logits": 0.055306099355220795, "loss/reg": 0.026151135563850403, "step": 284 }, { "epoch": 0.1425, "grad_norm": 1.7683607339859009, "grad_norm_var": 0.4129653376453054, "learning_rate": 2e-05, "loss": 0.4774, "loss/crossentropy": 2.422680377960205, "loss/hidden": 0.17138671875, "loss/logits": 0.04454575851559639, "loss/reg": 0.026149341836571693, "step": 285 }, { "epoch": 0.143, "grad_norm": 1.890203833580017, "grad_norm_var": 0.3920434322764975, "learning_rate": 2e-05, "loss": 0.6648, "loss/crossentropy": 2.3643598556518555, "loss/hidden": 0.3388671875, "loss/logits": 0.06448590569198132, "loss/reg": 0.02614753320813179, "step": 286 }, { "epoch": 0.1435, "grad_norm": 1.29023015499115, "grad_norm_var": 0.39001840335736465, "learning_rate": 2e-05, "loss": 0.4522, "loss/crossentropy": 2.5188199281692505, "loss/hidden": 0.16748046875, "loss/logits": 0.02329123578965664, "loss/reg": 0.02614591456949711, "step": 287 }, { "epoch": 0.144, "grad_norm": 10.653407096862793, "grad_norm_var": 5.386538257885738, "learning_rate": 2e-05, "loss": 0.5673, "loss/crossentropy": 2.5609625577926636, "loss/hidden": 0.27880859375, "loss/logits": 0.02702578529715538, "loss/reg": 0.026144322007894516, "step": 288 }, { "epoch": 0.1445, "grad_norm": 1.2127407789230347, "grad_norm_var": 5.43876626293414, "learning_rate": 2e-05, "loss": 0.5081, "loss/crossentropy": 2.4100780487060547, "loss/hidden": 0.21044921875, "loss/logits": 0.03620042186230421, "loss/reg": 0.02614261396229267, "step": 289 }, { "epoch": 0.145, "grad_norm": 1.4402183294296265, "grad_norm_var": 5.4412882443615285, "learning_rate": 2e-05, "loss": 0.4768, "loss/crossentropy": 2.271009087562561, "loss/hidden": 0.189453125, "loss/logits": 0.02590431459248066, "loss/reg": 0.026140958070755005, "step": 290 }, { "epoch": 0.1455, "grad_norm": 1.5095008611679077, "grad_norm_var": 5.388429514304694, "learning_rate": 2e-05, "loss": 0.5205, "loss/crossentropy": 2.3384816646575928, "loss/hidden": 0.22265625, "loss/logits": 0.036461083218455315, "loss/reg": 0.026139242574572563, "step": 291 }, { "epoch": 0.146, "grad_norm": 1.1620471477508545, "grad_norm_var": 5.390050224047064, "learning_rate": 2e-05, "loss": 0.4969, "loss/crossentropy": 2.433851480484009, "loss/hidden": 0.20068359375, "loss/logits": 0.0348251610994339, "loss/reg": 0.026137609034776688, "step": 292 }, { "epoch": 0.1465, "grad_norm": 1.4650121927261353, "grad_norm_var": 5.394693634841302, "learning_rate": 2e-05, "loss": 0.4378, "loss/crossentropy": 2.5522985458374023, "loss/hidden": 0.154296875, "loss/logits": 0.022188137285411358, "loss/reg": 0.02613597922027111, "step": 293 }, { "epoch": 0.147, "grad_norm": 1.9892516136169434, "grad_norm_var": 5.352159159580765, "learning_rate": 2e-05, "loss": 0.5504, "loss/crossentropy": 2.1993319392204285, "loss/hidden": 0.24267578125, "loss/logits": 0.04638373479247093, "loss/reg": 0.026134170591831207, "step": 294 }, { "epoch": 0.1475, "grad_norm": 1.465166687965393, "grad_norm_var": 5.285637901292613, "learning_rate": 2e-05, "loss": 0.494, "loss/crossentropy": 2.223472237586975, "loss/hidden": 0.19287109375, "loss/logits": 0.03982667811214924, "loss/reg": 0.026132365688681602, "step": 295 }, { "epoch": 0.148, "grad_norm": 2.5565261840820312, "grad_norm_var": 5.2893741834582775, "learning_rate": 2e-05, "loss": 0.5916, "loss/crossentropy": 2.2553144693374634, "loss/hidden": 0.27392578125, "loss/logits": 0.056398073211312294, "loss/reg": 0.026130499318242073, "step": 296 }, { "epoch": 0.1485, "grad_norm": 1.2621214389801025, "grad_norm_var": 5.286002834073586, "learning_rate": 2e-05, "loss": 0.4855, "loss/crossentropy": 2.241385817527771, "loss/hidden": 0.1953125, "loss/logits": 0.028942352160811424, "loss/reg": 0.026128675788640976, "step": 297 }, { "epoch": 0.149, "grad_norm": 1.841597080230713, "grad_norm_var": 5.2087414924686675, "learning_rate": 2e-05, "loss": 0.5784, "loss/crossentropy": 2.296829104423523, "loss/hidden": 0.2802734375, "loss/logits": 0.03681251127272844, "loss/reg": 0.026126863434910774, "step": 298 }, { "epoch": 0.1495, "grad_norm": 1.258812427520752, "grad_norm_var": 5.265093383729075, "learning_rate": 2e-05, "loss": 0.492, "loss/crossentropy": 2.4392940998077393, "loss/hidden": 0.20166015625, "loss/logits": 0.0290931249037385, "loss/reg": 0.026125235483050346, "step": 299 }, { "epoch": 0.15, "grad_norm": 1.3167269229888916, "grad_norm_var": 5.3015866088773915, "learning_rate": 2e-05, "loss": 0.4889, "loss/crossentropy": 2.401396870613098, "loss/hidden": 0.19775390625, "loss/logits": 0.029924746602773666, "loss/reg": 0.02612358331680298, "step": 300 }, { "epoch": 0.1505, "grad_norm": 1.6229465007781982, "grad_norm_var": 5.309922187137865, "learning_rate": 2e-05, "loss": 0.5287, "loss/crossentropy": 2.36386775970459, "loss/hidden": 0.2255859375, "loss/logits": 0.04194306582212448, "loss/reg": 0.02612200565636158, "step": 301 }, { "epoch": 0.151, "grad_norm": 1.777891755104065, "grad_norm_var": 5.31416719857012, "learning_rate": 2e-05, "loss": 0.5103, "loss/crossentropy": 2.4092063903808594, "loss/hidden": 0.2138671875, "loss/logits": 0.03518137149512768, "loss/reg": 0.026120424270629883, "step": 302 }, { "epoch": 0.1515, "grad_norm": 1.1520640850067139, "grad_norm_var": 5.330536147630553, "learning_rate": 2e-05, "loss": 0.5057, "loss/crossentropy": 2.2741400003433228, "loss/hidden": 0.21142578125, "loss/logits": 0.03307824395596981, "loss/reg": 0.0261188056319952, "step": 303 }, { "epoch": 0.152, "grad_norm": 1.2208960056304932, "grad_norm_var": 0.1407175104424084, "learning_rate": 2e-05, "loss": 0.4876, "loss/crossentropy": 2.202619433403015, "loss/hidden": 0.19140625, "loss/logits": 0.03499259799718857, "loss/reg": 0.026117179542779922, "step": 304 }, { "epoch": 0.1525, "grad_norm": 1.2294107675552368, "grad_norm_var": 0.14006117928402068, "learning_rate": 2e-05, "loss": 0.4935, "loss/crossentropy": 2.3829336166381836, "loss/hidden": 0.1982421875, "loss/logits": 0.03412310779094696, "loss/reg": 0.026115482673048973, "step": 305 }, { "epoch": 0.153, "grad_norm": 1.2149832248687744, "grad_norm_var": 0.1455343172725079, "learning_rate": 2e-05, "loss": 0.4618, "loss/crossentropy": 2.3216532468795776, "loss/hidden": 0.17529296875, "loss/logits": 0.025372054427862167, "loss/reg": 0.02611370198428631, "step": 306 }, { "epoch": 0.1535, "grad_norm": 1.4385122060775757, "grad_norm_var": 0.14578594604365136, "learning_rate": 2e-05, "loss": 0.51, "loss/crossentropy": 2.449226975440979, "loss/hidden": 0.20654296875, "loss/logits": 0.04237618204206228, "loss/reg": 0.026111874729394913, "step": 307 }, { "epoch": 0.154, "grad_norm": 1.118850588798523, "grad_norm_var": 0.14783964943001873, "learning_rate": 2e-05, "loss": 0.4752, "loss/crossentropy": 2.3721216917037964, "loss/hidden": 0.1865234375, "loss/logits": 0.027595724910497665, "loss/reg": 0.0261102132499218, "step": 308 }, { "epoch": 0.1545, "grad_norm": 1.3892052173614502, "grad_norm_var": 0.14850872616204683, "learning_rate": 2e-05, "loss": 0.4986, "loss/crossentropy": 2.3415383100509644, "loss/hidden": 0.205078125, "loss/logits": 0.03244547359645367, "loss/reg": 0.02610846608877182, "step": 309 }, { "epoch": 0.155, "grad_norm": 1.1625828742980957, "grad_norm_var": 0.13629436785804921, "learning_rate": 2e-05, "loss": 0.4995, "loss/crossentropy": 2.3235228061676025, "loss/hidden": 0.2099609375, "loss/logits": 0.028443023562431335, "loss/reg": 0.026106812059879303, "step": 310 }, { "epoch": 0.1555, "grad_norm": 1.27708899974823, "grad_norm_var": 0.1378557412128671, "learning_rate": 2e-05, "loss": 0.517, "loss/crossentropy": 2.4090656042099, "loss/hidden": 0.220703125, "loss/logits": 0.035252392292022705, "loss/reg": 0.026105303317308426, "step": 311 }, { "epoch": 0.156, "grad_norm": 1.1573866605758667, "grad_norm_var": 0.049585704844170665, "learning_rate": 2e-05, "loss": 0.509, "loss/crossentropy": 2.1680856943130493, "loss/hidden": 0.2158203125, "loss/logits": 0.03210577368736267, "loss/reg": 0.026103774085640907, "step": 312 }, { "epoch": 0.1565, "grad_norm": 1.265214443206787, "grad_norm_var": 0.04955415784550207, "learning_rate": 2e-05, "loss": 0.4997, "loss/crossentropy": 2.3472299575805664, "loss/hidden": 0.20458984375, "loss/logits": 0.03409944660961628, "loss/reg": 0.026102419942617416, "step": 313 }, { "epoch": 0.157, "grad_norm": 1.9698238372802734, "grad_norm_var": 0.05915308914134864, "learning_rate": 2e-05, "loss": 0.5882, "loss/crossentropy": 2.3045787811279297, "loss/hidden": 0.27392578125, "loss/logits": 0.05324920453131199, "loss/reg": 0.02610074356198311, "step": 314 }, { "epoch": 0.1575, "grad_norm": 1.3615264892578125, "grad_norm_var": 0.058587269718664695, "learning_rate": 2e-05, "loss": 0.5097, "loss/crossentropy": 2.010044515132904, "loss/hidden": 0.2138671875, "loss/logits": 0.0348639115691185, "loss/reg": 0.026099352166056633, "step": 315 }, { "epoch": 0.158, "grad_norm": 1.450539231300354, "grad_norm_var": 0.05902897578693942, "learning_rate": 2e-05, "loss": 0.5259, "loss/crossentropy": 2.258197784423828, "loss/hidden": 0.22412109375, "loss/logits": 0.040814803913235664, "loss/reg": 0.026097897440195084, "step": 316 }, { "epoch": 0.1585, "grad_norm": 1.2342588901519775, "grad_norm_var": 0.055002612504784484, "learning_rate": 2e-05, "loss": 0.5114, "loss/crossentropy": 2.450056791305542, "loss/hidden": 0.212890625, "loss/logits": 0.03752759099006653, "loss/reg": 0.026096461340785027, "step": 317 }, { "epoch": 0.159, "grad_norm": 1.5264660120010376, "grad_norm_var": 0.04423249803009378, "learning_rate": 2e-05, "loss": 0.5069, "loss/crossentropy": 2.3556759357452393, "loss/hidden": 0.212890625, "loss/logits": 0.03307069279253483, "loss/reg": 0.026094747707247734, "step": 318 }, { "epoch": 0.1595, "grad_norm": 1.394983172416687, "grad_norm_var": 0.04238248493052742, "learning_rate": 2e-05, "loss": 0.4826, "loss/crossentropy": 2.3402878046035767, "loss/hidden": 0.19384765625, "loss/logits": 0.027848311699926853, "loss/reg": 0.026093317195773125, "step": 319 }, { "epoch": 0.16, "grad_norm": 1.3035892248153687, "grad_norm_var": 0.04151614019492621, "learning_rate": 2e-05, "loss": 0.5366, "loss/crossentropy": 2.4592941999435425, "loss/hidden": 0.2333984375, "loss/logits": 0.04223489202558994, "loss/reg": 0.026091884821653366, "step": 320 }, { "epoch": 0.1605, "grad_norm": 1.8944873809814453, "grad_norm_var": 0.05905324081961657, "learning_rate": 2e-05, "loss": 0.5082, "loss/crossentropy": 2.2413108348846436, "loss/hidden": 0.21142578125, "loss/logits": 0.03591745160520077, "loss/reg": 0.026090335100889206, "step": 321 }, { "epoch": 0.161, "grad_norm": 2.45639705657959, "grad_norm_var": 0.12723620805793795, "learning_rate": 2e-05, "loss": 0.6455, "loss/crossentropy": 1.9915293455123901, "loss/hidden": 0.3408203125, "loss/logits": 0.043817924335598946, "loss/reg": 0.02608887292444706, "step": 322 }, { "epoch": 0.1615, "grad_norm": 1.7480417490005493, "grad_norm_var": 0.13223189773490632, "learning_rate": 2e-05, "loss": 0.5439, "loss/crossentropy": 2.4047733545303345, "loss/hidden": 0.22265625, "loss/logits": 0.06036931276321411, "loss/reg": 0.026087457314133644, "step": 323 }, { "epoch": 0.162, "grad_norm": 1.853732705116272, "grad_norm_var": 0.1304117384352215, "learning_rate": 2e-05, "loss": 0.4878, "loss/crossentropy": 2.5980257987976074, "loss/hidden": 0.189453125, "loss/logits": 0.037442656233906746, "loss/reg": 0.02608575113117695, "step": 324 }, { "epoch": 0.1625, "grad_norm": 1.894579291343689, "grad_norm_var": 0.13703325521176069, "learning_rate": 2e-05, "loss": 0.6235, "loss/crossentropy": 2.3670873641967773, "loss/hidden": 0.2626953125, "loss/logits": 0.09991350024938583, "loss/reg": 0.02608424238860607, "step": 325 }, { "epoch": 0.163, "grad_norm": 1.3630772829055786, "grad_norm_var": 0.1289371841207372, "learning_rate": 2e-05, "loss": 0.5014, "loss/crossentropy": 2.1478903889656067, "loss/hidden": 0.2099609375, "loss/logits": 0.030608470551669598, "loss/reg": 0.026082569733262062, "step": 326 }, { "epoch": 0.1635, "grad_norm": 1.2252777814865112, "grad_norm_var": 0.13114190368244535, "learning_rate": 2e-05, "loss": 0.5137, "loss/crossentropy": 2.228654980659485, "loss/hidden": 0.216796875, "loss/logits": 0.036048877984285355, "loss/reg": 0.026081033051013947, "step": 327 }, { "epoch": 0.164, "grad_norm": 1.1830672025680542, "grad_norm_var": 0.12977471644483457, "learning_rate": 2e-05, "loss": 0.4567, "loss/crossentropy": 2.5576133728027344, "loss/hidden": 0.16796875, "loss/logits": 0.02798423543572426, "loss/reg": 0.026079514995217323, "step": 328 }, { "epoch": 0.1645, "grad_norm": 1.9584394693374634, "grad_norm_var": 0.13160920382139138, "learning_rate": 2e-05, "loss": 0.5043, "loss/crossentropy": 2.321745753288269, "loss/hidden": 0.2109375, "loss/logits": 0.032608283683657646, "loss/reg": 0.02607780508697033, "step": 329 }, { "epoch": 0.165, "grad_norm": 2.176175355911255, "grad_norm_var": 0.14407030947683092, "learning_rate": 2e-05, "loss": 0.5287, "loss/crossentropy": 2.5101382732391357, "loss/hidden": 0.23388671875, "loss/logits": 0.03408687189221382, "loss/reg": 0.026076283305883408, "step": 330 }, { "epoch": 0.1655, "grad_norm": 1.3150840997695923, "grad_norm_var": 0.14584616287976904, "learning_rate": 2e-05, "loss": 0.5012, "loss/crossentropy": 2.4776118993759155, "loss/hidden": 0.20751953125, "loss/logits": 0.032900793477892876, "loss/reg": 0.026074586436152458, "step": 331 }, { "epoch": 0.166, "grad_norm": 1.6297320127487183, "grad_norm_var": 0.14371731927044115, "learning_rate": 2e-05, "loss": 0.5161, "loss/crossentropy": 2.4321337938308716, "loss/hidden": 0.22119140625, "loss/logits": 0.03421984426677227, "loss/reg": 0.02607305720448494, "step": 332 }, { "epoch": 0.1665, "grad_norm": 1.3825711011886597, "grad_norm_var": 0.13717068867274657, "learning_rate": 2e-05, "loss": 0.4764, "loss/crossentropy": 2.212525486946106, "loss/hidden": 0.18701171875, "loss/logits": 0.028680726885795593, "loss/reg": 0.026071617379784584, "step": 333 }, { "epoch": 0.167, "grad_norm": 1.1411490440368652, "grad_norm_var": 0.15249385172816404, "learning_rate": 2e-05, "loss": 0.4697, "loss/crossentropy": 2.309118866920471, "loss/hidden": 0.1806640625, "loss/logits": 0.028305926360189915, "loss/reg": 0.026069944724440575, "step": 334 }, { "epoch": 0.1675, "grad_norm": 1.5472488403320312, "grad_norm_var": 0.14937410499563786, "learning_rate": 2e-05, "loss": 0.5375, "loss/crossentropy": 2.2855429649353027, "loss/hidden": 0.24072265625, "loss/logits": 0.03612595796585083, "loss/reg": 0.026068488135933876, "step": 335 }, { "epoch": 0.168, "grad_norm": 5.691341400146484, "grad_norm_var": 1.161954663002865, "learning_rate": 2e-05, "loss": 0.5703, "loss/crossentropy": 2.4927643537521362, "loss/hidden": 0.26953125, "loss/logits": 0.04008688498288393, "loss/reg": 0.026066886261105537, "step": 336 }, { "epoch": 0.1685, "grad_norm": 2.456817388534546, "grad_norm_var": 1.1810217336131037, "learning_rate": 2e-05, "loss": 0.7493, "loss/crossentropy": 2.5297993421554565, "loss/hidden": 0.37744140625, "loss/logits": 0.11121575441211462, "loss/reg": 0.026065362617373466, "step": 337 }, { "epoch": 0.169, "grad_norm": 1.8542064428329468, "grad_norm_var": 1.1621370785149523, "learning_rate": 2e-05, "loss": 0.5021, "loss/crossentropy": 2.4378503561019897, "loss/hidden": 0.2021484375, "loss/logits": 0.03930371440947056, "loss/reg": 0.026063820347189903, "step": 338 }, { "epoch": 0.1695, "grad_norm": 1.8168195486068726, "grad_norm_var": 1.1610274406018892, "learning_rate": 2e-05, "loss": 0.4994, "loss/crossentropy": 2.1696581840515137, "loss/hidden": 0.201171875, "loss/logits": 0.03763199597597122, "loss/reg": 0.02606227435171604, "step": 339 }, { "epoch": 0.17, "grad_norm": 1.1088804006576538, "grad_norm_var": 1.20085213579918, "learning_rate": 2e-05, "loss": 0.4517, "loss/crossentropy": 2.512749671936035, "loss/hidden": 0.16748046875, "loss/logits": 0.02365376614034176, "loss/reg": 0.026060722768306732, "step": 340 }, { "epoch": 0.1705, "grad_norm": 1.490470051765442, "grad_norm_var": 1.2091431686160143, "learning_rate": 2e-05, "loss": 0.4908, "loss/crossentropy": 2.487561345100403, "loss/hidden": 0.2001953125, "loss/logits": 0.030045345425605774, "loss/reg": 0.026059186086058617, "step": 341 }, { "epoch": 0.171, "grad_norm": 1.0408867597579956, "grad_norm_var": 1.2358515542870572, "learning_rate": 2e-05, "loss": 0.4727, "loss/crossentropy": 2.2461780309677124, "loss/hidden": 0.1865234375, "loss/logits": 0.025552313774824142, "loss/reg": 0.02605745941400528, "step": 342 }, { "epoch": 0.1715, "grad_norm": 1.5709069967269897, "grad_norm_var": 1.2162039640705784, "learning_rate": 2e-05, "loss": 0.4876, "loss/crossentropy": 2.2603927850723267, "loss/hidden": 0.19775390625, "loss/logits": 0.02933008223772049, "loss/reg": 0.026055721566081047, "step": 343 }, { "epoch": 0.172, "grad_norm": 1.2913998365402222, "grad_norm_var": 1.2075172882359821, "learning_rate": 2e-05, "loss": 0.5198, "loss/crossentropy": 2.196273446083069, "loss/hidden": 0.2216796875, "loss/logits": 0.03755245357751846, "loss/reg": 0.026053981855511665, "step": 344 }, { "epoch": 0.1725, "grad_norm": 1.2019914388656616, "grad_norm_var": 1.2315373969600656, "learning_rate": 2e-05, "loss": 0.4743, "loss/crossentropy": 2.2306121587753296, "loss/hidden": 0.18994140625, "loss/logits": 0.023790341801941395, "loss/reg": 0.02605227194726467, "step": 345 }, { "epoch": 0.173, "grad_norm": 1.3895491361618042, "grad_norm_var": 1.2302038798211163, "learning_rate": 2e-05, "loss": 0.4963, "loss/crossentropy": 2.4988722801208496, "loss/hidden": 0.2080078125, "loss/logits": 0.02780199982225895, "loss/reg": 0.02605038322508335, "step": 346 }, { "epoch": 0.1735, "grad_norm": 1.50831937789917, "grad_norm_var": 1.2214463856538722, "learning_rate": 2e-05, "loss": 0.4932, "loss/crossentropy": 2.409302234649658, "loss/hidden": 0.19921875, "loss/logits": 0.03346476424485445, "loss/reg": 0.02604857087135315, "step": 347 }, { "epoch": 0.174, "grad_norm": 1.7516964673995972, "grad_norm_var": 1.220296012686515, "learning_rate": 2e-05, "loss": 0.5642, "loss/crossentropy": 2.2191531658172607, "loss/hidden": 0.25927734375, "loss/logits": 0.04449588805437088, "loss/reg": 0.02604682371020317, "step": 348 }, { "epoch": 0.1745, "grad_norm": 2.329723358154297, "grad_norm_var": 1.228035235727617, "learning_rate": 2e-05, "loss": 0.5682, "loss/crossentropy": 2.1749590635299683, "loss/hidden": 0.2646484375, "loss/logits": 0.04313355404883623, "loss/reg": 0.026045063510537148, "step": 349 }, { "epoch": 0.175, "grad_norm": 1.3271498680114746, "grad_norm_var": 1.2132512460490317, "learning_rate": 2e-05, "loss": 0.45, "loss/crossentropy": 2.588584542274475, "loss/hidden": 0.16650390625, "loss/logits": 0.02306409552693367, "loss/reg": 0.026043301448225975, "step": 350 }, { "epoch": 0.1755, "grad_norm": 1.875108003616333, "grad_norm_var": 1.2073429995003617, "learning_rate": 2e-05, "loss": 0.5202, "loss/crossentropy": 2.287582039833069, "loss/hidden": 0.20703125, "loss/logits": 0.052729660645127296, "loss/reg": 0.026041487231850624, "step": 351 }, { "epoch": 0.176, "grad_norm": 1.146688461303711, "grad_norm_var": 0.1745214276224876, "learning_rate": 2e-05, "loss": 0.4424, "loss/crossentropy": 2.3722145557403564, "loss/hidden": 0.159912109375, "loss/logits": 0.022133183665573597, "loss/reg": 0.026039764285087585, "step": 352 }, { "epoch": 0.1765, "grad_norm": 1.562357783317566, "grad_norm_var": 0.11906480060907014, "learning_rate": 2e-05, "loss": 0.5252, "loss/crossentropy": 2.2052918672561646, "loss/hidden": 0.2275390625, "loss/logits": 0.0372452475130558, "loss/reg": 0.02603817544877529, "step": 353 }, { "epoch": 0.177, "grad_norm": 1.3673954010009766, "grad_norm_var": 0.11196520379043946, "learning_rate": 2e-05, "loss": 0.462, "loss/crossentropy": 2.3004499673843384, "loss/hidden": 0.17578125, "loss/logits": 0.025897801853716373, "loss/reg": 0.026036500930786133, "step": 354 }, { "epoch": 0.1775, "grad_norm": 1.2918845415115356, "grad_norm_var": 0.10604762311465758, "learning_rate": 2e-05, "loss": 0.4731, "loss/crossentropy": 2.265425443649292, "loss/hidden": 0.18701171875, "loss/logits": 0.025708286091685295, "loss/reg": 0.026034945622086525, "step": 355 }, { "epoch": 0.178, "grad_norm": 7.662310600280762, "grad_norm_var": 2.4892246344001143, "learning_rate": 2e-05, "loss": 0.5369, "loss/crossentropy": 2.398472547531128, "loss/hidden": 0.2412109375, "loss/logits": 0.0353584922850132, "loss/reg": 0.026033204048871994, "step": 356 }, { "epoch": 0.1785, "grad_norm": 1.422759771347046, "grad_norm_var": 2.492874299968556, "learning_rate": 2e-05, "loss": 0.5149, "loss/crossentropy": 2.226934790611267, "loss/hidden": 0.220703125, "loss/logits": 0.03390590753406286, "loss/reg": 0.026031551882624626, "step": 357 }, { "epoch": 0.179, "grad_norm": 1.271759271621704, "grad_norm_var": 2.471029150963487, "learning_rate": 2e-05, "loss": 0.5199, "loss/crossentropy": 2.3659080266952515, "loss/hidden": 0.21630859375, "loss/logits": 0.04329786077141762, "loss/reg": 0.02602977305650711, "step": 358 }, { "epoch": 0.1795, "grad_norm": 1.2337300777435303, "grad_norm_var": 2.491724270181853, "learning_rate": 2e-05, "loss": 0.5058, "loss/crossentropy": 2.3398635387420654, "loss/hidden": 0.20849609375, "loss/logits": 0.0370652936398983, "loss/reg": 0.026028025895357132, "step": 359 }, { "epoch": 0.18, "grad_norm": 1.1331290006637573, "grad_norm_var": 2.505122499555146, "learning_rate": 2e-05, "loss": 0.4673, "loss/crossentropy": 2.4402376413345337, "loss/hidden": 0.17919921875, "loss/logits": 0.027860145084559917, "loss/reg": 0.026026224717497826, "step": 360 }, { "epoch": 0.1805, "grad_norm": 1.8800278902053833, "grad_norm_var": 2.475975881369847, "learning_rate": 2e-05, "loss": 0.5447, "loss/crossentropy": 2.1927571296691895, "loss/hidden": 0.24658203125, "loss/logits": 0.037921242415905, "loss/reg": 0.02602434903383255, "step": 361 }, { "epoch": 0.181, "grad_norm": 1.1613508462905884, "grad_norm_var": 2.4942931489268525, "learning_rate": 2e-05, "loss": 0.4629, "loss/crossentropy": 2.3627922534942627, "loss/hidden": 0.173828125, "loss/logits": 0.02883315272629261, "loss/reg": 0.026022551581263542, "step": 362 }, { "epoch": 0.1815, "grad_norm": 1.2477275133132935, "grad_norm_var": 2.5111159165951857, "learning_rate": 2e-05, "loss": 0.5399, "loss/crossentropy": 2.3385051488876343, "loss/hidden": 0.23779296875, "loss/logits": 0.041944630444049835, "loss/reg": 0.026020534336566925, "step": 363 }, { "epoch": 0.182, "grad_norm": 1.0904345512390137, "grad_norm_var": 2.547469450312644, "learning_rate": 2e-05, "loss": 0.4478, "loss/crossentropy": 2.353084683418274, "loss/hidden": 0.16259765625, "loss/logits": 0.02504115179181099, "loss/reg": 0.0260187778621912, "step": 364 }, { "epoch": 0.1825, "grad_norm": 1.6713463068008423, "grad_norm_var": 2.5291763950799013, "learning_rate": 2e-05, "loss": 0.4939, "loss/crossentropy": 2.2228282690048218, "loss/hidden": 0.2099609375, "loss/logits": 0.023817350156605244, "loss/reg": 0.026017041876912117, "step": 365 }, { "epoch": 0.183, "grad_norm": 1.2542800903320312, "grad_norm_var": 2.5338262674117384, "learning_rate": 2e-05, "loss": 0.5238, "loss/crossentropy": 2.228682041168213, "loss/hidden": 0.2275390625, "loss/logits": 0.03613369073718786, "loss/reg": 0.026015128940343857, "step": 366 }, { "epoch": 0.1835, "grad_norm": 1.2646586894989014, "grad_norm_var": 2.548319005158211, "learning_rate": 2e-05, "loss": 0.4777, "loss/crossentropy": 2.3304578065872192, "loss/hidden": 0.1875, "loss/logits": 0.030102724209427834, "loss/reg": 0.026013074442744255, "step": 367 }, { "epoch": 0.184, "grad_norm": 1.0247364044189453, "grad_norm_var": 2.5587148751606645, "learning_rate": 2e-05, "loss": 0.4394, "loss/crossentropy": 2.5192004442214966, "loss/hidden": 0.1572265625, "loss/logits": 0.022060595452785492, "loss/reg": 0.026011094450950623, "step": 368 }, { "epoch": 0.1845, "grad_norm": 1.4839156866073608, "grad_norm_var": 2.5607612202403485, "learning_rate": 2e-05, "loss": 0.5208, "loss/crossentropy": 2.1315367221832275, "loss/hidden": 0.22705078125, "loss/logits": 0.03365152329206467, "loss/reg": 0.02600909397006035, "step": 369 }, { "epoch": 0.185, "grad_norm": 1.2327549457550049, "grad_norm_var": 2.5681585055774634, "learning_rate": 2e-05, "loss": 0.441, "loss/crossentropy": 2.418115019798279, "loss/hidden": 0.158203125, "loss/logits": 0.022698544897139072, "loss/reg": 0.026007305830717087, "step": 370 }, { "epoch": 0.1855, "grad_norm": 1.2444417476654053, "grad_norm_var": 2.5709309337522748, "learning_rate": 2e-05, "loss": 0.467, "loss/crossentropy": 2.2915507555007935, "loss/hidden": 0.1806640625, "loss/logits": 0.026262402534484863, "loss/reg": 0.02600536122918129, "step": 371 }, { "epoch": 0.186, "grad_norm": 1.2689179182052612, "grad_norm_var": 0.0472904244491535, "learning_rate": 2e-05, "loss": 0.4999, "loss/crossentropy": 2.518853783607483, "loss/hidden": 0.21435546875, "loss/logits": 0.025527067482471466, "loss/reg": 0.026003584265708923, "step": 372 }, { "epoch": 0.1865, "grad_norm": 1.4123287200927734, "grad_norm_var": 0.047133962787962426, "learning_rate": 2e-05, "loss": 0.4964, "loss/crossentropy": 2.3583970069885254, "loss/hidden": 0.2080078125, "loss/logits": 0.028406362980604172, "loss/reg": 0.02600177377462387, "step": 373 }, { "epoch": 0.187, "grad_norm": 1.3444428443908691, "grad_norm_var": 0.04714470510582007, "learning_rate": 2e-05, "loss": 0.4758, "loss/crossentropy": 2.250472664833069, "loss/hidden": 0.1875, "loss/logits": 0.02829747088253498, "loss/reg": 0.02599998004734516, "step": 374 }, { "epoch": 0.1875, "grad_norm": 1.2682015895843506, "grad_norm_var": 0.046871804013897095, "learning_rate": 2e-05, "loss": 0.5168, "loss/crossentropy": 2.1512317657470703, "loss/hidden": 0.2216796875, "loss/logits": 0.03511458821594715, "loss/reg": 0.025998059660196304, "step": 375 }, { "epoch": 0.188, "grad_norm": 1.2203181982040405, "grad_norm_var": 0.04527427140258874, "learning_rate": 2e-05, "loss": 0.4928, "loss/crossentropy": 2.4670302867889404, "loss/hidden": 0.20458984375, "loss/logits": 0.028278429992496967, "loss/reg": 0.025996318086981773, "step": 376 }, { "epoch": 0.1885, "grad_norm": 1.6124721765518188, "grad_norm_var": 0.02965817159038971, "learning_rate": 2e-05, "loss": 0.5482, "loss/crossentropy": 2.2732619047164917, "loss/hidden": 0.24169921875, "loss/logits": 0.046598936431109905, "loss/reg": 0.02599457837641239, "step": 377 }, { "epoch": 0.189, "grad_norm": 1.2982152700424194, "grad_norm_var": 0.0282961065281843, "learning_rate": 2e-05, "loss": 0.4767, "loss/crossentropy": 2.362215518951416, "loss/hidden": 0.189453125, "loss/logits": 0.02727901004254818, "loss/reg": 0.025992868468165398, "step": 378 }, { "epoch": 0.1895, "grad_norm": 1.4476395845413208, "grad_norm_var": 0.02916870288535254, "learning_rate": 2e-05, "loss": 0.5588, "loss/crossentropy": 2.1909669637680054, "loss/hidden": 0.25244140625, "loss/logits": 0.04647276923060417, "loss/reg": 0.02599099464714527, "step": 379 }, { "epoch": 0.19, "grad_norm": 1.3061769008636475, "grad_norm_var": 0.025439804416175528, "learning_rate": 2e-05, "loss": 0.4942, "loss/crossentropy": 2.291175603866577, "loss/hidden": 0.19970703125, "loss/logits": 0.034558966755867004, "loss/reg": 0.02598922699689865, "step": 380 }, { "epoch": 0.1905, "grad_norm": 1.635046362876892, "grad_norm_var": 0.02389268741876922, "learning_rate": 2e-05, "loss": 0.5255, "loss/crossentropy": 2.6519399881362915, "loss/hidden": 0.22705078125, "loss/logits": 0.0385761484503746, "loss/reg": 0.02598743885755539, "step": 381 }, { "epoch": 0.191, "grad_norm": 1.4028866291046143, "grad_norm_var": 0.023724865257600848, "learning_rate": 2e-05, "loss": 0.4627, "loss/crossentropy": 2.4420300722122192, "loss/hidden": 0.17724609375, "loss/logits": 0.025584472343325615, "loss/reg": 0.025985730811953545, "step": 382 }, { "epoch": 0.1915, "grad_norm": 3.43645977973938, "grad_norm_var": 0.29621158197049285, "learning_rate": 2e-05, "loss": 0.509, "loss/crossentropy": 2.3477495908737183, "loss/hidden": 0.216796875, "loss/logits": 0.03231562860310078, "loss/reg": 0.02598407492041588, "step": 383 }, { "epoch": 0.192, "grad_norm": 1.156148076057434, "grad_norm_var": 0.28935891803297004, "learning_rate": 2e-05, "loss": 0.4697, "loss/crossentropy": 2.4156278371810913, "loss/hidden": 0.17822265625, "loss/logits": 0.0316650066524744, "loss/reg": 0.0259822029620409, "step": 384 }, { "epoch": 0.1925, "grad_norm": 1.7708622217178345, "grad_norm_var": 0.2944387889021565, "learning_rate": 2e-05, "loss": 0.5722, "loss/crossentropy": 2.334781527519226, "loss/hidden": 0.2587890625, "loss/logits": 0.05361687205731869, "loss/reg": 0.025980478152632713, "step": 385 }, { "epoch": 0.193, "grad_norm": 2.3118906021118164, "grad_norm_var": 0.3282542563962823, "learning_rate": 2e-05, "loss": 0.5078, "loss/crossentropy": 2.3158434629440308, "loss/hidden": 0.21875, "loss/logits": 0.029261935502290726, "loss/reg": 0.02597857639193535, "step": 386 }, { "epoch": 0.1935, "grad_norm": 2.3003060817718506, "grad_norm_var": 0.3519549073980194, "learning_rate": 2e-05, "loss": 0.5687, "loss/crossentropy": 2.4877541065216064, "loss/hidden": 0.2783203125, "loss/logits": 0.03062661923468113, "loss/reg": 0.02597683109343052, "step": 387 }, { "epoch": 0.194, "grad_norm": 1.1055262088775635, "grad_norm_var": 0.3616427614209148, "learning_rate": 2e-05, "loss": 0.5022, "loss/crossentropy": 2.312312960624695, "loss/hidden": 0.20947265625, "loss/logits": 0.03301689215004444, "loss/reg": 0.025975055992603302, "step": 388 }, { "epoch": 0.1945, "grad_norm": 1.2821520566940308, "grad_norm_var": 0.3664245697624286, "learning_rate": 2e-05, "loss": 0.461, "loss/crossentropy": 2.3373734951019287, "loss/hidden": 0.17431640625, "loss/logits": 0.026956655085086823, "loss/reg": 0.02597302943468094, "step": 389 }, { "epoch": 0.195, "grad_norm": 1.158923625946045, "grad_norm_var": 0.37535894838822137, "learning_rate": 2e-05, "loss": 0.4653, "loss/crossentropy": 2.3373029232025146, "loss/hidden": 0.1787109375, "loss/logits": 0.02689830120652914, "loss/reg": 0.025970980525016785, "step": 390 }, { "epoch": 0.1955, "grad_norm": 4.394406318664551, "grad_norm_var": 0.8449288503424893, "learning_rate": 2e-05, "loss": 0.621, "loss/crossentropy": 2.483940362930298, "loss/hidden": 0.22021484375, "loss/logits": 0.14109261147677898, "loss/reg": 0.025968806818127632, "step": 391 }, { "epoch": 0.196, "grad_norm": 1.3220263719558716, "grad_norm_var": 0.837680848201209, "learning_rate": 2e-05, "loss": 0.571, "loss/crossentropy": 2.3968313932418823, "loss/hidden": 0.26318359375, "loss/logits": 0.04814390931278467, "loss/reg": 0.02596699632704258, "step": 392 }, { "epoch": 0.1965, "grad_norm": 1.476704478263855, "grad_norm_var": 0.8423872820531374, "learning_rate": 2e-05, "loss": 0.4767, "loss/crossentropy": 2.6665027141571045, "loss/hidden": 0.18115234375, "loss/logits": 0.03589140065014362, "loss/reg": 0.025965221226215363, "step": 393 }, { "epoch": 0.197, "grad_norm": 1.3346498012542725, "grad_norm_var": 0.8400309797725388, "learning_rate": 2e-05, "loss": 0.4879, "loss/crossentropy": 2.550223231315613, "loss/hidden": 0.189453125, "loss/logits": 0.03877757303416729, "loss/reg": 0.025963468477129936, "step": 394 }, { "epoch": 0.1975, "grad_norm": 1.381104826927185, "grad_norm_var": 0.8434567338089674, "learning_rate": 2e-05, "loss": 0.4542, "loss/crossentropy": 2.3325024843215942, "loss/hidden": 0.16259765625, "loss/logits": 0.03194664418697357, "loss/reg": 0.02596171200275421, "step": 395 }, { "epoch": 0.198, "grad_norm": 1.517006516456604, "grad_norm_var": 0.8323965808807104, "learning_rate": 2e-05, "loss": 0.5682, "loss/crossentropy": 2.144330859184265, "loss/hidden": 0.2607421875, "loss/logits": 0.04788592271506786, "loss/reg": 0.025959979742765427, "step": 396 }, { "epoch": 0.1985, "grad_norm": 1.0620001554489136, "grad_norm_var": 0.8664126262366226, "learning_rate": 2e-05, "loss": 0.4494, "loss/crossentropy": 2.410404920578003, "loss/hidden": 0.162109375, "loss/logits": 0.027681468054652214, "loss/reg": 0.025957921519875526, "step": 397 }, { "epoch": 0.199, "grad_norm": 1.4343640804290771, "grad_norm_var": 0.8649093715486228, "learning_rate": 2e-05, "loss": 0.5069, "loss/crossentropy": 2.295978307723999, "loss/hidden": 0.21142578125, "loss/logits": 0.035917842760682106, "loss/reg": 0.025956083089113235, "step": 398 }, { "epoch": 0.1995, "grad_norm": 1.325333595275879, "grad_norm_var": 0.676572657470614, "learning_rate": 2e-05, "loss": 0.4739, "loss/crossentropy": 2.3820759057998657, "loss/hidden": 0.18603515625, "loss/logits": 0.028354477137327194, "loss/reg": 0.025954021140933037, "step": 399 }, { "epoch": 0.2, "grad_norm": 2.790135622024536, "grad_norm_var": 0.736756106069653, "learning_rate": 2e-05, "loss": 0.5812, "loss/crossentropy": 2.283258855342865, "loss/hidden": 0.26953125, "loss/logits": 0.05216490104794502, "loss/reg": 0.02595207281410694, "step": 400 }, { "epoch": 0.2005, "grad_norm": 2.0463712215423584, "grad_norm_var": 0.7423414092941923, "learning_rate": 2e-05, "loss": 0.5143, "loss/crossentropy": 2.5675315856933594, "loss/hidden": 0.21728515625, "loss/logits": 0.037514453753829, "loss/reg": 0.025950025767087936, "step": 401 }, { "epoch": 0.201, "grad_norm": 1.8808186054229736, "grad_norm_var": 0.7225325442870276, "learning_rate": 2e-05, "loss": 0.463, "loss/crossentropy": 2.3908499479293823, "loss/hidden": 0.17724609375, "loss/logits": 0.026285232976078987, "loss/reg": 0.02594805508852005, "step": 402 }, { "epoch": 0.2015, "grad_norm": 1.2097140550613403, "grad_norm_var": 0.7151380800453793, "learning_rate": 2e-05, "loss": 0.4623, "loss/crossentropy": 2.3907727003097534, "loss/hidden": 0.1708984375, "loss/logits": 0.031923141330480576, "loss/reg": 0.025945995002985, "step": 403 }, { "epoch": 0.202, "grad_norm": 1.301154613494873, "grad_norm_var": 0.7028043528110918, "learning_rate": 2e-05, "loss": 0.4544, "loss/crossentropy": 2.606261968612671, "loss/hidden": 0.16650390625, "loss/logits": 0.02846657857298851, "loss/reg": 0.02594408206641674, "step": 404 }, { "epoch": 0.2025, "grad_norm": 1.1995950937271118, "grad_norm_var": 0.7076350429627898, "learning_rate": 2e-05, "loss": 0.4391, "loss/crossentropy": 2.3680388927459717, "loss/hidden": 0.15673828125, "loss/logits": 0.022917790338397026, "loss/reg": 0.025941966101527214, "step": 405 }, { "epoch": 0.203, "grad_norm": 8.632776260375977, "grad_norm_var": 3.6823756133748495, "learning_rate": 2e-05, "loss": 1.2499, "loss/crossentropy": 2.4126373529434204, "loss/hidden": 0.72021484375, "loss/logits": 0.270312886685133, "loss/reg": 0.025939757004380226, "step": 406 }, { "epoch": 0.2035, "grad_norm": 1.2286854982376099, "grad_norm_var": 3.358959418903309, "learning_rate": 2e-05, "loss": 0.4674, "loss/crossentropy": 2.290730118751526, "loss/hidden": 0.17822265625, "loss/logits": 0.029834291897714138, "loss/reg": 0.025937531143426895, "step": 407 }, { "epoch": 0.204, "grad_norm": 1.3198645114898682, "grad_norm_var": 3.359139686229141, "learning_rate": 2e-05, "loss": 0.4864, "loss/crossentropy": 2.424551844596863, "loss/hidden": 0.1953125, "loss/logits": 0.03177413158118725, "loss/reg": 0.025935430079698563, "step": 408 }, { "epoch": 0.2045, "grad_norm": 1.1165919303894043, "grad_norm_var": 3.389790819966483, "learning_rate": 2e-05, "loss": 0.5246, "loss/crossentropy": 2.2340330481529236, "loss/hidden": 0.22509765625, "loss/logits": 0.04018213599920273, "loss/reg": 0.02593357115983963, "step": 409 }, { "epoch": 0.205, "grad_norm": 1.4326255321502686, "grad_norm_var": 3.382694967184837, "learning_rate": 2e-05, "loss": 0.4669, "loss/crossentropy": 2.4408915042877197, "loss/hidden": 0.17822265625, "loss/logits": 0.02933754399418831, "loss/reg": 0.0259317085146904, "step": 410 }, { "epoch": 0.2055, "grad_norm": 1.4279175996780396, "grad_norm_var": 3.379406616020368, "learning_rate": 2e-05, "loss": 0.4722, "loss/crossentropy": 2.398142695426941, "loss/hidden": 0.18359375, "loss/logits": 0.02931864559650421, "loss/reg": 0.02592984400689602, "step": 411 }, { "epoch": 0.206, "grad_norm": 3.493486166000366, "grad_norm_var": 3.5139842381908477, "learning_rate": 2e-05, "loss": 0.5326, "loss/crossentropy": 2.3516749143600464, "loss/hidden": 0.17431640625, "loss/logits": 0.0990044642239809, "loss/reg": 0.025927875190973282, "step": 412 }, { "epoch": 0.2065, "grad_norm": 1.1578741073608398, "grad_norm_var": 3.5018478922430516, "learning_rate": 2e-05, "loss": 0.4537, "loss/crossentropy": 2.4094560146331787, "loss/hidden": 0.16845703125, "loss/logits": 0.025964444503188133, "loss/reg": 0.0259258896112442, "step": 413 }, { "epoch": 0.207, "grad_norm": 1.1830717325210571, "grad_norm_var": 3.5268350962116277, "learning_rate": 2e-05, "loss": 0.4649, "loss/crossentropy": 2.5271745920181274, "loss/hidden": 0.1650390625, "loss/logits": 0.04061662219464779, "loss/reg": 0.02592400461435318, "step": 414 }, { "epoch": 0.2075, "grad_norm": 1.8626422882080078, "grad_norm_var": 3.493204661138373, "learning_rate": 2e-05, "loss": 0.5312, "loss/crossentropy": 2.4808900356292725, "loss/hidden": 0.23193359375, "loss/logits": 0.040002613328397274, "loss/reg": 0.025921940803527832, "step": 415 }, { "epoch": 0.208, "grad_norm": 1.3317103385925293, "grad_norm_var": 3.4880922061323294, "learning_rate": 2e-05, "loss": 0.4776, "loss/crossentropy": 2.2604642510414124, "loss/hidden": 0.1884765625, "loss/logits": 0.029966252855956554, "loss/reg": 0.0259199608117342, "step": 416 }, { "epoch": 0.2085, "grad_norm": 1.5000430345535278, "grad_norm_var": 3.502571821664609, "learning_rate": 2e-05, "loss": 0.4815, "loss/crossentropy": 2.813089966773987, "loss/hidden": 0.1875, "loss/logits": 0.0347793884575367, "loss/reg": 0.025918107479810715, "step": 417 }, { "epoch": 0.209, "grad_norm": 2.0012242794036865, "grad_norm_var": 3.502288435747643, "learning_rate": 2e-05, "loss": 0.5129, "loss/crossentropy": 2.4542036056518555, "loss/hidden": 0.21923828125, "loss/logits": 0.034517631866037846, "loss/reg": 0.025916218757629395, "step": 418 }, { "epoch": 0.2095, "grad_norm": 1.627580165863037, "grad_norm_var": 3.4712634219787057, "learning_rate": 2e-05, "loss": 0.5077, "loss/crossentropy": 2.2533979415893555, "loss/hidden": 0.2109375, "loss/logits": 0.0376081969588995, "loss/reg": 0.02591414749622345, "step": 419 }, { "epoch": 0.21, "grad_norm": 1.2047749757766724, "grad_norm_var": 3.480677477073349, "learning_rate": 2e-05, "loss": 0.4928, "loss/crossentropy": 2.3260152339935303, "loss/hidden": 0.19873046875, "loss/logits": 0.03491301275789738, "loss/reg": 0.02591288462281227, "step": 420 }, { "epoch": 0.2105, "grad_norm": 1.4506651163101196, "grad_norm_var": 3.4584077400506277, "learning_rate": 2e-05, "loss": 0.4793, "loss/crossentropy": 2.378847122192383, "loss/hidden": 0.18212890625, "loss/logits": 0.038035670295357704, "loss/reg": 0.02591102570295334, "step": 421 }, { "epoch": 0.211, "grad_norm": 1.5155658721923828, "grad_norm_var": 0.3283885764687509, "learning_rate": 2e-05, "loss": 0.4815, "loss/crossentropy": 2.5136163234710693, "loss/hidden": 0.1904296875, "loss/logits": 0.03193356655538082, "loss/reg": 0.025909241288900375, "step": 422 }, { "epoch": 0.2115, "grad_norm": 1.1012701988220215, "grad_norm_var": 0.3349196404218929, "learning_rate": 2e-05, "loss": 0.4696, "loss/crossentropy": 2.4695777893066406, "loss/hidden": 0.18115234375, "loss/logits": 0.02937779761850834, "loss/reg": 0.025907844305038452, "step": 423 }, { "epoch": 0.212, "grad_norm": 1.069792628288269, "grad_norm_var": 0.3463492066639103, "learning_rate": 2e-05, "loss": 0.4535, "loss/crossentropy": 2.4616788625717163, "loss/hidden": 0.169921875, "loss/logits": 0.024515327997505665, "loss/reg": 0.025905968621373177, "step": 424 }, { "epoch": 0.2125, "grad_norm": 1.3047641515731812, "grad_norm_var": 0.3381949619476723, "learning_rate": 2e-05, "loss": 0.4452, "loss/crossentropy": 2.3210073709487915, "loss/hidden": 0.1640625, "loss/logits": 0.022073786705732346, "loss/reg": 0.025904452428221703, "step": 425 }, { "epoch": 0.213, "grad_norm": 1.2246006727218628, "grad_norm_var": 0.34392116884757395, "learning_rate": 2e-05, "loss": 0.439, "loss/crossentropy": 2.378546953201294, "loss/hidden": 0.1572265625, "loss/logits": 0.022725941613316536, "loss/reg": 0.025902574881911278, "step": 426 }, { "epoch": 0.2135, "grad_norm": 1.0174260139465332, "grad_norm_var": 0.35996108865228293, "learning_rate": 2e-05, "loss": 0.4365, "loss/crossentropy": 2.5196746587753296, "loss/hidden": 0.1552734375, "loss/logits": 0.02219019364565611, "loss/reg": 0.025900712236762047, "step": 427 }, { "epoch": 0.214, "grad_norm": 1.0695523023605347, "grad_norm_var": 0.08383900724757207, "learning_rate": 2e-05, "loss": 0.4605, "loss/crossentropy": 2.321962356567383, "loss/hidden": 0.17578125, "loss/logits": 0.025690771639347076, "loss/reg": 0.025898825377225876, "step": 428 }, { "epoch": 0.2145, "grad_norm": 2.6451644897460938, "grad_norm_var": 0.183711866568535, "learning_rate": 2e-05, "loss": 0.609, "loss/crossentropy": 2.2449337244033813, "loss/hidden": 0.28857421875, "loss/logits": 0.061411263421177864, "loss/reg": 0.02589711733162403, "step": 429 }, { "epoch": 0.215, "grad_norm": 1.713813304901123, "grad_norm_var": 0.1828266836214993, "learning_rate": 2e-05, "loss": 0.5079, "loss/crossentropy": 2.4451547861099243, "loss/hidden": 0.21533203125, "loss/logits": 0.033602748066186905, "loss/reg": 0.025895224884152412, "step": 430 }, { "epoch": 0.2155, "grad_norm": 1.0962016582489014, "grad_norm_var": 0.1801864102764767, "learning_rate": 2e-05, "loss": 0.4605, "loss/crossentropy": 2.357746958732605, "loss/hidden": 0.17578125, "loss/logits": 0.025804596953094006, "loss/reg": 0.025893518701195717, "step": 431 }, { "epoch": 0.216, "grad_norm": 4.617275238037109, "grad_norm_var": 0.8119718727911261, "learning_rate": 2e-05, "loss": 0.8169, "loss/crossentropy": 2.3531649112701416, "loss/hidden": 0.4423828125, "loss/logits": 0.11559372302144766, "loss/reg": 0.025891879573464394, "step": 432 }, { "epoch": 0.2165, "grad_norm": 0.9944002032279968, "grad_norm_var": 0.8370490047784728, "learning_rate": 2e-05, "loss": 0.4543, "loss/crossentropy": 2.474943161010742, "loss/hidden": 0.1708984375, "loss/logits": 0.024509361945092678, "loss/reg": 0.02588999830186367, "step": 433 }, { "epoch": 0.217, "grad_norm": 1.6105306148529053, "grad_norm_var": 0.8258643739880323, "learning_rate": 2e-05, "loss": 0.5185, "loss/crossentropy": 2.1941992044448853, "loss/hidden": 0.22705078125, "loss/logits": 0.03256234619766474, "loss/reg": 0.025888269767165184, "step": 434 }, { "epoch": 0.2175, "grad_norm": 1.2945845127105713, "grad_norm_var": 0.8306360972250484, "learning_rate": 2e-05, "loss": 0.4599, "loss/crossentropy": 2.3870290517807007, "loss/hidden": 0.1748046875, "loss/logits": 0.026204396039247513, "loss/reg": 0.025886395946145058, "step": 435 }, { "epoch": 0.218, "grad_norm": 1.646968126296997, "grad_norm_var": 0.8220224189189824, "learning_rate": 2e-05, "loss": 0.5252, "loss/crossentropy": 2.1381598711013794, "loss/hidden": 0.22705078125, "loss/logits": 0.03925580158829689, "loss/reg": 0.025884483009576797, "step": 436 }, { "epoch": 0.2185, "grad_norm": 1.2482175827026367, "grad_norm_var": 0.8282312987681753, "learning_rate": 2e-05, "loss": 0.4699, "loss/crossentropy": 2.4693511724472046, "loss/hidden": 0.18359375, "loss/logits": 0.027454238384962082, "loss/reg": 0.025882598012685776, "step": 437 }, { "epoch": 0.219, "grad_norm": 1.6927727460861206, "grad_norm_var": 0.828833769560893, "learning_rate": 2e-05, "loss": 0.6173, "loss/crossentropy": 2.264186978340149, "loss/hidden": 0.30419921875, "loss/logits": 0.05433515552431345, "loss/reg": 0.025880809873342514, "step": 438 }, { "epoch": 0.2195, "grad_norm": 1.3382420539855957, "grad_norm_var": 0.8170844633700326, "learning_rate": 2e-05, "loss": 0.4594, "loss/crossentropy": 2.4065046310424805, "loss/hidden": 0.16943359375, "loss/logits": 0.03115204442292452, "loss/reg": 0.025879191234707832, "step": 439 }, { "epoch": 0.22, "grad_norm": 1.280760407447815, "grad_norm_var": 0.8049795437588958, "learning_rate": 2e-05, "loss": 0.4746, "loss/crossentropy": 2.4252418279647827, "loss/hidden": 0.18701171875, "loss/logits": 0.028856026008725166, "loss/reg": 0.025877289474010468, "step": 440 }, { "epoch": 0.2205, "grad_norm": 1.1407486200332642, "grad_norm_var": 0.8133841973004384, "learning_rate": 2e-05, "loss": 0.4791, "loss/crossentropy": 2.264625906944275, "loss/hidden": 0.19091796875, "loss/logits": 0.029384871013462543, "loss/reg": 0.02587556093931198, "step": 441 }, { "epoch": 0.221, "grad_norm": 1.1531625986099243, "grad_norm_var": 0.8172974757844712, "learning_rate": 2e-05, "loss": 0.4645, "loss/crossentropy": 2.3819206953048706, "loss/hidden": 0.16015625, "loss/logits": 0.045620132237672806, "loss/reg": 0.025873858481645584, "step": 442 }, { "epoch": 0.2215, "grad_norm": 1.2209059000015259, "grad_norm_var": 0.8041477490589808, "learning_rate": 2e-05, "loss": 0.4922, "loss/crossentropy": 2.2260149717330933, "loss/hidden": 0.19775390625, "loss/logits": 0.03575233928859234, "loss/reg": 0.025872183963656425, "step": 443 }, { "epoch": 0.222, "grad_norm": 1.4616377353668213, "grad_norm_var": 0.7854915962697572, "learning_rate": 2e-05, "loss": 0.4759, "loss/crossentropy": 2.326699376106262, "loss/hidden": 0.1875, "loss/logits": 0.02973231580108404, "loss/reg": 0.02587028034031391, "step": 444 }, { "epoch": 0.2225, "grad_norm": 1.1616874933242798, "grad_norm_var": 0.7231711161910169, "learning_rate": 2e-05, "loss": 0.477, "loss/crossentropy": 2.3070465326309204, "loss/hidden": 0.18701171875, "loss/logits": 0.03132193721830845, "loss/reg": 0.02586846426129341, "step": 445 }, { "epoch": 0.223, "grad_norm": 1.1598429679870605, "grad_norm_var": 0.7296602944536337, "learning_rate": 2e-05, "loss": 0.4613, "loss/crossentropy": 2.353983521461487, "loss/hidden": 0.171875, "loss/logits": 0.030772192403674126, "loss/reg": 0.025866517797112465, "step": 446 }, { "epoch": 0.2235, "grad_norm": 1.3874998092651367, "grad_norm_var": 0.7189939859161824, "learning_rate": 2e-05, "loss": 0.5004, "loss/crossentropy": 2.47870934009552, "loss/hidden": 0.19970703125, "loss/logits": 0.04206428676843643, "loss/reg": 0.025864504277706146, "step": 447 }, { "epoch": 0.224, "grad_norm": 2.931767463684082, "grad_norm_var": 0.2017417237659708, "learning_rate": 2e-05, "loss": 0.8347, "loss/crossentropy": 2.5819171667099, "loss/hidden": 0.40234375, "loss/logits": 0.1736808605492115, "loss/reg": 0.025862593203783035, "step": 448 }, { "epoch": 0.2245, "grad_norm": 1.3012363910675049, "grad_norm_var": 0.19020454457909727, "learning_rate": 2e-05, "loss": 0.4471, "loss/crossentropy": 2.3639878034591675, "loss/hidden": 0.16162109375, "loss/logits": 0.026854592375457287, "loss/reg": 0.025860626250505447, "step": 449 }, { "epoch": 0.225, "grad_norm": 1.6675218343734741, "grad_norm_var": 0.19170785847398542, "learning_rate": 2e-05, "loss": 0.5328, "loss/crossentropy": 2.3093976974487305, "loss/hidden": 0.22119140625, "loss/logits": 0.05306573584675789, "loss/reg": 0.025858718901872635, "step": 450 }, { "epoch": 0.2255, "grad_norm": 1.4241790771484375, "grad_norm_var": 0.19019349759976567, "learning_rate": 2e-05, "loss": 0.4946, "loss/crossentropy": 2.3994137048721313, "loss/hidden": 0.2060546875, "loss/logits": 0.029971184208989143, "loss/reg": 0.02585672214627266, "step": 451 }, { "epoch": 0.226, "grad_norm": 2.4200472831726074, "grad_norm_var": 0.24773914499477828, "learning_rate": 2e-05, "loss": 0.4804, "loss/crossentropy": 2.349528431892395, "loss/hidden": 0.19482421875, "loss/logits": 0.02699958346784115, "loss/reg": 0.02585473842918873, "step": 452 }, { "epoch": 0.2265, "grad_norm": 1.597848653793335, "grad_norm_var": 0.24367026793009636, "learning_rate": 2e-05, "loss": 0.4561, "loss/crossentropy": 2.2601643800735474, "loss/hidden": 0.17041015625, "loss/logits": 0.027189917862415314, "loss/reg": 0.025852810591459274, "step": 453 }, { "epoch": 0.227, "grad_norm": 1.4791680574417114, "grad_norm_var": 0.24163663071934274, "learning_rate": 2e-05, "loss": 0.5426, "loss/crossentropy": 2.184678077697754, "loss/hidden": 0.2412109375, "loss/logits": 0.04290330223739147, "loss/reg": 0.025850806385278702, "step": 454 }, { "epoch": 0.2275, "grad_norm": 1.54658842086792, "grad_norm_var": 0.2396368776147885, "learning_rate": 2e-05, "loss": 0.495, "loss/crossentropy": 2.427361249923706, "loss/hidden": 0.203125, "loss/logits": 0.03336675837635994, "loss/reg": 0.025848930701613426, "step": 455 }, { "epoch": 0.228, "grad_norm": 1.0083175897598267, "grad_norm_var": 0.2529996468682663, "learning_rate": 2e-05, "loss": 0.51, "loss/crossentropy": 2.1102696657180786, "loss/hidden": 0.21484375, "loss/logits": 0.03667537495493889, "loss/reg": 0.025847142562270164, "step": 456 }, { "epoch": 0.2285, "grad_norm": 1.2189358472824097, "grad_norm_var": 0.2495960410376004, "learning_rate": 2e-05, "loss": 0.5198, "loss/crossentropy": 2.3419077396392822, "loss/hidden": 0.2275390625, "loss/logits": 0.03385118395090103, "loss/reg": 0.025845320895314217, "step": 457 }, { "epoch": 0.229, "grad_norm": 1.3370299339294434, "grad_norm_var": 0.24299100458264036, "learning_rate": 2e-05, "loss": 0.5765, "loss/crossentropy": 1.9379181265830994, "loss/hidden": 0.27734375, "loss/logits": 0.0406951867043972, "loss/reg": 0.025843370705842972, "step": 458 }, { "epoch": 0.2295, "grad_norm": 1.1177793741226196, "grad_norm_var": 0.24777192368354406, "learning_rate": 2e-05, "loss": 0.4456, "loss/crossentropy": 2.4968008995056152, "loss/hidden": 0.158203125, "loss/logits": 0.02897755615413189, "loss/reg": 0.02584136091172695, "step": 459 }, { "epoch": 0.23, "grad_norm": 1.4649550914764404, "grad_norm_var": 0.24774953141933906, "learning_rate": 2e-05, "loss": 0.4868, "loss/crossentropy": 2.2792553901672363, "loss/hidden": 0.193359375, "loss/logits": 0.035073790699243546, "loss/reg": 0.025839168578386307, "step": 460 }, { "epoch": 0.2305, "grad_norm": 2.2920172214508057, "grad_norm_var": 0.2745013047452227, "learning_rate": 2e-05, "loss": 0.621, "loss/crossentropy": 2.418063998222351, "loss/hidden": 0.29736328125, "loss/logits": 0.06528288684785366, "loss/reg": 0.025837266817688942, "step": 461 }, { "epoch": 0.231, "grad_norm": 1.5773580074310303, "grad_norm_var": 0.2617466213370638, "learning_rate": 2e-05, "loss": 0.5073, "loss/crossentropy": 2.2258150577545166, "loss/hidden": 0.2138671875, "loss/logits": 0.03509692847728729, "loss/reg": 0.025835072621703148, "step": 462 }, { "epoch": 0.2315, "grad_norm": 1.6675801277160645, "grad_norm_var": 0.2583117846520134, "learning_rate": 2e-05, "loss": 0.5136, "loss/crossentropy": 2.485268235206604, "loss/hidden": 0.22412109375, "loss/logits": 0.031180618330836296, "loss/reg": 0.025833170861005783, "step": 463 }, { "epoch": 0.232, "grad_norm": 1.4492632150650024, "grad_norm_var": 0.13801685370827765, "learning_rate": 2e-05, "loss": 0.4973, "loss/crossentropy": 2.257757544517517, "loss/hidden": 0.20703125, "loss/logits": 0.03195131104439497, "loss/reg": 0.025831099599599838, "step": 464 }, { "epoch": 0.2325, "grad_norm": 1.2513377666473389, "grad_norm_var": 0.1397318210080442, "learning_rate": 2e-05, "loss": 0.4871, "loss/crossentropy": 2.425398826599121, "loss/hidden": 0.1953125, "loss/logits": 0.0334627740085125, "loss/reg": 0.025828994810581207, "step": 465 }, { "epoch": 0.233, "grad_norm": 1.3394722938537598, "grad_norm_var": 0.14055180736722211, "learning_rate": 2e-05, "loss": 0.4586, "loss/crossentropy": 2.373073101043701, "loss/hidden": 0.171875, "loss/logits": 0.02840618882328272, "loss/reg": 0.025827039033174515, "step": 466 }, { "epoch": 0.2335, "grad_norm": 1.447240948677063, "grad_norm_var": 0.14031502946211252, "learning_rate": 2e-05, "loss": 0.5055, "loss/crossentropy": 2.207805633544922, "loss/hidden": 0.21337890625, "loss/logits": 0.03383456543087959, "loss/reg": 0.025825195014476776, "step": 467 }, { "epoch": 0.234, "grad_norm": 2.395975351333618, "grad_norm_var": 0.13744138699080868, "learning_rate": 2e-05, "loss": 0.5234, "loss/crossentropy": 2.323424220085144, "loss/hidden": 0.232421875, "loss/logits": 0.03275643941015005, "loss/reg": 0.02582353726029396, "step": 468 }, { "epoch": 0.2345, "grad_norm": 2.5749197006225586, "grad_norm_var": 0.2083013754485968, "learning_rate": 2e-05, "loss": 0.5199, "loss/crossentropy": 2.2507615089416504, "loss/hidden": 0.2275390625, "loss/logits": 0.0341134462505579, "loss/reg": 0.025821637362241745, "step": 469 }, { "epoch": 0.235, "grad_norm": 1.052276611328125, "grad_norm_var": 0.22503173458550182, "learning_rate": 2e-05, "loss": 0.4509, "loss/crossentropy": 2.543000817298889, "loss/hidden": 0.16796875, "loss/logits": 0.024687878787517548, "loss/reg": 0.025819703936576843, "step": 470 }, { "epoch": 0.2355, "grad_norm": 1.2194154262542725, "grad_norm_var": 0.2317099631068041, "learning_rate": 2e-05, "loss": 0.4424, "loss/crossentropy": 2.2912293672561646, "loss/hidden": 0.16015625, "loss/logits": 0.024061255156993866, "loss/reg": 0.025817908346652985, "step": 471 }, { "epoch": 0.236, "grad_norm": 1.3160464763641357, "grad_norm_var": 0.2163932029026758, "learning_rate": 2e-05, "loss": 0.5102, "loss/crossentropy": 2.2214205265045166, "loss/hidden": 0.2119140625, "loss/logits": 0.040074046701192856, "loss/reg": 0.025816213339567184, "step": 472 }, { "epoch": 0.2365, "grad_norm": 1.4499560594558716, "grad_norm_var": 0.20968210761966072, "learning_rate": 2e-05, "loss": 0.465, "loss/crossentropy": 2.4633235931396484, "loss/hidden": 0.17626953125, "loss/logits": 0.030628393404185772, "loss/reg": 0.025814484804868698, "step": 473 }, { "epoch": 0.237, "grad_norm": 1.5795156955718994, "grad_norm_var": 0.20616303007269987, "learning_rate": 2e-05, "loss": 0.4987, "loss/crossentropy": 2.5177139043807983, "loss/hidden": 0.21240234375, "loss/logits": 0.02812807820737362, "loss/reg": 0.025812778621912003, "step": 474 }, { "epoch": 0.2375, "grad_norm": 1.834425449371338, "grad_norm_var": 0.19460237139374303, "learning_rate": 2e-05, "loss": 0.548, "loss/crossentropy": 2.566522002220154, "loss/hidden": 0.251953125, "loss/logits": 0.03790239989757538, "loss/reg": 0.0258110873401165, "step": 475 }, { "epoch": 0.238, "grad_norm": 1.8921895027160645, "grad_norm_var": 0.19720773265526592, "learning_rate": 2e-05, "loss": 0.4895, "loss/crossentropy": 2.6173768043518066, "loss/hidden": 0.19873046875, "loss/logits": 0.0326268021017313, "loss/reg": 0.025809384882450104, "step": 476 }, { "epoch": 0.2385, "grad_norm": 1.4226226806640625, "grad_norm_var": 0.16958397715451046, "learning_rate": 2e-05, "loss": 0.498, "loss/crossentropy": 2.3383296728134155, "loss/hidden": 0.20751953125, "loss/logits": 0.03235785476863384, "loss/reg": 0.025807524099946022, "step": 477 }, { "epoch": 0.239, "grad_norm": 1.141805648803711, "grad_norm_var": 0.1822821790845537, "learning_rate": 2e-05, "loss": 0.4543, "loss/crossentropy": 2.428423523902893, "loss/hidden": 0.169921875, "loss/logits": 0.026319866999983788, "loss/reg": 0.025805801153182983, "step": 478 }, { "epoch": 0.2395, "grad_norm": 1.0476349592208862, "grad_norm_var": 0.19779294720925691, "learning_rate": 2e-05, "loss": 0.4542, "loss/crossentropy": 2.3026620149612427, "loss/hidden": 0.16845703125, "loss/logits": 0.027672583237290382, "loss/reg": 0.025803864002227783, "step": 479 }, { "epoch": 0.24, "grad_norm": 1.3201205730438232, "grad_norm_var": 0.20015459609518108, "learning_rate": 2e-05, "loss": 0.4768, "loss/crossentropy": 2.4549564123153687, "loss/hidden": 0.18603515625, "loss/logits": 0.0327040059491992, "loss/reg": 0.025801965966820717, "step": 480 }, { "epoch": 0.2405, "grad_norm": 2.7316701412200928, "grad_norm_var": 0.28452048900647253, "learning_rate": 2e-05, "loss": 0.6865, "loss/crossentropy": 2.420086145401001, "loss/hidden": 0.318359375, "loss/logits": 0.11013734713196754, "loss/reg": 0.025800272822380066, "step": 481 }, { "epoch": 0.241, "grad_norm": 1.2162243127822876, "grad_norm_var": 0.28992089783955044, "learning_rate": 2e-05, "loss": 0.5028, "loss/crossentropy": 2.3521331548690796, "loss/hidden": 0.2177734375, "loss/logits": 0.027009712532162666, "loss/reg": 0.025798635557293892, "step": 482 }, { "epoch": 0.2415, "grad_norm": 1.079655647277832, "grad_norm_var": 0.3059815393422553, "learning_rate": 2e-05, "loss": 0.4912, "loss/crossentropy": 2.3484867811203003, "loss/hidden": 0.19873046875, "loss/logits": 0.03451688028872013, "loss/reg": 0.025796744972467422, "step": 483 }, { "epoch": 0.242, "grad_norm": 1.3099355697631836, "grad_norm_var": 0.2614914398097065, "learning_rate": 2e-05, "loss": 0.4762, "loss/crossentropy": 2.4110331535339355, "loss/hidden": 0.18798828125, "loss/logits": 0.03024720586836338, "loss/reg": 0.025794848799705505, "step": 484 }, { "epoch": 0.2425, "grad_norm": 1.11648690700531, "grad_norm_var": 0.1876940743940563, "learning_rate": 2e-05, "loss": 0.4663, "loss/crossentropy": 2.3808969259262085, "loss/hidden": 0.17919921875, "loss/logits": 0.02915147691965103, "loss/reg": 0.025793053209781647, "step": 485 }, { "epoch": 0.243, "grad_norm": 1.1367552280426025, "grad_norm_var": 0.18399111878470176, "learning_rate": 2e-05, "loss": 0.4665, "loss/crossentropy": 2.3539984226226807, "loss/hidden": 0.1787109375, "loss/logits": 0.02986688818782568, "loss/reg": 0.02579127438366413, "step": 486 }, { "epoch": 0.2435, "grad_norm": 1.1363672018051147, "grad_norm_var": 0.18670864710500906, "learning_rate": 2e-05, "loss": 0.457, "loss/crossentropy": 2.5751640796661377, "loss/hidden": 0.17041015625, "loss/logits": 0.028702068142592907, "loss/reg": 0.02578934282064438, "step": 487 }, { "epoch": 0.244, "grad_norm": 1.59341299533844, "grad_norm_var": 0.1876461007769971, "learning_rate": 2e-05, "loss": 0.549, "loss/crossentropy": 2.4539562463760376, "loss/hidden": 0.248046875, "loss/logits": 0.043077923357486725, "loss/reg": 0.02578747272491455, "step": 488 }, { "epoch": 0.2445, "grad_norm": 1.3884077072143555, "grad_norm_var": 0.18778514582004005, "learning_rate": 2e-05, "loss": 0.4548, "loss/crossentropy": 2.4432852268218994, "loss/hidden": 0.171875, "loss/logits": 0.0251072458922863, "loss/reg": 0.025785457342863083, "step": 489 }, { "epoch": 0.245, "grad_norm": 1.168309211730957, "grad_norm_var": 0.1903861218173105, "learning_rate": 2e-05, "loss": 0.4505, "loss/crossentropy": 2.2845112085342407, "loss/hidden": 0.16162109375, "loss/logits": 0.031048119068145752, "loss/reg": 0.025783469900488853, "step": 490 }, { "epoch": 0.2455, "grad_norm": 1.2630984783172607, "grad_norm_var": 0.17834144864876003, "learning_rate": 2e-05, "loss": 0.4489, "loss/crossentropy": 2.357891082763672, "loss/hidden": 0.1650390625, "loss/logits": 0.02602921612560749, "loss/reg": 0.02578144334256649, "step": 491 }, { "epoch": 0.246, "grad_norm": 1.0908071994781494, "grad_norm_var": 0.16298183484375428, "learning_rate": 2e-05, "loss": 0.453, "loss/crossentropy": 2.3261715173721313, "loss/hidden": 0.15625, "loss/logits": 0.03899524360895157, "loss/reg": 0.02577943727374077, "step": 492 }, { "epoch": 0.2465, "grad_norm": 1.1049315929412842, "grad_norm_var": 0.165057508559335, "learning_rate": 2e-05, "loss": 0.4612, "loss/crossentropy": 2.347619652748108, "loss/hidden": 0.17529296875, "loss/logits": 0.028116335161030293, "loss/reg": 0.025777503848075867, "step": 493 }, { "epoch": 0.247, "grad_norm": 1.2722063064575195, "grad_norm_var": 0.1633202153049367, "learning_rate": 2e-05, "loss": 0.4791, "loss/crossentropy": 2.428340435028076, "loss/hidden": 0.1904296875, "loss/logits": 0.030900001525878906, "loss/reg": 0.02577553130686283, "step": 494 }, { "epoch": 0.2475, "grad_norm": 1.0491212606430054, "grad_norm_var": 0.16326816109757653, "learning_rate": 2e-05, "loss": 0.4459, "loss/crossentropy": 2.44633686542511, "loss/hidden": 0.1630859375, "loss/logits": 0.02505970373749733, "loss/reg": 0.025773610919713974, "step": 495 }, { "epoch": 0.248, "grad_norm": 1.4577767848968506, "grad_norm_var": 0.1646181560542212, "learning_rate": 2e-05, "loss": 0.5875, "loss/crossentropy": 2.1383886337280273, "loss/hidden": 0.2783203125, "loss/logits": 0.05150237772613764, "loss/reg": 0.02577175572514534, "step": 496 }, { "epoch": 0.2485, "grad_norm": 1.1691800355911255, "grad_norm_var": 0.023045095234002843, "learning_rate": 2e-05, "loss": 0.4948, "loss/crossentropy": 2.406825542449951, "loss/hidden": 0.2001953125, "loss/logits": 0.036906635388731956, "loss/reg": 0.025769958272576332, "step": 497 }, { "epoch": 0.249, "grad_norm": 1.1311383247375488, "grad_norm_var": 0.023563575455448373, "learning_rate": 2e-05, "loss": 0.4495, "loss/crossentropy": 2.2107361555099487, "loss/hidden": 0.16943359375, "loss/logits": 0.02235421910881996, "loss/reg": 0.025767968967556953, "step": 498 }, { "epoch": 0.2495, "grad_norm": 1.2846966981887817, "grad_norm_var": 0.022443893755450736, "learning_rate": 2e-05, "loss": 0.484, "loss/crossentropy": 2.169008255004883, "loss/hidden": 0.18798828125, "loss/logits": 0.03831418417394161, "loss/reg": 0.025765718892216682, "step": 499 }, { "epoch": 0.25, "grad_norm": 1.2635072469711304, "grad_norm_var": 0.02208093059473833, "learning_rate": 2e-05, "loss": 0.5241, "loss/crossentropy": 2.3311681747436523, "loss/hidden": 0.228515625, "loss/logits": 0.037904972210526466, "loss/reg": 0.025763733312487602, "step": 500 }, { "epoch": 0.2505, "grad_norm": 1.8094271421432495, "grad_norm_var": 0.04191426078620844, "learning_rate": 2e-05, "loss": 0.5198, "loss/crossentropy": 2.18564236164093, "loss/hidden": 0.22705078125, "loss/logits": 0.03517603315412998, "loss/reg": 0.025761688128113747, "step": 501 }, { "epoch": 0.251, "grad_norm": 1.4268393516540527, "grad_norm_var": 0.042022005671670054, "learning_rate": 2e-05, "loss": 0.5027, "loss/crossentropy": 2.3186033964157104, "loss/hidden": 0.2109375, "loss/logits": 0.03412244841456413, "loss/reg": 0.025759579613804817, "step": 502 }, { "epoch": 0.2515, "grad_norm": 1.6704895496368408, "grad_norm_var": 0.04904823070484075, "learning_rate": 2e-05, "loss": 0.507, "loss/crossentropy": 2.444745898246765, "loss/hidden": 0.21240234375, "loss/logits": 0.03699003718793392, "loss/reg": 0.02575748972594738, "step": 503 }, { "epoch": 0.252, "grad_norm": 1.4041507244110107, "grad_norm_var": 0.04442425217177727, "learning_rate": 2e-05, "loss": 0.4537, "loss/crossentropy": 2.3990856409072876, "loss/hidden": 0.166015625, "loss/logits": 0.030156807973980904, "loss/reg": 0.025755319744348526, "step": 504 }, { "epoch": 0.2525, "grad_norm": 1.5246555805206299, "grad_norm_var": 0.04701556722156628, "learning_rate": 2e-05, "loss": 0.5252, "loss/crossentropy": 2.551340937614441, "loss/hidden": 0.23583984375, "loss/logits": 0.031838640570640564, "loss/reg": 0.025753194466233253, "step": 505 }, { "epoch": 0.253, "grad_norm": 1.8362479209899902, "grad_norm_var": 0.06155521373344843, "learning_rate": 2e-05, "loss": 0.5572, "loss/crossentropy": 2.1370293498039246, "loss/hidden": 0.23974609375, "loss/logits": 0.05993914417922497, "loss/reg": 0.02575111947953701, "step": 506 }, { "epoch": 0.2535, "grad_norm": 1.1423455476760864, "grad_norm_var": 0.06402495885733686, "learning_rate": 2e-05, "loss": 0.4659, "loss/crossentropy": 2.4107199907302856, "loss/hidden": 0.1806640625, "loss/logits": 0.027789254672825336, "loss/reg": 0.025749139487743378, "step": 507 }, { "epoch": 0.254, "grad_norm": 1.2471706867218018, "grad_norm_var": 0.06010039179400053, "learning_rate": 2e-05, "loss": 0.5008, "loss/crossentropy": 2.2391830682754517, "loss/hidden": 0.20703125, "loss/logits": 0.03628289885818958, "loss/reg": 0.025747055187821388, "step": 508 }, { "epoch": 0.2545, "grad_norm": 1.6316094398498535, "grad_norm_var": 0.059376668774846306, "learning_rate": 2e-05, "loss": 0.5185, "loss/crossentropy": 2.4537373781204224, "loss/hidden": 0.2294921875, "loss/logits": 0.031561460345983505, "loss/reg": 0.02574506774544716, "step": 509 }, { "epoch": 0.255, "grad_norm": 1.7221488952636719, "grad_norm_var": 0.06466089846806326, "learning_rate": 2e-05, "loss": 0.5166, "loss/crossentropy": 2.008660316467285, "loss/hidden": 0.2294921875, "loss/logits": 0.029630004428327084, "loss/reg": 0.02574305608868599, "step": 510 }, { "epoch": 0.2555, "grad_norm": 2.063495635986328, "grad_norm_var": 0.07838236427375302, "learning_rate": 2e-05, "loss": 0.6291, "loss/crossentropy": 2.2193583250045776, "loss/hidden": 0.3251953125, "loss/logits": 0.04652561619877815, "loss/reg": 0.025741035118699074, "step": 511 }, { "epoch": 0.256, "grad_norm": 2.1549365520477295, "grad_norm_var": 0.10608428210922506, "learning_rate": 2e-05, "loss": 0.5281, "loss/crossentropy": 1.9776748418807983, "loss/hidden": 0.2412109375, "loss/logits": 0.029531195759773254, "loss/reg": 0.025739166885614395, "step": 512 }, { "epoch": 0.2565, "grad_norm": 2.0352017879486084, "grad_norm_var": 0.11128044423135464, "learning_rate": 2e-05, "loss": 0.5546, "loss/crossentropy": 2.393476963043213, "loss/hidden": 0.25341796875, "loss/logits": 0.043846890330314636, "loss/reg": 0.02573738433420658, "step": 513 }, { "epoch": 0.257, "grad_norm": 1.3759031295776367, "grad_norm_var": 0.10023724397306069, "learning_rate": 2e-05, "loss": 0.5094, "loss/crossentropy": 2.3285356760025024, "loss/hidden": 0.21875, "loss/logits": 0.03326238878071308, "loss/reg": 0.02573556825518608, "step": 514 }, { "epoch": 0.2575, "grad_norm": 2.0449092388153076, "grad_norm_var": 0.10444321701007618, "learning_rate": 2e-05, "loss": 0.5387, "loss/crossentropy": 2.3776673078536987, "loss/hidden": 0.22216796875, "loss/logits": 0.05914916470646858, "loss/reg": 0.025733835995197296, "step": 515 }, { "epoch": 0.258, "grad_norm": 1.2532458305358887, "grad_norm_var": 0.10497457736165051, "learning_rate": 2e-05, "loss": 0.4725, "loss/crossentropy": 2.5611300468444824, "loss/hidden": 0.1796875, "loss/logits": 0.035542636178433895, "loss/reg": 0.025731824338436127, "step": 516 }, { "epoch": 0.2585, "grad_norm": 1.166143774986267, "grad_norm_var": 0.11685692171310862, "learning_rate": 2e-05, "loss": 0.4871, "loss/crossentropy": 2.292641520500183, "loss/hidden": 0.19873046875, "loss/logits": 0.031022757291793823, "loss/reg": 0.025729816406965256, "step": 517 }, { "epoch": 0.259, "grad_norm": 0.9319448471069336, "grad_norm_var": 0.14400094830482596, "learning_rate": 2e-05, "loss": 0.4359, "loss/crossentropy": 2.2908148765563965, "loss/hidden": 0.15673828125, "loss/logits": 0.021897392347455025, "loss/reg": 0.025727812200784683, "step": 518 }, { "epoch": 0.2595, "grad_norm": 1.3351777791976929, "grad_norm_var": 0.146771754161323, "learning_rate": 2e-05, "loss": 0.5675, "loss/crossentropy": 2.117431879043579, "loss/hidden": 0.26708984375, "loss/logits": 0.04313970357179642, "loss/reg": 0.025725772604346275, "step": 519 }, { "epoch": 0.26, "grad_norm": 1.2591443061828613, "grad_norm_var": 0.1509895364147709, "learning_rate": 2e-05, "loss": 0.472, "loss/crossentropy": 2.2097796201705933, "loss/hidden": 0.1875, "loss/logits": 0.027306508272886276, "loss/reg": 0.025723854079842567, "step": 520 }, { "epoch": 0.2605, "grad_norm": 1.4675482511520386, "grad_norm_var": 0.15135031036683486, "learning_rate": 2e-05, "loss": 0.5158, "loss/crossentropy": 2.1767526865005493, "loss/hidden": 0.22119140625, "loss/logits": 0.037375250831246376, "loss/reg": 0.025722013786435127, "step": 521 }, { "epoch": 0.261, "grad_norm": 1.318777322769165, "grad_norm_var": 0.1477635335278175, "learning_rate": 2e-05, "loss": 0.4433, "loss/crossentropy": 2.3265219926834106, "loss/hidden": 0.16064453125, "loss/logits": 0.025505591183900833, "loss/reg": 0.025719961151480675, "step": 522 }, { "epoch": 0.2615, "grad_norm": 1.4309393167495728, "grad_norm_var": 0.13884665705610644, "learning_rate": 2e-05, "loss": 0.4473, "loss/crossentropy": 2.297981858253479, "loss/hidden": 0.1650390625, "loss/logits": 0.025057895109057426, "loss/reg": 0.025717932730913162, "step": 523 }, { "epoch": 0.262, "grad_norm": 2.0628879070281982, "grad_norm_var": 0.14995613654657897, "learning_rate": 2e-05, "loss": 0.4881, "loss/crossentropy": 2.3760178089141846, "loss/hidden": 0.1982421875, "loss/logits": 0.032686688005924225, "loss/reg": 0.025715861469507217, "step": 524 }, { "epoch": 0.2625, "grad_norm": 1.75223970413208, "grad_norm_var": 0.1517218258554711, "learning_rate": 2e-05, "loss": 0.4799, "loss/crossentropy": 2.3368886709213257, "loss/hidden": 0.19287109375, "loss/logits": 0.02990109659731388, "loss/reg": 0.02571384236216545, "step": 525 }, { "epoch": 0.263, "grad_norm": 1.1534109115600586, "grad_norm_var": 0.16160742489911778, "learning_rate": 2e-05, "loss": 0.4834, "loss/crossentropy": 2.2142513394355774, "loss/hidden": 0.19482421875, "loss/logits": 0.031417591497302055, "loss/reg": 0.025711748749017715, "step": 526 }, { "epoch": 0.2635, "grad_norm": 1.425850510597229, "grad_norm_var": 0.143393700633121, "learning_rate": 2e-05, "loss": 0.4661, "loss/crossentropy": 2.439908742904663, "loss/hidden": 0.1796875, "loss/logits": 0.029349423944950104, "loss/reg": 0.02570977620780468, "step": 527 }, { "epoch": 0.264, "grad_norm": 1.4983434677124023, "grad_norm_var": 0.11392210677285745, "learning_rate": 2e-05, "loss": 0.508, "loss/crossentropy": 2.310701370239258, "loss/hidden": 0.1728515625, "loss/logits": 0.078089265152812, "loss/reg": 0.025707799941301346, "step": 528 }, { "epoch": 0.2645, "grad_norm": 1.6121326684951782, "grad_norm_var": 0.09319685976795024, "learning_rate": 2e-05, "loss": 0.5862, "loss/crossentropy": 2.195641279220581, "loss/hidden": 0.28857421875, "loss/logits": 0.04055267106741667, "loss/reg": 0.025705868378281593, "step": 529 }, { "epoch": 0.265, "grad_norm": 1.4942004680633545, "grad_norm_var": 0.09301259307607192, "learning_rate": 2e-05, "loss": 0.5006, "loss/crossentropy": 2.2726430892944336, "loss/hidden": 0.2060546875, "loss/logits": 0.037457194179296494, "loss/reg": 0.025703880935907364, "step": 530 }, { "epoch": 0.2655, "grad_norm": 2.3016085624694824, "grad_norm_var": 0.11747795625703147, "learning_rate": 2e-05, "loss": 0.5752, "loss/crossentropy": 2.360868453979492, "loss/hidden": 0.28515625, "loss/logits": 0.03298346884548664, "loss/reg": 0.02570200525224209, "step": 531 }, { "epoch": 0.266, "grad_norm": 1.9155231714248657, "grad_norm_var": 0.12606227216750904, "learning_rate": 2e-05, "loss": 0.5386, "loss/crossentropy": 2.1607614755630493, "loss/hidden": 0.23974609375, "loss/logits": 0.04186772648245096, "loss/reg": 0.025700142607092857, "step": 532 }, { "epoch": 0.2665, "grad_norm": 1.9601225852966309, "grad_norm_var": 0.12928627941643545, "learning_rate": 2e-05, "loss": 0.5628, "loss/crossentropy": 2.4702740907669067, "loss/hidden": 0.2607421875, "loss/logits": 0.04509196989238262, "loss/reg": 0.025698326528072357, "step": 533 }, { "epoch": 0.267, "grad_norm": 1.6414953470230103, "grad_norm_var": 0.10157179579757945, "learning_rate": 2e-05, "loss": 0.435, "loss/crossentropy": 2.3161516189575195, "loss/hidden": 0.15576171875, "loss/logits": 0.022296501323580742, "loss/reg": 0.025696277618408203, "step": 534 }, { "epoch": 0.2675, "grad_norm": 1.7865321636199951, "grad_norm_var": 0.09825659810908008, "learning_rate": 2e-05, "loss": 0.4618, "loss/crossentropy": 2.3330507278442383, "loss/hidden": 0.17724609375, "loss/logits": 0.02764590922743082, "loss/reg": 0.025694238021969795, "step": 535 }, { "epoch": 0.268, "grad_norm": 1.8025976419448853, "grad_norm_var": 0.08983964833530009, "learning_rate": 2e-05, "loss": 0.5996, "loss/crossentropy": 2.1311851739883423, "loss/hidden": 0.2939453125, "loss/logits": 0.04878038726747036, "loss/reg": 0.025692163035273552, "step": 536 }, { "epoch": 0.2685, "grad_norm": 1.3388440608978271, "grad_norm_var": 0.09424639337241937, "learning_rate": 2e-05, "loss": 0.4705, "loss/crossentropy": 2.276059627532959, "loss/hidden": 0.17919921875, "loss/logits": 0.03437704313546419, "loss/reg": 0.02569023333489895, "step": 537 }, { "epoch": 0.269, "grad_norm": 1.1625409126281738, "grad_norm_var": 0.10279622484355831, "learning_rate": 2e-05, "loss": 0.4461, "loss/crossentropy": 2.515538215637207, "loss/hidden": 0.1630859375, "loss/logits": 0.02611909992992878, "loss/reg": 0.025688180699944496, "step": 538 }, { "epoch": 0.2695, "grad_norm": 1.3560576438903809, "grad_norm_var": 0.10529593288305386, "learning_rate": 2e-05, "loss": 0.4713, "loss/crossentropy": 2.333776354789734, "loss/hidden": 0.1796875, "loss/logits": 0.03480132482945919, "loss/reg": 0.02568606473505497, "step": 539 }, { "epoch": 0.27, "grad_norm": 1.033890724182129, "grad_norm_var": 0.11366219521287153, "learning_rate": 2e-05, "loss": 0.4471, "loss/crossentropy": 2.451215624809265, "loss/hidden": 0.16259765625, "loss/logits": 0.02763993013650179, "loss/reg": 0.02568388171494007, "step": 540 }, { "epoch": 0.2705, "grad_norm": 1.3294425010681152, "grad_norm_var": 0.11496770242969863, "learning_rate": 2e-05, "loss": 0.4625, "loss/crossentropy": 2.5572686195373535, "loss/hidden": 0.1748046875, "loss/logits": 0.03091136459261179, "loss/reg": 0.025681814178824425, "step": 541 }, { "epoch": 0.271, "grad_norm": 1.6161302328109741, "grad_norm_var": 0.10383304121057577, "learning_rate": 2e-05, "loss": 0.5046, "loss/crossentropy": 2.4156856536865234, "loss/hidden": 0.21923828125, "loss/logits": 0.02856369875371456, "loss/reg": 0.02568003162741661, "step": 542 }, { "epoch": 0.2715, "grad_norm": 1.6249600648880005, "grad_norm_var": 0.10222625558776764, "learning_rate": 2e-05, "loss": 0.5214, "loss/crossentropy": 2.5546233654022217, "loss/hidden": 0.2255859375, "loss/logits": 0.03906646929681301, "loss/reg": 0.025678148493170738, "step": 543 }, { "epoch": 0.272, "grad_norm": 1.2705844640731812, "grad_norm_var": 0.10831713729850012, "learning_rate": 2e-05, "loss": 0.5189, "loss/crossentropy": 2.3757272958755493, "loss/hidden": 0.21728515625, "loss/logits": 0.04489796422421932, "loss/reg": 0.025676140561699867, "step": 544 }, { "epoch": 0.2725, "grad_norm": 1.05636727809906, "grad_norm_var": 0.1250863434263256, "learning_rate": 2e-05, "loss": 0.46, "loss/crossentropy": 2.383628726005554, "loss/hidden": 0.17529296875, "loss/logits": 0.02797263953834772, "loss/reg": 0.025674104690551758, "step": 545 }, { "epoch": 0.273, "grad_norm": 1.2423522472381592, "grad_norm_var": 0.13069532228992856, "learning_rate": 2e-05, "loss": 0.4639, "loss/crossentropy": 2.595247983932495, "loss/hidden": 0.1767578125, "loss/logits": 0.030470484867691994, "loss/reg": 0.02567211352288723, "step": 546 }, { "epoch": 0.2735, "grad_norm": 1.1715264320373535, "grad_norm_var": 0.09386338960438909, "learning_rate": 2e-05, "loss": 0.4501, "loss/crossentropy": 2.321129322052002, "loss/hidden": 0.16552734375, "loss/logits": 0.027902510948479176, "loss/reg": 0.025670204311609268, "step": 547 }, { "epoch": 0.274, "grad_norm": 1.5972819328308105, "grad_norm_var": 0.08072905924476359, "learning_rate": 2e-05, "loss": 0.5185, "loss/crossentropy": 2.2606377601623535, "loss/hidden": 0.22021484375, "loss/logits": 0.04160183481872082, "loss/reg": 0.025668160989880562, "step": 548 }, { "epoch": 0.2745, "grad_norm": 1.0867489576339722, "grad_norm_var": 0.067476102626288, "learning_rate": 2e-05, "loss": 0.4364, "loss/crossentropy": 2.463867425918579, "loss/hidden": 0.15625, "loss/logits": 0.02346113882958889, "loss/reg": 0.02566620334982872, "step": 549 }, { "epoch": 0.275, "grad_norm": 2.4649062156677246, "grad_norm_var": 0.13830422072727325, "learning_rate": 2e-05, "loss": 0.5387, "loss/crossentropy": 2.7201980352401733, "loss/hidden": 0.20458984375, "loss/logits": 0.07749359030276537, "loss/reg": 0.02566409669816494, "step": 550 }, { "epoch": 0.2755, "grad_norm": 1.4529809951782227, "grad_norm_var": 0.1295704130285588, "learning_rate": 2e-05, "loss": 0.4755, "loss/crossentropy": 2.4272106885910034, "loss/hidden": 0.1943359375, "loss/logits": 0.024497310630977154, "loss/reg": 0.025662219151854515, "step": 551 }, { "epoch": 0.276, "grad_norm": 1.18130362033844, "grad_norm_var": 0.12141776800456393, "learning_rate": 2e-05, "loss": 0.4905, "loss/crossentropy": 2.2826067209243774, "loss/hidden": 0.19677734375, "loss/logits": 0.03717024438083172, "loss/reg": 0.025659961625933647, "step": 552 }, { "epoch": 0.2765, "grad_norm": 1.4119619131088257, "grad_norm_var": 0.12140800103305664, "learning_rate": 2e-05, "loss": 0.4373, "loss/crossentropy": 2.6987099647521973, "loss/hidden": 0.15673828125, "loss/logits": 0.024006612598896027, "loss/reg": 0.02565770410001278, "step": 553 }, { "epoch": 0.277, "grad_norm": 1.4704711437225342, "grad_norm_var": 0.11845981336057979, "learning_rate": 2e-05, "loss": 0.506, "loss/crossentropy": 2.258659243583679, "loss/hidden": 0.21728515625, "loss/logits": 0.032203953713178635, "loss/reg": 0.025655701756477356, "step": 554 }, { "epoch": 0.2775, "grad_norm": 1.6067429780960083, "grad_norm_var": 0.12098775757429862, "learning_rate": 2e-05, "loss": 0.4385, "loss/crossentropy": 2.5011746883392334, "loss/hidden": 0.1572265625, "loss/logits": 0.024782009422779083, "loss/reg": 0.025653747841715813, "step": 555 }, { "epoch": 0.278, "grad_norm": 1.1298900842666626, "grad_norm_var": 0.1167034622019452, "learning_rate": 2e-05, "loss": 0.4405, "loss/crossentropy": 2.2035679817199707, "loss/hidden": 0.15966796875, "loss/logits": 0.024293298833072186, "loss/reg": 0.02565157227218151, "step": 556 }, { "epoch": 0.2785, "grad_norm": 0.9485200047492981, "grad_norm_var": 0.13035156532443523, "learning_rate": 2e-05, "loss": 0.4479, "loss/crossentropy": 2.5580928325653076, "loss/hidden": 0.16357421875, "loss/logits": 0.027810726314783096, "loss/reg": 0.025649361312389374, "step": 557 }, { "epoch": 0.279, "grad_norm": 1.709061622619629, "grad_norm_var": 0.13362146514691972, "learning_rate": 2e-05, "loss": 0.4703, "loss/crossentropy": 2.4233322143554688, "loss/hidden": 0.1826171875, "loss/logits": 0.031242147088050842, "loss/reg": 0.025647401809692383, "step": 558 }, { "epoch": 0.2795, "grad_norm": 1.1522104740142822, "grad_norm_var": 0.13351084508299657, "learning_rate": 2e-05, "loss": 0.4706, "loss/crossentropy": 2.5604687929153442, "loss/hidden": 0.18798828125, "loss/logits": 0.026114785112440586, "loss/reg": 0.025645434856414795, "step": 559 }, { "epoch": 0.28, "grad_norm": 1.5035618543624878, "grad_norm_var": 0.13375114473651117, "learning_rate": 2e-05, "loss": 0.49, "loss/crossentropy": 2.5127099752426147, "loss/hidden": 0.2001953125, "loss/logits": 0.03332594968378544, "loss/reg": 0.02564323879778385, "step": 560 }, { "epoch": 0.2805, "grad_norm": 1.3412765264511108, "grad_norm_var": 0.12627894398200917, "learning_rate": 2e-05, "loss": 0.4533, "loss/crossentropy": 2.3233593702316284, "loss/hidden": 0.158203125, "loss/logits": 0.038642819970846176, "loss/reg": 0.02564125321805477, "step": 561 }, { "epoch": 0.281, "grad_norm": 1.3613826036453247, "grad_norm_var": 0.124592250727938, "learning_rate": 2e-05, "loss": 0.4388, "loss/crossentropy": 2.6328701972961426, "loss/hidden": 0.158203125, "loss/logits": 0.02416001632809639, "loss/reg": 0.025639118626713753, "step": 562 }, { "epoch": 0.2815, "grad_norm": 1.4453518390655518, "grad_norm_var": 0.12050377751008766, "learning_rate": 2e-05, "loss": 0.467, "loss/crossentropy": 2.379120349884033, "loss/hidden": 0.1748046875, "loss/logits": 0.03583723120391369, "loss/reg": 0.02563699148595333, "step": 563 }, { "epoch": 0.282, "grad_norm": 1.1511297225952148, "grad_norm_var": 0.12293264284763053, "learning_rate": 2e-05, "loss": 0.4638, "loss/crossentropy": 2.211379051208496, "loss/hidden": 0.1806640625, "loss/logits": 0.026775190606713295, "loss/reg": 0.025634942576289177, "step": 564 }, { "epoch": 0.2825, "grad_norm": 1.229429841041565, "grad_norm_var": 0.11822487448682713, "learning_rate": 2e-05, "loss": 0.465, "loss/crossentropy": 2.3449004888534546, "loss/hidden": 0.162109375, "loss/logits": 0.04653145559132099, "loss/reg": 0.02563273347914219, "step": 565 }, { "epoch": 0.283, "grad_norm": 1.7746120691299438, "grad_norm_var": 0.05091479897566722, "learning_rate": 2e-05, "loss": 0.5206, "loss/crossentropy": 2.4769328832626343, "loss/hidden": 0.2275390625, "loss/logits": 0.036771247163414955, "loss/reg": 0.025630656629800797, "step": 566 }, { "epoch": 0.2835, "grad_norm": 1.0254523754119873, "grad_norm_var": 0.0574298221699075, "learning_rate": 2e-05, "loss": 0.4352, "loss/crossentropy": 2.4587230682373047, "loss/hidden": 0.15625, "loss/logits": 0.022636396810412407, "loss/reg": 0.025628428906202316, "step": 567 }, { "epoch": 0.284, "grad_norm": 1.4086933135986328, "grad_norm_var": 0.05584552607974088, "learning_rate": 2e-05, "loss": 0.5428, "loss/crossentropy": 2.3397552967071533, "loss/hidden": 0.25244140625, "loss/logits": 0.03410719987004995, "loss/reg": 0.025626273825764656, "step": 568 }, { "epoch": 0.2845, "grad_norm": 1.1368968486785889, "grad_norm_var": 0.058461728907536765, "learning_rate": 2e-05, "loss": 0.4463, "loss/crossentropy": 2.4085217714309692, "loss/hidden": 0.16259765625, "loss/logits": 0.027470089495182037, "loss/reg": 0.025624196976423264, "step": 569 }, { "epoch": 0.285, "grad_norm": 1.3466085195541382, "grad_norm_var": 0.0572190922863477, "learning_rate": 2e-05, "loss": 0.4488, "loss/crossentropy": 2.454616904258728, "loss/hidden": 0.16259765625, "loss/logits": 0.029985230416059494, "loss/reg": 0.025622138753533363, "step": 570 }, { "epoch": 0.2855, "grad_norm": 1.1087514162063599, "grad_norm_var": 0.05430530108740682, "learning_rate": 2e-05, "loss": 0.4377, "loss/crossentropy": 2.381610155105591, "loss/hidden": 0.1572265625, "loss/logits": 0.024281597696244717, "loss/reg": 0.025620009750127792, "step": 571 }, { "epoch": 0.286, "grad_norm": 2.252387046813965, "grad_norm_var": 0.10784971065445176, "learning_rate": 2e-05, "loss": 0.5819, "loss/crossentropy": 2.284385323524475, "loss/hidden": 0.26611328125, "loss/logits": 0.05961132235825062, "loss/reg": 0.025618063285946846, "step": 572 }, { "epoch": 0.2865, "grad_norm": 1.2841176986694336, "grad_norm_var": 0.09609813291731933, "learning_rate": 2e-05, "loss": 0.4665, "loss/crossentropy": 2.3138378858566284, "loss/hidden": 0.18310546875, "loss/logits": 0.027231371961534023, "loss/reg": 0.025616133585572243, "step": 573 }, { "epoch": 0.287, "grad_norm": 0.9297242164611816, "grad_norm_var": 0.10084539110608777, "learning_rate": 2e-05, "loss": 0.4129, "loss/crossentropy": 2.41566002368927, "loss/hidden": 0.13671875, "loss/logits": 0.020072663202881813, "loss/reg": 0.02561403624713421, "step": 574 }, { "epoch": 0.2875, "grad_norm": 1.3601016998291016, "grad_norm_var": 0.09832118521838087, "learning_rate": 2e-05, "loss": 0.4644, "loss/crossentropy": 2.2336788177490234, "loss/hidden": 0.1787109375, "loss/logits": 0.029539520852267742, "loss/reg": 0.025612102821469307, "step": 575 }, { "epoch": 0.288, "grad_norm": 1.6692289113998413, "grad_norm_var": 0.10334643999860299, "learning_rate": 2e-05, "loss": 0.4483, "loss/crossentropy": 2.3380844593048096, "loss/hidden": 0.16552734375, "loss/logits": 0.02669445425271988, "loss/reg": 0.02561003342270851, "step": 576 }, { "epoch": 0.2885, "grad_norm": 2.4334895610809326, "grad_norm_var": 0.17458492052786573, "learning_rate": 2e-05, "loss": 0.558, "loss/crossentropy": 2.2851526737213135, "loss/hidden": 0.2548828125, "loss/logits": 0.0470340047031641, "loss/reg": 0.02560798078775406, "step": 577 }, { "epoch": 0.289, "grad_norm": 1.154819130897522, "grad_norm_var": 0.17920585225899094, "learning_rate": 2e-05, "loss": 0.4925, "loss/crossentropy": 2.4699753522872925, "loss/hidden": 0.20703125, "loss/logits": 0.02939967904239893, "loss/reg": 0.025605909526348114, "step": 578 }, { "epoch": 0.2895, "grad_norm": 1.2029677629470825, "grad_norm_var": 0.18203981769593008, "learning_rate": 2e-05, "loss": 0.4405, "loss/crossentropy": 2.388526439666748, "loss/hidden": 0.15771484375, "loss/logits": 0.026735836640000343, "loss/reg": 0.025603823363780975, "step": 579 }, { "epoch": 0.29, "grad_norm": 1.2292884588241577, "grad_norm_var": 0.17978354168637148, "learning_rate": 2e-05, "loss": 0.4722, "loss/crossentropy": 2.2643179893493652, "loss/hidden": 0.1845703125, "loss/logits": 0.03162453696131706, "loss/reg": 0.025601672008633614, "step": 580 }, { "epoch": 0.2905, "grad_norm": 1.381611704826355, "grad_norm_var": 0.1775840985068174, "learning_rate": 2e-05, "loss": 0.5298, "loss/crossentropy": 2.317778706550598, "loss/hidden": 0.22802734375, "loss/logits": 0.04581563360989094, "loss/reg": 0.025599613785743713, "step": 581 }, { "epoch": 0.291, "grad_norm": 1.9058457612991333, "grad_norm_var": 0.18488866977521237, "learning_rate": 2e-05, "loss": 0.5568, "loss/crossentropy": 2.329615592956543, "loss/hidden": 0.265625, "loss/logits": 0.03523416444659233, "loss/reg": 0.02559736929833889, "step": 582 }, { "epoch": 0.2915, "grad_norm": 2.325834035873413, "grad_norm_var": 0.22097551825223125, "learning_rate": 2e-05, "loss": 0.5317, "loss/crossentropy": 2.537761688232422, "loss/hidden": 0.2373046875, "loss/logits": 0.038454240188002586, "loss/reg": 0.025595253333449364, "step": 583 }, { "epoch": 0.292, "grad_norm": 1.53029203414917, "grad_norm_var": 0.22028718572734055, "learning_rate": 2e-05, "loss": 0.5085, "loss/crossentropy": 2.4244139194488525, "loss/hidden": 0.208984375, "loss/logits": 0.04360722564160824, "loss/reg": 0.025593377649784088, "step": 584 }, { "epoch": 0.2925, "grad_norm": 1.2639271020889282, "grad_norm_var": 0.21487899090266935, "learning_rate": 2e-05, "loss": 0.4995, "loss/crossentropy": 2.2803802490234375, "loss/hidden": 0.19921875, "loss/logits": 0.04435891658067703, "loss/reg": 0.025591382756829262, "step": 585 }, { "epoch": 0.293, "grad_norm": 1.27842116355896, "grad_norm_var": 0.21677952247985388, "learning_rate": 2e-05, "loss": 0.4548, "loss/crossentropy": 2.3663710355758667, "loss/hidden": 0.16845703125, "loss/logits": 0.030416646972298622, "loss/reg": 0.02558933198451996, "step": 586 }, { "epoch": 0.2935, "grad_norm": 1.7024108171463013, "grad_norm_var": 0.20629975430104253, "learning_rate": 2e-05, "loss": 0.5827, "loss/crossentropy": 2.059163510799408, "loss/hidden": 0.29248046875, "loss/logits": 0.03431258723139763, "loss/reg": 0.02558741346001625, "step": 587 }, { "epoch": 0.294, "grad_norm": 1.4503967761993408, "grad_norm_var": 0.1720895319235313, "learning_rate": 2e-05, "loss": 0.4796, "loss/crossentropy": 2.4850029945373535, "loss/hidden": 0.19873046875, "loss/logits": 0.02503114379942417, "loss/reg": 0.025585299357771873, "step": 588 }, { "epoch": 0.2945, "grad_norm": 1.933565378189087, "grad_norm_var": 0.1792024124736713, "learning_rate": 2e-05, "loss": 0.4693, "loss/crossentropy": 2.4364657402038574, "loss/hidden": 0.18408203125, "loss/logits": 0.029408352449536324, "loss/reg": 0.025583306327462196, "step": 589 }, { "epoch": 0.295, "grad_norm": 1.8611115217208862, "grad_norm_var": 0.15676426573083635, "learning_rate": 2e-05, "loss": 0.4429, "loss/crossentropy": 2.441853404045105, "loss/hidden": 0.16259765625, "loss/logits": 0.02446013130247593, "loss/reg": 0.025581372901797295, "step": 590 }, { "epoch": 0.2955, "grad_norm": 1.1267725229263306, "grad_norm_var": 0.1677922843229851, "learning_rate": 2e-05, "loss": 0.441, "loss/crossentropy": 2.560555934906006, "loss/hidden": 0.1591796875, "loss/logits": 0.026059484109282494, "loss/reg": 0.025579283013939857, "step": 591 }, { "epoch": 0.296, "grad_norm": 9.252140998840332, "grad_norm_var": 3.841050987302196, "learning_rate": 2e-05, "loss": 0.5664, "loss/crossentropy": 2.2949132919311523, "loss/hidden": 0.2783203125, "loss/logits": 0.032294947654008865, "loss/reg": 0.0255771204829216, "step": 592 }, { "epoch": 0.2965, "grad_norm": 1.3007549047470093, "grad_norm_var": 3.8655234521870527, "learning_rate": 2e-05, "loss": 0.441, "loss/crossentropy": 2.31974720954895, "loss/hidden": 0.1611328125, "loss/logits": 0.02411063387989998, "loss/reg": 0.025574954226613045, "step": 593 }, { "epoch": 0.297, "grad_norm": 1.7131131887435913, "grad_norm_var": 3.822554124166938, "learning_rate": 2e-05, "loss": 0.4643, "loss/crossentropy": 2.39312207698822, "loss/hidden": 0.18212890625, "loss/logits": 0.026430404745042324, "loss/reg": 0.0255727581679821, "step": 594 }, { "epoch": 0.2975, "grad_norm": 1.6008955240249634, "grad_norm_var": 3.7886423499076054, "learning_rate": 2e-05, "loss": 0.512, "loss/crossentropy": 2.3966288566589355, "loss/hidden": 0.2109375, "loss/logits": 0.045352160930633545, "loss/reg": 0.025570496916770935, "step": 595 }, { "epoch": 0.298, "grad_norm": 1.7445118427276611, "grad_norm_var": 3.748611248289865, "learning_rate": 2e-05, "loss": 0.538, "loss/crossentropy": 1.9948007464408875, "loss/hidden": 0.23388671875, "loss/logits": 0.0484439916908741, "loss/reg": 0.025568410754203796, "step": 596 }, { "epoch": 0.2985, "grad_norm": 1.8626893758773804, "grad_norm_var": 3.7179115354234606, "learning_rate": 2e-05, "loss": 0.5166, "loss/crossentropy": 2.4233922958374023, "loss/hidden": 0.22119140625, "loss/logits": 0.03974040970206261, "loss/reg": 0.025566227734088898, "step": 597 }, { "epoch": 0.299, "grad_norm": 1.2042471170425415, "grad_norm_var": 3.7683163733932923, "learning_rate": 2e-05, "loss": 0.4547, "loss/crossentropy": 2.1347005367279053, "loss/hidden": 0.17041015625, "loss/logits": 0.028620691038668156, "loss/reg": 0.025564009323716164, "step": 598 }, { "epoch": 0.2995, "grad_norm": 1.5764883756637573, "grad_norm_var": 3.778044329930853, "learning_rate": 2e-05, "loss": 0.4981, "loss/crossentropy": 2.4556859731674194, "loss/hidden": 0.2021484375, "loss/logits": 0.0403362512588501, "loss/reg": 0.025561654940247536, "step": 599 }, { "epoch": 0.3, "grad_norm": 1.096451997756958, "grad_norm_var": 3.8184307388690244, "learning_rate": 2e-05, "loss": 0.5136, "loss/crossentropy": 2.321175456047058, "loss/hidden": 0.21875, "loss/logits": 0.03927676286548376, "loss/reg": 0.02555953338742256, "step": 600 }, { "epoch": 0.3005, "grad_norm": 1.2970784902572632, "grad_norm_var": 3.8152547172108693, "learning_rate": 2e-05, "loss": 0.4625, "loss/crossentropy": 2.479397773742676, "loss/hidden": 0.166015625, "loss/logits": 0.040870534256100655, "loss/reg": 0.02555713802576065, "step": 601 }, { "epoch": 0.301, "grad_norm": 1.2297303676605225, "grad_norm_var": 3.8200878842337733, "learning_rate": 2e-05, "loss": 0.4696, "loss/crossentropy": 2.384745955467224, "loss/hidden": 0.1826171875, "loss/logits": 0.031467003747820854, "loss/reg": 0.02555503323674202, "step": 602 }, { "epoch": 0.3015, "grad_norm": 0.9617077112197876, "grad_norm_var": 3.883473919292651, "learning_rate": 2e-05, "loss": 0.4283, "loss/crossentropy": 2.3142151832580566, "loss/hidden": 0.14990234375, "loss/logits": 0.022820310667157173, "loss/reg": 0.02555287443101406, "step": 603 }, { "epoch": 0.302, "grad_norm": 1.0868256092071533, "grad_norm_var": 3.9159895776620384, "learning_rate": 2e-05, "loss": 0.445, "loss/crossentropy": 2.4487764835357666, "loss/hidden": 0.16064453125, "loss/logits": 0.02884063497185707, "loss/reg": 0.02555077336728573, "step": 604 }, { "epoch": 0.3025, "grad_norm": 1.2197123765945435, "grad_norm_var": 3.94730949969078, "learning_rate": 2e-05, "loss": 0.4717, "loss/crossentropy": 2.4810107946395874, "loss/hidden": 0.1865234375, "loss/logits": 0.02972548082470894, "loss/reg": 0.02554868534207344, "step": 605 }, { "epoch": 0.303, "grad_norm": 1.2248950004577637, "grad_norm_var": 3.974497531375912, "learning_rate": 2e-05, "loss": 0.4324, "loss/crossentropy": 2.4361231327056885, "loss/hidden": 0.15380859375, "loss/logits": 0.023081100545823574, "loss/reg": 0.025546491146087646, "step": 606 }, { "epoch": 0.3035, "grad_norm": 1.5527586936950684, "grad_norm_var": 3.945123091404479, "learning_rate": 2e-05, "loss": 0.4697, "loss/crossentropy": 2.5113409757614136, "loss/hidden": 0.18896484375, "loss/logits": 0.02533858921378851, "loss/reg": 0.02554413489997387, "step": 607 }, { "epoch": 0.304, "grad_norm": 2.6096105575561523, "grad_norm_var": 0.16489908848412535, "learning_rate": 2e-05, "loss": 0.442, "loss/crossentropy": 2.345840811729431, "loss/hidden": 0.15771484375, "loss/logits": 0.028883887454867363, "loss/reg": 0.02554202266037464, "step": 608 }, { "epoch": 0.3045, "grad_norm": 1.2581703662872314, "grad_norm_var": 0.1658887448879168, "learning_rate": 2e-05, "loss": 0.4535, "loss/crossentropy": 2.4540570974349976, "loss/hidden": 0.16845703125, "loss/logits": 0.029606305062770844, "loss/reg": 0.025539804250001907, "step": 609 }, { "epoch": 0.305, "grad_norm": 1.0741705894470215, "grad_norm_var": 0.16919604526549956, "learning_rate": 2e-05, "loss": 0.4519, "loss/crossentropy": 2.4377275705337524, "loss/hidden": 0.1650390625, "loss/logits": 0.031461406499147415, "loss/reg": 0.02553771249949932, "step": 610 }, { "epoch": 0.3055, "grad_norm": 1.2582398653030396, "grad_norm_var": 0.1679268859736533, "learning_rate": 2e-05, "loss": 0.4509, "loss/crossentropy": 2.414643406867981, "loss/hidden": 0.166015625, "loss/logits": 0.029505026526749134, "loss/reg": 0.025535589084029198, "step": 611 }, { "epoch": 0.306, "grad_norm": 1.128620982170105, "grad_norm_var": 0.16261113353333864, "learning_rate": 2e-05, "loss": 0.4456, "loss/crossentropy": 2.4645986557006836, "loss/hidden": 0.16357421875, "loss/logits": 0.02667510323226452, "loss/reg": 0.025533363223075867, "step": 612 }, { "epoch": 0.3065, "grad_norm": 1.2573778629302979, "grad_norm_var": 0.14434184243498857, "learning_rate": 2e-05, "loss": 0.4809, "loss/crossentropy": 2.4240217208862305, "loss/hidden": 0.1953125, "loss/logits": 0.030296322889626026, "loss/reg": 0.025531131774187088, "step": 613 }, { "epoch": 0.307, "grad_norm": 7.996622562408447, "grad_norm_var": 2.9277827960594167, "learning_rate": 2e-05, "loss": 0.9676, "loss/crossentropy": 2.0657594203948975, "loss/hidden": 0.5224609375, "loss/logits": 0.1897994950413704, "loss/reg": 0.02552902325987816, "step": 614 }, { "epoch": 0.3075, "grad_norm": 0.9756619930267334, "grad_norm_var": 2.963385991390479, "learning_rate": 2e-05, "loss": 0.4192, "loss/crossentropy": 2.511542320251465, "loss/hidden": 0.142578125, "loss/logits": 0.02132318541407585, "loss/reg": 0.025526810437440872, "step": 615 }, { "epoch": 0.308, "grad_norm": 1.9326781034469604, "grad_norm_var": 2.939604367144011, "learning_rate": 2e-05, "loss": 0.4602, "loss/crossentropy": 2.2573784589767456, "loss/hidden": 0.18212890625, "loss/logits": 0.02284115180373192, "loss/reg": 0.025524748489260674, "step": 616 }, { "epoch": 0.3085, "grad_norm": 1.3322287797927856, "grad_norm_var": 2.9375401728012682, "learning_rate": 2e-05, "loss": 0.4746, "loss/crossentropy": 2.328918933868408, "loss/hidden": 0.1884765625, "loss/logits": 0.03090812638401985, "loss/reg": 0.02552272193133831, "step": 617 }, { "epoch": 0.309, "grad_norm": 1.368570327758789, "grad_norm_var": 2.92899917136145, "learning_rate": 2e-05, "loss": 0.4906, "loss/crossentropy": 2.449865460395813, "loss/hidden": 0.189453125, "loss/logits": 0.04598201438784599, "loss/reg": 0.025520512834191322, "step": 618 }, { "epoch": 0.3095, "grad_norm": 1.2598553895950317, "grad_norm_var": 2.902626964663748, "learning_rate": 2e-05, "loss": 0.4644, "loss/crossentropy": 2.4811675548553467, "loss/hidden": 0.1748046875, "loss/logits": 0.03438819758594036, "loss/reg": 0.025518309324979782, "step": 619 }, { "epoch": 0.31, "grad_norm": 1.5200271606445312, "grad_norm_var": 2.8741158851437234, "learning_rate": 2e-05, "loss": 0.4648, "loss/crossentropy": 2.443149447441101, "loss/hidden": 0.1796875, "loss/logits": 0.02990366704761982, "loss/reg": 0.025516200810670853, "step": 620 }, { "epoch": 0.3105, "grad_norm": 1.1189664602279663, "grad_norm_var": 2.882687177244717, "learning_rate": 2e-05, "loss": 0.4401, "loss/crossentropy": 2.3359590768814087, "loss/hidden": 0.1591796875, "loss/logits": 0.025752616114914417, "loss/reg": 0.02551414631307125, "step": 621 }, { "epoch": 0.311, "grad_norm": 1.1328538656234741, "grad_norm_var": 2.8903269313735427, "learning_rate": 2e-05, "loss": 0.4446, "loss/crossentropy": 2.3448485136032104, "loss/hidden": 0.16357421875, "loss/logits": 0.025948218069970608, "loss/reg": 0.025512101128697395, "step": 622 }, { "epoch": 0.3115, "grad_norm": 1.3280351161956787, "grad_norm_var": 2.9008471808041234, "learning_rate": 2e-05, "loss": 0.502, "loss/crossentropy": 2.3840510845184326, "loss/hidden": 0.203125, "loss/logits": 0.04378024488687515, "loss/reg": 0.025509938597679138, "step": 623 }, { "epoch": 0.312, "grad_norm": 1.2119669914245605, "grad_norm_var": 2.8691701461931562, "learning_rate": 2e-05, "loss": 0.4384, "loss/crossentropy": 2.5495107173919678, "loss/hidden": 0.158203125, "loss/logits": 0.025127064436674118, "loss/reg": 0.025507742539048195, "step": 624 }, { "epoch": 0.3125, "grad_norm": 1.3906071186065674, "grad_norm_var": 2.862515149821022, "learning_rate": 2e-05, "loss": 0.4877, "loss/crossentropy": 2.418786406517029, "loss/hidden": 0.1953125, "loss/logits": 0.037290943786501884, "loss/reg": 0.02550552599132061, "step": 625 }, { "epoch": 0.313, "grad_norm": 1.1530770063400269, "grad_norm_var": 2.8562631605775035, "learning_rate": 2e-05, "loss": 0.4523, "loss/crossentropy": 2.5153443813323975, "loss/hidden": 0.16796875, "loss/logits": 0.029278968460857868, "loss/reg": 0.02550341933965683, "step": 626 }, { "epoch": 0.3135, "grad_norm": 1.9171541929244995, "grad_norm_var": 2.843679575594864, "learning_rate": 2e-05, "loss": 0.4798, "loss/crossentropy": 2.592397689819336, "loss/hidden": 0.19482421875, "loss/logits": 0.029941866174340248, "loss/reg": 0.025501396507024765, "step": 627 }, { "epoch": 0.314, "grad_norm": 1.4067423343658447, "grad_norm_var": 2.8254152118389118, "learning_rate": 2e-05, "loss": 0.4421, "loss/crossentropy": 2.360334277153015, "loss/hidden": 0.1650390625, "loss/logits": 0.02205614186823368, "loss/reg": 0.02549940161406994, "step": 628 }, { "epoch": 0.3145, "grad_norm": 1.271565318107605, "grad_norm_var": 2.8244601627756833, "learning_rate": 2e-05, "loss": 0.492, "loss/crossentropy": 2.323120355606079, "loss/hidden": 0.20556640625, "loss/logits": 0.031410202383995056, "loss/reg": 0.025497442111372948, "step": 629 }, { "epoch": 0.315, "grad_norm": 1.192052960395813, "grad_norm_var": 0.06888867322267149, "learning_rate": 2e-05, "loss": 0.5017, "loss/crossentropy": 2.176342010498047, "loss/hidden": 0.2138671875, "loss/logits": 0.032856905832886696, "loss/reg": 0.025495316833257675, "step": 630 }, { "epoch": 0.3155, "grad_norm": 1.0660690069198608, "grad_norm_var": 0.06495340762153295, "learning_rate": 2e-05, "loss": 0.4197, "loss/crossentropy": 2.4988861083984375, "loss/hidden": 0.1416015625, "loss/logits": 0.023143235594034195, "loss/reg": 0.025493212044239044, "step": 631 }, { "epoch": 0.316, "grad_norm": 1.3349320888519287, "grad_norm_var": 0.04085774566783152, "learning_rate": 2e-05, "loss": 0.4506, "loss/crossentropy": 2.455591082572937, "loss/hidden": 0.169921875, "loss/logits": 0.0257937153801322, "loss/reg": 0.025491099804639816, "step": 632 }, { "epoch": 0.3165, "grad_norm": 3.0417232513427734, "grad_norm_var": 0.22793577307115717, "learning_rate": 2e-05, "loss": 0.5239, "loss/crossentropy": 2.37486732006073, "loss/hidden": 0.2314453125, "loss/logits": 0.03756898641586304, "loss/reg": 0.025489188730716705, "step": 633 }, { "epoch": 0.317, "grad_norm": 1.2681235074996948, "grad_norm_var": 0.22925030763110257, "learning_rate": 2e-05, "loss": 0.5084, "loss/crossentropy": 2.259597897529602, "loss/hidden": 0.21337890625, "loss/logits": 0.040193804539740086, "loss/reg": 0.025486983358860016, "step": 634 }, { "epoch": 0.3175, "grad_norm": 1.7327830791473389, "grad_norm_var": 0.2335495834433952, "learning_rate": 2e-05, "loss": 0.464, "loss/crossentropy": 2.754118800163269, "loss/hidden": 0.18115234375, "loss/logits": 0.027960547246038914, "loss/reg": 0.025484783574938774, "step": 635 }, { "epoch": 0.318, "grad_norm": 1.1379330158233643, "grad_norm_var": 0.23874590770987894, "learning_rate": 2e-05, "loss": 0.4394, "loss/crossentropy": 2.275562047958374, "loss/hidden": 0.1630859375, "loss/logits": 0.021530453115701675, "loss/reg": 0.02548276260495186, "step": 636 }, { "epoch": 0.3185, "grad_norm": 2.230278253555298, "grad_norm_var": 0.2714714145474554, "learning_rate": 2e-05, "loss": 0.5173, "loss/crossentropy": 2.338230013847351, "loss/hidden": 0.23681640625, "loss/logits": 0.025674378499388695, "loss/reg": 0.025480857118964195, "step": 637 }, { "epoch": 0.319, "grad_norm": 1.439009428024292, "grad_norm_var": 0.26281213986055435, "learning_rate": 2e-05, "loss": 0.4592, "loss/crossentropy": 2.2542585134506226, "loss/hidden": 0.177734375, "loss/logits": 0.026685651391744614, "loss/reg": 0.025478988885879517, "step": 638 }, { "epoch": 0.3195, "grad_norm": 1.2269906997680664, "grad_norm_var": 0.26586984825830745, "learning_rate": 2e-05, "loss": 0.484, "loss/crossentropy": 2.4006763696670532, "loss/hidden": 0.19189453125, "loss/logits": 0.03735353797674179, "loss/reg": 0.02547682449221611, "step": 639 }, { "epoch": 0.32, "grad_norm": 1.4380499124526978, "grad_norm_var": 0.2603422819560449, "learning_rate": 2e-05, "loss": 0.4904, "loss/crossentropy": 2.2535301446914673, "loss/hidden": 0.2001953125, "loss/logits": 0.035440364852547646, "loss/reg": 0.02547490783035755, "step": 640 }, { "epoch": 0.3205, "grad_norm": 1.1311625242233276, "grad_norm_var": 0.268867656697473, "learning_rate": 2e-05, "loss": 0.4509, "loss/crossentropy": 2.3959745168685913, "loss/hidden": 0.1689453125, "loss/logits": 0.027200866490602493, "loss/reg": 0.025472737848758698, "step": 641 }, { "epoch": 0.321, "grad_norm": 1.5106732845306396, "grad_norm_var": 0.2603555469624775, "learning_rate": 2e-05, "loss": 0.4666, "loss/crossentropy": 2.3994356393814087, "loss/hidden": 0.17919921875, "loss/logits": 0.03274068981409073, "loss/reg": 0.025470787659287453, "step": 642 }, { "epoch": 0.3215, "grad_norm": 1.823555588722229, "grad_norm_var": 0.25596636935256256, "learning_rate": 2e-05, "loss": 0.5589, "loss/crossentropy": 2.1762577295303345, "loss/hidden": 0.255859375, "loss/logits": 0.04833154007792473, "loss/reg": 0.025468602776527405, "step": 643 }, { "epoch": 0.322, "grad_norm": 1.4282046556472778, "grad_norm_var": 0.2556832814253122, "learning_rate": 2e-05, "loss": 0.4487, "loss/crossentropy": 2.3608391284942627, "loss/hidden": 0.1650390625, "loss/logits": 0.02895598392933607, "loss/reg": 0.0254666730761528, "step": 644 }, { "epoch": 0.3225, "grad_norm": 1.1915183067321777, "grad_norm_var": 0.2587039981971661, "learning_rate": 2e-05, "loss": 0.4567, "loss/crossentropy": 2.160835921764374, "loss/hidden": 0.17529296875, "loss/logits": 0.026723448187112808, "loss/reg": 0.02546459622681141, "step": 645 }, { "epoch": 0.323, "grad_norm": 1.0989381074905396, "grad_norm_var": 0.2632189617332703, "learning_rate": 2e-05, "loss": 0.4694, "loss/crossentropy": 2.429106831550598, "loss/hidden": 0.181640625, "loss/logits": 0.03316484112292528, "loss/reg": 0.025462418794631958, "step": 646 }, { "epoch": 0.3235, "grad_norm": 2.257662296295166, "grad_norm_var": 0.28202735887923397, "learning_rate": 2e-05, "loss": 0.5176, "loss/crossentropy": 2.431147336959839, "loss/hidden": 0.22216796875, "loss/logits": 0.04080248158425093, "loss/reg": 0.02546020597219467, "step": 647 }, { "epoch": 0.324, "grad_norm": 1.9271612167358398, "grad_norm_var": 0.28453986075382054, "learning_rate": 2e-05, "loss": 0.4812, "loss/crossentropy": 2.2328370809555054, "loss/hidden": 0.19580078125, "loss/logits": 0.03081681113690138, "loss/reg": 0.02545810490846634, "step": 648 }, { "epoch": 0.3245, "grad_norm": 1.57036554813385, "grad_norm_var": 0.14048631360791, "learning_rate": 2e-05, "loss": 0.4572, "loss/crossentropy": 2.3384969234466553, "loss/hidden": 0.1748046875, "loss/logits": 0.027865654788911343, "loss/reg": 0.02545584924519062, "step": 649 }, { "epoch": 0.325, "grad_norm": 3.972613573074341, "grad_norm_var": 0.5047189714591601, "learning_rate": 2e-05, "loss": 0.8007, "loss/crossentropy": 2.171482264995575, "loss/hidden": 0.39306640625, "loss/logits": 0.1530690910294652, "loss/reg": 0.0254536010324955, "step": 650 }, { "epoch": 0.3255, "grad_norm": 1.2306694984436035, "grad_norm_var": 0.5179338564331883, "learning_rate": 2e-05, "loss": 0.4915, "loss/crossentropy": 2.2794147729873657, "loss/hidden": 0.203125, "loss/logits": 0.033877959474921227, "loss/reg": 0.025451431050896645, "step": 651 }, { "epoch": 0.326, "grad_norm": 6.3861775398254395, "grad_norm_var": 1.8717174937138472, "learning_rate": 2e-05, "loss": 0.6826, "loss/crossentropy": 2.2695876359939575, "loss/hidden": 0.35986328125, "loss/logits": 0.06826404109597206, "loss/reg": 0.02544919028878212, "step": 652 }, { "epoch": 0.3265, "grad_norm": 0.9534096717834473, "grad_norm_var": 1.932954969588551, "learning_rate": 2e-05, "loss": 0.4184, "loss/crossentropy": 2.4821490049362183, "loss/hidden": 0.14208984375, "loss/logits": 0.02188246138393879, "loss/reg": 0.02544700726866722, "step": 653 }, { "epoch": 0.327, "grad_norm": 1.8388314247131348, "grad_norm_var": 1.917750585249983, "learning_rate": 2e-05, "loss": 0.5184, "loss/crossentropy": 2.622692823410034, "loss/hidden": 0.21337890625, "loss/logits": 0.05053009279072285, "loss/reg": 0.025444859638810158, "step": 654 }, { "epoch": 0.3275, "grad_norm": 1.0860503911972046, "grad_norm_var": 1.9323275539076297, "learning_rate": 2e-05, "loss": 0.4528, "loss/crossentropy": 2.4405031204223633, "loss/hidden": 0.17041015625, "loss/logits": 0.027923785150051117, "loss/reg": 0.02544267661869526, "step": 655 }, { "epoch": 0.328, "grad_norm": 1.8346869945526123, "grad_norm_var": 1.9162589037726556, "learning_rate": 2e-05, "loss": 0.4467, "loss/crossentropy": 2.561371684074402, "loss/hidden": 0.16552734375, "loss/logits": 0.02681200671941042, "loss/reg": 0.025440504774451256, "step": 656 }, { "epoch": 0.3285, "grad_norm": 1.7516165971755981, "grad_norm_var": 1.8723634601240877, "learning_rate": 2e-05, "loss": 0.5473, "loss/crossentropy": 2.1868897676467896, "loss/hidden": 0.24951171875, "loss/logits": 0.04336274042725563, "loss/reg": 0.025438381358981133, "step": 657 }, { "epoch": 0.329, "grad_norm": 1.1200268268585205, "grad_norm_var": 1.906939612518704, "learning_rate": 2e-05, "loss": 0.4269, "loss/crossentropy": 2.5193029642105103, "loss/hidden": 0.14892578125, "loss/logits": 0.023628379218280315, "loss/reg": 0.025436177849769592, "step": 658 }, { "epoch": 0.3295, "grad_norm": 1.2578015327453613, "grad_norm_var": 1.9377626206597995, "learning_rate": 2e-05, "loss": 0.4864, "loss/crossentropy": 2.4258992671966553, "loss/hidden": 0.19140625, "loss/logits": 0.040623242035508156, "loss/reg": 0.02543400041759014, "step": 659 }, { "epoch": 0.33, "grad_norm": 1.0507436990737915, "grad_norm_var": 1.9720027861822638, "learning_rate": 2e-05, "loss": 0.4556, "loss/crossentropy": 2.3461010456085205, "loss/hidden": 0.16796875, "loss/logits": 0.033318827860057354, "loss/reg": 0.02543184906244278, "step": 660 }, { "epoch": 0.3305, "grad_norm": 1.2176129817962646, "grad_norm_var": 1.969552437425464, "learning_rate": 2e-05, "loss": 0.469, "loss/crossentropy": 2.3208965063095093, "loss/hidden": 0.1787109375, "loss/logits": 0.03600373677909374, "loss/reg": 0.025429651141166687, "step": 661 }, { "epoch": 0.331, "grad_norm": 1.1113396883010864, "grad_norm_var": 1.9682215053201066, "learning_rate": 2e-05, "loss": 0.4615, "loss/crossentropy": 1.9688389897346497, "loss/hidden": 0.1826171875, "loss/logits": 0.024597243405878544, "loss/reg": 0.025427548214793205, "step": 662 }, { "epoch": 0.3315, "grad_norm": 2.15281343460083, "grad_norm_var": 1.9640542341909926, "learning_rate": 2e-05, "loss": 0.5739, "loss/crossentropy": 2.476504325866699, "loss/hidden": 0.2392578125, "loss/logits": 0.08039886690676212, "loss/reg": 0.025425344705581665, "step": 663 }, { "epoch": 0.332, "grad_norm": 1.142524003982544, "grad_norm_var": 2.0000960230816474, "learning_rate": 2e-05, "loss": 0.4834, "loss/crossentropy": 2.45276939868927, "loss/hidden": 0.1865234375, "loss/logits": 0.042642902582883835, "loss/reg": 0.025423116981983185, "step": 664 }, { "epoch": 0.3325, "grad_norm": 1.3945834636688232, "grad_norm_var": 2.0086944041381836, "learning_rate": 2e-05, "loss": 0.5291, "loss/crossentropy": 2.2395424842834473, "loss/hidden": 0.22265625, "loss/logits": 0.05220697447657585, "loss/reg": 0.025420982390642166, "step": 665 }, { "epoch": 0.333, "grad_norm": 1.2921602725982666, "grad_norm_var": 1.6969372224034807, "learning_rate": 2e-05, "loss": 0.5112, "loss/crossentropy": 2.4718152284622192, "loss/hidden": 0.2158203125, "loss/logits": 0.041200825944542885, "loss/reg": 0.025418834760785103, "step": 666 }, { "epoch": 0.3335, "grad_norm": 1.8263285160064697, "grad_norm_var": 1.6837190851105295, "learning_rate": 2e-05, "loss": 0.6125, "loss/crossentropy": 2.3179105520248413, "loss/hidden": 0.30029296875, "loss/logits": 0.058010220527648926, "loss/reg": 0.02541666105389595, "step": 667 }, { "epoch": 0.334, "grad_norm": 0.9793453216552734, "grad_norm_var": 0.14228114450421098, "learning_rate": 2e-05, "loss": 0.421, "loss/crossentropy": 2.375541925430298, "loss/hidden": 0.1435546875, "loss/logits": 0.023268045857548714, "loss/reg": 0.025414319708943367, "step": 668 }, { "epoch": 0.3345, "grad_norm": 1.0925960540771484, "grad_norm_var": 0.13565654288369539, "learning_rate": 2e-05, "loss": 0.4561, "loss/crossentropy": 2.314823031425476, "loss/hidden": 0.173828125, "loss/logits": 0.028134356252849102, "loss/reg": 0.025411993265151978, "step": 669 }, { "epoch": 0.335, "grad_norm": 1.2090929746627808, "grad_norm_var": 0.12227878219406557, "learning_rate": 2e-05, "loss": 0.4747, "loss/crossentropy": 2.471633553504944, "loss/hidden": 0.19189453125, "loss/logits": 0.02871632482856512, "loss/reg": 0.025409623980522156, "step": 670 }, { "epoch": 0.3355, "grad_norm": 1.2346880435943604, "grad_norm_var": 0.11852848812049478, "learning_rate": 2e-05, "loss": 0.4636, "loss/crossentropy": 2.414598226547241, "loss/hidden": 0.1748046875, "loss/logits": 0.034768104553222656, "loss/reg": 0.02540736459195614, "step": 671 }, { "epoch": 0.336, "grad_norm": 1.069199800491333, "grad_norm_var": 0.10611561855972672, "learning_rate": 2e-05, "loss": 0.4507, "loss/crossentropy": 2.2887717485427856, "loss/hidden": 0.1689453125, "loss/logits": 0.027695579454302788, "loss/reg": 0.025404995307326317, "step": 672 }, { "epoch": 0.3365, "grad_norm": 1.3578476905822754, "grad_norm_var": 0.09243173709434505, "learning_rate": 2e-05, "loss": 0.4806, "loss/crossentropy": 2.090680956840515, "loss/hidden": 0.2001953125, "loss/logits": 0.026400449685752392, "loss/reg": 0.025402558967471123, "step": 673 }, { "epoch": 0.337, "grad_norm": 1.222834587097168, "grad_norm_var": 0.09087487045602523, "learning_rate": 2e-05, "loss": 0.4371, "loss/crossentropy": 2.3926587104797363, "loss/hidden": 0.15234375, "loss/logits": 0.03075406327843666, "loss/reg": 0.025400325655937195, "step": 674 }, { "epoch": 0.3375, "grad_norm": 1.2310668230056763, "grad_norm_var": 0.09102797075126906, "learning_rate": 2e-05, "loss": 0.487, "loss/crossentropy": 2.3810113668441772, "loss/hidden": 0.19970703125, "loss/logits": 0.03329848870635033, "loss/reg": 0.0253978930413723, "step": 675 }, { "epoch": 0.338, "grad_norm": 1.1485071182250977, "grad_norm_var": 0.08855158055118005, "learning_rate": 2e-05, "loss": 0.4242, "loss/crossentropy": 2.513722538948059, "loss/hidden": 0.1455078125, "loss/logits": 0.02471320889890194, "loss/reg": 0.025395726785063744, "step": 676 }, { "epoch": 0.3385, "grad_norm": 1.302976369857788, "grad_norm_var": 0.08815285694720097, "learning_rate": 2e-05, "loss": 0.408, "loss/crossentropy": 2.3934881687164307, "loss/hidden": 0.132568359375, "loss/logits": 0.0214870385825634, "loss/reg": 0.025393173098564148, "step": 677 }, { "epoch": 0.339, "grad_norm": 1.2492976188659668, "grad_norm_var": 0.08590898883028307, "learning_rate": 2e-05, "loss": 0.4409, "loss/crossentropy": 2.4734569787979126, "loss/hidden": 0.16064453125, "loss/logits": 0.02638374548405409, "loss/reg": 0.025390924885869026, "step": 678 }, { "epoch": 0.3395, "grad_norm": 1.4173915386199951, "grad_norm_var": 0.03673691192539176, "learning_rate": 2e-05, "loss": 0.423, "loss/crossentropy": 2.4406436681747437, "loss/hidden": 0.14453125, "loss/logits": 0.024627392180263996, "loss/reg": 0.025388652458786964, "step": 679 }, { "epoch": 0.34, "grad_norm": 0.9957833290100098, "grad_norm_var": 0.04039394780387108, "learning_rate": 2e-05, "loss": 0.4216, "loss/crossentropy": 2.472551703453064, "loss/hidden": 0.14453125, "loss/logits": 0.023214499466121197, "loss/reg": 0.025386210530996323, "step": 680 }, { "epoch": 0.3405, "grad_norm": 1.3958747386932373, "grad_norm_var": 0.040418689929556946, "learning_rate": 2e-05, "loss": 0.4807, "loss/crossentropy": 2.636582612991333, "loss/hidden": 0.18798828125, "loss/logits": 0.038843123242259026, "loss/reg": 0.02538374997675419, "step": 681 }, { "epoch": 0.341, "grad_norm": 1.2559229135513306, "grad_norm_var": 0.04030460464576505, "learning_rate": 2e-05, "loss": 0.4735, "loss/crossentropy": 2.2572195529937744, "loss/hidden": 0.1865234375, "loss/logits": 0.03318110667169094, "loss/reg": 0.02538151666522026, "step": 682 }, { "epoch": 0.3415, "grad_norm": 1.437334418296814, "grad_norm_var": 0.019833637621310865, "learning_rate": 2e-05, "loss": 0.4508, "loss/crossentropy": 2.4471691846847534, "loss/hidden": 0.17041015625, "loss/logits": 0.026614676229655743, "loss/reg": 0.0253791194409132, "step": 683 }, { "epoch": 0.342, "grad_norm": 1.7899738550186157, "grad_norm_var": 0.03435388481276878, "learning_rate": 2e-05, "loss": 0.4697, "loss/crossentropy": 2.3129884004592896, "loss/hidden": 0.1865234375, "loss/logits": 0.029374102130532265, "loss/reg": 0.025376921519637108, "step": 684 }, { "epoch": 0.3425, "grad_norm": 1.0163391828536987, "grad_norm_var": 0.03657853766482973, "learning_rate": 2e-05, "loss": 0.4382, "loss/crossentropy": 2.441192150115967, "loss/hidden": 0.16015625, "loss/logits": 0.024306317791342735, "loss/reg": 0.025374585762619972, "step": 685 }, { "epoch": 0.343, "grad_norm": 1.1746463775634766, "grad_norm_var": 0.03693649317759663, "learning_rate": 2e-05, "loss": 0.4359, "loss/crossentropy": 2.3794326782226562, "loss/hidden": 0.15576171875, "loss/logits": 0.026386510580778122, "loss/reg": 0.025372277945280075, "step": 686 }, { "epoch": 0.3435, "grad_norm": 1.0302844047546387, "grad_norm_var": 0.04047557695632419, "learning_rate": 2e-05, "loss": 0.4222, "loss/crossentropy": 2.421720266342163, "loss/hidden": 0.146484375, "loss/logits": 0.022049223072826862, "loss/reg": 0.025370018556714058, "step": 687 }, { "epoch": 0.344, "grad_norm": 1.4138187170028687, "grad_norm_var": 0.039316962171863895, "learning_rate": 2e-05, "loss": 0.4613, "loss/crossentropy": 2.4710036516189575, "loss/hidden": 0.17626953125, "loss/logits": 0.03138366714119911, "loss/reg": 0.02536788582801819, "step": 688 }, { "epoch": 0.3445, "grad_norm": 1.39634108543396, "grad_norm_var": 0.03982198390903117, "learning_rate": 2e-05, "loss": 0.4409, "loss/crossentropy": 2.50797963142395, "loss/hidden": 0.158203125, "loss/logits": 0.029052263125777245, "loss/reg": 0.02536572329699993, "step": 689 }, { "epoch": 0.345, "grad_norm": 1.4006764888763428, "grad_norm_var": 0.040445578503683306, "learning_rate": 2e-05, "loss": 0.4874, "loss/crossentropy": 2.327502489089966, "loss/hidden": 0.20947265625, "loss/logits": 0.0243146987631917, "loss/reg": 0.025363536551594734, "step": 690 }, { "epoch": 0.3455, "grad_norm": 1.3401939868927002, "grad_norm_var": 0.04031761591638811, "learning_rate": 2e-05, "loss": 0.4645, "loss/crossentropy": 2.4942984580993652, "loss/hidden": 0.18017578125, "loss/logits": 0.03075546585023403, "loss/reg": 0.02536129206418991, "step": 691 }, { "epoch": 0.346, "grad_norm": 1.1368129253387451, "grad_norm_var": 0.040558999133186016, "learning_rate": 2e-05, "loss": 0.4513, "loss/crossentropy": 2.5324333906173706, "loss/hidden": 0.1708984375, "loss/logits": 0.02676941640675068, "loss/reg": 0.025359032675623894, "step": 692 }, { "epoch": 0.3465, "grad_norm": 10.904756546020508, "grad_norm_var": 5.81021311974864, "learning_rate": 2e-05, "loss": 0.6749, "loss/crossentropy": 2.5305880308151245, "loss/hidden": 0.39306640625, "loss/logits": 0.028291589580476284, "loss/reg": 0.02535676583647728, "step": 693 }, { "epoch": 0.347, "grad_norm": 1.0383360385894775, "grad_norm_var": 5.831219439922715, "learning_rate": 2e-05, "loss": 0.4407, "loss/crossentropy": 2.347463846206665, "loss/hidden": 0.1640625, "loss/logits": 0.02308377344161272, "loss/reg": 0.025354566052556038, "step": 694 }, { "epoch": 0.3475, "grad_norm": 1.1074702739715576, "grad_norm_var": 5.856505480500767, "learning_rate": 2e-05, "loss": 0.4515, "loss/crossentropy": 2.6167062520980835, "loss/hidden": 0.1689453125, "loss/logits": 0.029017897322773933, "loss/reg": 0.025352245196700096, "step": 695 }, { "epoch": 0.348, "grad_norm": 1.6335835456848145, "grad_norm_var": 5.808040474999217, "learning_rate": 2e-05, "loss": 0.4665, "loss/crossentropy": 2.2225993871688843, "loss/hidden": 0.181640625, "loss/logits": 0.031376788392663, "loss/reg": 0.025349974632263184, "step": 696 }, { "epoch": 0.3485, "grad_norm": 2.073458194732666, "grad_norm_var": 5.790781894960122, "learning_rate": 2e-05, "loss": 0.513, "loss/crossentropy": 2.1957470178604126, "loss/hidden": 0.23193359375, "loss/logits": 0.027595311403274536, "loss/reg": 0.02534763514995575, "step": 697 }, { "epoch": 0.349, "grad_norm": 4.3756818771362305, "grad_norm_var": 6.1116753594536, "learning_rate": 2e-05, "loss": 0.6104, "loss/crossentropy": 2.120497226715088, "loss/hidden": 0.32080078125, "loss/logits": 0.03617184329777956, "loss/reg": 0.025345396250486374, "step": 698 }, { "epoch": 0.3495, "grad_norm": 2.3373966217041016, "grad_norm_var": 6.07775883522955, "learning_rate": 2e-05, "loss": 0.574, "loss/crossentropy": 2.3245344161987305, "loss/hidden": 0.2373046875, "loss/logits": 0.08329359069466591, "loss/reg": 0.025343157351017, "step": 699 }, { "epoch": 0.35, "grad_norm": 1.0438388586044312, "grad_norm_var": 6.153157025860973, "learning_rate": 2e-05, "loss": 0.4362, "loss/crossentropy": 2.362974166870117, "loss/hidden": 0.15576171875, "loss/logits": 0.026995157822966576, "loss/reg": 0.025340832769870758, "step": 700 }, { "epoch": 0.3505, "grad_norm": 1.6430028676986694, "grad_norm_var": 6.082854600769767, "learning_rate": 2e-05, "loss": 0.5277, "loss/crossentropy": 2.102017641067505, "loss/hidden": 0.2333984375, "loss/logits": 0.04095187783241272, "loss/reg": 0.025338461622595787, "step": 701 }, { "epoch": 0.351, "grad_norm": 1.5996311902999878, "grad_norm_var": 6.036571733599795, "learning_rate": 2e-05, "loss": 0.566, "loss/crossentropy": 2.1797362565994263, "loss/hidden": 0.271484375, "loss/logits": 0.04117584228515625, "loss/reg": 0.025335904210805893, "step": 702 }, { "epoch": 0.3515, "grad_norm": 1.128166913986206, "grad_norm_var": 6.021680040096112, "learning_rate": 2e-05, "loss": 0.4406, "loss/crossentropy": 2.4110106229782104, "loss/hidden": 0.1611328125, "loss/logits": 0.026095453649759293, "loss/reg": 0.025333648547530174, "step": 703 }, { "epoch": 0.352, "grad_norm": 1.8349699974060059, "grad_norm_var": 5.987309069676893, "learning_rate": 2e-05, "loss": 0.5493, "loss/crossentropy": 2.0857229232788086, "loss/hidden": 0.2529296875, "loss/logits": 0.04306299611926079, "loss/reg": 0.0253314059227705, "step": 704 }, { "epoch": 0.3525, "grad_norm": 1.496748924255371, "grad_norm_var": 5.9765153933005, "learning_rate": 2e-05, "loss": 0.4725, "loss/crossentropy": 2.4090970754623413, "loss/hidden": 0.18701171875, "loss/logits": 0.0321922991424799, "loss/reg": 0.0253291055560112, "step": 705 }, { "epoch": 0.353, "grad_norm": 1.9091578722000122, "grad_norm_var": 5.9346915662249025, "learning_rate": 2e-05, "loss": 0.4712, "loss/crossentropy": 2.4798312187194824, "loss/hidden": 0.18115234375, "loss/logits": 0.03674683719873428, "loss/reg": 0.025326747447252274, "step": 706 }, { "epoch": 0.3535, "grad_norm": 1.3746347427368164, "grad_norm_var": 5.930414656573596, "learning_rate": 2e-05, "loss": 0.4387, "loss/crossentropy": 2.332283616065979, "loss/hidden": 0.15673828125, "loss/logits": 0.02867988497018814, "loss/reg": 0.025324523448944092, "step": 707 }, { "epoch": 0.354, "grad_norm": 1.2437435388565063, "grad_norm_var": 5.914689920860994, "learning_rate": 2e-05, "loss": 0.4413, "loss/crossentropy": 2.4120808839797974, "loss/hidden": 0.15869140625, "loss/logits": 0.029423246160149574, "loss/reg": 0.025322169065475464, "step": 708 }, { "epoch": 0.3545, "grad_norm": 1.5218226909637451, "grad_norm_var": 0.6477736948298792, "learning_rate": 2e-05, "loss": 0.4785, "loss/crossentropy": 2.4414559602737427, "loss/hidden": 0.1923828125, "loss/logits": 0.032884806394577026, "loss/reg": 0.02531973458826542, "step": 709 }, { "epoch": 0.355, "grad_norm": 1.0013576745986938, "grad_norm_var": 0.651171268534646, "learning_rate": 2e-05, "loss": 0.4448, "loss/crossentropy": 2.2079886198043823, "loss/hidden": 0.158203125, "loss/logits": 0.033450678922235966, "loss/reg": 0.025317512452602386, "step": 710 }, { "epoch": 0.3555, "grad_norm": 1.558840274810791, "grad_norm_var": 0.6277757593685663, "learning_rate": 2e-05, "loss": 0.5398, "loss/crossentropy": 2.2513808012008667, "loss/hidden": 0.23974609375, "loss/logits": 0.046865444630384445, "loss/reg": 0.025314999744296074, "step": 711 }, { "epoch": 0.356, "grad_norm": 1.1995527744293213, "grad_norm_var": 0.6454767272231472, "learning_rate": 2e-05, "loss": 0.5245, "loss/crossentropy": 2.1171988248825073, "loss/hidden": 0.2294921875, "loss/logits": 0.041836922988295555, "loss/reg": 0.025312749668955803, "step": 712 }, { "epoch": 0.3565, "grad_norm": 1.0955619812011719, "grad_norm_var": 0.6577077274442764, "learning_rate": 2e-05, "loss": 0.4366, "loss/crossentropy": 2.375182032585144, "loss/hidden": 0.15576171875, "loss/logits": 0.027697966434061527, "loss/reg": 0.02531055547297001, "step": 713 }, { "epoch": 0.357, "grad_norm": 1.287891149520874, "grad_norm_var": 0.13050938322241734, "learning_rate": 2e-05, "loss": 0.4445, "loss/crossentropy": 2.4279476404190063, "loss/hidden": 0.1640625, "loss/logits": 0.027320224791765213, "loss/reg": 0.025308314710855484, "step": 714 }, { "epoch": 0.3575, "grad_norm": 1.1665476560592651, "grad_norm_var": 0.07840015841878997, "learning_rate": 2e-05, "loss": 0.4761, "loss/crossentropy": 2.3419547080993652, "loss/hidden": 0.1904296875, "loss/logits": 0.03259772714227438, "loss/reg": 0.02530606836080551, "step": 715 }, { "epoch": 0.358, "grad_norm": 1.0555628538131714, "grad_norm_var": 0.07788077396049188, "learning_rate": 2e-05, "loss": 0.4214, "loss/crossentropy": 2.2978007793426514, "loss/hidden": 0.142578125, "loss/logits": 0.025796832516789436, "loss/reg": 0.025303872302174568, "step": 716 }, { "epoch": 0.3585, "grad_norm": 0.9452884793281555, "grad_norm_var": 0.084055576139029, "learning_rate": 2e-05, "loss": 0.4439, "loss/crossentropy": 2.2497235536575317, "loss/hidden": 0.1630859375, "loss/logits": 0.027815911918878555, "loss/reg": 0.025301622226834297, "step": 717 }, { "epoch": 0.359, "grad_norm": 1.4938298463821411, "grad_norm_var": 0.08107452606832148, "learning_rate": 2e-05, "loss": 0.4895, "loss/crossentropy": 2.2590330839157104, "loss/hidden": 0.203125, "loss/logits": 0.0333606218919158, "loss/reg": 0.025299306958913803, "step": 718 }, { "epoch": 0.3595, "grad_norm": 1.3809986114501953, "grad_norm_var": 0.07819483831579542, "learning_rate": 2e-05, "loss": 0.4289, "loss/crossentropy": 2.3282105922698975, "loss/hidden": 0.1474609375, "loss/logits": 0.028477998450398445, "loss/reg": 0.025296946987509727, "step": 719 }, { "epoch": 0.36, "grad_norm": 1.4700795412063599, "grad_norm_var": 0.062819776138249, "learning_rate": 2e-05, "loss": 0.4491, "loss/crossentropy": 2.4214909076690674, "loss/hidden": 0.171875, "loss/logits": 0.024313151836395264, "loss/reg": 0.025294575840234756, "step": 720 }, { "epoch": 0.3605, "grad_norm": 1.8786181211471558, "grad_norm_var": 0.08067338037888813, "learning_rate": 2e-05, "loss": 0.4645, "loss/crossentropy": 2.574108123779297, "loss/hidden": 0.177734375, "loss/logits": 0.03380656335502863, "loss/reg": 0.025292182341217995, "step": 721 }, { "epoch": 0.361, "grad_norm": 1.4755817651748657, "grad_norm_var": 0.06003798552636786, "learning_rate": 2e-05, "loss": 0.4538, "loss/crossentropy": 2.327579617500305, "loss/hidden": 0.171875, "loss/logits": 0.029062069952487946, "loss/reg": 0.025289788842201233, "step": 722 }, { "epoch": 0.3615, "grad_norm": 1.5997651815414429, "grad_norm_var": 0.06478959320761259, "learning_rate": 2e-05, "loss": 0.4824, "loss/crossentropy": 2.248897910118103, "loss/hidden": 0.19189453125, "loss/logits": 0.037585300393402576, "loss/reg": 0.025287389755249023, "step": 723 }, { "epoch": 0.362, "grad_norm": 0.9326217770576477, "grad_norm_var": 0.07466397239676793, "learning_rate": 2e-05, "loss": 0.4426, "loss/crossentropy": 2.4139727354049683, "loss/hidden": 0.16162109375, "loss/logits": 0.028111821971833706, "loss/reg": 0.025284940376877785, "step": 724 }, { "epoch": 0.3625, "grad_norm": 1.4016598463058472, "grad_norm_var": 0.07227671584546869, "learning_rate": 2e-05, "loss": 0.4875, "loss/crossentropy": 2.2318572402000427, "loss/hidden": 0.19140625, "loss/logits": 0.04322698712348938, "loss/reg": 0.02528252638876438, "step": 725 }, { "epoch": 0.363, "grad_norm": 2.276989698410034, "grad_norm_var": 0.12165648929605381, "learning_rate": 2e-05, "loss": 0.5242, "loss/crossentropy": 2.2202149629592896, "loss/hidden": 0.23095703125, "loss/logits": 0.040427614003419876, "loss/reg": 0.0252800602465868, "step": 726 }, { "epoch": 0.3635, "grad_norm": 1.3967205286026, "grad_norm_var": 0.11962167472225668, "learning_rate": 2e-05, "loss": 0.4749, "loss/crossentropy": 2.4266481399536133, "loss/hidden": 0.189453125, "loss/logits": 0.03263301961123943, "loss/reg": 0.025277448818087578, "step": 727 }, { "epoch": 0.364, "grad_norm": 1.777940273284912, "grad_norm_var": 0.12672369877618006, "learning_rate": 2e-05, "loss": 0.4827, "loss/crossentropy": 2.5460067987442017, "loss/hidden": 0.19384765625, "loss/logits": 0.03612758591771126, "loss/reg": 0.025274960324168205, "step": 728 }, { "epoch": 0.3645, "grad_norm": 1.3306031227111816, "grad_norm_var": 0.12017416562564118, "learning_rate": 2e-05, "loss": 0.4589, "loss/crossentropy": 2.3280850648880005, "loss/hidden": 0.17236328125, "loss/logits": 0.033771621994674206, "loss/reg": 0.025272710248827934, "step": 729 }, { "epoch": 0.365, "grad_norm": 1.8370397090911865, "grad_norm_var": 0.12865930776388695, "learning_rate": 2e-05, "loss": 0.477, "loss/crossentropy": 2.3396809101104736, "loss/hidden": 0.1923828125, "loss/logits": 0.03194649703800678, "loss/reg": 0.025270242244005203, "step": 730 }, { "epoch": 0.3655, "grad_norm": 1.231892704963684, "grad_norm_var": 0.12633683764280407, "learning_rate": 2e-05, "loss": 0.4522, "loss/crossentropy": 2.5270928144454956, "loss/hidden": 0.169921875, "loss/logits": 0.029576458036899567, "loss/reg": 0.0252678282558918, "step": 731 }, { "epoch": 0.366, "grad_norm": 1.4518746137619019, "grad_norm_var": 0.11436872382724275, "learning_rate": 2e-05, "loss": 0.4706, "loss/crossentropy": 2.5901981592178345, "loss/hidden": 0.1640625, "loss/logits": 0.05386000592261553, "loss/reg": 0.02526557259261608, "step": 732 }, { "epoch": 0.3665, "grad_norm": 1.6061288118362427, "grad_norm_var": 0.09343888808112574, "learning_rate": 2e-05, "loss": 0.4847, "loss/crossentropy": 2.3777267932891846, "loss/hidden": 0.19482421875, "loss/logits": 0.03727641887962818, "loss/reg": 0.02526322938501835, "step": 733 }, { "epoch": 0.367, "grad_norm": 1.2036224603652954, "grad_norm_var": 0.10025301072385275, "learning_rate": 2e-05, "loss": 0.4686, "loss/crossentropy": 2.4308606386184692, "loss/hidden": 0.18359375, "loss/logits": 0.0324308592826128, "loss/reg": 0.02526094578206539, "step": 734 }, { "epoch": 0.3675, "grad_norm": 1.1550683975219727, "grad_norm_var": 0.10750280174214168, "learning_rate": 2e-05, "loss": 0.4347, "loss/crossentropy": 2.320576786994934, "loss/hidden": 0.1533203125, "loss/logits": 0.028841860592365265, "loss/reg": 0.025258498266339302, "step": 735 }, { "epoch": 0.368, "grad_norm": 1.1622178554534912, "grad_norm_var": 0.1147218928368229, "learning_rate": 2e-05, "loss": 0.4227, "loss/crossentropy": 2.37722384929657, "loss/hidden": 0.14892578125, "loss/logits": 0.021224712021648884, "loss/reg": 0.025255965068936348, "step": 736 }, { "epoch": 0.3685, "grad_norm": 1.2075239419937134, "grad_norm_var": 0.1074162568700674, "learning_rate": 2e-05, "loss": 0.4463, "loss/crossentropy": 2.341481566429138, "loss/hidden": 0.16259765625, "loss/logits": 0.031207844614982605, "loss/reg": 0.025253457948565483, "step": 737 }, { "epoch": 0.369, "grad_norm": 1.5584073066711426, "grad_norm_var": 0.10823295060967611, "learning_rate": 2e-05, "loss": 0.4797, "loss/crossentropy": 2.2698925733566284, "loss/hidden": 0.18701171875, "loss/logits": 0.04022688418626785, "loss/reg": 0.025251101702451706, "step": 738 }, { "epoch": 0.3695, "grad_norm": 1.0440956354141235, "grad_norm_var": 0.11611120991532643, "learning_rate": 2e-05, "loss": 0.4302, "loss/crossentropy": 2.3013094663619995, "loss/hidden": 0.1533203125, "loss/logits": 0.024398976005613804, "loss/reg": 0.025248851627111435, "step": 739 }, { "epoch": 0.37, "grad_norm": 1.4074509143829346, "grad_norm_var": 0.09992254468936514, "learning_rate": 2e-05, "loss": 0.4871, "loss/crossentropy": 2.5492948293685913, "loss/hidden": 0.2041015625, "loss/logits": 0.03054051846265793, "loss/reg": 0.025246579200029373, "step": 740 }, { "epoch": 0.3705, "grad_norm": 1.2850230932235718, "grad_norm_var": 0.10137802938979425, "learning_rate": 2e-05, "loss": 0.463, "loss/crossentropy": 2.3693546056747437, "loss/hidden": 0.17919921875, "loss/logits": 0.031334346160292625, "loss/reg": 0.02524430677294731, "step": 741 }, { "epoch": 0.371, "grad_norm": 1.5320541858673096, "grad_norm_var": 0.05226058368684695, "learning_rate": 2e-05, "loss": 0.4494, "loss/crossentropy": 2.404141068458557, "loss/hidden": 0.16748046875, "loss/logits": 0.029474626295268536, "loss/reg": 0.025241872295737267, "step": 742 }, { "epoch": 0.3715, "grad_norm": 1.2663581371307373, "grad_norm_var": 0.05314906099788974, "learning_rate": 2e-05, "loss": 0.4418, "loss/crossentropy": 2.3754160404205322, "loss/hidden": 0.16455078125, "loss/logits": 0.0248889597132802, "loss/reg": 0.0252396073192358, "step": 743 }, { "epoch": 0.372, "grad_norm": 1.8194047212600708, "grad_norm_var": 0.05546441039958623, "learning_rate": 2e-05, "loss": 0.4775, "loss/crossentropy": 2.3306996822357178, "loss/hidden": 0.173828125, "loss/logits": 0.051343479193747044, "loss/reg": 0.02523711882531643, "step": 744 }, { "epoch": 0.3725, "grad_norm": 1.1723297834396362, "grad_norm_var": 0.05809724214908408, "learning_rate": 2e-05, "loss": 0.4121, "loss/crossentropy": 2.492545485496521, "loss/hidden": 0.13623046875, "loss/logits": 0.02352056372910738, "loss/reg": 0.025234658271074295, "step": 745 }, { "epoch": 0.373, "grad_norm": 1.085463047027588, "grad_norm_var": 0.04672765278327275, "learning_rate": 2e-05, "loss": 0.4431, "loss/crossentropy": 2.5141403675079346, "loss/hidden": 0.1611328125, "loss/logits": 0.02963507827371359, "loss/reg": 0.02523215487599373, "step": 746 }, { "epoch": 0.3735, "grad_norm": 1.266335129737854, "grad_norm_var": 0.04637739796541395, "learning_rate": 2e-05, "loss": 0.4577, "loss/crossentropy": 2.527552366256714, "loss/hidden": 0.177734375, "loss/logits": 0.02770281210541725, "loss/reg": 0.025229567661881447, "step": 747 }, { "epoch": 0.374, "grad_norm": 1.704702377319336, "grad_norm_var": 0.05460029232385371, "learning_rate": 2e-05, "loss": 0.448, "loss/crossentropy": 2.5581319332122803, "loss/hidden": 0.16943359375, "loss/logits": 0.026291027665138245, "loss/reg": 0.0252272579818964, "step": 748 }, { "epoch": 0.3745, "grad_norm": 2.3668906688690186, "grad_norm_var": 0.11753805177080157, "learning_rate": 2e-05, "loss": 0.5337, "loss/crossentropy": 2.3304221630096436, "loss/hidden": 0.24169921875, "loss/logits": 0.03977209888398647, "loss/reg": 0.025224953889846802, "step": 749 }, { "epoch": 0.375, "grad_norm": 1.3969782590866089, "grad_norm_var": 0.11507466699231461, "learning_rate": 2e-05, "loss": 0.4742, "loss/crossentropy": 2.3295921087265015, "loss/hidden": 0.19140625, "loss/logits": 0.030590247362852097, "loss/reg": 0.02522265538573265, "step": 750 }, { "epoch": 0.3755, "grad_norm": 1.4511960744857788, "grad_norm_var": 0.11080980776827473, "learning_rate": 2e-05, "loss": 0.5357, "loss/crossentropy": 2.506491780281067, "loss/hidden": 0.2373046875, "loss/logits": 0.04617682471871376, "loss/reg": 0.025220239534974098, "step": 751 }, { "epoch": 0.376, "grad_norm": 0.9766618609428406, "grad_norm_var": 0.1193494277132064, "learning_rate": 2e-05, "loss": 0.4495, "loss/crossentropy": 2.248973250389099, "loss/hidden": 0.169921875, "loss/logits": 0.02738242596387863, "loss/reg": 0.025217954069375992, "step": 752 }, { "epoch": 0.3765, "grad_norm": 1.1288150548934937, "grad_norm_var": 0.12184896532288716, "learning_rate": 2e-05, "loss": 0.4228, "loss/crossentropy": 2.373740792274475, "loss/hidden": 0.1474609375, "loss/logits": 0.023226436227560043, "loss/reg": 0.025215715169906616, "step": 753 }, { "epoch": 0.377, "grad_norm": 1.3548938035964966, "grad_norm_var": 0.12024460158514286, "learning_rate": 2e-05, "loss": 0.4838, "loss/crossentropy": 2.304950475692749, "loss/hidden": 0.1845703125, "loss/logits": 0.04713786952197552, "loss/reg": 0.02521336078643799, "step": 754 }, { "epoch": 0.3775, "grad_norm": 1.3141090869903564, "grad_norm_var": 0.1123061572966031, "learning_rate": 2e-05, "loss": 0.469, "loss/crossentropy": 2.4102286100387573, "loss/hidden": 0.185546875, "loss/logits": 0.03135187551379204, "loss/reg": 0.025211207568645477, "step": 755 }, { "epoch": 0.378, "grad_norm": 2.194099187850952, "grad_norm_var": 0.1509201675997546, "learning_rate": 2e-05, "loss": 0.5182, "loss/crossentropy": 2.5629695653915405, "loss/hidden": 0.21923828125, "loss/logits": 0.04687961935997009, "loss/reg": 0.02520875632762909, "step": 756 }, { "epoch": 0.3785, "grad_norm": 1.8557016849517822, "grad_norm_var": 0.15817322836116407, "learning_rate": 2e-05, "loss": 0.4778, "loss/crossentropy": 2.405009627342224, "loss/hidden": 0.19677734375, "loss/logits": 0.028977664187550545, "loss/reg": 0.025206197053194046, "step": 757 }, { "epoch": 0.379, "grad_norm": 1.1612073183059692, "grad_norm_var": 0.1648314055929359, "learning_rate": 2e-05, "loss": 0.4358, "loss/crossentropy": 2.4666056632995605, "loss/hidden": 0.1591796875, "loss/logits": 0.024547006003558636, "loss/reg": 0.02520374022424221, "step": 758 }, { "epoch": 0.3795, "grad_norm": 1.2368805408477783, "grad_norm_var": 0.16568490433094543, "learning_rate": 2e-05, "loss": 0.4821, "loss/crossentropy": 2.499003052711487, "loss/hidden": 0.193359375, "loss/logits": 0.03675047680735588, "loss/reg": 0.025201212614774704, "step": 759 }, { "epoch": 0.38, "grad_norm": 1.1964080333709717, "grad_norm_var": 0.16074074145114683, "learning_rate": 2e-05, "loss": 0.4833, "loss/crossentropy": 2.2424347400665283, "loss/hidden": 0.19921875, "loss/logits": 0.032082391902804375, "loss/reg": 0.025198953226208687, "step": 760 }, { "epoch": 0.3805, "grad_norm": 1.2416514158248901, "grad_norm_var": 0.15866947858677752, "learning_rate": 2e-05, "loss": 0.4837, "loss/crossentropy": 2.1305224299430847, "loss/hidden": 0.20068359375, "loss/logits": 0.031024353578686714, "loss/reg": 0.02519652061164379, "step": 761 }, { "epoch": 0.381, "grad_norm": 1.4174950122833252, "grad_norm_var": 0.15016297167369203, "learning_rate": 2e-05, "loss": 0.4513, "loss/crossentropy": 2.610305905342102, "loss/hidden": 0.16796875, "loss/logits": 0.03143086936324835, "loss/reg": 0.025194261223077774, "step": 762 }, { "epoch": 0.3815, "grad_norm": 1.2875245809555054, "grad_norm_var": 0.149660827140138, "learning_rate": 2e-05, "loss": 0.4411, "loss/crossentropy": 2.418062686920166, "loss/hidden": 0.16162109375, "loss/logits": 0.027532209642231464, "loss/reg": 0.02519218809902668, "step": 763 }, { "epoch": 0.382, "grad_norm": 2.1845688819885254, "grad_norm_var": 0.180008472094818, "learning_rate": 2e-05, "loss": 0.5501, "loss/crossentropy": 2.4159024953842163, "loss/hidden": 0.25244140625, "loss/logits": 0.04572839289903641, "loss/reg": 0.025189923122525215, "step": 764 }, { "epoch": 0.3825, "grad_norm": 1.8935918807983398, "grad_norm_var": 0.13837621014211632, "learning_rate": 2e-05, "loss": 0.4812, "loss/crossentropy": 2.5697638988494873, "loss/hidden": 0.1962890625, "loss/logits": 0.03301386162638664, "loss/reg": 0.025187674909830093, "step": 765 }, { "epoch": 0.383, "grad_norm": 1.5216825008392334, "grad_norm_var": 0.13837117134393406, "learning_rate": 2e-05, "loss": 0.4664, "loss/crossentropy": 2.4129964113235474, "loss/hidden": 0.18212890625, "loss/logits": 0.032372357323765755, "loss/reg": 0.025185411795973778, "step": 766 }, { "epoch": 0.3835, "grad_norm": 1.0873256921768188, "grad_norm_var": 0.14724468912793848, "learning_rate": 2e-05, "loss": 0.4436, "loss/crossentropy": 2.3542500734329224, "loss/hidden": 0.1650390625, "loss/logits": 0.026738815940916538, "loss/reg": 0.025183262303471565, "step": 767 }, { "epoch": 0.384, "grad_norm": 1.8608894348144531, "grad_norm_var": 0.14139169238716037, "learning_rate": 2e-05, "loss": 0.4817, "loss/crossentropy": 2.070446014404297, "loss/hidden": 0.19677734375, "loss/logits": 0.03307647071778774, "loss/reg": 0.025181252509355545, "step": 768 }, { "epoch": 0.3845, "grad_norm": 1.2290267944335938, "grad_norm_var": 0.1371124714077353, "learning_rate": 2e-05, "loss": 0.4697, "loss/crossentropy": 2.607566475868225, "loss/hidden": 0.18359375, "loss/logits": 0.03431819751858711, "loss/reg": 0.025178972631692886, "step": 769 }, { "epoch": 0.385, "grad_norm": 1.6517506837844849, "grad_norm_var": 0.13678511646327815, "learning_rate": 2e-05, "loss": 0.5167, "loss/crossentropy": 2.214709520339966, "loss/hidden": 0.22412109375, "loss/logits": 0.04081333614885807, "loss/reg": 0.0251768808811903, "step": 770 }, { "epoch": 0.3855, "grad_norm": 1.0256072282791138, "grad_norm_var": 0.14994063600200108, "learning_rate": 2e-05, "loss": 0.4487, "loss/crossentropy": 2.3049023151397705, "loss/hidden": 0.1689453125, "loss/logits": 0.02800673432648182, "loss/reg": 0.02517460659146309, "step": 771 }, { "epoch": 0.386, "grad_norm": 1.1300290822982788, "grad_norm_var": 0.12263260379390548, "learning_rate": 2e-05, "loss": 0.4688, "loss/crossentropy": 2.4035372734069824, "loss/hidden": 0.18701171875, "loss/logits": 0.030027078464627266, "loss/reg": 0.025172380730509758, "step": 772 }, { "epoch": 0.3865, "grad_norm": 1.5945173501968384, "grad_norm_var": 0.11229187265839163, "learning_rate": 2e-05, "loss": 0.509, "loss/crossentropy": 2.2906605005264282, "loss/hidden": 0.2138671875, "loss/logits": 0.04339625872671604, "loss/reg": 0.02517029643058777, "step": 773 }, { "epoch": 0.387, "grad_norm": 1.2455098628997803, "grad_norm_var": 0.1098270276560114, "learning_rate": 2e-05, "loss": 0.506, "loss/crossentropy": 2.1885640621185303, "loss/hidden": 0.220703125, "loss/logits": 0.033652519807219505, "loss/reg": 0.025167938321828842, "step": 774 }, { "epoch": 0.3875, "grad_norm": 1.943253755569458, "grad_norm_var": 0.12326830210349768, "learning_rate": 2e-05, "loss": 0.4517, "loss/crossentropy": 2.537282109260559, "loss/hidden": 0.17041015625, "loss/logits": 0.029665526933968067, "loss/reg": 0.0251656174659729, "step": 775 }, { "epoch": 0.388, "grad_norm": 1.6714816093444824, "grad_norm_var": 0.12008035318969133, "learning_rate": 2e-05, "loss": 0.4953, "loss/crossentropy": 2.1838293075561523, "loss/hidden": 0.208984375, "loss/logits": 0.034706905484199524, "loss/reg": 0.025163283571600914, "step": 776 }, { "epoch": 0.3885, "grad_norm": 1.2149651050567627, "grad_norm_var": 0.12104097819330283, "learning_rate": 2e-05, "loss": 0.445, "loss/crossentropy": 2.3864688873291016, "loss/hidden": 0.16064453125, "loss/logits": 0.03277465607970953, "loss/reg": 0.025161121040582657, "step": 777 }, { "epoch": 0.389, "grad_norm": 1.1848781108856201, "grad_norm_var": 0.12690278069956282, "learning_rate": 2e-05, "loss": 0.4562, "loss/crossentropy": 2.383737087249756, "loss/hidden": 0.17138671875, "loss/logits": 0.033218057826161385, "loss/reg": 0.025158870965242386, "step": 778 }, { "epoch": 0.3895, "grad_norm": 2.212529420852661, "grad_norm_var": 0.1562819136879937, "learning_rate": 2e-05, "loss": 0.564, "loss/crossentropy": 2.454702615737915, "loss/hidden": 0.27734375, "loss/logits": 0.03507992811501026, "loss/reg": 0.025156671181321144, "step": 779 }, { "epoch": 0.39, "grad_norm": 1.3409082889556885, "grad_norm_var": 0.12834240393133164, "learning_rate": 2e-05, "loss": 0.4781, "loss/crossentropy": 2.3675941228866577, "loss/hidden": 0.1875, "loss/logits": 0.03905305452644825, "loss/reg": 0.02515433356165886, "step": 780 }, { "epoch": 0.3905, "grad_norm": 1.649703025817871, "grad_norm_var": 0.1188706614056916, "learning_rate": 2e-05, "loss": 0.5103, "loss/crossentropy": 1.9899500608444214, "loss/hidden": 0.20458984375, "loss/logits": 0.05418789014220238, "loss/reg": 0.025151889771223068, "step": 781 }, { "epoch": 0.391, "grad_norm": 1.539289951324463, "grad_norm_var": 0.11900490617594, "learning_rate": 2e-05, "loss": 0.4399, "loss/crossentropy": 2.3202253580093384, "loss/hidden": 0.16015625, "loss/logits": 0.02825088147073984, "loss/reg": 0.025149622932076454, "step": 782 }, { "epoch": 0.3915, "grad_norm": 2.8615036010742188, "grad_norm_var": 0.22430059081523435, "learning_rate": 2e-05, "loss": 0.5087, "loss/crossentropy": 2.3681873083114624, "loss/hidden": 0.1845703125, "loss/logits": 0.07264281064271927, "loss/reg": 0.025147197768092155, "step": 783 }, { "epoch": 0.392, "grad_norm": 1.4059878587722778, "grad_norm_var": 0.22048462683969675, "learning_rate": 2e-05, "loss": 0.4234, "loss/crossentropy": 2.4366742372512817, "loss/hidden": 0.14990234375, "loss/logits": 0.022082606330513954, "loss/reg": 0.025144780054688454, "step": 784 }, { "epoch": 0.3925, "grad_norm": 1.011724591255188, "grad_norm_var": 0.23291844077479976, "learning_rate": 2e-05, "loss": 0.4233, "loss/crossentropy": 2.2704352140426636, "loss/hidden": 0.1435546875, "loss/logits": 0.028314979746937752, "loss/reg": 0.02514229156076908, "step": 785 }, { "epoch": 0.393, "grad_norm": 1.193475365638733, "grad_norm_var": 0.23938277110281755, "learning_rate": 2e-05, "loss": 0.4774, "loss/crossentropy": 2.372236728668213, "loss/hidden": 0.19091796875, "loss/logits": 0.0351157495751977, "loss/reg": 0.0251397043466568, "step": 786 }, { "epoch": 0.3935, "grad_norm": 1.1133381128311157, "grad_norm_var": 0.23414986734980137, "learning_rate": 2e-05, "loss": 0.4323, "loss/crossentropy": 2.2562466859817505, "loss/hidden": 0.15283203125, "loss/logits": 0.028081120923161507, "loss/reg": 0.025137118995189667, "step": 787 }, { "epoch": 0.394, "grad_norm": 1.4887886047363281, "grad_norm_var": 0.22356068135045598, "learning_rate": 2e-05, "loss": 0.4772, "loss/crossentropy": 2.218737244606018, "loss/hidden": 0.1708984375, "loss/logits": 0.0549413226544857, "loss/reg": 0.025134827941656113, "step": 788 }, { "epoch": 0.3945, "grad_norm": 1.6351099014282227, "grad_norm_var": 0.22394795699470554, "learning_rate": 2e-05, "loss": 0.4897, "loss/crossentropy": 2.6010366678237915, "loss/hidden": 0.2099609375, "loss/logits": 0.02844669111073017, "loss/reg": 0.02513228729367256, "step": 789 }, { "epoch": 0.395, "grad_norm": 1.6281384229660034, "grad_norm_var": 0.21784319752439665, "learning_rate": 2e-05, "loss": 0.4864, "loss/crossentropy": 2.3249675035476685, "loss/hidden": 0.20703125, "loss/logits": 0.028079986572265625, "loss/reg": 0.02512998878955841, "step": 790 }, { "epoch": 0.3955, "grad_norm": 1.526131510734558, "grad_norm_var": 0.20787200314066634, "learning_rate": 2e-05, "loss": 0.53, "loss/crossentropy": 2.134896695613861, "loss/hidden": 0.2412109375, "loss/logits": 0.037468770518898964, "loss/reg": 0.025127559900283813, "step": 791 }, { "epoch": 0.396, "grad_norm": 1.2636619806289673, "grad_norm_var": 0.211246353547789, "learning_rate": 2e-05, "loss": 0.4303, "loss/crossentropy": 2.412594199180603, "loss/hidden": 0.154296875, "loss/logits": 0.024777178652584553, "loss/reg": 0.02512528747320175, "step": 792 }, { "epoch": 0.3965, "grad_norm": 1.5792723894119263, "grad_norm_var": 0.2048758713311332, "learning_rate": 2e-05, "loss": 0.4862, "loss/crossentropy": 2.2064541578292847, "loss/hidden": 0.20068359375, "loss/logits": 0.034297335892915726, "loss/reg": 0.02512306347489357, "step": 793 }, { "epoch": 0.397, "grad_norm": 1.3188270330429077, "grad_norm_var": 0.199661045066693, "learning_rate": 2e-05, "loss": 0.4927, "loss/crossentropy": 2.26226544380188, "loss/hidden": 0.19873046875, "loss/logits": 0.04274392127990723, "loss/reg": 0.025120839476585388, "step": 794 }, { "epoch": 0.3975, "grad_norm": 1.6991007328033447, "grad_norm_var": 0.1706464817422428, "learning_rate": 2e-05, "loss": 0.4816, "loss/crossentropy": 2.36691677570343, "loss/hidden": 0.193359375, "loss/logits": 0.03702061250805855, "loss/reg": 0.025118518620729446, "step": 795 }, { "epoch": 0.398, "grad_norm": 1.7001802921295166, "grad_norm_var": 0.1703294579580552, "learning_rate": 2e-05, "loss": 0.4837, "loss/crossentropy": 2.2683218717575073, "loss/hidden": 0.16943359375, "loss/logits": 0.06313092540949583, "loss/reg": 0.025116167962551117, "step": 796 }, { "epoch": 0.3985, "grad_norm": 1.6777490377426147, "grad_norm_var": 0.1707948722071741, "learning_rate": 2e-05, "loss": 0.4669, "loss/crossentropy": 2.364277482032776, "loss/hidden": 0.181640625, "loss/logits": 0.03413047455251217, "loss/reg": 0.02511376328766346, "step": 797 }, { "epoch": 0.399, "grad_norm": 1.1140098571777344, "grad_norm_var": 0.18214716036864212, "learning_rate": 2e-05, "loss": 0.4658, "loss/crossentropy": 2.187538802623749, "loss/hidden": 0.1826171875, "loss/logits": 0.03205987066030502, "loss/reg": 0.025110801681876183, "step": 798 }, { "epoch": 0.3995, "grad_norm": 1.129773497581482, "grad_norm_var": 0.058341697787046044, "learning_rate": 2e-05, "loss": 0.4369, "loss/crossentropy": 2.3475732803344727, "loss/hidden": 0.1611328125, "loss/logits": 0.024723156355321407, "loss/reg": 0.025107914581894875, "step": 799 }, { "epoch": 0.4, "grad_norm": 1.3154513835906982, "grad_norm_var": 0.05884605160209707, "learning_rate": 2e-05, "loss": 0.4572, "loss/crossentropy": 2.398823618888855, "loss/hidden": 0.17431640625, "loss/logits": 0.0318829407915473, "loss/reg": 0.025105012580752373, "step": 800 }, { "epoch": 0.4005, "grad_norm": 1.3138872385025024, "grad_norm_var": 0.048922729616520874, "learning_rate": 2e-05, "loss": 0.4692, "loss/crossentropy": 2.3344963788986206, "loss/hidden": 0.181640625, "loss/logits": 0.036580765619874, "loss/reg": 0.025102730840444565, "step": 801 }, { "epoch": 0.401, "grad_norm": 1.0770680904388428, "grad_norm_var": 0.05326311463356787, "learning_rate": 2e-05, "loss": 0.4599, "loss/crossentropy": 2.337261915206909, "loss/hidden": 0.17431640625, "loss/logits": 0.03461520001292229, "loss/reg": 0.025099987164139748, "step": 802 }, { "epoch": 0.4015, "grad_norm": 0.9409591555595398, "grad_norm_var": 0.06196813771724311, "learning_rate": 2e-05, "loss": 0.4278, "loss/crossentropy": 2.4044833183288574, "loss/hidden": 0.146484375, "loss/logits": 0.030327575281262398, "loss/reg": 0.025096973404288292, "step": 803 }, { "epoch": 0.402, "grad_norm": 1.4571963548660278, "grad_norm_var": 0.06165864774989793, "learning_rate": 2e-05, "loss": 0.4391, "loss/crossentropy": 2.3775731325149536, "loss/hidden": 0.1591796875, "loss/logits": 0.028936855494976044, "loss/reg": 0.025093907490372658, "step": 804 }, { "epoch": 0.4025, "grad_norm": 1.3221757411956787, "grad_norm_var": 0.05790803967387448, "learning_rate": 2e-05, "loss": 0.4374, "loss/crossentropy": 2.535359501838684, "loss/hidden": 0.15869140625, "loss/logits": 0.027763372287154198, "loss/reg": 0.02509160339832306, "step": 805 }, { "epoch": 0.403, "grad_norm": 0.944514811038971, "grad_norm_var": 0.064405569007729, "learning_rate": 2e-05, "loss": 0.4569, "loss/crossentropy": 2.209794282913208, "loss/hidden": 0.17578125, "loss/logits": 0.030202921479940414, "loss/reg": 0.0250887181609869, "step": 806 }, { "epoch": 0.4035, "grad_norm": 1.182153344154358, "grad_norm_var": 0.06309183378906127, "learning_rate": 2e-05, "loss": 0.4521, "loss/crossentropy": 2.2558337450027466, "loss/hidden": 0.17333984375, "loss/logits": 0.027920391410589218, "loss/reg": 0.02508593164384365, "step": 807 }, { "epoch": 0.404, "grad_norm": 1.3775771856307983, "grad_norm_var": 0.06312693371007896, "learning_rate": 2e-05, "loss": 0.4939, "loss/crossentropy": 2.2632880210876465, "loss/hidden": 0.208984375, "loss/logits": 0.03405469283461571, "loss/reg": 0.025083083659410477, "step": 808 }, { "epoch": 0.4045, "grad_norm": 1.5316611528396606, "grad_norm_var": 0.06163456830326434, "learning_rate": 2e-05, "loss": 0.5141, "loss/crossentropy": 2.223612070083618, "loss/hidden": 0.22705078125, "loss/logits": 0.036264341324567795, "loss/reg": 0.02508021518588066, "step": 809 }, { "epoch": 0.405, "grad_norm": 1.149705171585083, "grad_norm_var": 0.06342368922468508, "learning_rate": 2e-05, "loss": 0.4366, "loss/crossentropy": 2.47035813331604, "loss/hidden": 0.1591796875, "loss/logits": 0.02665360551327467, "loss/reg": 0.02507762797176838, "step": 810 }, { "epoch": 0.4055, "grad_norm": 1.2824203968048096, "grad_norm_var": 0.052564492158762674, "learning_rate": 2e-05, "loss": 0.4829, "loss/crossentropy": 2.478409767150879, "loss/hidden": 0.19775390625, "loss/logits": 0.034352305345237255, "loss/reg": 0.02507534809410572, "step": 811 }, { "epoch": 0.406, "grad_norm": 2.6002371311187744, "grad_norm_var": 0.15334706854063704, "learning_rate": 2e-05, "loss": 0.5122, "loss/crossentropy": 2.588177442550659, "loss/hidden": 0.2373046875, "loss/logits": 0.024176809936761856, "loss/reg": 0.025072963908314705, "step": 812 }, { "epoch": 0.4065, "grad_norm": 1.5694222450256348, "grad_norm_var": 0.149181005955631, "learning_rate": 2e-05, "loss": 0.5268, "loss/crossentropy": 2.154956102371216, "loss/hidden": 0.232421875, "loss/logits": 0.04367602989077568, "loss/reg": 0.025070277974009514, "step": 813 }, { "epoch": 0.407, "grad_norm": 1.2066580057144165, "grad_norm_var": 0.1470275588442864, "learning_rate": 2e-05, "loss": 0.4439, "loss/crossentropy": 2.4592679738998413, "loss/hidden": 0.16064453125, "loss/logits": 0.03256369009613991, "loss/reg": 0.025067761540412903, "step": 814 }, { "epoch": 0.4075, "grad_norm": 1.298493504524231, "grad_norm_var": 0.1441324853666197, "learning_rate": 2e-05, "loss": 0.4939, "loss/crossentropy": 2.1921013593673706, "loss/hidden": 0.20263671875, "loss/logits": 0.040627798065543175, "loss/reg": 0.02506544440984726, "step": 815 }, { "epoch": 0.408, "grad_norm": 1.0179194211959839, "grad_norm_var": 0.15096046825236584, "learning_rate": 2e-05, "loss": 0.4628, "loss/crossentropy": 2.2625861167907715, "loss/hidden": 0.18212890625, "loss/logits": 0.03002795670181513, "loss/reg": 0.025062717497348785, "step": 816 }, { "epoch": 0.4085, "grad_norm": 1.279781460762024, "grad_norm_var": 0.1511041804692482, "learning_rate": 2e-05, "loss": 0.4842, "loss/crossentropy": 2.138561725616455, "loss/hidden": 0.19775390625, "loss/logits": 0.035851323045790195, "loss/reg": 0.025060279294848442, "step": 817 }, { "epoch": 0.409, "grad_norm": 1.0807183980941772, "grad_norm_var": 0.15098318869743482, "learning_rate": 2e-05, "loss": 0.4273, "loss/crossentropy": 2.3973305225372314, "loss/hidden": 0.150390625, "loss/logits": 0.026333114132285118, "loss/reg": 0.02505759336054325, "step": 818 }, { "epoch": 0.4095, "grad_norm": 1.2695621252059937, "grad_norm_var": 0.1407917737407074, "learning_rate": 2e-05, "loss": 0.4683, "loss/crossentropy": 2.253230392932892, "loss/hidden": 0.169921875, "loss/logits": 0.047788072377443314, "loss/reg": 0.025055285543203354, "step": 819 }, { "epoch": 0.41, "grad_norm": 1.209682822227478, "grad_norm_var": 0.14102156172926023, "learning_rate": 2e-05, "loss": 0.4417, "loss/crossentropy": 2.182424545288086, "loss/hidden": 0.1611328125, "loss/logits": 0.030013758689165115, "loss/reg": 0.02505277469754219, "step": 820 }, { "epoch": 0.4105, "grad_norm": 1.6707624197006226, "grad_norm_var": 0.1481284569685306, "learning_rate": 2e-05, "loss": 0.5191, "loss/crossentropy": 2.2835570573806763, "loss/hidden": 0.22802734375, "loss/logits": 0.04061476141214371, "loss/reg": 0.025050263851881027, "step": 821 }, { "epoch": 0.411, "grad_norm": 1.1448094844818115, "grad_norm_var": 0.1396880017606003, "learning_rate": 2e-05, "loss": 0.4704, "loss/crossentropy": 2.227652668952942, "loss/hidden": 0.1787109375, "loss/logits": 0.041171809658408165, "loss/reg": 0.02504797838628292, "step": 822 }, { "epoch": 0.4115, "grad_norm": 1.3101500272750854, "grad_norm_var": 0.13755777894761198, "learning_rate": 2e-05, "loss": 0.4425, "loss/crossentropy": 2.4227746725082397, "loss/hidden": 0.1650390625, "loss/logits": 0.02695902157574892, "loss/reg": 0.02504545822739601, "step": 823 }, { "epoch": 0.412, "grad_norm": 1.7020496129989624, "grad_norm_var": 0.14425061011981716, "learning_rate": 2e-05, "loss": 0.4481, "loss/crossentropy": 2.5413230657577515, "loss/hidden": 0.1572265625, "loss/logits": 0.040394325740635395, "loss/reg": 0.025042949244379997, "step": 824 }, { "epoch": 0.4125, "grad_norm": 1.0356205701828003, "grad_norm_var": 0.15060720196286131, "learning_rate": 2e-05, "loss": 0.4672, "loss/crossentropy": 2.308974862098694, "loss/hidden": 0.18310546875, "loss/logits": 0.033710891380906105, "loss/reg": 0.025040656328201294, "step": 825 }, { "epoch": 0.413, "grad_norm": 1.0203226804733276, "grad_norm_var": 0.1553545460901887, "learning_rate": 2e-05, "loss": 0.4106, "loss/crossentropy": 2.477281332015991, "loss/hidden": 0.13671875, "loss/logits": 0.023489448241889477, "loss/reg": 0.025038165971636772, "step": 826 }, { "epoch": 0.4135, "grad_norm": 1.1076934337615967, "grad_norm_var": 0.15898062007053398, "learning_rate": 2e-05, "loss": 0.4373, "loss/crossentropy": 2.4935485124588013, "loss/hidden": 0.1591796875, "loss/logits": 0.027745064347982407, "loss/reg": 0.02503584697842598, "step": 827 }, { "epoch": 0.414, "grad_norm": 11.625944137573242, "grad_norm_var": 6.76073723206649, "learning_rate": 2e-05, "loss": 0.5717, "loss/crossentropy": 2.347122311592102, "loss/hidden": 0.29052734375, "loss/logits": 0.030792713165283203, "loss/reg": 0.025033539161086082, "step": 828 }, { "epoch": 0.4145, "grad_norm": 1.3114373683929443, "grad_norm_var": 6.776589802928325, "learning_rate": 2e-05, "loss": 0.4495, "loss/crossentropy": 2.408790349960327, "loss/hidden": 0.1708984375, "loss/logits": 0.028319708071649075, "loss/reg": 0.02503122203052044, "step": 829 }, { "epoch": 0.415, "grad_norm": 1.2084417343139648, "grad_norm_var": 6.776426715144699, "learning_rate": 2e-05, "loss": 0.4981, "loss/crossentropy": 2.4079878330230713, "loss/hidden": 0.21044921875, "loss/logits": 0.037383945658802986, "loss/reg": 0.0250290185213089, "step": 830 }, { "epoch": 0.4155, "grad_norm": 1.5227446556091309, "grad_norm_var": 6.761783844737624, "learning_rate": 2e-05, "loss": 0.5666, "loss/crossentropy": 2.316787838935852, "loss/hidden": 0.2734375, "loss/logits": 0.0429159477353096, "loss/reg": 0.025026634335517883, "step": 831 }, { "epoch": 0.416, "grad_norm": 1.3830286264419556, "grad_norm_var": 6.7268166954643736, "learning_rate": 2e-05, "loss": 0.5173, "loss/crossentropy": 2.6073665618896484, "loss/hidden": 0.22900390625, "loss/logits": 0.03809538949280977, "loss/reg": 0.025024237111210823, "step": 832 }, { "epoch": 0.4165, "grad_norm": 0.9845668077468872, "grad_norm_var": 6.757864312480587, "learning_rate": 2e-05, "loss": 0.4286, "loss/crossentropy": 2.1612448692321777, "loss/hidden": 0.15234375, "loss/logits": 0.026042289100587368, "loss/reg": 0.025022020563483238, "step": 833 }, { "epoch": 0.417, "grad_norm": 1.03498113155365, "grad_norm_var": 6.763062760659847, "learning_rate": 2e-05, "loss": 0.4259, "loss/crossentropy": 2.3577685356140137, "loss/hidden": 0.1455078125, "loss/logits": 0.030183385126292706, "loss/reg": 0.025019681081175804, "step": 834 }, { "epoch": 0.4175, "grad_norm": 1.4572676420211792, "grad_norm_var": 6.749264821786343, "learning_rate": 2e-05, "loss": 0.443, "loss/crossentropy": 2.2105389833450317, "loss/hidden": 0.162109375, "loss/logits": 0.03076254576444626, "loss/reg": 0.025017455220222473, "step": 835 }, { "epoch": 0.418, "grad_norm": 1.650352954864502, "grad_norm_var": 6.719631400519071, "learning_rate": 2e-05, "loss": 0.4355, "loss/crossentropy": 2.2510547637939453, "loss/hidden": 0.1572265625, "loss/logits": 0.028081734664738178, "loss/reg": 0.025015119463205338, "step": 836 }, { "epoch": 0.4185, "grad_norm": 1.5497808456420898, "grad_norm_var": 6.725020460592711, "learning_rate": 2e-05, "loss": 0.4611, "loss/crossentropy": 2.534460186958313, "loss/hidden": 0.17578125, "loss/logits": 0.03517400100827217, "loss/reg": 0.02501281537115574, "step": 837 }, { "epoch": 0.419, "grad_norm": 1.1171302795410156, "grad_norm_var": 6.728005163235623, "learning_rate": 2e-05, "loss": 0.446, "loss/crossentropy": 2.4289716482162476, "loss/hidden": 0.1669921875, "loss/logits": 0.028871508315205574, "loss/reg": 0.025010673329234123, "step": 838 }, { "epoch": 0.4195, "grad_norm": 1.721587061882019, "grad_norm_var": 6.70409609664535, "learning_rate": 2e-05, "loss": 0.434, "loss/crossentropy": 2.530004143714905, "loss/hidden": 0.1591796875, "loss/logits": 0.024708636105060577, "loss/reg": 0.02500857040286064, "step": 839 }, { "epoch": 0.42, "grad_norm": 1.422298550605774, "grad_norm_var": 6.718779037944629, "learning_rate": 2e-05, "loss": 0.4845, "loss/crossentropy": 2.1376953125, "loss/hidden": 0.2001953125, "loss/logits": 0.03421156480908394, "loss/reg": 0.025006268173456192, "step": 840 }, { "epoch": 0.4205, "grad_norm": 1.2492433786392212, "grad_norm_var": 6.695670215657393, "learning_rate": 2e-05, "loss": 0.4417, "loss/crossentropy": 2.392310380935669, "loss/hidden": 0.15576171875, "loss/logits": 0.035860566422343254, "loss/reg": 0.02500392496585846, "step": 841 }, { "epoch": 0.421, "grad_norm": 1.8433566093444824, "grad_norm_var": 6.634841808570999, "learning_rate": 2e-05, "loss": 0.5097, "loss/crossentropy": 2.4924964904785156, "loss/hidden": 0.21728515625, "loss/logits": 0.042372843250632286, "loss/reg": 0.025001544505357742, "step": 842 }, { "epoch": 0.4215, "grad_norm": 1.458393931388855, "grad_norm_var": 6.600249569106912, "learning_rate": 2e-05, "loss": 0.4652, "loss/crossentropy": 2.4191232919692993, "loss/hidden": 0.18505859375, "loss/logits": 0.030173558741807938, "loss/reg": 0.02499937266111374, "step": 843 }, { "epoch": 0.422, "grad_norm": 1.5181694030761719, "grad_norm_var": 0.05830908442588125, "learning_rate": 2e-05, "loss": 0.4305, "loss/crossentropy": 2.543475866317749, "loss/hidden": 0.1552734375, "loss/logits": 0.02529764547944069, "loss/reg": 0.024997074156999588, "step": 844 }, { "epoch": 0.4225, "grad_norm": 1.4709956645965576, "grad_norm_var": 0.057972554883919496, "learning_rate": 2e-05, "loss": 0.4453, "loss/crossentropy": 2.40644907951355, "loss/hidden": 0.171875, "loss/logits": 0.023440631106495857, "loss/reg": 0.02499477192759514, "step": 845 }, { "epoch": 0.423, "grad_norm": 1.2264574766159058, "grad_norm_var": 0.0575038222824185, "learning_rate": 2e-05, "loss": 0.453, "loss/crossentropy": 2.4405059814453125, "loss/hidden": 0.17529296875, "loss/logits": 0.027810130268335342, "loss/reg": 0.024992434307932854, "step": 846 }, { "epoch": 0.4235, "grad_norm": 1.2261029481887817, "grad_norm_var": 0.05866876723294444, "learning_rate": 2e-05, "loss": 0.4206, "loss/crossentropy": 2.530023455619812, "loss/hidden": 0.14697265625, "loss/logits": 0.023748058825731277, "loss/reg": 0.024990031495690346, "step": 847 }, { "epoch": 0.424, "grad_norm": 1.2650319337844849, "grad_norm_var": 0.05972113104539645, "learning_rate": 2e-05, "loss": 0.4805, "loss/crossentropy": 2.4802552461624146, "loss/hidden": 0.1787109375, "loss/logits": 0.05189700424671173, "loss/reg": 0.02498767152428627, "step": 848 }, { "epoch": 0.4245, "grad_norm": 1.5666638612747192, "grad_norm_var": 0.049646390274153636, "learning_rate": 2e-05, "loss": 0.4839, "loss/crossentropy": 2.2816847562789917, "loss/hidden": 0.19189453125, "loss/logits": 0.04210854321718216, "loss/reg": 0.024985330179333687, "step": 849 }, { "epoch": 0.425, "grad_norm": 1.3735864162445068, "grad_norm_var": 0.03926651318212458, "learning_rate": 2e-05, "loss": 0.4633, "loss/crossentropy": 2.288491129875183, "loss/hidden": 0.18017578125, "loss/logits": 0.03332236781716347, "loss/reg": 0.024982422590255737, "step": 850 }, { "epoch": 0.4255, "grad_norm": 1.2534009218215942, "grad_norm_var": 0.04152457000375349, "learning_rate": 2e-05, "loss": 0.505, "loss/crossentropy": 2.300741195678711, "loss/hidden": 0.20703125, "loss/logits": 0.04814612679183483, "loss/reg": 0.02498042583465576, "step": 851 }, { "epoch": 0.426, "grad_norm": 1.2829217910766602, "grad_norm_var": 0.03926682396235221, "learning_rate": 2e-05, "loss": 0.4858, "loss/crossentropy": 2.076082229614258, "loss/hidden": 0.19873046875, "loss/logits": 0.037287235260009766, "loss/reg": 0.02497800998389721, "step": 852 }, { "epoch": 0.4265, "grad_norm": 1.0537066459655762, "grad_norm_var": 0.04534035977341985, "learning_rate": 2e-05, "loss": 0.4146, "loss/crossentropy": 2.2343058586120605, "loss/hidden": 0.14208984375, "loss/logits": 0.022780392318964005, "loss/reg": 0.024975987151265144, "step": 853 }, { "epoch": 0.427, "grad_norm": 1.25690758228302, "grad_norm_var": 0.04169842414173767, "learning_rate": 2e-05, "loss": 0.4457, "loss/crossentropy": 2.2924128770828247, "loss/hidden": 0.1689453125, "loss/logits": 0.027037952095270157, "loss/reg": 0.024973342195153236, "step": 854 }, { "epoch": 0.4275, "grad_norm": 1.1707593202590942, "grad_norm_var": 0.03607373501481727, "learning_rate": 2e-05, "loss": 0.4284, "loss/crossentropy": 2.1742767095565796, "loss/hidden": 0.15087890625, "loss/logits": 0.027766499668359756, "loss/reg": 0.024970991536974907, "step": 855 }, { "epoch": 0.428, "grad_norm": 1.244330883026123, "grad_norm_var": 0.03639404290223063, "learning_rate": 2e-05, "loss": 0.4646, "loss/crossentropy": 2.283990740776062, "loss/hidden": 0.181640625, "loss/logits": 0.03325035236775875, "loss/reg": 0.024968596175312996, "step": 856 }, { "epoch": 0.4285, "grad_norm": 1.376844048500061, "grad_norm_var": 0.0358462854612099, "learning_rate": 2e-05, "loss": 0.4368, "loss/crossentropy": 2.340665102005005, "loss/hidden": 0.1591796875, "loss/logits": 0.027914387173950672, "loss/reg": 0.024966033175587654, "step": 857 }, { "epoch": 0.429, "grad_norm": 1.1170494556427002, "grad_norm_var": 0.020964417363066385, "learning_rate": 2e-05, "loss": 0.4231, "loss/crossentropy": 2.44227135181427, "loss/hidden": 0.146484375, "loss/logits": 0.027010299265384674, "loss/reg": 0.024963244795799255, "step": 858 }, { "epoch": 0.4295, "grad_norm": 1.2896698713302612, "grad_norm_var": 0.019266560970768804, "learning_rate": 2e-05, "loss": 0.4338, "loss/crossentropy": 2.437178373336792, "loss/hidden": 0.15673828125, "loss/logits": 0.027423975989222527, "loss/reg": 0.024961121380329132, "step": 859 }, { "epoch": 0.43, "grad_norm": 1.5006057024002075, "grad_norm_var": 0.018759206476876972, "learning_rate": 2e-05, "loss": 0.4984, "loss/crossentropy": 2.36915385723114, "loss/hidden": 0.20703125, "loss/logits": 0.04180637001991272, "loss/reg": 0.02495899423956871, "step": 860 }, { "epoch": 0.4305, "grad_norm": 1.2376413345336914, "grad_norm_var": 0.0165992425597094, "learning_rate": 2e-05, "loss": 0.4467, "loss/crossentropy": 2.1004234552383423, "loss/hidden": 0.17041015625, "loss/logits": 0.026734575629234314, "loss/reg": 0.0249563567340374, "step": 861 }, { "epoch": 0.431, "grad_norm": 1.1335351467132568, "grad_norm_var": 0.017772602276823986, "learning_rate": 2e-05, "loss": 0.4597, "loss/crossentropy": 2.2476999759674072, "loss/hidden": 0.1806640625, "loss/logits": 0.029479091055691242, "loss/reg": 0.02495376206934452, "step": 862 }, { "epoch": 0.4315, "grad_norm": 1.1975138187408447, "grad_norm_var": 0.017997867740444682, "learning_rate": 2e-05, "loss": 0.454, "loss/crossentropy": 2.114013433456421, "loss/hidden": 0.17529296875, "loss/logits": 0.029184110462665558, "loss/reg": 0.024951165542006493, "step": 863 }, { "epoch": 0.432, "grad_norm": 1.684401512145996, "grad_norm_var": 0.028711411651534922, "learning_rate": 2e-05, "loss": 0.4371, "loss/crossentropy": 2.5192021131515503, "loss/hidden": 0.16015625, "loss/logits": 0.027418741025030613, "loss/reg": 0.02494893968105316, "step": 864 }, { "epoch": 0.4325, "grad_norm": 1.327570915222168, "grad_norm_var": 0.023662792002424264, "learning_rate": 2e-05, "loss": 0.4681, "loss/crossentropy": 2.387048840522766, "loss/hidden": 0.18212890625, "loss/logits": 0.036548664793372154, "loss/reg": 0.024946413934230804, "step": 865 }, { "epoch": 0.433, "grad_norm": 1.168529987335205, "grad_norm_var": 0.02376700496530641, "learning_rate": 2e-05, "loss": 0.4656, "loss/crossentropy": 2.4732731580734253, "loss/hidden": 0.181640625, "loss/logits": 0.034491341561079025, "loss/reg": 0.02494383417069912, "step": 866 }, { "epoch": 0.4335, "grad_norm": 1.0916374921798706, "grad_norm_var": 0.02572730800574637, "learning_rate": 2e-05, "loss": 0.4188, "loss/crossentropy": 2.4050283432006836, "loss/hidden": 0.14404296875, "loss/logits": 0.025381820276379585, "loss/reg": 0.02494126372039318, "step": 867 }, { "epoch": 0.434, "grad_norm": 1.173865795135498, "grad_norm_var": 0.026113363341109638, "learning_rate": 2e-05, "loss": 0.4418, "loss/crossentropy": 2.3439362049102783, "loss/hidden": 0.16455078125, "loss/logits": 0.02782224863767624, "loss/reg": 0.024939002469182014, "step": 868 }, { "epoch": 0.4345, "grad_norm": 1.1083062887191772, "grad_norm_var": 0.02485949808100442, "learning_rate": 2e-05, "loss": 0.4429, "loss/crossentropy": 2.08747261762619, "loss/hidden": 0.16357421875, "loss/logits": 0.029917718842625618, "loss/reg": 0.024936381727457047, "step": 869 }, { "epoch": 0.435, "grad_norm": 2.1887571811676025, "grad_norm_var": 0.0793744402641759, "learning_rate": 2e-05, "loss": 0.5581, "loss/crossentropy": 2.1568849086761475, "loss/hidden": 0.26513671875, "loss/logits": 0.04361843876540661, "loss/reg": 0.024933794513344765, "step": 870 }, { "epoch": 0.4355, "grad_norm": 1.2723170518875122, "grad_norm_var": 0.07809042331594848, "learning_rate": 2e-05, "loss": 0.4427, "loss/crossentropy": 2.4057594537734985, "loss/hidden": 0.16162109375, "loss/logits": 0.03172140009701252, "loss/reg": 0.024931542575359344, "step": 871 }, { "epoch": 0.436, "grad_norm": 1.2788238525390625, "grad_norm_var": 0.07781891044481218, "learning_rate": 2e-05, "loss": 0.4932, "loss/crossentropy": 2.2258142232894897, "loss/hidden": 0.2109375, "loss/logits": 0.032939719036221504, "loss/reg": 0.024928996339440346, "step": 872 }, { "epoch": 0.4365, "grad_norm": 1.833802342414856, "grad_norm_var": 0.09422989175293613, "learning_rate": 2e-05, "loss": 0.4469, "loss/crossentropy": 2.3266918659210205, "loss/hidden": 0.17236328125, "loss/logits": 0.02525283396244049, "loss/reg": 0.024926558136940002, "step": 873 }, { "epoch": 0.437, "grad_norm": 1.3627578020095825, "grad_norm_var": 0.09036321255378343, "learning_rate": 2e-05, "loss": 0.4308, "loss/crossentropy": 2.6098272800445557, "loss/hidden": 0.15478515625, "loss/logits": 0.026725860312581062, "loss/reg": 0.02492396906018257, "step": 874 }, { "epoch": 0.4375, "grad_norm": 1.3417320251464844, "grad_norm_var": 0.09000547961185816, "learning_rate": 2e-05, "loss": 0.4348, "loss/crossentropy": 2.27658474445343, "loss/hidden": 0.15673828125, "loss/logits": 0.028818843886256218, "loss/reg": 0.02492145262658596, "step": 875 }, { "epoch": 0.438, "grad_norm": 1.6484942436218262, "grad_norm_var": 0.09397019522889086, "learning_rate": 2e-05, "loss": 0.4471, "loss/crossentropy": 2.509611129760742, "loss/hidden": 0.1689453125, "loss/logits": 0.028927761130034924, "loss/reg": 0.024918843060731888, "step": 876 }, { "epoch": 0.4385, "grad_norm": 1.3224067687988281, "grad_norm_var": 0.09283173563057918, "learning_rate": 2e-05, "loss": 0.4616, "loss/crossentropy": 2.326986074447632, "loss/hidden": 0.17822265625, "loss/logits": 0.034166223369538784, "loss/reg": 0.02491624280810356, "step": 877 }, { "epoch": 0.439, "grad_norm": 1.520644187927246, "grad_norm_var": 0.08930074610143818, "learning_rate": 2e-05, "loss": 0.4886, "loss/crossentropy": 2.3941385746002197, "loss/hidden": 0.19970703125, "loss/logits": 0.039785370230674744, "loss/reg": 0.0249137245118618, "step": 878 }, { "epoch": 0.4395, "grad_norm": 1.2307255268096924, "grad_norm_var": 0.0884393859627858, "learning_rate": 2e-05, "loss": 0.4364, "loss/crossentropy": 2.3979439735412598, "loss/hidden": 0.158203125, "loss/logits": 0.029046453535556793, "loss/reg": 0.024911358952522278, "step": 879 }, { "epoch": 0.44, "grad_norm": 1.3595565557479858, "grad_norm_var": 0.08313544190789533, "learning_rate": 2e-05, "loss": 0.4471, "loss/crossentropy": 2.423276662826538, "loss/hidden": 0.16796875, "loss/logits": 0.03002047911286354, "loss/reg": 0.024909034371376038, "step": 880 }, { "epoch": 0.4405, "grad_norm": 1.374289870262146, "grad_norm_var": 0.08288689659587992, "learning_rate": 2e-05, "loss": 0.4752, "loss/crossentropy": 2.37721049785614, "loss/hidden": 0.193359375, "loss/logits": 0.032731397077441216, "loss/reg": 0.024906881153583527, "step": 881 }, { "epoch": 0.441, "grad_norm": 1.8934530019760132, "grad_norm_var": 0.0941036028269572, "learning_rate": 2e-05, "loss": 0.4598, "loss/crossentropy": 2.418339967727661, "loss/hidden": 0.18212890625, "loss/logits": 0.028664090670645237, "loss/reg": 0.02490459941327572, "step": 882 }, { "epoch": 0.4415, "grad_norm": 1.4956854581832886, "grad_norm_var": 0.08566906663214482, "learning_rate": 2e-05, "loss": 0.4619, "loss/crossentropy": 2.519649028778076, "loss/hidden": 0.1826171875, "loss/logits": 0.030213934369385242, "loss/reg": 0.024902526289224625, "step": 883 }, { "epoch": 0.442, "grad_norm": 1.841424822807312, "grad_norm_var": 0.0877992890859374, "learning_rate": 2e-05, "loss": 0.455, "loss/crossentropy": 2.4970178604125977, "loss/hidden": 0.17626953125, "loss/logits": 0.02974709589034319, "loss/reg": 0.024900225922465324, "step": 884 }, { "epoch": 0.4425, "grad_norm": 1.140735387802124, "grad_norm_var": 0.0861516049042431, "learning_rate": 2e-05, "loss": 0.4212, "loss/crossentropy": 2.385036587715149, "loss/hidden": 0.146484375, "loss/logits": 0.0257627060636878, "loss/reg": 0.024898122996091843, "step": 885 }, { "epoch": 0.443, "grad_norm": 1.5280400514602661, "grad_norm_var": 0.05334077575196729, "learning_rate": 2e-05, "loss": 0.4512, "loss/crossentropy": 2.461831569671631, "loss/hidden": 0.1669921875, "loss/logits": 0.0352974608540535, "loss/reg": 0.024895787239074707, "step": 886 }, { "epoch": 0.4435, "grad_norm": 1.0629712343215942, "grad_norm_var": 0.06146672512662115, "learning_rate": 2e-05, "loss": 0.4218, "loss/crossentropy": 2.366239547729492, "loss/hidden": 0.14794921875, "loss/logits": 0.02493153791874647, "loss/reg": 0.024893587455153465, "step": 887 }, { "epoch": 0.444, "grad_norm": 1.745954155921936, "grad_norm_var": 0.06430499243878576, "learning_rate": 2e-05, "loss": 0.4751, "loss/crossentropy": 2.4183106422424316, "loss/hidden": 0.1826171875, "loss/logits": 0.043571919202804565, "loss/reg": 0.024891452863812447, "step": 888 }, { "epoch": 0.4445, "grad_norm": 1.5373462438583374, "grad_norm_var": 0.055868980125863034, "learning_rate": 2e-05, "loss": 0.4825, "loss/crossentropy": 2.448971748352051, "loss/hidden": 0.1875, "loss/logits": 0.046090008690953255, "loss/reg": 0.024889154359698296, "step": 889 }, { "epoch": 0.445, "grad_norm": 1.2213661670684814, "grad_norm_var": 0.05900614209897312, "learning_rate": 2e-05, "loss": 0.4548, "loss/crossentropy": 2.500189185142517, "loss/hidden": 0.17578125, "loss/logits": 0.030193179845809937, "loss/reg": 0.02488705888390541, "step": 890 }, { "epoch": 0.4455, "grad_norm": 1.2715861797332764, "grad_norm_var": 0.06036416983983243, "learning_rate": 2e-05, "loss": 0.4079, "loss/crossentropy": 2.4891607761383057, "loss/hidden": 0.13671875, "loss/logits": 0.022357992827892303, "loss/reg": 0.02488500438630581, "step": 891 }, { "epoch": 0.446, "grad_norm": 1.2065671682357788, "grad_norm_var": 0.06085480104910346, "learning_rate": 2e-05, "loss": 0.4132, "loss/crossentropy": 2.3212687969207764, "loss/hidden": 0.1416015625, "loss/logits": 0.02274497877806425, "loss/reg": 0.024882985278964043, "step": 892 }, { "epoch": 0.4465, "grad_norm": 2.286463975906372, "grad_norm_var": 0.10613483736873922, "learning_rate": 2e-05, "loss": 0.6098, "loss/crossentropy": 1.9856956601142883, "loss/hidden": 0.29248046875, "loss/logits": 0.06851914338767529, "loss/reg": 0.02488100528717041, "step": 893 }, { "epoch": 0.447, "grad_norm": 1.3317387104034424, "grad_norm_var": 0.10739939277282436, "learning_rate": 2e-05, "loss": 0.4361, "loss/crossentropy": 2.180716037750244, "loss/hidden": 0.1611328125, "loss/logits": 0.026132527738809586, "loss/reg": 0.024878744035959244, "step": 894 }, { "epoch": 0.4475, "grad_norm": 1.1505863666534424, "grad_norm_var": 0.11036276513544672, "learning_rate": 2e-05, "loss": 0.4076, "loss/crossentropy": 2.4193174839019775, "loss/hidden": 0.134765625, "loss/logits": 0.02407541684806347, "loss/reg": 0.024876724928617477, "step": 895 }, { "epoch": 0.448, "grad_norm": 1.2850412130355835, "grad_norm_var": 0.11176224122004706, "learning_rate": 2e-05, "loss": 0.4134, "loss/crossentropy": 2.3620327711105347, "loss/hidden": 0.13916015625, "loss/logits": 0.025503816083073616, "loss/reg": 0.024874389171600342, "step": 896 }, { "epoch": 0.4485, "grad_norm": 2.1535191535949707, "grad_norm_var": 0.1407210477913499, "learning_rate": 2e-05, "loss": 0.523, "loss/crossentropy": 2.0216793417930603, "loss/hidden": 0.2353515625, "loss/logits": 0.03891510330140591, "loss/reg": 0.024871978908777237, "step": 897 }, { "epoch": 0.449, "grad_norm": 1.4914774894714355, "grad_norm_var": 0.1302430455595032, "learning_rate": 2e-05, "loss": 0.439, "loss/crossentropy": 2.452531099319458, "loss/hidden": 0.16455078125, "loss/logits": 0.025756201706826687, "loss/reg": 0.024869605898857117, "step": 898 }, { "epoch": 0.4495, "grad_norm": 1.766234278678894, "grad_norm_var": 0.13522470542034715, "learning_rate": 2e-05, "loss": 0.4881, "loss/crossentropy": 2.4874242544174194, "loss/hidden": 0.203125, "loss/logits": 0.03627724573016167, "loss/reg": 0.02486717328429222, "step": 899 }, { "epoch": 0.45, "grad_norm": 1.790714979171753, "grad_norm_var": 0.13308583996840462, "learning_rate": 2e-05, "loss": 0.4733, "loss/crossentropy": 2.4922057390213013, "loss/hidden": 0.19140625, "loss/logits": 0.033293405547738075, "loss/reg": 0.02486467733979225, "step": 900 }, { "epoch": 0.4505, "grad_norm": 1.8885260820388794, "grad_norm_var": 0.13239945321149568, "learning_rate": 2e-05, "loss": 0.4617, "loss/crossentropy": 2.575096845626831, "loss/hidden": 0.18603515625, "loss/logits": 0.02701327670365572, "loss/reg": 0.024862412363290787, "step": 901 }, { "epoch": 0.451, "grad_norm": 1.5798112154006958, "grad_norm_var": 0.13245070282555294, "learning_rate": 2e-05, "loss": 0.4422, "loss/crossentropy": 2.324281692504883, "loss/hidden": 0.166015625, "loss/logits": 0.027567077428102493, "loss/reg": 0.024859966710209846, "step": 902 }, { "epoch": 0.4515, "grad_norm": 1.364610195159912, "grad_norm_var": 0.11862540114961077, "learning_rate": 2e-05, "loss": 0.4362, "loss/crossentropy": 2.336674928665161, "loss/hidden": 0.1591796875, "loss/logits": 0.028398605063557625, "loss/reg": 0.024857668206095695, "step": 903 }, { "epoch": 0.452, "grad_norm": 1.5987074375152588, "grad_norm_var": 0.11646655255089418, "learning_rate": 2e-05, "loss": 0.4476, "loss/crossentropy": 2.581295609474182, "loss/hidden": 0.1640625, "loss/logits": 0.03499746974557638, "loss/reg": 0.024855423718690872, "step": 904 }, { "epoch": 0.4525, "grad_norm": 1.2477660179138184, "grad_norm_var": 0.12249611635972564, "learning_rate": 2e-05, "loss": 0.471, "loss/crossentropy": 2.3965861797332764, "loss/hidden": 0.1923828125, "loss/logits": 0.03011870291084051, "loss/reg": 0.024853060021996498, "step": 905 }, { "epoch": 0.453, "grad_norm": 1.091818928718567, "grad_norm_var": 0.1290430691585063, "learning_rate": 2e-05, "loss": 0.4421, "loss/crossentropy": 2.3112945556640625, "loss/hidden": 0.16552734375, "loss/logits": 0.02809662837535143, "loss/reg": 0.024850843474268913, "step": 906 }, { "epoch": 0.4535, "grad_norm": 1.2797828912734985, "grad_norm_var": 0.1287631299307894, "learning_rate": 2e-05, "loss": 0.4194, "loss/crossentropy": 2.327611804008484, "loss/hidden": 0.146484375, "loss/logits": 0.024378618225455284, "loss/reg": 0.024848705157637596, "step": 907 }, { "epoch": 0.454, "grad_norm": 1.0900261402130127, "grad_norm_var": 0.13467015675929944, "learning_rate": 2e-05, "loss": 0.4585, "loss/crossentropy": 2.3121442794799805, "loss/hidden": 0.17529296875, "loss/logits": 0.03474980313330889, "loss/reg": 0.024846620857715607, "step": 908 }, { "epoch": 0.4545, "grad_norm": 1.530750036239624, "grad_norm_var": 0.09361760922795549, "learning_rate": 2e-05, "loss": 0.4755, "loss/crossentropy": 2.254515528678894, "loss/hidden": 0.19189453125, "loss/logits": 0.035164170898497105, "loss/reg": 0.024844245985150337, "step": 909 }, { "epoch": 0.455, "grad_norm": 1.4343830347061157, "grad_norm_var": 0.09228027400132052, "learning_rate": 2e-05, "loss": 0.4977, "loss/crossentropy": 2.3030155897140503, "loss/hidden": 0.220703125, "loss/logits": 0.028591503389179707, "loss/reg": 0.02484210580587387, "step": 910 }, { "epoch": 0.4555, "grad_norm": 1.2215298414230347, "grad_norm_var": 0.0894411767193444, "learning_rate": 2e-05, "loss": 0.4869, "loss/crossentropy": 2.1451609134674072, "loss/hidden": 0.203125, "loss/logits": 0.03534893877804279, "loss/reg": 0.024839749559760094, "step": 911 }, { "epoch": 0.456, "grad_norm": 1.1733628511428833, "grad_norm_var": 0.09324906194983575, "learning_rate": 2e-05, "loss": 0.4387, "loss/crossentropy": 2.298704981803894, "loss/hidden": 0.16015625, "loss/logits": 0.030184932053089142, "loss/reg": 0.024837518110871315, "step": 912 }, { "epoch": 0.4565, "grad_norm": 1.3525742292404175, "grad_norm_var": 0.06157036227700316, "learning_rate": 2e-05, "loss": 0.4353, "loss/crossentropy": 2.3784111738204956, "loss/hidden": 0.15576171875, "loss/logits": 0.031202757731080055, "loss/reg": 0.02483524940907955, "step": 913 }, { "epoch": 0.457, "grad_norm": 1.6027723550796509, "grad_norm_var": 0.06323633110931534, "learning_rate": 2e-05, "loss": 0.5053, "loss/crossentropy": 2.2770267724990845, "loss/hidden": 0.21728515625, "loss/logits": 0.03967934101819992, "loss/reg": 0.024832794442772865, "step": 914 }, { "epoch": 0.4575, "grad_norm": 1.939664602279663, "grad_norm_var": 0.07269855280353182, "learning_rate": 2e-05, "loss": 0.5217, "loss/crossentropy": 2.3569631576538086, "loss/hidden": 0.2294921875, "loss/logits": 0.04393378458917141, "loss/reg": 0.024830317124724388, "step": 915 }, { "epoch": 0.458, "grad_norm": 1.4609216451644897, "grad_norm_var": 0.06447793501211076, "learning_rate": 2e-05, "loss": 0.457, "loss/crossentropy": 2.481472373008728, "loss/hidden": 0.17138671875, "loss/logits": 0.03733105957508087, "loss/reg": 0.02482791244983673, "step": 916 }, { "epoch": 0.4585, "grad_norm": 2.2184019088745117, "grad_norm_var": 0.09150982546446039, "learning_rate": 2e-05, "loss": 0.4721, "loss/crossentropy": 2.2963072061538696, "loss/hidden": 0.189453125, "loss/logits": 0.034421585500240326, "loss/reg": 0.024825412780046463, "step": 917 }, { "epoch": 0.459, "grad_norm": 1.441645622253418, "grad_norm_var": 0.0902964389133101, "learning_rate": 2e-05, "loss": 0.4577, "loss/crossentropy": 2.3010048866271973, "loss/hidden": 0.17236328125, "loss/logits": 0.03706255368888378, "loss/reg": 0.024822838604450226, "step": 918 }, { "epoch": 0.4595, "grad_norm": 2.116910219192505, "grad_norm_var": 0.11805189358334474, "learning_rate": 2e-05, "loss": 0.5238, "loss/crossentropy": 2.357789158821106, "loss/hidden": 0.23095703125, "loss/logits": 0.04468147084116936, "loss/reg": 0.024820242077112198, "step": 919 }, { "epoch": 0.46, "grad_norm": 1.6940172910690308, "grad_norm_var": 0.12003205518374636, "learning_rate": 2e-05, "loss": 0.462, "loss/crossentropy": 2.6455941200256348, "loss/hidden": 0.18115234375, "loss/logits": 0.03263464197516441, "loss/reg": 0.024817565456032753, "step": 920 }, { "epoch": 0.4605, "grad_norm": 1.2647062540054321, "grad_norm_var": 0.11949490577010594, "learning_rate": 2e-05, "loss": 0.4935, "loss/crossentropy": 2.5739123821258545, "loss/hidden": 0.21044921875, "loss/logits": 0.03494640905410051, "loss/reg": 0.02481519803404808, "step": 921 }, { "epoch": 0.461, "grad_norm": 1.8925144672393799, "grad_norm_var": 0.11656603854064347, "learning_rate": 2e-05, "loss": 0.4917, "loss/crossentropy": 2.2864513397216797, "loss/hidden": 0.2021484375, "loss/logits": 0.04141218215227127, "loss/reg": 0.024812612682580948, "step": 922 }, { "epoch": 0.4615, "grad_norm": 1.816635251045227, "grad_norm_var": 0.11562187436851393, "learning_rate": 2e-05, "loss": 0.4829, "loss/crossentropy": 2.3441028594970703, "loss/hidden": 0.18994140625, "loss/logits": 0.044871050864458084, "loss/reg": 0.02481023781001568, "step": 923 }, { "epoch": 0.462, "grad_norm": 1.319472074508667, "grad_norm_var": 0.10397834789190098, "learning_rate": 2e-05, "loss": 0.4607, "loss/crossentropy": 2.5088049173355103, "loss/hidden": 0.18017578125, "loss/logits": 0.03244396485388279, "loss/reg": 0.02480742521584034, "step": 924 }, { "epoch": 0.4625, "grad_norm": 1.5024747848510742, "grad_norm_var": 0.10426117709982438, "learning_rate": 2e-05, "loss": 0.4199, "loss/crossentropy": 2.44161593914032, "loss/hidden": 0.14404296875, "loss/logits": 0.027814405038952827, "loss/reg": 0.024804776534438133, "step": 925 }, { "epoch": 0.463, "grad_norm": 1.0549204349517822, "grad_norm_var": 0.12117201442257676, "learning_rate": 2e-05, "loss": 0.4289, "loss/crossentropy": 2.441314697265625, "loss/hidden": 0.15185546875, "loss/logits": 0.029008976183831692, "loss/reg": 0.024802392348647118, "step": 926 }, { "epoch": 0.4635, "grad_norm": 1.2234230041503906, "grad_norm_var": 0.12108502599879684, "learning_rate": 2e-05, "loss": 0.4649, "loss/crossentropy": 2.360079288482666, "loss/hidden": 0.18505859375, "loss/logits": 0.03184010460972786, "loss/reg": 0.024800008162856102, "step": 927 }, { "epoch": 0.464, "grad_norm": 1.3127866983413696, "grad_norm_var": 0.11497950175635048, "learning_rate": 2e-05, "loss": 0.417, "loss/crossentropy": 2.2916054725646973, "loss/hidden": 0.1484375, "loss/logits": 0.02055790089070797, "loss/reg": 0.024797627702355385, "step": 928 }, { "epoch": 0.4645, "grad_norm": 1.418331503868103, "grad_norm_var": 0.11329202015476666, "learning_rate": 2e-05, "loss": 0.4367, "loss/crossentropy": 2.228062152862549, "loss/hidden": 0.15966796875, "loss/logits": 0.02912633679807186, "loss/reg": 0.024795077741146088, "step": 929 }, { "epoch": 0.465, "grad_norm": 1.2717900276184082, "grad_norm_var": 0.11913277672642243, "learning_rate": 2e-05, "loss": 0.4317, "loss/crossentropy": 2.266680121421814, "loss/hidden": 0.1552734375, "loss/logits": 0.028470346704125404, "loss/reg": 0.02479269914329052, "step": 930 }, { "epoch": 0.4655, "grad_norm": 1.396073341369629, "grad_norm_var": 0.11003177528165793, "learning_rate": 2e-05, "loss": 0.483, "loss/crossentropy": 2.5918630361557007, "loss/hidden": 0.1953125, "loss/logits": 0.03977473732084036, "loss/reg": 0.02479018084704876, "step": 931 }, { "epoch": 0.466, "grad_norm": 1.1711387634277344, "grad_norm_var": 0.11776813258662025, "learning_rate": 2e-05, "loss": 0.4211, "loss/crossentropy": 2.1843584775924683, "loss/hidden": 0.1494140625, "loss/logits": 0.023802118375897408, "loss/reg": 0.024787776172161102, "step": 932 }, { "epoch": 0.4665, "grad_norm": 1.4844838380813599, "grad_norm_var": 0.08183792965829349, "learning_rate": 2e-05, "loss": 0.4591, "loss/crossentropy": 2.2599565982818604, "loss/hidden": 0.17041015625, "loss/logits": 0.040790168568491936, "loss/reg": 0.024785393849015236, "step": 933 }, { "epoch": 0.467, "grad_norm": 1.6613248586654663, "grad_norm_var": 0.08427746877438451, "learning_rate": 2e-05, "loss": 0.4759, "loss/crossentropy": 2.3885433673858643, "loss/hidden": 0.1943359375, "loss/logits": 0.033717614598572254, "loss/reg": 0.02478303201496601, "step": 934 }, { "epoch": 0.4675, "grad_norm": 2.4827864170074463, "grad_norm_var": 0.12395562095072604, "learning_rate": 2e-05, "loss": 0.5939, "loss/crossentropy": 2.383415699005127, "loss/hidden": 0.302734375, "loss/logits": 0.04333702102303505, "loss/reg": 0.02478056028485298, "step": 935 }, { "epoch": 0.468, "grad_norm": 1.4659557342529297, "grad_norm_var": 0.12124371062594505, "learning_rate": 2e-05, "loss": 0.485, "loss/crossentropy": 2.1564711332321167, "loss/hidden": 0.19921875, "loss/logits": 0.03798619005829096, "loss/reg": 0.024778055027127266, "step": 936 }, { "epoch": 0.4685, "grad_norm": 1.528003454208374, "grad_norm_var": 0.11788932977424474, "learning_rate": 2e-05, "loss": 0.4102, "loss/crossentropy": 2.308253049850464, "loss/hidden": 0.13623046875, "loss/logits": 0.026196792721748352, "loss/reg": 0.024775685742497444, "step": 937 }, { "epoch": 0.469, "grad_norm": 1.1551241874694824, "grad_norm_var": 0.11329483698475963, "learning_rate": 2e-05, "loss": 0.4444, "loss/crossentropy": 2.2042760848999023, "loss/hidden": 0.1689453125, "loss/logits": 0.027680596336722374, "loss/reg": 0.024773309007287025, "step": 938 }, { "epoch": 0.4695, "grad_norm": 2.032935857772827, "grad_norm_var": 0.1266760833029648, "learning_rate": 2e-05, "loss": 0.4807, "loss/crossentropy": 2.7006815671920776, "loss/hidden": 0.19091796875, "loss/logits": 0.042035577818751335, "loss/reg": 0.02477095276117325, "step": 939 }, { "epoch": 0.47, "grad_norm": 1.141130805015564, "grad_norm_var": 0.1321853888846779, "learning_rate": 2e-05, "loss": 0.4271, "loss/crossentropy": 2.4339696168899536, "loss/hidden": 0.15673828125, "loss/logits": 0.02268486563116312, "loss/reg": 0.024768613278865814, "step": 940 }, { "epoch": 0.4705, "grad_norm": 1.7656772136688232, "grad_norm_var": 0.13813141921850866, "learning_rate": 2e-05, "loss": 0.4583, "loss/crossentropy": 2.327541947364807, "loss/hidden": 0.1767578125, "loss/logits": 0.03389530163258314, "loss/reg": 0.024766255170106888, "step": 941 }, { "epoch": 0.471, "grad_norm": 1.3216570615768433, "grad_norm_var": 0.12771394362122404, "learning_rate": 2e-05, "loss": 0.4448, "loss/crossentropy": 2.4096368551254272, "loss/hidden": 0.1650390625, "loss/logits": 0.03215141408145428, "loss/reg": 0.024764133617281914, "step": 942 }, { "epoch": 0.4715, "grad_norm": 1.3881388902664185, "grad_norm_var": 0.12356518206842436, "learning_rate": 2e-05, "loss": 0.416, "loss/crossentropy": 2.585834264755249, "loss/hidden": 0.1416015625, "loss/logits": 0.02676891814917326, "loss/reg": 0.024761632084846497, "step": 943 }, { "epoch": 0.472, "grad_norm": 1.2373863458633423, "grad_norm_var": 0.1258009621939289, "learning_rate": 2e-05, "loss": 0.4321, "loss/crossentropy": 2.3134829998016357, "loss/hidden": 0.15576171875, "loss/logits": 0.02871276345103979, "loss/reg": 0.024759074673056602, "step": 944 }, { "epoch": 0.4725, "grad_norm": 1.1650878190994263, "grad_norm_var": 0.1324021004505103, "learning_rate": 2e-05, "loss": 0.4674, "loss/crossentropy": 2.1889017820358276, "loss/hidden": 0.19140625, "loss/logits": 0.028469436801970005, "loss/reg": 0.02475649118423462, "step": 945 }, { "epoch": 0.473, "grad_norm": 3.083178997039795, "grad_norm_var": 0.28735681279515096, "learning_rate": 2e-05, "loss": 0.4476, "loss/crossentropy": 2.484034538269043, "loss/hidden": 0.17529296875, "loss/logits": 0.024776030331850052, "loss/reg": 0.02475435845553875, "step": 946 }, { "epoch": 0.4735, "grad_norm": 2.8552777767181396, "grad_norm_var": 0.38221875854398485, "learning_rate": 2e-05, "loss": 0.6484, "loss/crossentropy": 2.2809172868728638, "loss/hidden": 0.328125, "loss/logits": 0.07271300628781319, "loss/reg": 0.024751881137490273, "step": 947 }, { "epoch": 0.474, "grad_norm": 1.3637315034866333, "grad_norm_var": 0.3713747885972831, "learning_rate": 2e-05, "loss": 0.4537, "loss/crossentropy": 2.368937849998474, "loss/hidden": 0.17724609375, "loss/logits": 0.02893682010471821, "loss/reg": 0.02474971115589142, "step": 948 }, { "epoch": 0.4745, "grad_norm": 1.571547269821167, "grad_norm_var": 0.36939615340519977, "learning_rate": 2e-05, "loss": 0.435, "loss/crossentropy": 2.5506834983825684, "loss/hidden": 0.15869140625, "loss/logits": 0.028869743458926678, "loss/reg": 0.024747245013713837, "step": 949 }, { "epoch": 0.475, "grad_norm": 1.5900770425796509, "grad_norm_var": 0.37009206946137085, "learning_rate": 2e-05, "loss": 0.4676, "loss/crossentropy": 2.4405782222747803, "loss/hidden": 0.18701171875, "loss/logits": 0.033153336495161057, "loss/reg": 0.02474481612443924, "step": 950 }, { "epoch": 0.4755, "grad_norm": 1.2171446084976196, "grad_norm_var": 0.3375590343649016, "learning_rate": 2e-05, "loss": 0.4351, "loss/crossentropy": 2.6292362213134766, "loss/hidden": 0.1572265625, "loss/logits": 0.030438624322414398, "loss/reg": 0.02474270388484001, "step": 951 }, { "epoch": 0.476, "grad_norm": 1.5484012365341187, "grad_norm_var": 0.3363165658381873, "learning_rate": 2e-05, "loss": 0.4397, "loss/crossentropy": 2.4467194080352783, "loss/hidden": 0.16259765625, "loss/logits": 0.029697156511247158, "loss/reg": 0.02474055252969265, "step": 952 }, { "epoch": 0.4765, "grad_norm": 1.3582558631896973, "grad_norm_var": 0.34026256323006543, "learning_rate": 2e-05, "loss": 0.449, "loss/crossentropy": 2.439231514930725, "loss/hidden": 0.173828125, "loss/logits": 0.027837133966386318, "loss/reg": 0.024738363921642303, "step": 953 }, { "epoch": 0.477, "grad_norm": 1.969158411026001, "grad_norm_var": 0.33207128414330966, "learning_rate": 2e-05, "loss": 0.4533, "loss/crossentropy": 2.452765464782715, "loss/hidden": 0.17138671875, "loss/logits": 0.03453033231198788, "loss/reg": 0.024736056104302406, "step": 954 }, { "epoch": 0.4775, "grad_norm": 1.4187953472137451, "grad_norm_var": 0.32535599956763966, "learning_rate": 2e-05, "loss": 0.4805, "loss/crossentropy": 2.3570865392684937, "loss/hidden": 0.1953125, "loss/logits": 0.0378948412835598, "loss/reg": 0.024733752012252808, "step": 955 }, { "epoch": 0.478, "grad_norm": 1.6787301301956177, "grad_norm_var": 0.30875959889143295, "learning_rate": 2e-05, "loss": 0.4978, "loss/crossentropy": 2.316788911819458, "loss/hidden": 0.197265625, "loss/logits": 0.0531964972615242, "loss/reg": 0.02473163791000843, "step": 956 }, { "epoch": 0.4785, "grad_norm": 1.398138403892517, "grad_norm_var": 0.311938660042613, "learning_rate": 2e-05, "loss": 0.464, "loss/crossentropy": 2.5198220014572144, "loss/hidden": 0.177734375, "loss/logits": 0.03894750215113163, "loss/reg": 0.024729417636990547, "step": 957 }, { "epoch": 0.479, "grad_norm": 1.1664695739746094, "grad_norm_var": 0.3199335312784062, "learning_rate": 2e-05, "loss": 0.4793, "loss/crossentropy": 2.3437399864196777, "loss/hidden": 0.20068359375, "loss/logits": 0.031378373503685, "loss/reg": 0.024727249518036842, "step": 958 }, { "epoch": 0.4795, "grad_norm": 1.1036415100097656, "grad_norm_var": 0.33399962070790534, "learning_rate": 2e-05, "loss": 0.4159, "loss/crossentropy": 2.387402892112732, "loss/hidden": 0.14306640625, "loss/logits": 0.025623535737395287, "loss/reg": 0.02472485415637493, "step": 959 }, { "epoch": 0.48, "grad_norm": 1.7331931591033936, "grad_norm_var": 0.32487558042496256, "learning_rate": 2e-05, "loss": 0.5184, "loss/crossentropy": 2.3020901679992676, "loss/hidden": 0.220703125, "loss/logits": 0.05047208443284035, "loss/reg": 0.024722406640648842, "step": 960 }, { "epoch": 0.4805, "grad_norm": 1.6267915964126587, "grad_norm_var": 0.3090366583706251, "learning_rate": 2e-05, "loss": 0.4403, "loss/crossentropy": 2.4366101026535034, "loss/hidden": 0.15576171875, "loss/logits": 0.03732542134821415, "loss/reg": 0.024719906970858574, "step": 961 }, { "epoch": 0.481, "grad_norm": 1.4028695821762085, "grad_norm_var": 0.16836660240098808, "learning_rate": 2e-05, "loss": 0.4703, "loss/crossentropy": 2.271396040916443, "loss/hidden": 0.19287109375, "loss/logits": 0.030256666243076324, "loss/reg": 0.024717407301068306, "step": 962 }, { "epoch": 0.4815, "grad_norm": 1.1878552436828613, "grad_norm_var": 0.054751871241501014, "learning_rate": 2e-05, "loss": 0.4214, "loss/crossentropy": 2.3253756761550903, "loss/hidden": 0.14697265625, "loss/logits": 0.02732379548251629, "loss/reg": 0.024714868515729904, "step": 963 }, { "epoch": 0.482, "grad_norm": 1.1242592334747314, "grad_norm_var": 0.06135958658490489, "learning_rate": 2e-05, "loss": 0.4465, "loss/crossentropy": 2.239442467689514, "loss/hidden": 0.16650390625, "loss/logits": 0.03284657001495361, "loss/reg": 0.024712176993489265, "step": 964 }, { "epoch": 0.4825, "grad_norm": 1.3463644981384277, "grad_norm_var": 0.06068299245025669, "learning_rate": 2e-05, "loss": 0.4428, "loss/crossentropy": 2.316848874092102, "loss/hidden": 0.16357421875, "loss/logits": 0.03217571787536144, "loss/reg": 0.024709584191441536, "step": 965 }, { "epoch": 0.483, "grad_norm": 1.765031099319458, "grad_norm_var": 0.0663445679323234, "learning_rate": 2e-05, "loss": 0.4263, "loss/crossentropy": 2.5315778255462646, "loss/hidden": 0.14892578125, "loss/logits": 0.030336866155266762, "loss/reg": 0.024706894531846046, "step": 966 }, { "epoch": 0.4835, "grad_norm": 1.2559092044830322, "grad_norm_var": 0.06528498573976828, "learning_rate": 2e-05, "loss": 0.4667, "loss/crossentropy": 2.440574526786804, "loss/hidden": 0.18359375, "loss/logits": 0.036081746220588684, "loss/reg": 0.024704458191990852, "step": 967 }, { "epoch": 0.484, "grad_norm": 1.1820833683013916, "grad_norm_var": 0.06851111155043531, "learning_rate": 2e-05, "loss": 0.413, "loss/crossentropy": 2.4466443061828613, "loss/hidden": 0.14453125, "loss/logits": 0.02141994796693325, "loss/reg": 0.024701889604330063, "step": 968 }, { "epoch": 0.4845, "grad_norm": 2.0894601345062256, "grad_norm_var": 0.09592261683346047, "learning_rate": 2e-05, "loss": 0.4239, "loss/crossentropy": 2.4341933727264404, "loss/hidden": 0.156005859375, "loss/logits": 0.02092854119837284, "loss/reg": 0.02469906210899353, "step": 969 }, { "epoch": 0.485, "grad_norm": 1.430828332901001, "grad_norm_var": 0.07788717528373278, "learning_rate": 2e-05, "loss": 0.4388, "loss/crossentropy": 2.356964588165283, "loss/hidden": 0.1591796875, "loss/logits": 0.03270021267235279, "loss/reg": 0.02469666488468647, "step": 970 }, { "epoch": 0.4855, "grad_norm": 1.776854395866394, "grad_norm_var": 0.08527437507114347, "learning_rate": 2e-05, "loss": 0.5232, "loss/crossentropy": 2.171905517578125, "loss/hidden": 0.2265625, "loss/logits": 0.049691107124090195, "loss/reg": 0.02469424158334732, "step": 971 }, { "epoch": 0.486, "grad_norm": 1.2763676643371582, "grad_norm_var": 0.08335147102312987, "learning_rate": 2e-05, "loss": 0.4638, "loss/crossentropy": 2.0069618225097656, "loss/hidden": 0.1845703125, "loss/logits": 0.032350869849324226, "loss/reg": 0.02469182200729847, "step": 972 }, { "epoch": 0.4865, "grad_norm": 1.5993913412094116, "grad_norm_var": 0.08505121055133316, "learning_rate": 2e-05, "loss": 0.4726, "loss/crossentropy": 2.4825299978256226, "loss/hidden": 0.18701171875, "loss/logits": 0.03873500041663647, "loss/reg": 0.024689404293894768, "step": 973 }, { "epoch": 0.487, "grad_norm": 1.3979259729385376, "grad_norm_var": 0.07990529104096797, "learning_rate": 2e-05, "loss": 0.4285, "loss/crossentropy": 2.3328219652175903, "loss/hidden": 0.15283203125, "loss/logits": 0.028818014077842236, "loss/reg": 0.024686843156814575, "step": 974 }, { "epoch": 0.4875, "grad_norm": 2.5152621269226074, "grad_norm_var": 0.138094556758349, "learning_rate": 2e-05, "loss": 0.5242, "loss/crossentropy": 2.279319643974304, "loss/hidden": 0.23974609375, "loss/logits": 0.037576699629426, "loss/reg": 0.024684444069862366, "step": 975 }, { "epoch": 0.488, "grad_norm": 1.4693434238433838, "grad_norm_var": 0.13580396599954264, "learning_rate": 2e-05, "loss": 0.4357, "loss/crossentropy": 2.2661033868789673, "loss/hidden": 0.162109375, "loss/logits": 0.026757996529340744, "loss/reg": 0.024681907147169113, "step": 976 }, { "epoch": 0.4885, "grad_norm": 2.0209670066833496, "grad_norm_var": 0.15071162713445574, "learning_rate": 2e-05, "loss": 0.4782, "loss/crossentropy": 2.4691094160079956, "loss/hidden": 0.18505859375, "loss/logits": 0.04630833398550749, "loss/reg": 0.024679280817508698, "step": 977 }, { "epoch": 0.489, "grad_norm": 1.5368741750717163, "grad_norm_var": 0.1491596028383583, "learning_rate": 2e-05, "loss": 0.4838, "loss/crossentropy": 2.272148370742798, "loss/hidden": 0.2041015625, "loss/logits": 0.032892788760364056, "loss/reg": 0.02467675693333149, "step": 978 }, { "epoch": 0.4895, "grad_norm": 1.4713010787963867, "grad_norm_var": 0.14008166049740395, "learning_rate": 2e-05, "loss": 0.4665, "loss/crossentropy": 2.202664375305176, "loss/hidden": 0.18798828125, "loss/logits": 0.03179653640836477, "loss/reg": 0.024674372747540474, "step": 979 }, { "epoch": 0.49, "grad_norm": 1.870495080947876, "grad_norm_var": 0.12967598326321478, "learning_rate": 2e-05, "loss": 0.462, "loss/crossentropy": 2.598837971687317, "loss/hidden": 0.171875, "loss/logits": 0.04344309400767088, "loss/reg": 0.024671973660588264, "step": 980 }, { "epoch": 0.4905, "grad_norm": 1.2552647590637207, "grad_norm_var": 0.1335825488275977, "learning_rate": 2e-05, "loss": 0.4263, "loss/crossentropy": 2.2683433294296265, "loss/hidden": 0.15234375, "loss/logits": 0.027212919667363167, "loss/reg": 0.024669544771313667, "step": 981 }, { "epoch": 0.491, "grad_norm": 1.5247058868408203, "grad_norm_var": 0.13253172817718994, "learning_rate": 2e-05, "loss": 0.4795, "loss/crossentropy": 2.3193823099136353, "loss/hidden": 0.20166015625, "loss/logits": 0.031176569871604443, "loss/reg": 0.02466718479990959, "step": 982 }, { "epoch": 0.4915, "grad_norm": 1.1023645401000977, "grad_norm_var": 0.14114311646802283, "learning_rate": 2e-05, "loss": 0.4124, "loss/crossentropy": 2.534896492958069, "loss/hidden": 0.14111328125, "loss/logits": 0.024660163559019566, "loss/reg": 0.02466486021876335, "step": 983 }, { "epoch": 0.492, "grad_norm": 1.2959052324295044, "grad_norm_var": 0.13568678899981698, "learning_rate": 2e-05, "loss": 0.4442, "loss/crossentropy": 2.2339383363723755, "loss/hidden": 0.16748046875, "loss/logits": 0.030090173706412315, "loss/reg": 0.024662485346198082, "step": 984 }, { "epoch": 0.4925, "grad_norm": 1.5475845336914062, "grad_norm_var": 0.11882549883375754, "learning_rate": 2e-05, "loss": 0.4666, "loss/crossentropy": 2.548925042152405, "loss/hidden": 0.1875, "loss/logits": 0.03245330601930618, "loss/reg": 0.024660129100084305, "step": 985 }, { "epoch": 0.493, "grad_norm": 1.518269658088684, "grad_norm_var": 0.11770160652835292, "learning_rate": 2e-05, "loss": 0.4389, "loss/crossentropy": 2.379398465156555, "loss/hidden": 0.16552734375, "loss/logits": 0.026743890717625618, "loss/reg": 0.024657921865582466, "step": 986 }, { "epoch": 0.4935, "grad_norm": 1.570279836654663, "grad_norm_var": 0.11477257851482622, "learning_rate": 2e-05, "loss": 0.4243, "loss/crossentropy": 2.4684702157974243, "loss/hidden": 0.150390625, "loss/logits": 0.027326886542141438, "loss/reg": 0.02465582638978958, "step": 987 }, { "epoch": 0.494, "grad_norm": 1.4916634559631348, "grad_norm_var": 0.109505544141344, "learning_rate": 2e-05, "loss": 0.4294, "loss/crossentropy": 2.4013638496398926, "loss/hidden": 0.15673828125, "loss/logits": 0.026141656562685966, "loss/reg": 0.02465374581515789, "step": 988 }, { "epoch": 0.4945, "grad_norm": 1.7180440425872803, "grad_norm_var": 0.11078359056482606, "learning_rate": 2e-05, "loss": 0.5258, "loss/crossentropy": 2.3340543508529663, "loss/hidden": 0.22998046875, "loss/logits": 0.049306683242321014, "loss/reg": 0.024651547893881798, "step": 989 }, { "epoch": 0.495, "grad_norm": 1.192015290260315, "grad_norm_var": 0.11847738378988476, "learning_rate": 2e-05, "loss": 0.4235, "loss/crossentropy": 2.4830812215805054, "loss/hidden": 0.1513671875, "loss/logits": 0.02562696486711502, "loss/reg": 0.024649281054735184, "step": 990 }, { "epoch": 0.4955, "grad_norm": 2.068011522293091, "grad_norm_var": 0.07453697096159431, "learning_rate": 2e-05, "loss": 0.4613, "loss/crossentropy": 2.6523276567459106, "loss/hidden": 0.18115234375, "loss/logits": 0.033670464530587196, "loss/reg": 0.02464720420539379, "step": 991 }, { "epoch": 0.496, "grad_norm": 1.2752137184143066, "grad_norm_var": 0.07874241495605147, "learning_rate": 2e-05, "loss": 0.4984, "loss/crossentropy": 2.0650646686553955, "loss/hidden": 0.2119140625, "loss/logits": 0.04002711549401283, "loss/reg": 0.024645155295729637, "step": 992 }, { "epoch": 0.4965, "grad_norm": 1.5579633712768555, "grad_norm_var": 0.06175023932142167, "learning_rate": 2e-05, "loss": 0.4986, "loss/crossentropy": 2.3349034786224365, "loss/hidden": 0.197265625, "loss/logits": 0.054928943514823914, "loss/reg": 0.02464275248348713, "step": 993 }, { "epoch": 0.497, "grad_norm": 1.2338091135025024, "grad_norm_var": 0.06599051690941451, "learning_rate": 2e-05, "loss": 0.4429, "loss/crossentropy": 2.4117237329483032, "loss/hidden": 0.1650390625, "loss/logits": 0.031464939936995506, "loss/reg": 0.024640321731567383, "step": 994 }, { "epoch": 0.4975, "grad_norm": 1.5982106924057007, "grad_norm_var": 0.0668363147457848, "learning_rate": 2e-05, "loss": 0.4563, "loss/crossentropy": 2.387059211730957, "loss/hidden": 0.17236328125, "loss/logits": 0.03751287795603275, "loss/reg": 0.024637887254357338, "step": 995 }, { "epoch": 0.498, "grad_norm": 1.4983510971069336, "grad_norm_var": 0.056549508629934485, "learning_rate": 2e-05, "loss": 0.4948, "loss/crossentropy": 2.186620593070984, "loss/hidden": 0.21337890625, "loss/logits": 0.03502054139971733, "loss/reg": 0.024635281413793564, "step": 996 }, { "epoch": 0.4985, "grad_norm": 1.8561230897903442, "grad_norm_var": 0.062272768724757795, "learning_rate": 2e-05, "loss": 0.5759, "loss/crossentropy": 2.4618980884552, "loss/hidden": 0.27197265625, "loss/logits": 0.05761981941759586, "loss/reg": 0.024632660672068596, "step": 997 }, { "epoch": 0.499, "grad_norm": 1.5297044515609741, "grad_norm_var": 0.06228877530962974, "learning_rate": 2e-05, "loss": 0.4652, "loss/crossentropy": 2.2573466300964355, "loss/hidden": 0.18505859375, "loss/logits": 0.03383258357644081, "loss/reg": 0.024630188941955566, "step": 998 }, { "epoch": 0.4995, "grad_norm": 1.9509611129760742, "grad_norm_var": 0.06192666000230999, "learning_rate": 2e-05, "loss": 0.4608, "loss/crossentropy": 2.4582537412643433, "loss/hidden": 0.18115234375, "loss/logits": 0.03336348757147789, "loss/reg": 0.024627676233649254, "step": 999 }, { "epoch": 0.5, "grad_norm": 1.263331413269043, "grad_norm_var": 0.06312427179109174, "learning_rate": 2e-05, "loss": 0.4127, "loss/crossentropy": 2.5180909633636475, "loss/hidden": 0.1435546875, "loss/logits": 0.022916819900274277, "loss/reg": 0.024625113233923912, "step": 1000 }, { "epoch": 0.5005, "grad_norm": 2.315190553665161, "grad_norm_var": 0.09925843788652339, "learning_rate": 2e-05, "loss": 0.5383, "loss/crossentropy": 2.3459049463272095, "loss/hidden": 0.244140625, "loss/logits": 0.04792695306241512, "loss/reg": 0.024622488766908646, "step": 1001 }, { "epoch": 0.501, "grad_norm": 1.533280372619629, "grad_norm_var": 0.09910429692047741, "learning_rate": 2e-05, "loss": 0.4648, "loss/crossentropy": 2.3168352842330933, "loss/hidden": 0.18896484375, "loss/logits": 0.029631631448864937, "loss/reg": 0.02461997978389263, "step": 1002 }, { "epoch": 0.5015, "grad_norm": 2.5686206817626953, "grad_norm_var": 0.1570070725080583, "learning_rate": 2e-05, "loss": 0.5058, "loss/crossentropy": 2.1777498722076416, "loss/hidden": 0.2099609375, "loss/logits": 0.0496145635843277, "loss/reg": 0.024617573246359825, "step": 1003 }, { "epoch": 0.502, "grad_norm": 1.3942785263061523, "grad_norm_var": 0.15985904345598664, "learning_rate": 2e-05, "loss": 0.4515, "loss/crossentropy": 2.30439692735672, "loss/hidden": 0.18017578125, "loss/logits": 0.025181924924254417, "loss/reg": 0.024615149945020676, "step": 1004 }, { "epoch": 0.5025, "grad_norm": 1.6636312007904053, "grad_norm_var": 0.15961985398144515, "learning_rate": 2e-05, "loss": 0.4799, "loss/crossentropy": 2.387961268424988, "loss/hidden": 0.193359375, "loss/logits": 0.04039803333580494, "loss/reg": 0.02461281418800354, "step": 1005 }, { "epoch": 0.503, "grad_norm": 1.175167202949524, "grad_norm_var": 0.16068027431229595, "learning_rate": 2e-05, "loss": 0.4205, "loss/crossentropy": 2.2776483297348022, "loss/hidden": 0.14599609375, "loss/logits": 0.028357837349176407, "loss/reg": 0.024610213935375214, "step": 1006 }, { "epoch": 0.5035, "grad_norm": 1.2402100563049316, "grad_norm_var": 0.15793593833079214, "learning_rate": 2e-05, "loss": 0.4417, "loss/crossentropy": 2.4232317209243774, "loss/hidden": 0.16845703125, "loss/logits": 0.027130945585668087, "loss/reg": 0.024607809260487556, "step": 1007 }, { "epoch": 0.504, "grad_norm": 1.4888067245483398, "grad_norm_var": 0.15144150127088754, "learning_rate": 2e-05, "loss": 0.4199, "loss/crossentropy": 2.17998468875885, "loss/hidden": 0.15234375, "loss/logits": 0.021496030502021313, "loss/reg": 0.02460542693734169, "step": 1008 }, { "epoch": 0.5045, "grad_norm": 1.750985026359558, "grad_norm_var": 0.15225772018986655, "learning_rate": 2e-05, "loss": 0.4642, "loss/crossentropy": 2.200040578842163, "loss/hidden": 0.18701171875, "loss/logits": 0.031146997585892677, "loss/reg": 0.02460303343832493, "step": 1009 }, { "epoch": 0.505, "grad_norm": 1.1058796644210815, "grad_norm_var": 0.16001790603834795, "learning_rate": 2e-05, "loss": 0.4448, "loss/crossentropy": 2.094850778579712, "loss/hidden": 0.16796875, "loss/logits": 0.030812044627964497, "loss/reg": 0.024600572884082794, "step": 1010 }, { "epoch": 0.5055, "grad_norm": 1.4799710512161255, "grad_norm_var": 0.16124775408475406, "learning_rate": 2e-05, "loss": 0.4294, "loss/crossentropy": 2.5933122634887695, "loss/hidden": 0.15185546875, "loss/logits": 0.03153660800307989, "loss/reg": 0.024597788229584694, "step": 1011 }, { "epoch": 0.506, "grad_norm": 2.6447713375091553, "grad_norm_var": 0.22580341469373647, "learning_rate": 2e-05, "loss": 0.4916, "loss/crossentropy": 2.423816442489624, "loss/hidden": 0.212890625, "loss/logits": 0.03276214189827442, "loss/reg": 0.024594949558377266, "step": 1012 }, { "epoch": 0.5065, "grad_norm": 1.3123342990875244, "grad_norm_var": 0.23188188108190289, "learning_rate": 2e-05, "loss": 0.4473, "loss/crossentropy": 2.268904685974121, "loss/hidden": 0.16845703125, "loss/logits": 0.03294616658240557, "loss/reg": 0.024592256173491478, "step": 1013 }, { "epoch": 0.507, "grad_norm": 1.6381093263626099, "grad_norm_var": 0.23086213820556947, "learning_rate": 2e-05, "loss": 0.483, "loss/crossentropy": 2.5634379386901855, "loss/hidden": 0.1953125, "loss/logits": 0.04177115485072136, "loss/reg": 0.024589471518993378, "step": 1014 }, { "epoch": 0.5075, "grad_norm": 1.9902760982513428, "grad_norm_var": 0.2324952537472744, "learning_rate": 2e-05, "loss": 0.5418, "loss/crossentropy": 2.2787784934043884, "loss/hidden": 0.24169921875, "loss/logits": 0.05426573008298874, "loss/reg": 0.024587033316493034, "step": 1015 }, { "epoch": 0.508, "grad_norm": 1.2062731981277466, "grad_norm_var": 0.2357187944792192, "learning_rate": 2e-05, "loss": 0.4281, "loss/crossentropy": 2.5111724138259888, "loss/hidden": 0.15234375, "loss/logits": 0.02992274332791567, "loss/reg": 0.024584423750638962, "step": 1016 }, { "epoch": 0.5085, "grad_norm": 1.755979299545288, "grad_norm_var": 0.20616830501869762, "learning_rate": 2e-05, "loss": 0.475, "loss/crossentropy": 2.331393003463745, "loss/hidden": 0.19921875, "loss/logits": 0.029994547367095947, "loss/reg": 0.024581963196396828, "step": 1017 }, { "epoch": 0.509, "grad_norm": 1.800107479095459, "grad_norm_var": 0.2074693433041612, "learning_rate": 2e-05, "loss": 0.4413, "loss/crossentropy": 2.3108561038970947, "loss/hidden": 0.16650390625, "loss/logits": 0.029039999470114708, "loss/reg": 0.024579644203186035, "step": 1018 }, { "epoch": 0.5095, "grad_norm": 1.8739287853240967, "grad_norm_var": 0.15147520519188878, "learning_rate": 2e-05, "loss": 0.4338, "loss/crossentropy": 2.204409599304199, "loss/hidden": 0.16259765625, "loss/logits": 0.025386362336575985, "loss/reg": 0.024577105417847633, "step": 1019 }, { "epoch": 0.51, "grad_norm": 2.2447474002838135, "grad_norm_var": 0.17391527788569666, "learning_rate": 2e-05, "loss": 0.4625, "loss/crossentropy": 2.5994725227355957, "loss/hidden": 0.189453125, "loss/logits": 0.0273160170763731, "loss/reg": 0.024574514478445053, "step": 1020 }, { "epoch": 0.5105, "grad_norm": 1.2325525283813477, "grad_norm_var": 0.18464255921690906, "learning_rate": 2e-05, "loss": 0.4429, "loss/crossentropy": 2.3073863983154297, "loss/hidden": 0.16796875, "loss/logits": 0.029254252091050148, "loss/reg": 0.024571970105171204, "step": 1021 }, { "epoch": 0.511, "grad_norm": 1.2389066219329834, "grad_norm_var": 0.18110535153355295, "learning_rate": 2e-05, "loss": 0.4033, "loss/crossentropy": 2.50894033908844, "loss/hidden": 0.13671875, "loss/logits": 0.020873015746474266, "loss/reg": 0.024569377303123474, "step": 1022 }, { "epoch": 0.5115, "grad_norm": 1.7655569314956665, "grad_norm_var": 0.17138478636465398, "learning_rate": 2e-05, "loss": 0.5674, "loss/crossentropy": 1.9970663189888, "loss/hidden": 0.2744140625, "loss/logits": 0.04734223149716854, "loss/reg": 0.02456682361662388, "step": 1023 }, { "epoch": 0.512, "grad_norm": 1.2552883625030518, "grad_norm_var": 0.18006323532261087, "learning_rate": 2e-05, "loss": 0.4555, "loss/crossentropy": 2.412826180458069, "loss/hidden": 0.177734375, "loss/logits": 0.03207558020949364, "loss/reg": 0.024564214050769806, "step": 1024 }, { "epoch": 0.5125, "grad_norm": 1.5279215574264526, "grad_norm_var": 0.1799756513199552, "learning_rate": 2e-05, "loss": 0.4414, "loss/crossentropy": 2.2349936962127686, "loss/hidden": 0.166015625, "loss/logits": 0.029770507477223873, "loss/reg": 0.024561790749430656, "step": 1025 }, { "epoch": 0.513, "grad_norm": 1.204811930656433, "grad_norm_var": 0.17367981846485894, "learning_rate": 2e-05, "loss": 0.4652, "loss/crossentropy": 2.3194793462753296, "loss/hidden": 0.1865234375, "loss/logits": 0.03311028238385916, "loss/reg": 0.02455941028892994, "step": 1026 }, { "epoch": 0.5135, "grad_norm": 1.3728058338165283, "grad_norm_var": 0.17662305625485236, "learning_rate": 2e-05, "loss": 0.4484, "loss/crossentropy": 2.6432000398635864, "loss/hidden": 0.171875, "loss/logits": 0.030997256748378277, "loss/reg": 0.02455691620707512, "step": 1027 }, { "epoch": 0.514, "grad_norm": 1.755271553993225, "grad_norm_var": 0.10560597146194155, "learning_rate": 2e-05, "loss": 0.5098, "loss/crossentropy": 2.364680051803589, "loss/hidden": 0.23779296875, "loss/logits": 0.026502804830670357, "loss/reg": 0.024554504081606865, "step": 1028 }, { "epoch": 0.5145, "grad_norm": 1.0920544862747192, "grad_norm_var": 0.11630720334852303, "learning_rate": 2e-05, "loss": 0.4003, "loss/crossentropy": 2.2761380672454834, "loss/hidden": 0.1337890625, "loss/logits": 0.020970601588487625, "loss/reg": 0.024552173912525177, "step": 1029 }, { "epoch": 0.515, "grad_norm": 6.667808532714844, "grad_norm_var": 1.750033221105651, "learning_rate": 2e-05, "loss": 0.8699, "loss/crossentropy": 2.0262590050697327, "loss/hidden": 0.54443359375, "loss/logits": 0.0799819864332676, "loss/reg": 0.02454986795783043, "step": 1030 }, { "epoch": 0.5155, "grad_norm": 1.4751737117767334, "grad_norm_var": 1.7586317433690974, "learning_rate": 2e-05, "loss": 0.4788, "loss/crossentropy": 2.3618550300598145, "loss/hidden": 0.19677734375, "loss/logits": 0.03654679283499718, "loss/reg": 0.024547545239329338, "step": 1031 }, { "epoch": 0.516, "grad_norm": 1.1540484428405762, "grad_norm_var": 1.763227740616036, "learning_rate": 2e-05, "loss": 0.4253, "loss/crossentropy": 2.479053497314453, "loss/hidden": 0.1533203125, "loss/logits": 0.026503758504986763, "loss/reg": 0.024545062333345413, "step": 1032 }, { "epoch": 0.5165, "grad_norm": 1.0233707427978516, "grad_norm_var": 1.8048390448531781, "learning_rate": 2e-05, "loss": 0.4207, "loss/crossentropy": 2.4981950521469116, "loss/hidden": 0.14697265625, "loss/logits": 0.028342297300696373, "loss/reg": 0.024542683735489845, "step": 1033 }, { "epoch": 0.517, "grad_norm": 1.6611769199371338, "grad_norm_var": 1.8059095215172836, "learning_rate": 2e-05, "loss": 0.5209, "loss/crossentropy": 2.3910595178604126, "loss/hidden": 0.232421875, "loss/logits": 0.04302603006362915, "loss/reg": 0.024540260434150696, "step": 1034 }, { "epoch": 0.5175, "grad_norm": 1.7571359872817993, "grad_norm_var": 1.805363038051151, "learning_rate": 2e-05, "loss": 0.4273, "loss/crossentropy": 2.403917074203491, "loss/hidden": 0.15478515625, "loss/logits": 0.02708614058792591, "loss/reg": 0.024537930265069008, "step": 1035 }, { "epoch": 0.518, "grad_norm": 1.377044677734375, "grad_norm_var": 1.798280006459376, "learning_rate": 2e-05, "loss": 0.4691, "loss/crossentropy": 2.070719838142395, "loss/hidden": 0.19091796875, "loss/logits": 0.03284468129277229, "loss/reg": 0.02453547529876232, "step": 1036 }, { "epoch": 0.5185, "grad_norm": 1.4872187376022339, "grad_norm_var": 1.78569505647099, "learning_rate": 2e-05, "loss": 0.4843, "loss/crossentropy": 2.3953222036361694, "loss/hidden": 0.1982421875, "loss/logits": 0.04068641737103462, "loss/reg": 0.02453303523361683, "step": 1037 }, { "epoch": 0.519, "grad_norm": 2.055389881134033, "grad_norm_var": 1.7729751683139976, "learning_rate": 2e-05, "loss": 0.533, "loss/crossentropy": 2.5933210849761963, "loss/hidden": 0.23583984375, "loss/logits": 0.05187349207699299, "loss/reg": 0.024530693888664246, "step": 1038 }, { "epoch": 0.5195, "grad_norm": 1.2173277139663696, "grad_norm_var": 1.7935104026338773, "learning_rate": 2e-05, "loss": 0.4539, "loss/crossentropy": 2.4251633882522583, "loss/hidden": 0.17724609375, "loss/logits": 0.03137340396642685, "loss/reg": 0.024528371170163155, "step": 1039 }, { "epoch": 0.52, "grad_norm": 1.4174593687057495, "grad_norm_var": 1.7843437503956898, "learning_rate": 2e-05, "loss": 0.4481, "loss/crossentropy": 2.562455415725708, "loss/hidden": 0.16650390625, "loss/logits": 0.036311980336904526, "loss/reg": 0.024526001885533333, "step": 1040 }, { "epoch": 0.5205, "grad_norm": 1.6313014030456543, "grad_norm_var": 1.7817386417632997, "learning_rate": 2e-05, "loss": 0.4698, "loss/crossentropy": 2.3674226999282837, "loss/hidden": 0.193359375, "loss/logits": 0.03119245171546936, "loss/reg": 0.02452370524406433, "step": 1041 }, { "epoch": 0.521, "grad_norm": 1.5067142248153687, "grad_norm_var": 1.7646103614574096, "learning_rate": 2e-05, "loss": 0.4274, "loss/crossentropy": 2.3603265285491943, "loss/hidden": 0.1591796875, "loss/logits": 0.02301643881946802, "loss/reg": 0.024521449580788612, "step": 1042 }, { "epoch": 0.5215, "grad_norm": 1.2168174982070923, "grad_norm_var": 1.7748228156101766, "learning_rate": 2e-05, "loss": 0.4298, "loss/crossentropy": 2.5000079870224, "loss/hidden": 0.1572265625, "loss/logits": 0.027365175541490316, "loss/reg": 0.024519138038158417, "step": 1043 }, { "epoch": 0.522, "grad_norm": 1.4697620868682861, "grad_norm_var": 1.780895340312144, "learning_rate": 2e-05, "loss": 0.4368, "loss/crossentropy": 2.4227681159973145, "loss/hidden": 0.16455078125, "loss/logits": 0.02705656923353672, "loss/reg": 0.02451668120920658, "step": 1044 }, { "epoch": 0.5225, "grad_norm": 1.3264552354812622, "grad_norm_var": 1.7633564468147955, "learning_rate": 2e-05, "loss": 0.4162, "loss/crossentropy": 2.35932993888855, "loss/hidden": 0.1474609375, "loss/logits": 0.023632820695638657, "loss/reg": 0.02451416663825512, "step": 1045 }, { "epoch": 0.523, "grad_norm": 2.0581815242767334, "grad_norm_var": 0.08589286553724325, "learning_rate": 2e-05, "loss": 0.4888, "loss/crossentropy": 2.554602861404419, "loss/hidden": 0.19384765625, "loss/logits": 0.04979093559086323, "loss/reg": 0.024511631578207016, "step": 1046 }, { "epoch": 0.5235, "grad_norm": 3.9287054538726807, "grad_norm_var": 0.45739211083615405, "learning_rate": 2e-05, "loss": 0.589, "loss/crossentropy": 2.350398898124695, "loss/hidden": 0.302734375, "loss/logits": 0.041149744763970375, "loss/reg": 0.024509234353899956, "step": 1047 }, { "epoch": 0.524, "grad_norm": 1.8000636100769043, "grad_norm_var": 0.44135897770784704, "learning_rate": 2e-05, "loss": 0.4479, "loss/crossentropy": 2.401803970336914, "loss/hidden": 0.173828125, "loss/logits": 0.029028436169028282, "loss/reg": 0.024506855756044388, "step": 1048 }, { "epoch": 0.5245, "grad_norm": 1.3291693925857544, "grad_norm_var": 0.4202927551272635, "learning_rate": 2e-05, "loss": 0.481, "loss/crossentropy": 2.194493293762207, "loss/hidden": 0.19921875, "loss/logits": 0.03670147806406021, "loss/reg": 0.02450430393218994, "step": 1049 }, { "epoch": 0.525, "grad_norm": 1.5405762195587158, "grad_norm_var": 0.42186619050553964, "learning_rate": 2e-05, "loss": 0.4794, "loss/crossentropy": 2.147883892059326, "loss/hidden": 0.20458984375, "loss/logits": 0.02983129769563675, "loss/reg": 0.024501901119947433, "step": 1050 }, { "epoch": 0.5255, "grad_norm": 1.8504911661148071, "grad_norm_var": 0.42318484533822676, "learning_rate": 2e-05, "loss": 0.5108, "loss/crossentropy": 2.149976372718811, "loss/hidden": 0.2236328125, "loss/logits": 0.04220755770802498, "loss/reg": 0.0244994405657053, "step": 1051 }, { "epoch": 0.526, "grad_norm": 1.2600603103637695, "grad_norm_var": 0.42908996868910637, "learning_rate": 2e-05, "loss": 0.4532, "loss/crossentropy": 2.4726301431655884, "loss/hidden": 0.1748046875, "loss/logits": 0.03347236476838589, "loss/reg": 0.024497076869010925, "step": 1052 }, { "epoch": 0.5265, "grad_norm": 1.9423667192459106, "grad_norm_var": 0.42952014360100654, "learning_rate": 2e-05, "loss": 0.4753, "loss/crossentropy": 2.3846495151519775, "loss/hidden": 0.193359375, "loss/logits": 0.03703247010707855, "loss/reg": 0.024494826793670654, "step": 1053 }, { "epoch": 0.527, "grad_norm": 1.3845815658569336, "grad_norm_var": 0.4278188958703671, "learning_rate": 2e-05, "loss": 0.4507, "loss/crossentropy": 2.388631224632263, "loss/hidden": 0.17138671875, "loss/logits": 0.03437050245702267, "loss/reg": 0.024492528289556503, "step": 1054 }, { "epoch": 0.5275, "grad_norm": 1.567394733428955, "grad_norm_var": 0.41388247279120466, "learning_rate": 2e-05, "loss": 0.4495, "loss/crossentropy": 2.368739128112793, "loss/hidden": 0.16845703125, "loss/logits": 0.03617890737950802, "loss/reg": 0.024490313604474068, "step": 1055 }, { "epoch": 0.528, "grad_norm": 2.2830867767333984, "grad_norm_var": 0.42788727790428427, "learning_rate": 2e-05, "loss": 0.5231, "loss/crossentropy": 2.152646243572235, "loss/hidden": 0.234375, "loss/logits": 0.043883830308914185, "loss/reg": 0.02448788657784462, "step": 1056 }, { "epoch": 0.5285, "grad_norm": 1.6639432907104492, "grad_norm_var": 0.427411225536909, "learning_rate": 2e-05, "loss": 0.5485, "loss/crossentropy": 2.2220189571380615, "loss/hidden": 0.2431640625, "loss/logits": 0.06049743480980396, "loss/reg": 0.02448536455631256, "step": 1057 }, { "epoch": 0.529, "grad_norm": 1.5924744606018066, "grad_norm_var": 0.4249972603969434, "learning_rate": 2e-05, "loss": 0.4682, "loss/crossentropy": 2.237455129623413, "loss/hidden": 0.1796875, "loss/logits": 0.04367717728018761, "loss/reg": 0.02448287233710289, "step": 1058 }, { "epoch": 0.5295, "grad_norm": 2.2733936309814453, "grad_norm_var": 0.4177709041128878, "learning_rate": 2e-05, "loss": 0.4764, "loss/crossentropy": 2.3304353952407837, "loss/hidden": 0.1923828125, "loss/logits": 0.03922894597053528, "loss/reg": 0.02448027953505516, "step": 1059 }, { "epoch": 0.53, "grad_norm": 1.09116530418396, "grad_norm_var": 0.44488470791259543, "learning_rate": 2e-05, "loss": 0.4262, "loss/crossentropy": 2.195927619934082, "loss/hidden": 0.15234375, "loss/logits": 0.029037375934422016, "loss/reg": 0.02447788044810295, "step": 1060 }, { "epoch": 0.5305, "grad_norm": 1.77937650680542, "grad_norm_var": 0.42876102735321137, "learning_rate": 2e-05, "loss": 0.4547, "loss/crossentropy": 2.4653072357177734, "loss/hidden": 0.177734375, "loss/logits": 0.0321984738111496, "loss/reg": 0.02447550557553768, "step": 1061 }, { "epoch": 0.531, "grad_norm": 1.846907138824463, "grad_norm_var": 0.42523747091545416, "learning_rate": 2e-05, "loss": 0.5053, "loss/crossentropy": 2.340217351913452, "loss/hidden": 0.21435546875, "loss/logits": 0.046232474967837334, "loss/reg": 0.02447315864264965, "step": 1062 }, { "epoch": 0.5315, "grad_norm": 1.659089207649231, "grad_norm_var": 0.10931806474138266, "learning_rate": 2e-05, "loss": 0.4724, "loss/crossentropy": 2.350934624671936, "loss/hidden": 0.1865234375, "loss/logits": 0.04117522016167641, "loss/reg": 0.024470685049891472, "step": 1063 }, { "epoch": 0.532, "grad_norm": 1.4331797361373901, "grad_norm_var": 0.11180905743439092, "learning_rate": 2e-05, "loss": 0.4513, "loss/crossentropy": 2.3989791870117188, "loss/hidden": 0.17578125, "loss/logits": 0.03085092268884182, "loss/reg": 0.02446819841861725, "step": 1064 }, { "epoch": 0.5325, "grad_norm": 1.4166337251663208, "grad_norm_var": 0.10847479799077456, "learning_rate": 2e-05, "loss": 0.4312, "loss/crossentropy": 2.507182240486145, "loss/hidden": 0.15771484375, "loss/logits": 0.02880854159593582, "loss/reg": 0.02446584217250347, "step": 1065 }, { "epoch": 0.533, "grad_norm": 1.7280267477035522, "grad_norm_var": 0.10764748193198746, "learning_rate": 2e-05, "loss": 0.4635, "loss/crossentropy": 2.478935956954956, "loss/hidden": 0.1865234375, "loss/logits": 0.03238129895180464, "loss/reg": 0.0244633499532938, "step": 1066 }, { "epoch": 0.5335, "grad_norm": 1.2200838327407837, "grad_norm_var": 0.11758883412413562, "learning_rate": 2e-05, "loss": 0.4332, "loss/crossentropy": 2.245327651500702, "loss/hidden": 0.15673828125, "loss/logits": 0.03184494376182556, "loss/reg": 0.024460740387439728, "step": 1067 }, { "epoch": 0.534, "grad_norm": 1.8010295629501343, "grad_norm_var": 0.10891741560488928, "learning_rate": 2e-05, "loss": 0.5088, "loss/crossentropy": 2.45032274723053, "loss/hidden": 0.22021484375, "loss/logits": 0.04397309757769108, "loss/reg": 0.024458307772874832, "step": 1068 }, { "epoch": 0.5345, "grad_norm": 3.2257442474365234, "grad_norm_var": 0.2588636742482642, "learning_rate": 2e-05, "loss": 0.5897, "loss/crossentropy": 2.250162959098816, "loss/hidden": 0.2841796875, "loss/logits": 0.06098415516316891, "loss/reg": 0.024455880746245384, "step": 1069 }, { "epoch": 0.535, "grad_norm": 1.5339529514312744, "grad_norm_var": 0.2530226057684303, "learning_rate": 2e-05, "loss": 0.4264, "loss/crossentropy": 2.2610918283462524, "loss/hidden": 0.154296875, "loss/logits": 0.02759288903325796, "loss/reg": 0.024453405290842056, "step": 1070 }, { "epoch": 0.5355, "grad_norm": 1.972740888595581, "grad_norm_var": 0.2530325031228223, "learning_rate": 2e-05, "loss": 0.521, "loss/crossentropy": 2.1504000425338745, "loss/hidden": 0.23681640625, "loss/logits": 0.03967903181910515, "loss/reg": 0.02445101924240589, "step": 1071 }, { "epoch": 0.536, "grad_norm": 1.547863245010376, "grad_norm_var": 0.23774975509498403, "learning_rate": 2e-05, "loss": 0.4831, "loss/crossentropy": 2.2970356941223145, "loss/hidden": 0.19873046875, "loss/logits": 0.03990238159894943, "loss/reg": 0.024448538199067116, "step": 1072 }, { "epoch": 0.5365, "grad_norm": 1.686294674873352, "grad_norm_var": 0.23756444788163353, "learning_rate": 2e-05, "loss": 0.436, "loss/crossentropy": 2.4303818941116333, "loss/hidden": 0.16357421875, "loss/logits": 0.02797577064484358, "loss/reg": 0.024446075782179832, "step": 1073 }, { "epoch": 0.537, "grad_norm": 2.996263027191162, "grad_norm_var": 0.3334905820123376, "learning_rate": 2e-05, "loss": 0.4512, "loss/crossentropy": 2.3784351348876953, "loss/hidden": 0.17724609375, "loss/logits": 0.02949346974492073, "loss/reg": 0.02444363757967949, "step": 1074 }, { "epoch": 0.5375, "grad_norm": 1.6075915098190308, "grad_norm_var": 0.32145599917045425, "learning_rate": 2e-05, "loss": 0.4564, "loss/crossentropy": 2.3008534908294678, "loss/hidden": 0.1806640625, "loss/logits": 0.03131491877138615, "loss/reg": 0.02444116212427616, "step": 1075 }, { "epoch": 0.538, "grad_norm": 1.6602723598480225, "grad_norm_var": 0.28911651671163174, "learning_rate": 2e-05, "loss": 0.5333, "loss/crossentropy": 2.380239248275757, "loss/hidden": 0.24072265625, "loss/logits": 0.04818672500550747, "loss/reg": 0.02443861961364746, "step": 1076 }, { "epoch": 0.5385, "grad_norm": 1.4191992282867432, "grad_norm_var": 0.2991605248784346, "learning_rate": 2e-05, "loss": 0.5074, "loss/crossentropy": 1.901290237903595, "loss/hidden": 0.22509765625, "loss/logits": 0.03790563438087702, "loss/reg": 0.02443600259721279, "step": 1077 }, { "epoch": 0.539, "grad_norm": 3.096097230911255, "grad_norm_var": 0.4049728367226398, "learning_rate": 2e-05, "loss": 0.5158, "loss/crossentropy": 2.3517009019851685, "loss/hidden": 0.23388671875, "loss/logits": 0.037621984258294106, "loss/reg": 0.024433549493551254, "step": 1078 }, { "epoch": 0.5395, "grad_norm": 2.404075860977173, "grad_norm_var": 0.4181886829541852, "learning_rate": 2e-05, "loss": 0.481, "loss/crossentropy": 2.174505352973938, "loss/hidden": 0.19873046875, "loss/logits": 0.03792595863342285, "loss/reg": 0.02443109266459942, "step": 1079 }, { "epoch": 0.54, "grad_norm": 2.0042619705200195, "grad_norm_var": 0.40136528423351076, "learning_rate": 2e-05, "loss": 0.4407, "loss/crossentropy": 2.4950019121170044, "loss/hidden": 0.165283203125, "loss/logits": 0.031170199625194073, "loss/reg": 0.024428587406873703, "step": 1080 }, { "epoch": 0.5405, "grad_norm": 1.8745791912078857, "grad_norm_var": 0.38144694441163024, "learning_rate": 2e-05, "loss": 0.5041, "loss/crossentropy": 2.4403117895126343, "loss/hidden": 0.22119140625, "loss/logits": 0.03862900286912918, "loss/reg": 0.024426110088825226, "step": 1081 }, { "epoch": 0.541, "grad_norm": 1.9030897617340088, "grad_norm_var": 0.3773378128842729, "learning_rate": 2e-05, "loss": 0.5108, "loss/crossentropy": 2.2553837299346924, "loss/hidden": 0.22802734375, "loss/logits": 0.03854364529252052, "loss/reg": 0.024423446506261826, "step": 1082 }, { "epoch": 0.5415, "grad_norm": 1.5374236106872559, "grad_norm_var": 0.350755978913394, "learning_rate": 2e-05, "loss": 0.426, "loss/crossentropy": 2.549424886703491, "loss/hidden": 0.1533203125, "loss/logits": 0.02845953404903412, "loss/reg": 0.024420736357569695, "step": 1083 }, { "epoch": 0.542, "grad_norm": 1.7184265851974487, "grad_norm_var": 0.3535600255487106, "learning_rate": 2e-05, "loss": 0.4644, "loss/crossentropy": 2.3825089931488037, "loss/hidden": 0.1884765625, "loss/logits": 0.031747978180646896, "loss/reg": 0.02441803179681301, "step": 1084 }, { "epoch": 0.5425, "grad_norm": 1.4469131231307983, "grad_norm_var": 0.26339110279265016, "learning_rate": 2e-05, "loss": 0.4765, "loss/crossentropy": 2.2089916467666626, "loss/hidden": 0.1962890625, "loss/logits": 0.03606886602938175, "loss/reg": 0.02441529557108879, "step": 1085 }, { "epoch": 0.543, "grad_norm": 1.1277079582214355, "grad_norm_var": 0.293563715509962, "learning_rate": 2e-05, "loss": 0.4419, "loss/crossentropy": 2.1233601570129395, "loss/hidden": 0.166015625, "loss/logits": 0.03171114809811115, "loss/reg": 0.02441273257136345, "step": 1086 }, { "epoch": 0.5435, "grad_norm": 1.922126054763794, "grad_norm_var": 0.2930653944445924, "learning_rate": 2e-05, "loss": 0.4729, "loss/crossentropy": 2.306940197944641, "loss/hidden": 0.193359375, "loss/logits": 0.035489412024617195, "loss/reg": 0.02441009320318699, "step": 1087 }, { "epoch": 0.544, "grad_norm": 1.4455621242523193, "grad_norm_var": 0.29814092122534225, "learning_rate": 2e-05, "loss": 0.4431, "loss/crossentropy": 2.4753963947296143, "loss/hidden": 0.17041015625, "loss/logits": 0.028606380335986614, "loss/reg": 0.02440747246146202, "step": 1088 }, { "epoch": 0.5445, "grad_norm": 1.202568769454956, "grad_norm_var": 0.3243311065439721, "learning_rate": 2e-05, "loss": 0.4296, "loss/crossentropy": 2.5858423709869385, "loss/hidden": 0.15673828125, "loss/logits": 0.028857764787971973, "loss/reg": 0.024404924362897873, "step": 1089 }, { "epoch": 0.545, "grad_norm": 1.676564335823059, "grad_norm_var": 0.228913483216607, "learning_rate": 2e-05, "loss": 0.4523, "loss/crossentropy": 2.392747402191162, "loss/hidden": 0.18017578125, "loss/logits": 0.028122087940573692, "loss/reg": 0.024402471259236336, "step": 1090 }, { "epoch": 0.5455, "grad_norm": 1.7542563676834106, "grad_norm_var": 0.2274162683570199, "learning_rate": 2e-05, "loss": 0.455, "loss/crossentropy": 2.146073818206787, "loss/hidden": 0.18115234375, "loss/logits": 0.02988947369158268, "loss/reg": 0.024400051683187485, "step": 1091 }, { "epoch": 0.546, "grad_norm": 1.9361008405685425, "grad_norm_var": 0.22842751723861923, "learning_rate": 2e-05, "loss": 0.482, "loss/crossentropy": 2.789412260055542, "loss/hidden": 0.208740234375, "loss/logits": 0.029331857338547707, "loss/reg": 0.02439761720597744, "step": 1092 }, { "epoch": 0.5465, "grad_norm": 1.5437133312225342, "grad_norm_var": 0.2234179936427338, "learning_rate": 2e-05, "loss": 0.4217, "loss/crossentropy": 2.3370405435562134, "loss/hidden": 0.14794921875, "loss/logits": 0.029778199270367622, "loss/reg": 0.024395201355218887, "step": 1093 }, { "epoch": 0.547, "grad_norm": 1.5581791400909424, "grad_norm_var": 0.1028233910206414, "learning_rate": 2e-05, "loss": 0.4635, "loss/crossentropy": 2.2354471683502197, "loss/hidden": 0.185546875, "loss/logits": 0.0340447872877121, "loss/reg": 0.024392733350396156, "step": 1094 }, { "epoch": 0.5475, "grad_norm": 1.944296956062317, "grad_norm_var": 0.07231965473971678, "learning_rate": 2e-05, "loss": 0.4836, "loss/crossentropy": 2.2027004957199097, "loss/hidden": 0.2060546875, "loss/logits": 0.03367648273706436, "loss/reg": 0.024390380829572678, "step": 1095 }, { "epoch": 0.548, "grad_norm": 1.5446677207946777, "grad_norm_var": 0.0645622226297327, "learning_rate": 2e-05, "loss": 0.4729, "loss/crossentropy": 2.347012758255005, "loss/hidden": 0.19189453125, "loss/logits": 0.03708443604409695, "loss/reg": 0.024387938901782036, "step": 1096 }, { "epoch": 0.5485, "grad_norm": 1.898376703262329, "grad_norm_var": 0.06536252751224628, "learning_rate": 2e-05, "loss": 0.4712, "loss/crossentropy": 2.2295031547546387, "loss/hidden": 0.1904296875, "loss/logits": 0.03696603327989578, "loss/reg": 0.02438538894057274, "step": 1097 }, { "epoch": 0.549, "grad_norm": 2.1913044452667236, "grad_norm_var": 0.08085664370673058, "learning_rate": 2e-05, "loss": 0.5301, "loss/crossentropy": 2.167301833629608, "loss/hidden": 0.2392578125, "loss/logits": 0.04704119265079498, "loss/reg": 0.024382859468460083, "step": 1098 }, { "epoch": 0.5495, "grad_norm": 1.253875970840454, "grad_norm_var": 0.0902515637472618, "learning_rate": 2e-05, "loss": 0.4156, "loss/crossentropy": 2.4487051963806152, "loss/hidden": 0.1435546875, "loss/logits": 0.028273213654756546, "loss/reg": 0.024380315095186234, "step": 1099 }, { "epoch": 0.55, "grad_norm": 1.5682874917984009, "grad_norm_var": 0.0899961499541573, "learning_rate": 2e-05, "loss": 0.4756, "loss/crossentropy": 2.167448401451111, "loss/hidden": 0.19580078125, "loss/logits": 0.03600800037384033, "loss/reg": 0.024377938359975815, "step": 1100 }, { "epoch": 0.5505, "grad_norm": 1.4174175262451172, "grad_norm_var": 0.09075445922031561, "learning_rate": 2e-05, "loss": 0.4288, "loss/crossentropy": 2.5550715923309326, "loss/hidden": 0.1572265625, "loss/logits": 0.02786921989172697, "loss/reg": 0.02437533624470234, "step": 1101 }, { "epoch": 0.551, "grad_norm": 1.3593388795852661, "grad_norm_var": 0.07877827873621565, "learning_rate": 2e-05, "loss": 0.4321, "loss/crossentropy": 2.7613970041275024, "loss/hidden": 0.15625, "loss/logits": 0.032078905031085014, "loss/reg": 0.024372844025492668, "step": 1102 }, { "epoch": 0.5515, "grad_norm": 1.4599738121032715, "grad_norm_var": 0.07465265183359193, "learning_rate": 2e-05, "loss": 0.4598, "loss/crossentropy": 2.228444457054138, "loss/hidden": 0.19140625, "loss/logits": 0.024730762466788292, "loss/reg": 0.024370355531573296, "step": 1103 }, { "epoch": 0.552, "grad_norm": 1.7234889268875122, "grad_norm_var": 0.07339957389561243, "learning_rate": 2e-05, "loss": 0.4523, "loss/crossentropy": 2.0966050028800964, "loss/hidden": 0.179443359375, "loss/logits": 0.029166480526328087, "loss/reg": 0.024367934092879295, "step": 1104 }, { "epoch": 0.5525, "grad_norm": 1.4158674478530884, "grad_norm_var": 0.06417161394244413, "learning_rate": 2e-05, "loss": 0.4406, "loss/crossentropy": 2.503642201423645, "loss/hidden": 0.166015625, "loss/logits": 0.030960144475102425, "loss/reg": 0.024365652352571487, "step": 1105 }, { "epoch": 0.553, "grad_norm": 1.1149797439575195, "grad_norm_var": 0.08117155153877267, "learning_rate": 2e-05, "loss": 0.4326, "loss/crossentropy": 2.4113690853118896, "loss/hidden": 0.1611328125, "loss/logits": 0.02782568149268627, "loss/reg": 0.02436315082013607, "step": 1106 }, { "epoch": 0.5535, "grad_norm": 1.5629899501800537, "grad_norm_var": 0.07965819036262284, "learning_rate": 2e-05, "loss": 0.469, "loss/crossentropy": 2.565138816833496, "loss/hidden": 0.1904296875, "loss/logits": 0.03499189019203186, "loss/reg": 0.02436099573969841, "step": 1107 }, { "epoch": 0.554, "grad_norm": 1.3787304162979126, "grad_norm_var": 0.07359921908290872, "learning_rate": 2e-05, "loss": 0.4357, "loss/crossentropy": 2.3302866220474243, "loss/hidden": 0.16015625, "loss/logits": 0.031937687657773495, "loss/reg": 0.024358561262488365, "step": 1108 }, { "epoch": 0.5545, "grad_norm": 1.5682357549667358, "grad_norm_var": 0.07358856061888677, "learning_rate": 2e-05, "loss": 0.4596, "loss/crossentropy": 2.4188989400863647, "loss/hidden": 0.18505859375, "loss/logits": 0.031012317165732384, "loss/reg": 0.024356119334697723, "step": 1109 }, { "epoch": 0.555, "grad_norm": 1.7214398384094238, "grad_norm_var": 0.0752147876360846, "learning_rate": 2e-05, "loss": 0.4862, "loss/crossentropy": 1.7631773948669434, "loss/hidden": 0.21533203125, "loss/logits": 0.027333957143127918, "loss/reg": 0.024353839457035065, "step": 1110 }, { "epoch": 0.5555, "grad_norm": 1.6694806814193726, "grad_norm_var": 0.06622747638215376, "learning_rate": 2e-05, "loss": 0.4914, "loss/crossentropy": 2.0741612911224365, "loss/hidden": 0.21142578125, "loss/logits": 0.03643079940229654, "loss/reg": 0.02435164712369442, "step": 1111 }, { "epoch": 0.556, "grad_norm": 1.240812063217163, "grad_norm_var": 0.0723367202665381, "learning_rate": 2e-05, "loss": 0.4424, "loss/crossentropy": 2.454365372657776, "loss/hidden": 0.16455078125, "loss/logits": 0.034358324483036995, "loss/reg": 0.024349192157387733, "step": 1112 }, { "epoch": 0.5565, "grad_norm": 1.4514116048812866, "grad_norm_var": 0.0631099103755652, "learning_rate": 2e-05, "loss": 0.4955, "loss/crossentropy": 2.4008172750473022, "loss/hidden": 0.2060546875, "loss/logits": 0.04597476311028004, "loss/reg": 0.024346981197595596, "step": 1113 }, { "epoch": 0.557, "grad_norm": 1.3681151866912842, "grad_norm_var": 0.030255623557352607, "learning_rate": 2e-05, "loss": 0.4851, "loss/crossentropy": 2.3843729496002197, "loss/hidden": 0.20068359375, "loss/logits": 0.040942758321762085, "loss/reg": 0.02434452809393406, "step": 1114 }, { "epoch": 0.5575, "grad_norm": 1.3606462478637695, "grad_norm_var": 0.028109850014208366, "learning_rate": 2e-05, "loss": 0.469, "loss/crossentropy": 2.2223979234695435, "loss/hidden": 0.18994140625, "loss/logits": 0.035602279007434845, "loss/reg": 0.024342484772205353, "step": 1115 }, { "epoch": 0.558, "grad_norm": 1.6001321077346802, "grad_norm_var": 0.02862738311728966, "learning_rate": 2e-05, "loss": 0.4578, "loss/crossentropy": 2.2725006341934204, "loss/hidden": 0.18017578125, "loss/logits": 0.03423896711319685, "loss/reg": 0.024340493604540825, "step": 1116 }, { "epoch": 0.5585, "grad_norm": 1.2636990547180176, "grad_norm_var": 0.031044949777116432, "learning_rate": 2e-05, "loss": 0.451, "loss/crossentropy": 2.3660874366760254, "loss/hidden": 0.17333984375, "loss/logits": 0.03424760699272156, "loss/reg": 0.02433803491294384, "step": 1117 }, { "epoch": 0.559, "grad_norm": 1.3616007566452026, "grad_norm_var": 0.03101680909424142, "learning_rate": 2e-05, "loss": 0.4927, "loss/crossentropy": 2.1571322679519653, "loss/hidden": 0.205078125, "loss/logits": 0.044297466054558754, "loss/reg": 0.024335812777280807, "step": 1118 }, { "epoch": 0.5595, "grad_norm": 2.230315685272217, "grad_norm_var": 0.06873493913916656, "learning_rate": 2e-05, "loss": 0.4905, "loss/crossentropy": 2.5267653465270996, "loss/hidden": 0.201171875, "loss/logits": 0.045963168144226074, "loss/reg": 0.024333106353878975, "step": 1119 }, { "epoch": 0.56, "grad_norm": 1.7945393323898315, "grad_norm_var": 0.071148731844346, "learning_rate": 2e-05, "loss": 0.4996, "loss/crossentropy": 2.512783646583557, "loss/hidden": 0.2177734375, "loss/logits": 0.03850918263196945, "loss/reg": 0.024330556392669678, "step": 1120 }, { "epoch": 0.5605, "grad_norm": 1.9248079061508179, "grad_norm_var": 0.08119155521753, "learning_rate": 2e-05, "loss": 0.5227, "loss/crossentropy": 2.2634390592575073, "loss/hidden": 0.23681640625, "loss/logits": 0.04262538440525532, "loss/reg": 0.024327831342816353, "step": 1121 }, { "epoch": 0.561, "grad_norm": 6.669369697570801, "grad_norm_var": 1.695929746004041, "learning_rate": 2e-05, "loss": 0.9722, "loss/crossentropy": 2.4415574073791504, "loss/hidden": 0.544921875, "loss/logits": 0.1840246431529522, "loss/reg": 0.024325016885995865, "step": 1122 }, { "epoch": 0.5615, "grad_norm": 1.315581202507019, "grad_norm_var": 1.7103908959366814, "learning_rate": 2e-05, "loss": 0.4449, "loss/crossentropy": 2.355056047439575, "loss/hidden": 0.171875, "loss/logits": 0.02983579970896244, "loss/reg": 0.024322576820850372, "step": 1123 }, { "epoch": 0.562, "grad_norm": 2.665045976638794, "grad_norm_var": 1.729558453751204, "learning_rate": 2e-05, "loss": 0.5175, "loss/crossentropy": 2.4953508377075195, "loss/hidden": 0.2314453125, "loss/logits": 0.042819553054869175, "loss/reg": 0.024320153519511223, "step": 1124 }, { "epoch": 0.5625, "grad_norm": 1.9144386053085327, "grad_norm_var": 1.71941199935234, "learning_rate": 2e-05, "loss": 0.5167, "loss/crossentropy": 2.513652205467224, "loss/hidden": 0.21826171875, "loss/logits": 0.0552450567483902, "loss/reg": 0.024317733943462372, "step": 1125 }, { "epoch": 0.563, "grad_norm": 2.0490505695343018, "grad_norm_var": 1.715176762349163, "learning_rate": 2e-05, "loss": 0.4345, "loss/crossentropy": 2.4084017276763916, "loss/hidden": 0.1591796875, "loss/logits": 0.032121747732162476, "loss/reg": 0.02431519515812397, "step": 1126 }, { "epoch": 0.5635, "grad_norm": 1.4523192644119263, "grad_norm_var": 1.7274754574344684, "learning_rate": 2e-05, "loss": 0.4548, "loss/crossentropy": 2.2959285974502563, "loss/hidden": 0.17724609375, "loss/logits": 0.03444240428507328, "loss/reg": 0.0243125818669796, "step": 1127 }, { "epoch": 0.564, "grad_norm": 1.5287126302719116, "grad_norm_var": 1.704324322007363, "learning_rate": 2e-05, "loss": 0.4667, "loss/crossentropy": 2.4687604904174805, "loss/hidden": 0.1904296875, "loss/logits": 0.03319636359810829, "loss/reg": 0.024309968575835228, "step": 1128 }, { "epoch": 0.5645, "grad_norm": 1.7322133779525757, "grad_norm_var": 1.6888306469406487, "learning_rate": 2e-05, "loss": 0.4384, "loss/crossentropy": 2.46881103515625, "loss/hidden": 0.16552734375, "loss/logits": 0.029783966951072216, "loss/reg": 0.02430731989443302, "step": 1129 }, { "epoch": 0.565, "grad_norm": 1.6788828372955322, "grad_norm_var": 1.6680869393205444, "learning_rate": 2e-05, "loss": 0.4634, "loss/crossentropy": 2.3073936700820923, "loss/hidden": 0.189453125, "loss/logits": 0.030876665376126766, "loss/reg": 0.02430490031838417, "step": 1130 }, { "epoch": 0.5655, "grad_norm": 1.89357328414917, "grad_norm_var": 1.638002930492539, "learning_rate": 2e-05, "loss": 0.4736, "loss/crossentropy": 2.3977789878845215, "loss/hidden": 0.19287109375, "loss/logits": 0.037660510279238224, "loss/reg": 0.024302346631884575, "step": 1131 }, { "epoch": 0.566, "grad_norm": 1.9893137216567993, "grad_norm_var": 1.6232357375180981, "learning_rate": 2e-05, "loss": 0.5389, "loss/crossentropy": 2.2793599367141724, "loss/hidden": 0.2470703125, "loss/logits": 0.04884297959506512, "loss/reg": 0.02429981529712677, "step": 1132 }, { "epoch": 0.5665, "grad_norm": 1.7860678434371948, "grad_norm_var": 1.5826367428758024, "learning_rate": 2e-05, "loss": 0.4758, "loss/crossentropy": 2.448815107345581, "loss/hidden": 0.19921875, "loss/logits": 0.033569784834980965, "loss/reg": 0.024297522380948067, "step": 1133 }, { "epoch": 0.567, "grad_norm": 1.0841586589813232, "grad_norm_var": 1.615654748481629, "learning_rate": 2e-05, "loss": 0.4351, "loss/crossentropy": 2.4790775775909424, "loss/hidden": 0.1630859375, "loss/logits": 0.02909463830292225, "loss/reg": 0.024295024573802948, "step": 1134 }, { "epoch": 0.5675, "grad_norm": 1.4538543224334717, "grad_norm_var": 1.6405455106021212, "learning_rate": 2e-05, "loss": 0.4484, "loss/crossentropy": 2.262555480003357, "loss/hidden": 0.17529296875, "loss/logits": 0.03016512282192707, "loss/reg": 0.024292735382914543, "step": 1135 }, { "epoch": 0.568, "grad_norm": 1.4780546426773071, "grad_norm_var": 1.657933535725162, "learning_rate": 2e-05, "loss": 0.4848, "loss/crossentropy": 2.3877243995666504, "loss/hidden": 0.20361328125, "loss/logits": 0.03827337175607681, "loss/reg": 0.024290479719638824, "step": 1136 }, { "epoch": 0.5685, "grad_norm": 1.9372481107711792, "grad_norm_var": 1.6577546853387217, "learning_rate": 2e-05, "loss": 0.5327, "loss/crossentropy": 2.5047531127929688, "loss/hidden": 0.24462890625, "loss/logits": 0.045230258256196976, "loss/reg": 0.02428818680346012, "step": 1137 }, { "epoch": 0.569, "grad_norm": 1.5927648544311523, "grad_norm_var": 0.13445619453283364, "learning_rate": 2e-05, "loss": 0.4097, "loss/crossentropy": 2.340881109237671, "loss/hidden": 0.14013671875, "loss/logits": 0.026726843789219856, "loss/reg": 0.024285737425088882, "step": 1138 }, { "epoch": 0.5695, "grad_norm": 1.7248023748397827, "grad_norm_var": 0.12274966628292004, "learning_rate": 2e-05, "loss": 0.4644, "loss/crossentropy": 2.322643995285034, "loss/hidden": 0.19189453125, "loss/logits": 0.029679549857974052, "loss/reg": 0.024283410981297493, "step": 1139 }, { "epoch": 0.57, "grad_norm": 1.2972499132156372, "grad_norm_var": 0.07234907048121096, "learning_rate": 2e-05, "loss": 0.4139, "loss/crossentropy": 2.3335630893707275, "loss/hidden": 0.14794921875, "loss/logits": 0.023186037316918373, "loss/reg": 0.024281039834022522, "step": 1140 }, { "epoch": 0.5705, "grad_norm": 1.733296513557434, "grad_norm_var": 0.06830394569533192, "learning_rate": 2e-05, "loss": 0.4458, "loss/crossentropy": 2.41671085357666, "loss/hidden": 0.17236328125, "loss/logits": 0.0306707676500082, "loss/reg": 0.024278564378619194, "step": 1141 }, { "epoch": 0.571, "grad_norm": 1.893109679222107, "grad_norm_var": 0.06154171256225324, "learning_rate": 2e-05, "loss": 0.5721, "loss/crossentropy": 2.1943784952163696, "loss/hidden": 0.2763671875, "loss/logits": 0.053009962663054466, "loss/reg": 0.02427605725824833, "step": 1142 }, { "epoch": 0.5715, "grad_norm": 1.6730619668960571, "grad_norm_var": 0.05903454724422227, "learning_rate": 2e-05, "loss": 0.5033, "loss/crossentropy": 2.3536800146102905, "loss/hidden": 0.21337890625, "loss/logits": 0.047144461423158646, "loss/reg": 0.024273628368973732, "step": 1143 }, { "epoch": 0.572, "grad_norm": 1.3953866958618164, "grad_norm_var": 0.06238648029036706, "learning_rate": 2e-05, "loss": 0.4842, "loss/crossentropy": 2.157352328300476, "loss/hidden": 0.20556640625, "loss/logits": 0.03593774512410164, "loss/reg": 0.024271195754408836, "step": 1144 }, { "epoch": 0.5725, "grad_norm": 2.192866325378418, "grad_norm_var": 0.0809172906121536, "learning_rate": 2e-05, "loss": 0.531, "loss/crossentropy": 2.570547342300415, "loss/hidden": 0.2392578125, "loss/logits": 0.0490174125880003, "loss/reg": 0.024268826469779015, "step": 1145 }, { "epoch": 0.573, "grad_norm": 1.5369664430618286, "grad_norm_var": 0.08210695127014726, "learning_rate": 2e-05, "loss": 0.4454, "loss/crossentropy": 2.435948967933655, "loss/hidden": 0.17431640625, "loss/logits": 0.028402727097272873, "loss/reg": 0.024266386404633522, "step": 1146 }, { "epoch": 0.5735, "grad_norm": 2.309359550476074, "grad_norm_var": 0.10550807519647355, "learning_rate": 2e-05, "loss": 0.4775, "loss/crossentropy": 2.4030030965805054, "loss/hidden": 0.19775390625, "loss/logits": 0.03707532212138176, "loss/reg": 0.024263978004455566, "step": 1147 }, { "epoch": 0.574, "grad_norm": 1.347505807876587, "grad_norm_var": 0.10584021840658688, "learning_rate": 2e-05, "loss": 0.442, "loss/crossentropy": 2.398911237716675, "loss/hidden": 0.1650390625, "loss/logits": 0.034335775300860405, "loss/reg": 0.02426161989569664, "step": 1148 }, { "epoch": 0.5745, "grad_norm": 1.4021785259246826, "grad_norm_var": 0.10820061974491917, "learning_rate": 2e-05, "loss": 0.4522, "loss/crossentropy": 2.36569881439209, "loss/hidden": 0.17822265625, "loss/logits": 0.031414832919836044, "loss/reg": 0.024259256199002266, "step": 1149 }, { "epoch": 0.575, "grad_norm": 1.4795446395874023, "grad_norm_var": 0.08928821772785417, "learning_rate": 2e-05, "loss": 0.4868, "loss/crossentropy": 2.4005424976348877, "loss/hidden": 0.20458984375, "loss/logits": 0.03963397815823555, "loss/reg": 0.024256786331534386, "step": 1150 }, { "epoch": 0.5755, "grad_norm": 1.2517884969711304, "grad_norm_var": 0.09720427256013545, "learning_rate": 2e-05, "loss": 0.4264, "loss/crossentropy": 2.4282373189926147, "loss/hidden": 0.15771484375, "loss/logits": 0.026178008876740932, "loss/reg": 0.02425423264503479, "step": 1151 }, { "epoch": 0.576, "grad_norm": 1.7204208374023438, "grad_norm_var": 0.0956317930189319, "learning_rate": 2e-05, "loss": 0.546, "loss/crossentropy": 2.2745801210403442, "loss/hidden": 0.25244140625, "loss/logits": 0.05107201635837555, "loss/reg": 0.024251526221632957, "step": 1152 }, { "epoch": 0.5765, "grad_norm": 1.5777848958969116, "grad_norm_var": 0.09020256568864984, "learning_rate": 2e-05, "loss": 0.5091, "loss/crossentropy": 2.4117172956466675, "loss/hidden": 0.2265625, "loss/logits": 0.040081385523080826, "loss/reg": 0.02424911968410015, "step": 1153 }, { "epoch": 0.577, "grad_norm": 1.783171534538269, "grad_norm_var": 0.09144687374157971, "learning_rate": 2e-05, "loss": 0.4786, "loss/crossentropy": 2.454118490219116, "loss/hidden": 0.1962890625, "loss/logits": 0.039798869751393795, "loss/reg": 0.024246560409665108, "step": 1154 }, { "epoch": 0.5775, "grad_norm": 2.023660659790039, "grad_norm_var": 0.10021283785235559, "learning_rate": 2e-05, "loss": 0.4482, "loss/crossentropy": 2.292509913444519, "loss/hidden": 0.1748046875, "loss/logits": 0.030983050353825092, "loss/reg": 0.0242440365254879, "step": 1155 }, { "epoch": 0.578, "grad_norm": 1.811361312866211, "grad_norm_var": 0.09162067235455892, "learning_rate": 2e-05, "loss": 0.447, "loss/crossentropy": 2.323517084121704, "loss/hidden": 0.171875, "loss/logits": 0.03270300664007664, "loss/reg": 0.024241575971245766, "step": 1156 }, { "epoch": 0.5785, "grad_norm": 1.7037845849990845, "grad_norm_var": 0.09152723245676046, "learning_rate": 2e-05, "loss": 0.4631, "loss/crossentropy": 2.222510814666748, "loss/hidden": 0.19140625, "loss/logits": 0.029271118342876434, "loss/reg": 0.024239055812358856, "step": 1157 }, { "epoch": 0.579, "grad_norm": 1.464499592781067, "grad_norm_var": 0.09162285800122252, "learning_rate": 2e-05, "loss": 0.4244, "loss/crossentropy": 2.477970004081726, "loss/hidden": 0.158203125, "loss/logits": 0.023815092630684376, "loss/reg": 0.024236636236310005, "step": 1158 }, { "epoch": 0.5795, "grad_norm": 1.6984034776687622, "grad_norm_var": 0.09168319422315055, "learning_rate": 2e-05, "loss": 0.4446, "loss/crossentropy": 2.3342912197113037, "loss/hidden": 0.1796875, "loss/logits": 0.022567021660506725, "loss/reg": 0.024234119802713394, "step": 1159 }, { "epoch": 0.58, "grad_norm": 2.0293402671813965, "grad_norm_var": 0.09370210145535408, "learning_rate": 2e-05, "loss": 0.4915, "loss/crossentropy": 2.2998613119125366, "loss/hidden": 0.20556640625, "loss/logits": 0.043666526675224304, "loss/reg": 0.024231692776083946, "step": 1160 }, { "epoch": 0.5805, "grad_norm": 1.280202865600586, "grad_norm_var": 0.08679439278309259, "learning_rate": 2e-05, "loss": 0.4539, "loss/crossentropy": 2.2879260778427124, "loss/hidden": 0.173828125, "loss/logits": 0.03776852414011955, "loss/reg": 0.024229243397712708, "step": 1161 }, { "epoch": 0.581, "grad_norm": 1.275728464126587, "grad_norm_var": 0.09504035923803802, "learning_rate": 2e-05, "loss": 0.4329, "loss/crossentropy": 2.229547381401062, "loss/hidden": 0.162109375, "loss/logits": 0.028550241142511368, "loss/reg": 0.024226877838373184, "step": 1162 }, { "epoch": 0.5815, "grad_norm": 2.9225845336914062, "grad_norm_var": 0.17368750923172507, "learning_rate": 2e-05, "loss": 0.4976, "loss/crossentropy": 2.2666972875595093, "loss/hidden": 0.22021484375, "loss/logits": 0.03513254597783089, "loss/reg": 0.024224402382969856, "step": 1163 }, { "epoch": 0.582, "grad_norm": 1.4278773069381714, "grad_norm_var": 0.1706005194348809, "learning_rate": 2e-05, "loss": 0.427, "loss/crossentropy": 2.4950149059295654, "loss/hidden": 0.15576171875, "loss/logits": 0.028986497782170773, "loss/reg": 0.024221867322921753, "step": 1164 }, { "epoch": 0.5825, "grad_norm": 2.058894395828247, "grad_norm_var": 0.17338003347078695, "learning_rate": 2e-05, "loss": 0.463, "loss/crossentropy": 2.4152116775512695, "loss/hidden": 0.185546875, "loss/logits": 0.035301932133734226, "loss/reg": 0.02421954646706581, "step": 1165 }, { "epoch": 0.583, "grad_norm": 1.8373039960861206, "grad_norm_var": 0.1699421495295475, "learning_rate": 2e-05, "loss": 0.4683, "loss/crossentropy": 2.154408037662506, "loss/hidden": 0.1943359375, "loss/logits": 0.03181672282516956, "loss/reg": 0.024217093363404274, "step": 1166 }, { "epoch": 0.5835, "grad_norm": 1.720379114151001, "grad_norm_var": 0.15305819839326673, "learning_rate": 2e-05, "loss": 0.4963, "loss/crossentropy": 2.205121636390686, "loss/hidden": 0.2060546875, "loss/logits": 0.0481159882619977, "loss/reg": 0.024214772507548332, "step": 1167 }, { "epoch": 0.584, "grad_norm": 1.5766927003860474, "grad_norm_var": 0.155317874758835, "learning_rate": 2e-05, "loss": 0.4537, "loss/crossentropy": 2.2504332065582275, "loss/hidden": 0.17626953125, "loss/logits": 0.03528860583901405, "loss/reg": 0.024212457239627838, "step": 1168 }, { "epoch": 0.5845, "grad_norm": 1.7561485767364502, "grad_norm_var": 0.15292574466172837, "learning_rate": 2e-05, "loss": 0.5043, "loss/crossentropy": 2.2984803915023804, "loss/hidden": 0.224609375, "loss/logits": 0.037589056417346, "loss/reg": 0.02421003021299839, "step": 1169 }, { "epoch": 0.585, "grad_norm": 1.8103101253509521, "grad_norm_var": 0.15300812172836042, "learning_rate": 2e-05, "loss": 0.4426, "loss/crossentropy": 2.35384738445282, "loss/hidden": 0.17236328125, "loss/logits": 0.028152812272310257, "loss/reg": 0.02420770935714245, "step": 1170 }, { "epoch": 0.5855, "grad_norm": 1.8475137948989868, "grad_norm_var": 0.1491030967858634, "learning_rate": 2e-05, "loss": 0.4867, "loss/crossentropy": 2.5606281757354736, "loss/hidden": 0.20654296875, "loss/logits": 0.03815155662596226, "loss/reg": 0.024205291643738747, "step": 1171 }, { "epoch": 0.586, "grad_norm": 1.2912832498550415, "grad_norm_var": 0.16271106748652428, "learning_rate": 2e-05, "loss": 0.4484, "loss/crossentropy": 2.3711254596710205, "loss/hidden": 0.1728515625, "loss/logits": 0.0335617596283555, "loss/reg": 0.02420296147465706, "step": 1172 }, { "epoch": 0.5865, "grad_norm": 1.606691837310791, "grad_norm_var": 0.16365658036543582, "learning_rate": 2e-05, "loss": 0.4691, "loss/crossentropy": 2.2857288122177124, "loss/hidden": 0.18408203125, "loss/logits": 0.04296381585299969, "loss/reg": 0.02420070767402649, "step": 1173 }, { "epoch": 0.587, "grad_norm": 1.4397433996200562, "grad_norm_var": 0.16455554628546304, "learning_rate": 2e-05, "loss": 0.4426, "loss/crossentropy": 2.4021177291870117, "loss/hidden": 0.16796875, "loss/logits": 0.03261144831776619, "loss/reg": 0.024198230355978012, "step": 1174 }, { "epoch": 0.5875, "grad_norm": 1.5037596225738525, "grad_norm_var": 0.1675797787548589, "learning_rate": 2e-05, "loss": 0.4517, "loss/crossentropy": 2.4898879528045654, "loss/hidden": 0.1767578125, "loss/logits": 0.03298352472484112, "loss/reg": 0.024195775389671326, "step": 1175 }, { "epoch": 0.588, "grad_norm": 1.6262377500534058, "grad_norm_var": 0.16065407055809539, "learning_rate": 2e-05, "loss": 0.4587, "loss/crossentropy": 2.2547478675842285, "loss/hidden": 0.18115234375, "loss/logits": 0.03557092510163784, "loss/reg": 0.02419334463775158, "step": 1176 }, { "epoch": 0.5885, "grad_norm": 1.2082576751708984, "grad_norm_var": 0.16487347300328276, "learning_rate": 2e-05, "loss": 0.4253, "loss/crossentropy": 2.432216763496399, "loss/hidden": 0.15380859375, "loss/logits": 0.029609275981783867, "loss/reg": 0.024190889671444893, "step": 1177 }, { "epoch": 0.589, "grad_norm": 1.4975124597549438, "grad_norm_var": 0.1559385884208209, "learning_rate": 2e-05, "loss": 0.5032, "loss/crossentropy": 2.235932469367981, "loss/hidden": 0.203125, "loss/logits": 0.058187903836369514, "loss/reg": 0.024188483133912086, "step": 1178 }, { "epoch": 0.5895, "grad_norm": 1.8868989944458008, "grad_norm_var": 0.05355658095740689, "learning_rate": 2e-05, "loss": 0.4478, "loss/crossentropy": 2.1847586631774902, "loss/hidden": 0.17626953125, "loss/logits": 0.029623565264046192, "loss/reg": 0.024185974150896072, "step": 1179 }, { "epoch": 0.59, "grad_norm": 1.7675682306289673, "grad_norm_var": 0.05156999438171397, "learning_rate": 2e-05, "loss": 0.5382, "loss/crossentropy": 2.1349618434906006, "loss/hidden": 0.2548828125, "loss/logits": 0.041456746868789196, "loss/reg": 0.024183399975299835, "step": 1180 }, { "epoch": 0.5905, "grad_norm": 1.6287496089935303, "grad_norm_var": 0.03980901680952513, "learning_rate": 2e-05, "loss": 0.4349, "loss/crossentropy": 2.344777226448059, "loss/hidden": 0.16357421875, "loss/logits": 0.02948729507625103, "loss/reg": 0.024180879816412926, "step": 1181 }, { "epoch": 0.591, "grad_norm": 1.8324546813964844, "grad_norm_var": 0.039673420153317276, "learning_rate": 2e-05, "loss": 0.4473, "loss/crossentropy": 2.274307608604431, "loss/hidden": 0.171875, "loss/logits": 0.03365413844585419, "loss/reg": 0.024178462103009224, "step": 1182 }, { "epoch": 0.5915, "grad_norm": 1.8862324953079224, "grad_norm_var": 0.04350154335947139, "learning_rate": 2e-05, "loss": 0.484, "loss/crossentropy": 2.469294786453247, "loss/hidden": 0.20361328125, "loss/logits": 0.03859470225870609, "loss/reg": 0.02417594939470291, "step": 1183 }, { "epoch": 0.592, "grad_norm": 1.8991292715072632, "grad_norm_var": 0.04747638633534521, "learning_rate": 2e-05, "loss": 0.4209, "loss/crossentropy": 2.328023672103882, "loss/hidden": 0.15185546875, "loss/logits": 0.027280107140541077, "loss/reg": 0.024173393845558167, "step": 1184 }, { "epoch": 0.5925, "grad_norm": 2.4212067127227783, "grad_norm_var": 0.08404251009467104, "learning_rate": 2e-05, "loss": 0.602, "loss/crossentropy": 2.2543612718582153, "loss/hidden": 0.314453125, "loss/logits": 0.04579521995037794, "loss/reg": 0.024170896038413048, "step": 1185 }, { "epoch": 0.593, "grad_norm": 1.5243364572525024, "grad_norm_var": 0.08483701742637818, "learning_rate": 2e-05, "loss": 0.4508, "loss/crossentropy": 2.3000658750534058, "loss/hidden": 0.17578125, "loss/logits": 0.03333883360028267, "loss/reg": 0.02416837401688099, "step": 1186 }, { "epoch": 0.5935, "grad_norm": 1.291556477546692, "grad_norm_var": 0.09168008942992945, "learning_rate": 2e-05, "loss": 0.4442, "loss/crossentropy": 2.1959608793258667, "loss/hidden": 0.1748046875, "loss/logits": 0.027781125158071518, "loss/reg": 0.024165915325284004, "step": 1187 }, { "epoch": 0.594, "grad_norm": 2.0484044551849365, "grad_norm_var": 0.09185247402589478, "learning_rate": 2e-05, "loss": 0.4147, "loss/crossentropy": 2.5615549087524414, "loss/hidden": 0.14990234375, "loss/logits": 0.023197302594780922, "loss/reg": 0.02416372112929821, "step": 1188 }, { "epoch": 0.5945, "grad_norm": 1.416138768196106, "grad_norm_var": 0.09628413301188529, "learning_rate": 2e-05, "loss": 0.4125, "loss/crossentropy": 2.387251138687134, "loss/hidden": 0.14697265625, "loss/logits": 0.023900354281067848, "loss/reg": 0.02416159212589264, "step": 1189 }, { "epoch": 0.595, "grad_norm": 1.9145230054855347, "grad_norm_var": 0.0951705943310803, "learning_rate": 2e-05, "loss": 0.4606, "loss/crossentropy": 2.314830780029297, "loss/hidden": 0.1875, "loss/logits": 0.03147210646420717, "loss/reg": 0.02415909618139267, "step": 1190 }, { "epoch": 0.5955, "grad_norm": 2.2240161895751953, "grad_norm_var": 0.10782977301944445, "learning_rate": 2e-05, "loss": 0.4566, "loss/crossentropy": 2.359019637107849, "loss/hidden": 0.18115234375, "loss/logits": 0.033831628039479256, "loss/reg": 0.024156760424375534, "step": 1191 }, { "epoch": 0.596, "grad_norm": 1.3667939901351929, "grad_norm_var": 0.11647627127392604, "learning_rate": 2e-05, "loss": 0.4461, "loss/crossentropy": 2.371762752532959, "loss/hidden": 0.17041015625, "loss/logits": 0.03416162542998791, "loss/reg": 0.024154268205165863, "step": 1192 }, { "epoch": 0.5965, "grad_norm": 1.9000991582870483, "grad_norm_var": 0.09749187838186989, "learning_rate": 2e-05, "loss": 0.4426, "loss/crossentropy": 2.3624730110168457, "loss/hidden": 0.171875, "loss/logits": 0.029253195971250534, "loss/reg": 0.024151787161827087, "step": 1193 }, { "epoch": 0.597, "grad_norm": 2.4538071155548096, "grad_norm_var": 0.11842507530329692, "learning_rate": 2e-05, "loss": 0.5535, "loss/crossentropy": 2.297299027442932, "loss/hidden": 0.2734375, "loss/logits": 0.0385602843016386, "loss/reg": 0.024149475619196892, "step": 1194 }, { "epoch": 0.5975, "grad_norm": 1.380436658859253, "grad_norm_var": 0.13138206675488526, "learning_rate": 2e-05, "loss": 0.4057, "loss/crossentropy": 2.476130962371826, "loss/hidden": 0.13916015625, "loss/logits": 0.025057541206479073, "loss/reg": 0.024147171527147293, "step": 1195 }, { "epoch": 0.598, "grad_norm": 1.3839375972747803, "grad_norm_var": 0.14273622703758423, "learning_rate": 2e-05, "loss": 0.4528, "loss/crossentropy": 2.396567940711975, "loss/hidden": 0.18212890625, "loss/logits": 0.02926408126950264, "loss/reg": 0.024144427850842476, "step": 1196 }, { "epoch": 0.5985, "grad_norm": 1.40784752368927, "grad_norm_var": 0.150409987258331, "learning_rate": 2e-05, "loss": 0.4807, "loss/crossentropy": 2.4952961206436157, "loss/hidden": 0.1962890625, "loss/logits": 0.04295238200575113, "loss/reg": 0.024141840636730194, "step": 1197 }, { "epoch": 0.599, "grad_norm": 1.2504740953445435, "grad_norm_var": 0.16688246301015586, "learning_rate": 2e-05, "loss": 0.4178, "loss/crossentropy": 2.395404577255249, "loss/hidden": 0.1484375, "loss/logits": 0.027952153235673904, "loss/reg": 0.02413935586810112, "step": 1198 }, { "epoch": 0.5995, "grad_norm": 1.3816466331481934, "grad_norm_var": 0.17265834810284506, "learning_rate": 2e-05, "loss": 0.4251, "loss/crossentropy": 2.547404170036316, "loss/hidden": 0.154296875, "loss/logits": 0.029406324960291386, "loss/reg": 0.02413680963218212, "step": 1199 }, { "epoch": 0.6, "grad_norm": 1.5290807485580444, "grad_norm_var": 0.1715902945906383, "learning_rate": 2e-05, "loss": 0.4251, "loss/crossentropy": 2.3598448038101196, "loss/hidden": 0.15283203125, "loss/logits": 0.030883144587278366, "loss/reg": 0.02413429692387581, "step": 1200 }, { "epoch": 0.6005, "grad_norm": 1.1940410137176514, "grad_norm_var": 0.1445797734401556, "learning_rate": 2e-05, "loss": 0.3971, "loss/crossentropy": 2.485979676246643, "loss/hidden": 0.13330078125, "loss/logits": 0.022492852061986923, "loss/reg": 0.024131763726472855, "step": 1201 }, { "epoch": 0.601, "grad_norm": 1.6485071182250977, "grad_norm_var": 0.14422125485489776, "learning_rate": 2e-05, "loss": 0.4846, "loss/crossentropy": 2.373944044113159, "loss/hidden": 0.20458984375, "loss/logits": 0.038744281977415085, "loss/reg": 0.024129167199134827, "step": 1202 }, { "epoch": 0.6015, "grad_norm": 1.1376245021820068, "grad_norm_var": 0.1522781785188527, "learning_rate": 2e-05, "loss": 0.4531, "loss/crossentropy": 2.4254151582717896, "loss/hidden": 0.17529296875, "loss/logits": 0.03653997741639614, "loss/reg": 0.024126648902893066, "step": 1203 }, { "epoch": 0.602, "grad_norm": 1.3931175470352173, "grad_norm_var": 0.14014204164493524, "learning_rate": 2e-05, "loss": 0.4614, "loss/crossentropy": 2.3730632066726685, "loss/hidden": 0.18017578125, "loss/logits": 0.0399714931845665, "loss/reg": 0.024123938754200935, "step": 1204 }, { "epoch": 0.6025, "grad_norm": 1.182810664176941, "grad_norm_var": 0.14806320441701076, "learning_rate": 2e-05, "loss": 0.4377, "loss/crossentropy": 2.3796987533569336, "loss/hidden": 0.1630859375, "loss/logits": 0.033448660746216774, "loss/reg": 0.02412118948996067, "step": 1205 }, { "epoch": 0.603, "grad_norm": 2.454332113265991, "grad_norm_var": 0.19274218238629567, "learning_rate": 2e-05, "loss": 0.5843, "loss/crossentropy": 2.41066837310791, "loss/hidden": 0.2548828125, "loss/logits": 0.08823728933930397, "loss/reg": 0.024118369445204735, "step": 1206 }, { "epoch": 0.6035, "grad_norm": 2.028047561645508, "grad_norm_var": 0.17832881774557302, "learning_rate": 2e-05, "loss": 0.4994, "loss/crossentropy": 2.4406535625457764, "loss/hidden": 0.21826171875, "loss/logits": 0.039943594485521317, "loss/reg": 0.024115748703479767, "step": 1207 }, { "epoch": 0.604, "grad_norm": 1.4872275590896606, "grad_norm_var": 0.17599978463784297, "learning_rate": 2e-05, "loss": 0.4413, "loss/crossentropy": 2.5775226354599, "loss/hidden": 0.16748046875, "loss/logits": 0.03267715871334076, "loss/reg": 0.02411310188472271, "step": 1208 }, { "epoch": 0.6045, "grad_norm": 1.444392204284668, "grad_norm_var": 0.16927527117337202, "learning_rate": 2e-05, "loss": 0.4386, "loss/crossentropy": 2.42952036857605, "loss/hidden": 0.169921875, "loss/logits": 0.02754312101751566, "loss/reg": 0.024110691621899605, "step": 1209 }, { "epoch": 0.605, "grad_norm": 1.3377629518508911, "grad_norm_var": 0.11223377067241286, "learning_rate": 2e-05, "loss": 0.4506, "loss/crossentropy": 2.463944435119629, "loss/hidden": 0.17822265625, "loss/logits": 0.031324658542871475, "loss/reg": 0.024108313024044037, "step": 1210 }, { "epoch": 0.6055, "grad_norm": 1.5005191564559937, "grad_norm_var": 0.11157964006104232, "learning_rate": 2e-05, "loss": 0.4304, "loss/crossentropy": 2.4867637157440186, "loss/hidden": 0.16015625, "loss/logits": 0.02921352256089449, "loss/reg": 0.024105625227093697, "step": 1211 }, { "epoch": 0.606, "grad_norm": 1.792034387588501, "grad_norm_var": 0.1164848223260717, "learning_rate": 2e-05, "loss": 0.5558, "loss/crossentropy": 2.144772946834564, "loss/hidden": 0.25341796875, "loss/logits": 0.0613440815359354, "loss/reg": 0.02410317398607731, "step": 1212 }, { "epoch": 0.6065, "grad_norm": 1.3222495317459106, "grad_norm_var": 0.11811538585086864, "learning_rate": 2e-05, "loss": 0.459, "loss/crossentropy": 2.4880030155181885, "loss/hidden": 0.17724609375, "loss/logits": 0.040756989270448685, "loss/reg": 0.02410070225596428, "step": 1213 }, { "epoch": 0.607, "grad_norm": 1.3210080862045288, "grad_norm_var": 0.11603035562697338, "learning_rate": 2e-05, "loss": 0.4262, "loss/crossentropy": 2.2644035816192627, "loss/hidden": 0.15673828125, "loss/logits": 0.02847316488623619, "loss/reg": 0.024098023772239685, "step": 1214 }, { "epoch": 0.6075, "grad_norm": 1.55643892288208, "grad_norm_var": 0.1149566743584008, "learning_rate": 2e-05, "loss": 0.472, "loss/crossentropy": 2.338138461112976, "loss/hidden": 0.19482421875, "loss/logits": 0.03619702160358429, "loss/reg": 0.024095552042126656, "step": 1215 }, { "epoch": 0.608, "grad_norm": 1.0845917463302612, "grad_norm_var": 0.12680071206606555, "learning_rate": 2e-05, "loss": 0.3949, "loss/crossentropy": 2.3922587633132935, "loss/hidden": 0.13037109375, "loss/logits": 0.023628353141248226, "loss/reg": 0.02409297414124012, "step": 1216 }, { "epoch": 0.6085, "grad_norm": 1.7885349988937378, "grad_norm_var": 0.12520873664582974, "learning_rate": 2e-05, "loss": 0.5574, "loss/crossentropy": 2.473549246788025, "loss/hidden": 0.271484375, "loss/logits": 0.045023126527667046, "loss/reg": 0.024090547114610672, "step": 1217 }, { "epoch": 0.609, "grad_norm": 1.367996096611023, "grad_norm_var": 0.12569242606033507, "learning_rate": 2e-05, "loss": 0.4443, "loss/crossentropy": 2.5233819484710693, "loss/hidden": 0.16015625, "loss/logits": 0.043247487396001816, "loss/reg": 0.024087954312562943, "step": 1218 }, { "epoch": 0.6095, "grad_norm": 2.246495485305786, "grad_norm_var": 0.14712908643756276, "learning_rate": 2e-05, "loss": 0.4786, "loss/crossentropy": 2.3473750352859497, "loss/hidden": 0.2021484375, "loss/logits": 0.035566676408052444, "loss/reg": 0.024085314944386482, "step": 1219 }, { "epoch": 0.61, "grad_norm": 1.4608843326568604, "grad_norm_var": 0.14571195454986466, "learning_rate": 2e-05, "loss": 0.5275, "loss/crossentropy": 2.1911109685897827, "loss/hidden": 0.2412109375, "loss/logits": 0.04546273872256279, "loss/reg": 0.02408267930150032, "step": 1220 }, { "epoch": 0.6105, "grad_norm": 3.359498977661133, "grad_norm_var": 0.3248317660863883, "learning_rate": 2e-05, "loss": 0.5278, "loss/crossentropy": 2.7323907613754272, "loss/hidden": 0.2412109375, "loss/logits": 0.04578916169703007, "loss/reg": 0.024080097675323486, "step": 1221 }, { "epoch": 0.611, "grad_norm": 1.410009741783142, "grad_norm_var": 0.29102285697800256, "learning_rate": 2e-05, "loss": 0.442, "loss/crossentropy": 2.259430766105652, "loss/hidden": 0.173828125, "loss/logits": 0.027424287050962448, "loss/reg": 0.02407745271921158, "step": 1222 }, { "epoch": 0.6115, "grad_norm": 1.7386364936828613, "grad_norm_var": 0.28192935324309565, "learning_rate": 2e-05, "loss": 0.4869, "loss/crossentropy": 2.1112271547317505, "loss/hidden": 0.20556640625, "loss/logits": 0.04061359539628029, "loss/reg": 0.024074768647551537, "step": 1223 }, { "epoch": 0.612, "grad_norm": 1.7512989044189453, "grad_norm_var": 0.28095646018948417, "learning_rate": 2e-05, "loss": 0.4391, "loss/crossentropy": 2.2848275899887085, "loss/hidden": 0.16796875, "loss/logits": 0.030420562252402306, "loss/reg": 0.024072324857115746, "step": 1224 }, { "epoch": 0.6125, "grad_norm": 2.1722567081451416, "grad_norm_var": 0.2936146731009558, "learning_rate": 2e-05, "loss": 0.5109, "loss/crossentropy": 2.3575836420059204, "loss/hidden": 0.22705078125, "loss/logits": 0.043132973834872246, "loss/reg": 0.024069787934422493, "step": 1225 }, { "epoch": 0.613, "grad_norm": 1.7794545888900757, "grad_norm_var": 0.28443734408106686, "learning_rate": 2e-05, "loss": 0.4502, "loss/crossentropy": 2.1393051147460938, "loss/hidden": 0.1845703125, "loss/logits": 0.024946999736130238, "loss/reg": 0.024067340418696404, "step": 1226 }, { "epoch": 0.6135, "grad_norm": 1.1268008947372437, "grad_norm_var": 0.3045137650879551, "learning_rate": 2e-05, "loss": 0.4021, "loss/crossentropy": 2.568060874938965, "loss/hidden": 0.1376953125, "loss/logits": 0.02373245358467102, "loss/reg": 0.02406480722129345, "step": 1227 }, { "epoch": 0.614, "grad_norm": 1.7132948637008667, "grad_norm_var": 0.303986332406373, "learning_rate": 2e-05, "loss": 0.4984, "loss/crossentropy": 2.4376784563064575, "loss/hidden": 0.2138671875, "loss/logits": 0.04392072185873985, "loss/reg": 0.024062197655439377, "step": 1228 }, { "epoch": 0.6145, "grad_norm": 1.6476460695266724, "grad_norm_var": 0.29421634520029155, "learning_rate": 2e-05, "loss": 0.4641, "loss/crossentropy": 2.292167067527771, "loss/hidden": 0.1865234375, "loss/logits": 0.03701779432594776, "loss/reg": 0.024059604853391647, "step": 1229 }, { "epoch": 0.615, "grad_norm": 1.1690088510513306, "grad_norm_var": 0.3037526654890541, "learning_rate": 2e-05, "loss": 0.4071, "loss/crossentropy": 2.5657061338424683, "loss/hidden": 0.13818359375, "loss/logits": 0.028342257253825665, "loss/reg": 0.024057114496827126, "step": 1230 }, { "epoch": 0.6155, "grad_norm": 1.7636140584945679, "grad_norm_var": 0.3021712089508715, "learning_rate": 2e-05, "loss": 0.484, "loss/crossentropy": 2.354526996612549, "loss/hidden": 0.19970703125, "loss/logits": 0.04370002634823322, "loss/reg": 0.02405458688735962, "step": 1231 }, { "epoch": 0.616, "grad_norm": 1.546012282371521, "grad_norm_var": 0.2761551623079915, "learning_rate": 2e-05, "loss": 0.4528, "loss/crossentropy": 2.3835500478744507, "loss/hidden": 0.17333984375, "loss/logits": 0.03897825721651316, "loss/reg": 0.02405191771686077, "step": 1232 }, { "epoch": 0.6165, "grad_norm": 2.489821434020996, "grad_norm_var": 0.3102538412663264, "learning_rate": 2e-05, "loss": 0.486, "loss/crossentropy": 2.604992389678955, "loss/hidden": 0.20751953125, "loss/logits": 0.037946032360196114, "loss/reg": 0.024049216881394386, "step": 1233 }, { "epoch": 0.617, "grad_norm": 1.2768479585647583, "grad_norm_var": 0.31597976978417625, "learning_rate": 2e-05, "loss": 0.4123, "loss/crossentropy": 2.3154995441436768, "loss/hidden": 0.14697265625, "loss/logits": 0.02490917406976223, "loss/reg": 0.02404674142599106, "step": 1234 }, { "epoch": 0.6175, "grad_norm": 4.095790386199951, "grad_norm_var": 0.6421038174808378, "learning_rate": 2e-05, "loss": 0.6366, "loss/crossentropy": 2.2328860759735107, "loss/hidden": 0.34619140625, "loss/logits": 0.049947988241910934, "loss/reg": 0.024044139310717583, "step": 1235 }, { "epoch": 0.618, "grad_norm": 1.500967264175415, "grad_norm_var": 0.6398237315745594, "learning_rate": 2e-05, "loss": 0.4614, "loss/crossentropy": 2.3703516721725464, "loss/hidden": 0.18408203125, "loss/logits": 0.03693939931690693, "loss/reg": 0.02404148131608963, "step": 1236 }, { "epoch": 0.6185, "grad_norm": 2.2723183631896973, "grad_norm_var": 0.5034082078181905, "learning_rate": 2e-05, "loss": 0.5442, "loss/crossentropy": 2.221264958381653, "loss/hidden": 0.26318359375, "loss/logits": 0.04067422728985548, "loss/reg": 0.02403891831636429, "step": 1237 }, { "epoch": 0.619, "grad_norm": 2.6256821155548096, "grad_norm_var": 0.5259378567608592, "learning_rate": 2e-05, "loss": 0.4112, "loss/crossentropy": 2.5045779943466187, "loss/hidden": 0.14794921875, "loss/logits": 0.022931482642889023, "loss/reg": 0.024036424234509468, "step": 1238 }, { "epoch": 0.6195, "grad_norm": 1.0845694541931152, "grad_norm_var": 0.5682165874069398, "learning_rate": 2e-05, "loss": 0.4171, "loss/crossentropy": 2.3470133543014526, "loss/hidden": 0.14794921875, "loss/logits": 0.028764693066477776, "loss/reg": 0.024033887311816216, "step": 1239 }, { "epoch": 0.62, "grad_norm": 1.6361056566238403, "grad_norm_var": 0.5709606356025133, "learning_rate": 2e-05, "loss": 0.5352, "loss/crossentropy": 2.156785488128662, "loss/hidden": 0.24951171875, "loss/logits": 0.04537991248071194, "loss/reg": 0.024031352251768112, "step": 1240 }, { "epoch": 0.6205, "grad_norm": 1.6204231977462769, "grad_norm_var": 0.5676626713635791, "learning_rate": 2e-05, "loss": 0.4345, "loss/crossentropy": 2.4386643171310425, "loss/hidden": 0.162109375, "loss/logits": 0.032054854556918144, "loss/reg": 0.02402876876294613, "step": 1241 }, { "epoch": 0.621, "grad_norm": 1.1746337413787842, "grad_norm_var": 0.5949463432824259, "learning_rate": 2e-05, "loss": 0.4244, "loss/crossentropy": 2.3799376487731934, "loss/hidden": 0.154296875, "loss/logits": 0.029872726649045944, "loss/reg": 0.0240262970328331, "step": 1242 }, { "epoch": 0.6215, "grad_norm": 1.6356711387634277, "grad_norm_var": 0.5656939566181675, "learning_rate": 2e-05, "loss": 0.4244, "loss/crossentropy": 2.3845585584640503, "loss/hidden": 0.15380859375, "loss/logits": 0.030385269783437252, "loss/reg": 0.02402365952730179, "step": 1243 }, { "epoch": 0.622, "grad_norm": 2.1370480060577393, "grad_norm_var": 0.5704204269581301, "learning_rate": 2e-05, "loss": 0.5108, "loss/crossentropy": 2.528768539428711, "loss/hidden": 0.22216796875, "loss/logits": 0.048394979909062386, "loss/reg": 0.024021117016673088, "step": 1244 }, { "epoch": 0.6225, "grad_norm": 2.0357038974761963, "grad_norm_var": 0.569115940961103, "learning_rate": 2e-05, "loss": 0.4838, "loss/crossentropy": 2.294468402862549, "loss/hidden": 0.20751953125, "loss/logits": 0.03613162599503994, "loss/reg": 0.024018656462430954, "step": 1245 }, { "epoch": 0.623, "grad_norm": 1.5724185705184937, "grad_norm_var": 0.5410974439999165, "learning_rate": 2e-05, "loss": 0.4462, "loss/crossentropy": 2.4699219465255737, "loss/hidden": 0.1728515625, "loss/logits": 0.033166331239044666, "loss/reg": 0.02401614561676979, "step": 1246 }, { "epoch": 0.6235, "grad_norm": 1.199653148651123, "grad_norm_var": 0.5715490275335109, "learning_rate": 2e-05, "loss": 0.4286, "loss/crossentropy": 2.542737126350403, "loss/hidden": 0.158203125, "loss/logits": 0.03024892695248127, "loss/reg": 0.024013692513108253, "step": 1247 }, { "epoch": 0.624, "grad_norm": 1.2950655221939087, "grad_norm_var": 0.5862912521386784, "learning_rate": 2e-05, "loss": 0.4532, "loss/crossentropy": 2.0858335494995117, "loss/hidden": 0.18017578125, "loss/logits": 0.03286417946219444, "loss/reg": 0.024011155590415, "step": 1248 }, { "epoch": 0.6245, "grad_norm": 1.3316272497177124, "grad_norm_var": 0.5718334494050034, "learning_rate": 2e-05, "loss": 0.4137, "loss/crossentropy": 2.394113779067993, "loss/hidden": 0.146484375, "loss/logits": 0.027119265869259834, "loss/reg": 0.02400875836610794, "step": 1249 }, { "epoch": 0.625, "grad_norm": 1.247865915298462, "grad_norm_var": 0.5738337715934628, "learning_rate": 2e-05, "loss": 0.4395, "loss/crossentropy": 2.2250888347625732, "loss/hidden": 0.16845703125, "loss/logits": 0.030949266627430916, "loss/reg": 0.024006184190511703, "step": 1250 }, { "epoch": 0.6255, "grad_norm": 1.4751828908920288, "grad_norm_var": 0.1935716929082652, "learning_rate": 2e-05, "loss": 0.4572, "loss/crossentropy": 2.395260810852051, "loss/hidden": 0.18212890625, "loss/logits": 0.03504170663654804, "loss/reg": 0.024003824219107628, "step": 1251 }, { "epoch": 0.626, "grad_norm": 1.179787516593933, "grad_norm_var": 0.20491551538845407, "learning_rate": 2e-05, "loss": 0.3969, "loss/crossentropy": 2.4494906663894653, "loss/hidden": 0.134765625, "loss/logits": 0.022070709615945816, "loss/reg": 0.02400144934654236, "step": 1252 }, { "epoch": 0.6265, "grad_norm": 1.586988925933838, "grad_norm_var": 0.17240025072078843, "learning_rate": 2e-05, "loss": 0.4791, "loss/crossentropy": 2.367736339569092, "loss/hidden": 0.20263671875, "loss/logits": 0.036459170281887054, "loss/reg": 0.023999080061912537, "step": 1253 }, { "epoch": 0.627, "grad_norm": 1.408430576324463, "grad_norm_var": 0.0908129483057038, "learning_rate": 2e-05, "loss": 0.4469, "loss/crossentropy": 2.5444475412368774, "loss/hidden": 0.17578125, "loss/logits": 0.03117147646844387, "loss/reg": 0.023996589705348015, "step": 1254 }, { "epoch": 0.6275, "grad_norm": 1.3537817001342773, "grad_norm_var": 0.08128065351453409, "learning_rate": 2e-05, "loss": 0.4433, "loss/crossentropy": 2.5234625339508057, "loss/hidden": 0.1748046875, "loss/logits": 0.028503548353910446, "loss/reg": 0.023994173854589462, "step": 1255 }, { "epoch": 0.628, "grad_norm": 1.5888077020645142, "grad_norm_var": 0.08051893249327732, "learning_rate": 2e-05, "loss": 0.5085, "loss/crossentropy": 2.2406824827194214, "loss/hidden": 0.22509765625, "loss/logits": 0.043473441153764725, "loss/reg": 0.023991703987121582, "step": 1256 }, { "epoch": 0.6285, "grad_norm": 1.1863782405853271, "grad_norm_var": 0.08475685961338304, "learning_rate": 2e-05, "loss": 0.4252, "loss/crossentropy": 2.3977235555648804, "loss/hidden": 0.16015625, "loss/logits": 0.025112398900091648, "loss/reg": 0.023989345878362656, "step": 1257 }, { "epoch": 0.629, "grad_norm": 2.7917957305908203, "grad_norm_var": 0.18601559285113065, "learning_rate": 2e-05, "loss": 0.4964, "loss/crossentropy": 2.582550048828125, "loss/hidden": 0.20849609375, "loss/logits": 0.0480042677372694, "loss/reg": 0.023987185209989548, "step": 1258 }, { "epoch": 0.6295, "grad_norm": 1.227421760559082, "grad_norm_var": 0.1925385294557105, "learning_rate": 2e-05, "loss": 0.4472, "loss/crossentropy": 2.3612314462661743, "loss/hidden": 0.1728515625, "loss/logits": 0.034451963379979134, "loss/reg": 0.023985007777810097, "step": 1259 }, { "epoch": 0.63, "grad_norm": 1.3271315097808838, "grad_norm_var": 0.1689130153916018, "learning_rate": 2e-05, "loss": 0.4098, "loss/crossentropy": 2.2717262506484985, "loss/hidden": 0.146484375, "loss/logits": 0.023538900539278984, "loss/reg": 0.023982524871826172, "step": 1260 }, { "epoch": 0.6305, "grad_norm": 1.5784635543823242, "grad_norm_var": 0.1485889910484635, "learning_rate": 2e-05, "loss": 0.4908, "loss/crossentropy": 2.5256478786468506, "loss/hidden": 0.21044921875, "loss/logits": 0.040558042004704475, "loss/reg": 0.02397996559739113, "step": 1261 }, { "epoch": 0.631, "grad_norm": 1.4419437646865845, "grad_norm_var": 0.14768726273588845, "learning_rate": 2e-05, "loss": 0.453, "loss/crossentropy": 2.2362372875213623, "loss/hidden": 0.18310546875, "loss/logits": 0.030156176537275314, "loss/reg": 0.02397749572992325, "step": 1262 }, { "epoch": 0.6315, "grad_norm": 1.3559249639511108, "grad_norm_var": 0.14397081070207676, "learning_rate": 2e-05, "loss": 0.4499, "loss/crossentropy": 2.6084084510803223, "loss/hidden": 0.1767578125, "loss/logits": 0.03335867449641228, "loss/reg": 0.023975025862455368, "step": 1263 }, { "epoch": 0.632, "grad_norm": 1.8681645393371582, "grad_norm_var": 0.1518160274302981, "learning_rate": 2e-05, "loss": 0.4988, "loss/crossentropy": 2.302277684211731, "loss/hidden": 0.2177734375, "loss/logits": 0.04134911857545376, "loss/reg": 0.023972423747181892, "step": 1264 }, { "epoch": 0.6325, "grad_norm": 1.2972159385681152, "grad_norm_var": 0.15264813462290375, "learning_rate": 2e-05, "loss": 0.4542, "loss/crossentropy": 2.126034438610077, "loss/hidden": 0.1806640625, "loss/logits": 0.03383249044418335, "loss/reg": 0.02396974340081215, "step": 1265 }, { "epoch": 0.633, "grad_norm": 1.1746324300765991, "grad_norm_var": 0.1553935858025509, "learning_rate": 2e-05, "loss": 0.4072, "loss/crossentropy": 2.2992480993270874, "loss/hidden": 0.14208984375, "loss/logits": 0.025420350953936577, "loss/reg": 0.023967038840055466, "step": 1266 }, { "epoch": 0.6335, "grad_norm": 1.0678731203079224, "grad_norm_var": 0.16657406511629347, "learning_rate": 2e-05, "loss": 0.4074, "loss/crossentropy": 2.2471213340759277, "loss/hidden": 0.14208984375, "loss/logits": 0.025705378502607346, "loss/reg": 0.02396426908671856, "step": 1267 }, { "epoch": 0.634, "grad_norm": 2.6190547943115234, "grad_norm_var": 0.24137234026006044, "learning_rate": 2e-05, "loss": 0.5394, "loss/crossentropy": 2.6888530254364014, "loss/hidden": 0.24853515625, "loss/logits": 0.05120135098695755, "loss/reg": 0.023961780592799187, "step": 1268 }, { "epoch": 0.6345, "grad_norm": 1.5800697803497314, "grad_norm_var": 0.24134547552578448, "learning_rate": 2e-05, "loss": 0.4279, "loss/crossentropy": 2.289852738380432, "loss/hidden": 0.15966796875, "loss/logits": 0.028674802742898464, "loss/reg": 0.023959312587976456, "step": 1269 }, { "epoch": 0.635, "grad_norm": 1.474374532699585, "grad_norm_var": 0.240335642083797, "learning_rate": 2e-05, "loss": 0.4526, "loss/crossentropy": 2.3954397439956665, "loss/hidden": 0.1826171875, "loss/logits": 0.030411606654524803, "loss/reg": 0.023956701159477234, "step": 1270 }, { "epoch": 0.6355, "grad_norm": 1.6741186380386353, "grad_norm_var": 0.2380131997486006, "learning_rate": 2e-05, "loss": 0.5338, "loss/crossentropy": 2.3165799379348755, "loss/hidden": 0.2578125, "loss/logits": 0.03643801715224981, "loss/reg": 0.023954056203365326, "step": 1271 }, { "epoch": 0.636, "grad_norm": 1.4911454916000366, "grad_norm_var": 0.23847295627966883, "learning_rate": 2e-05, "loss": 0.4419, "loss/crossentropy": 2.3574637174606323, "loss/hidden": 0.17431640625, "loss/logits": 0.028055937960743904, "loss/reg": 0.023951426148414612, "step": 1272 }, { "epoch": 0.6365, "grad_norm": 1.3399804830551147, "grad_norm_var": 0.23204516308295423, "learning_rate": 2e-05, "loss": 0.4296, "loss/crossentropy": 2.2917098999023438, "loss/hidden": 0.1650390625, "loss/logits": 0.02510044164955616, "loss/reg": 0.02394864708185196, "step": 1273 }, { "epoch": 0.637, "grad_norm": 1.1677303314208984, "grad_norm_var": 0.13488639573788433, "learning_rate": 2e-05, "loss": 0.4029, "loss/crossentropy": 2.272668480873108, "loss/hidden": 0.143798828125, "loss/logits": 0.019617602229118347, "loss/reg": 0.023945819586515427, "step": 1274 }, { "epoch": 0.6375, "grad_norm": 1.7385436296463013, "grad_norm_var": 0.13397879899451534, "learning_rate": 2e-05, "loss": 0.5288, "loss/crossentropy": 2.3414204120635986, "loss/hidden": 0.2470703125, "loss/logits": 0.04233134910464287, "loss/reg": 0.023943088948726654, "step": 1275 }, { "epoch": 0.638, "grad_norm": 0.977463960647583, "grad_norm_var": 0.15025223921002726, "learning_rate": 2e-05, "loss": 0.403, "loss/crossentropy": 2.4587652683258057, "loss/hidden": 0.13916015625, "loss/logits": 0.02442883513867855, "loss/reg": 0.023940427228808403, "step": 1276 }, { "epoch": 0.6385, "grad_norm": 2.247265577316284, "grad_norm_var": 0.18605952102636442, "learning_rate": 2e-05, "loss": 0.4763, "loss/crossentropy": 2.393397808074951, "loss/hidden": 0.20703125, "loss/logits": 0.029887165874242783, "loss/reg": 0.02393791824579239, "step": 1277 }, { "epoch": 0.639, "grad_norm": 1.2421715259552002, "grad_norm_var": 0.19095842498212073, "learning_rate": 2e-05, "loss": 0.4199, "loss/crossentropy": 2.3398871421813965, "loss/hidden": 0.1494140625, "loss/logits": 0.031092578545212746, "loss/reg": 0.023935388773679733, "step": 1278 }, { "epoch": 0.6395, "grad_norm": 1.630374550819397, "grad_norm_var": 0.1896718089495029, "learning_rate": 2e-05, "loss": 0.4178, "loss/crossentropy": 2.3470832109451294, "loss/hidden": 0.15234375, "loss/logits": 0.026134072802960873, "loss/reg": 0.023932764306664467, "step": 1279 }, { "epoch": 0.64, "grad_norm": 1.426863670349121, "grad_norm_var": 0.1823510070964786, "learning_rate": 2e-05, "loss": 0.4318, "loss/crossentropy": 2.281801223754883, "loss/hidden": 0.16259765625, "loss/logits": 0.029915660619735718, "loss/reg": 0.023930255323648453, "step": 1280 }, { "epoch": 0.6405, "grad_norm": 1.5419303178787231, "grad_norm_var": 0.17917366497501538, "learning_rate": 2e-05, "loss": 0.4594, "loss/crossentropy": 2.4495433568954468, "loss/hidden": 0.18359375, "loss/logits": 0.036524929106235504, "loss/reg": 0.023927820846438408, "step": 1281 }, { "epoch": 0.641, "grad_norm": 2.1934878826141357, "grad_norm_var": 0.19651069564279305, "learning_rate": 2e-05, "loss": 0.5702, "loss/crossentropy": 2.097387194633484, "loss/hidden": 0.2783203125, "loss/logits": 0.052668359130620956, "loss/reg": 0.02392534911632538, "step": 1282 }, { "epoch": 0.6415, "grad_norm": 1.5347496271133423, "grad_norm_var": 0.17773874075004978, "learning_rate": 2e-05, "loss": 0.4635, "loss/crossentropy": 2.2910990715026855, "loss/hidden": 0.19140625, "loss/logits": 0.032878163270652294, "loss/reg": 0.02392282895743847, "step": 1283 }, { "epoch": 0.642, "grad_norm": 1.4668471813201904, "grad_norm_var": 0.10683961538937149, "learning_rate": 2e-05, "loss": 0.5058, "loss/crossentropy": 2.1385812759399414, "loss/hidden": 0.220703125, "loss/logits": 0.04593625292181969, "loss/reg": 0.023920193314552307, "step": 1284 }, { "epoch": 0.6425, "grad_norm": 2.6525700092315674, "grad_norm_var": 0.1836820315419103, "learning_rate": 2e-05, "loss": 0.4432, "loss/crossentropy": 2.503835439682007, "loss/hidden": 0.173828125, "loss/logits": 0.030207850970327854, "loss/reg": 0.02391754277050495, "step": 1285 }, { "epoch": 0.643, "grad_norm": 1.9956448078155518, "grad_norm_var": 0.19106626883691427, "learning_rate": 2e-05, "loss": 0.4742, "loss/crossentropy": 2.4542654752731323, "loss/hidden": 0.1962890625, "loss/logits": 0.038763463497161865, "loss/reg": 0.023914897814393044, "step": 1286 }, { "epoch": 0.6435, "grad_norm": 1.4315065145492554, "grad_norm_var": 0.19380491573572362, "learning_rate": 2e-05, "loss": 0.4204, "loss/crossentropy": 2.544227123260498, "loss/hidden": 0.15234375, "loss/logits": 0.02888611890375614, "loss/reg": 0.023912400007247925, "step": 1287 }, { "epoch": 0.644, "grad_norm": 1.5084730386734009, "grad_norm_var": 0.19350312891037896, "learning_rate": 2e-05, "loss": 0.423, "loss/crossentropy": 2.5178849697113037, "loss/hidden": 0.15478515625, "loss/logits": 0.029108996503055096, "loss/reg": 0.023909782990813255, "step": 1288 }, { "epoch": 0.6445, "grad_norm": 1.9239797592163086, "grad_norm_var": 0.19216031597426284, "learning_rate": 2e-05, "loss": 0.4724, "loss/crossentropy": 2.4222670793533325, "loss/hidden": 0.2021484375, "loss/logits": 0.031173129566013813, "loss/reg": 0.023907171562314034, "step": 1289 }, { "epoch": 0.645, "grad_norm": 1.1635066270828247, "grad_norm_var": 0.19244286753470394, "learning_rate": 2e-05, "loss": 0.4123, "loss/crossentropy": 2.2675434350967407, "loss/hidden": 0.146484375, "loss/logits": 0.026736157946288586, "loss/reg": 0.023904629051685333, "step": 1290 }, { "epoch": 0.6455, "grad_norm": 1.2279921770095825, "grad_norm_var": 0.20387843935832747, "learning_rate": 2e-05, "loss": 0.4294, "loss/crossentropy": 2.409374237060547, "loss/hidden": 0.1591796875, "loss/logits": 0.031166162341833115, "loss/reg": 0.02390221692621708, "step": 1291 }, { "epoch": 0.646, "grad_norm": 1.5634331703186035, "grad_norm_var": 0.17394207919523214, "learning_rate": 2e-05, "loss": 0.4418, "loss/crossentropy": 2.3470332622528076, "loss/hidden": 0.16943359375, "loss/logits": 0.033411881886422634, "loss/reg": 0.023899724707007408, "step": 1292 }, { "epoch": 0.6465, "grad_norm": 1.4021070003509521, "grad_norm_var": 0.15375149805387778, "learning_rate": 2e-05, "loss": 0.4517, "loss/crossentropy": 2.4360326528549194, "loss/hidden": 0.1845703125, "loss/logits": 0.028121494688093662, "loss/reg": 0.023897258564829826, "step": 1293 }, { "epoch": 0.647, "grad_norm": 1.5751771926879883, "grad_norm_var": 0.14394628232820703, "learning_rate": 2e-05, "loss": 0.4148, "loss/crossentropy": 2.363794207572937, "loss/hidden": 0.15087890625, "loss/logits": 0.02497075777500868, "loss/reg": 0.023894891142845154, "step": 1294 }, { "epoch": 0.6475, "grad_norm": 1.5669056177139282, "grad_norm_var": 0.14427878956963974, "learning_rate": 2e-05, "loss": 0.4636, "loss/crossentropy": 2.2907302379608154, "loss/hidden": 0.1884765625, "loss/logits": 0.03620941936969757, "loss/reg": 0.023892676457762718, "step": 1295 }, { "epoch": 0.648, "grad_norm": 2.1094207763671875, "grad_norm_var": 0.154368248754838, "learning_rate": 2e-05, "loss": 0.4793, "loss/crossentropy": 2.2729530334472656, "loss/hidden": 0.20654296875, "loss/logits": 0.033849818632006645, "loss/reg": 0.023890400305390358, "step": 1296 }, { "epoch": 0.6485, "grad_norm": 1.4486722946166992, "grad_norm_var": 0.15661132320615986, "learning_rate": 2e-05, "loss": 0.4313, "loss/crossentropy": 2.376818895339966, "loss/hidden": 0.16796875, "loss/logits": 0.024429542012512684, "loss/reg": 0.02388790063560009, "step": 1297 }, { "epoch": 0.649, "grad_norm": 1.458891749382019, "grad_norm_var": 0.13933691898384565, "learning_rate": 2e-05, "loss": 0.4831, "loss/crossentropy": 2.1321340203285217, "loss/hidden": 0.2080078125, "loss/logits": 0.03623790666460991, "loss/reg": 0.023885508999228477, "step": 1298 }, { "epoch": 0.6495, "grad_norm": 1.397750735282898, "grad_norm_var": 0.1421926325690795, "learning_rate": 2e-05, "loss": 0.4592, "loss/crossentropy": 2.245489716529846, "loss/hidden": 0.181640625, "loss/logits": 0.03869971726089716, "loss/reg": 0.02388302981853485, "step": 1299 }, { "epoch": 0.65, "grad_norm": 2.3553082942962646, "grad_norm_var": 0.1735859217612727, "learning_rate": 2e-05, "loss": 0.5102, "loss/crossentropy": 2.422638416290283, "loss/hidden": 0.22021484375, "loss/logits": 0.051152704283595085, "loss/reg": 0.02388053759932518, "step": 1300 }, { "epoch": 0.6505, "grad_norm": 1.5121291875839233, "grad_norm_var": 0.10604831093349745, "learning_rate": 2e-05, "loss": 0.476, "loss/crossentropy": 2.5624797344207764, "loss/hidden": 0.2021484375, "loss/logits": 0.03510456532239914, "loss/reg": 0.023878419771790504, "step": 1301 }, { "epoch": 0.651, "grad_norm": 1.5881332159042358, "grad_norm_var": 0.09506899424586326, "learning_rate": 2e-05, "loss": 0.4744, "loss/crossentropy": 2.3333781957626343, "loss/hidden": 0.1962890625, "loss/logits": 0.03930900990962982, "loss/reg": 0.023876061663031578, "step": 1302 }, { "epoch": 0.6515, "grad_norm": 1.3511114120483398, "grad_norm_var": 0.0970334796528732, "learning_rate": 2e-05, "loss": 0.4549, "loss/crossentropy": 2.1691489219665527, "loss/hidden": 0.185546875, "loss/logits": 0.030610281974077225, "loss/reg": 0.02387375757098198, "step": 1303 }, { "epoch": 0.652, "grad_norm": 2.0355160236358643, "grad_norm_var": 0.10992582401275346, "learning_rate": 2e-05, "loss": 0.5417, "loss/crossentropy": 2.3282041549682617, "loss/hidden": 0.27099609375, "loss/logits": 0.03201697859913111, "loss/reg": 0.02387123927474022, "step": 1304 }, { "epoch": 0.6525, "grad_norm": 1.1647405624389648, "grad_norm_var": 0.11366288198163511, "learning_rate": 2e-05, "loss": 0.3925, "loss/crossentropy": 2.465924859046936, "loss/hidden": 0.13232421875, "loss/logits": 0.021514427848160267, "loss/reg": 0.02386898547410965, "step": 1305 }, { "epoch": 0.653, "grad_norm": 1.921985387802124, "grad_norm_var": 0.10976873004807407, "learning_rate": 2e-05, "loss": 0.5147, "loss/crossentropy": 2.435685157775879, "loss/hidden": 0.23681640625, "loss/logits": 0.03921514190733433, "loss/reg": 0.02386675402522087, "step": 1306 }, { "epoch": 0.6535, "grad_norm": 1.6400461196899414, "grad_norm_var": 0.09966999048148933, "learning_rate": 2e-05, "loss": 0.4302, "loss/crossentropy": 2.16571307182312, "loss/hidden": 0.16748046875, "loss/logits": 0.024049567990005016, "loss/reg": 0.023864606395363808, "step": 1307 }, { "epoch": 0.654, "grad_norm": 1.3085359334945679, "grad_norm_var": 0.10601720206320617, "learning_rate": 2e-05, "loss": 0.4506, "loss/crossentropy": 2.4350894689559937, "loss/hidden": 0.1787109375, "loss/logits": 0.03322593308985233, "loss/reg": 0.023862628266215324, "step": 1308 }, { "epoch": 0.6545, "grad_norm": 1.7091984748840332, "grad_norm_var": 0.10320339085501071, "learning_rate": 2e-05, "loss": 0.4515, "loss/crossentropy": 2.418308198451996, "loss/hidden": 0.18359375, "loss/logits": 0.02934916317462921, "loss/reg": 0.023860609158873558, "step": 1309 }, { "epoch": 0.655, "grad_norm": 1.2501323223114014, "grad_norm_var": 0.11235482446353538, "learning_rate": 2e-05, "loss": 0.417, "loss/crossentropy": 2.599029541015625, "loss/hidden": 0.1494140625, "loss/logits": 0.0289985379204154, "loss/reg": 0.023858599364757538, "step": 1310 }, { "epoch": 0.6555, "grad_norm": 1.5810778141021729, "grad_norm_var": 0.11227903902704757, "learning_rate": 2e-05, "loss": 0.435, "loss/crossentropy": 2.4957447052001953, "loss/hidden": 0.16748046875, "loss/logits": 0.028951111249625683, "loss/reg": 0.02385612390935421, "step": 1311 }, { "epoch": 0.656, "grad_norm": 1.6085758209228516, "grad_norm_var": 0.09490913098407905, "learning_rate": 2e-05, "loss": 0.4276, "loss/crossentropy": 2.525179862976074, "loss/hidden": 0.1630859375, "loss/logits": 0.02599877305328846, "loss/reg": 0.023853624239563942, "step": 1312 }, { "epoch": 0.6565, "grad_norm": 1.3025941848754883, "grad_norm_var": 0.09886375082411777, "learning_rate": 2e-05, "loss": 0.4164, "loss/crossentropy": 2.2159218788146973, "loss/hidden": 0.15380859375, "loss/logits": 0.02411420363932848, "loss/reg": 0.023851484060287476, "step": 1313 }, { "epoch": 0.657, "grad_norm": 1.7414854764938354, "grad_norm_var": 0.09951370157159824, "learning_rate": 2e-05, "loss": 0.4552, "loss/crossentropy": 2.6034278869628906, "loss/hidden": 0.18359375, "loss/logits": 0.033163596875965595, "loss/reg": 0.023848969489336014, "step": 1314 }, { "epoch": 0.6575, "grad_norm": 1.6796448230743408, "grad_norm_var": 0.0971878321131148, "learning_rate": 2e-05, "loss": 0.5352, "loss/crossentropy": 2.3006917238235474, "loss/hidden": 0.2548828125, "loss/logits": 0.04180280677974224, "loss/reg": 0.02384648472070694, "step": 1315 }, { "epoch": 0.658, "grad_norm": 1.5615240335464478, "grad_norm_var": 0.057622080975028626, "learning_rate": 2e-05, "loss": 0.4302, "loss/crossentropy": 2.188043475151062, "loss/hidden": 0.16455078125, "loss/logits": 0.02720883209258318, "loss/reg": 0.023843981325626373, "step": 1316 }, { "epoch": 0.6585, "grad_norm": 1.1154263019561768, "grad_norm_var": 0.06997817065805308, "learning_rate": 2e-05, "loss": 0.4081, "loss/crossentropy": 2.592913031578064, "loss/hidden": 0.14453125, "loss/logits": 0.025116360746324062, "loss/reg": 0.023841451853513718, "step": 1317 }, { "epoch": 0.659, "grad_norm": 1.5203436613082886, "grad_norm_var": 0.06978498065926123, "learning_rate": 2e-05, "loss": 0.5075, "loss/crossentropy": 2.2861050367355347, "loss/hidden": 0.22412109375, "loss/logits": 0.04499981366097927, "loss/reg": 0.023838885128498077, "step": 1318 }, { "epoch": 0.6595, "grad_norm": 1.2238833904266357, "grad_norm_var": 0.07384394251173394, "learning_rate": 2e-05, "loss": 0.4196, "loss/crossentropy": 2.538287878036499, "loss/hidden": 0.15283203125, "loss/logits": 0.02840554341673851, "loss/reg": 0.023836364969611168, "step": 1319 }, { "epoch": 0.66, "grad_norm": 1.191012978553772, "grad_norm_var": 0.06068536610595358, "learning_rate": 2e-05, "loss": 0.4069, "loss/crossentropy": 2.4569714069366455, "loss/hidden": 0.1455078125, "loss/logits": 0.02309222426265478, "loss/reg": 0.023833919316530228, "step": 1320 }, { "epoch": 0.6605, "grad_norm": 1.649925708770752, "grad_norm_var": 0.0556496711602169, "learning_rate": 2e-05, "loss": 0.4694, "loss/crossentropy": 2.3524088859558105, "loss/hidden": 0.19189453125, "loss/logits": 0.0391565915197134, "loss/reg": 0.023831605911254883, "step": 1321 }, { "epoch": 0.661, "grad_norm": 1.4207836389541626, "grad_norm_var": 0.043172417948901656, "learning_rate": 2e-05, "loss": 0.4519, "loss/crossentropy": 2.5140554904937744, "loss/hidden": 0.1806640625, "loss/logits": 0.03295655734837055, "loss/reg": 0.023829326033592224, "step": 1322 }, { "epoch": 0.6615, "grad_norm": 1.3647384643554688, "grad_norm_var": 0.04163129199955975, "learning_rate": 2e-05, "loss": 0.4128, "loss/crossentropy": 2.530665874481201, "loss/hidden": 0.14990234375, "loss/logits": 0.02467129472643137, "loss/reg": 0.023827021941542625, "step": 1323 }, { "epoch": 0.662, "grad_norm": 1.5273334980010986, "grad_norm_var": 0.0404437201587351, "learning_rate": 2e-05, "loss": 0.4635, "loss/crossentropy": 2.2921979427337646, "loss/hidden": 0.189453125, "loss/logits": 0.03580437693744898, "loss/reg": 0.023824498057365417, "step": 1324 }, { "epoch": 0.6625, "grad_norm": 3.4632437229156494, "grad_norm_var": 0.28973497995353714, "learning_rate": 2e-05, "loss": 0.4845, "loss/crossentropy": 2.3375691175460815, "loss/hidden": 0.2119140625, "loss/logits": 0.03432004339993, "loss/reg": 0.02382197044789791, "step": 1325 }, { "epoch": 0.663, "grad_norm": 1.2499359846115112, "grad_norm_var": 0.28974348968956176, "learning_rate": 2e-05, "loss": 0.4141, "loss/crossentropy": 2.437517523765564, "loss/hidden": 0.15185546875, "loss/logits": 0.024037906900048256, "loss/reg": 0.023819534108042717, "step": 1326 }, { "epoch": 0.6635, "grad_norm": 2.2203030586242676, "grad_norm_var": 0.31579141158708024, "learning_rate": 2e-05, "loss": 0.4227, "loss/crossentropy": 2.547585964202881, "loss/hidden": 0.1591796875, "loss/logits": 0.025345077738165855, "loss/reg": 0.02381698414683342, "step": 1327 }, { "epoch": 0.664, "grad_norm": 1.4417054653167725, "grad_norm_var": 0.317675752358493, "learning_rate": 2e-05, "loss": 0.4383, "loss/crossentropy": 2.5056179761886597, "loss/hidden": 0.16845703125, "loss/logits": 0.03173685912042856, "loss/reg": 0.023814348503947258, "step": 1328 }, { "epoch": 0.6645, "grad_norm": 1.6603224277496338, "grad_norm_var": 0.3112681967737764, "learning_rate": 2e-05, "loss": 0.4843, "loss/crossentropy": 2.1647554636001587, "loss/hidden": 0.2021484375, "loss/logits": 0.043993281200528145, "loss/reg": 0.023811759427189827, "step": 1329 }, { "epoch": 0.665, "grad_norm": 1.6948206424713135, "grad_norm_var": 0.31069182045736876, "learning_rate": 2e-05, "loss": 0.4089, "loss/crossentropy": 2.37821888923645, "loss/hidden": 0.150390625, "loss/logits": 0.020369217731058598, "loss/reg": 0.023809220641851425, "step": 1330 }, { "epoch": 0.6655, "grad_norm": 1.418535828590393, "grad_norm_var": 0.3130177534653304, "learning_rate": 2e-05, "loss": 0.4358, "loss/crossentropy": 2.3562744855880737, "loss/hidden": 0.16552734375, "loss/logits": 0.03218572027981281, "loss/reg": 0.023806730285286903, "step": 1331 }, { "epoch": 0.666, "grad_norm": 2.405161142349243, "grad_norm_var": 0.35230188449184957, "learning_rate": 2e-05, "loss": 0.4954, "loss/crossentropy": 2.5449503660202026, "loss/hidden": 0.212158203125, "loss/logits": 0.04522665124386549, "loss/reg": 0.02380412258207798, "step": 1332 }, { "epoch": 0.6665, "grad_norm": 2.3597934246063232, "grad_norm_var": 0.3586491765370226, "learning_rate": 2e-05, "loss": 0.4877, "loss/crossentropy": 2.3041821718215942, "loss/hidden": 0.208984375, "loss/logits": 0.040694585070014, "loss/reg": 0.02380160056054592, "step": 1333 }, { "epoch": 0.667, "grad_norm": 1.4537216424942017, "grad_norm_var": 0.36086214325715854, "learning_rate": 2e-05, "loss": 0.4747, "loss/crossentropy": 2.504698157310486, "loss/hidden": 0.19921875, "loss/logits": 0.03751707915216684, "loss/reg": 0.023799141868948936, "step": 1334 }, { "epoch": 0.6675, "grad_norm": 1.2887414693832397, "grad_norm_var": 0.3567130361876489, "learning_rate": 2e-05, "loss": 0.4786, "loss/crossentropy": 2.0967178344726562, "loss/hidden": 0.20263671875, "loss/logits": 0.03796843905001879, "loss/reg": 0.023796530440449715, "step": 1335 }, { "epoch": 0.668, "grad_norm": 1.360039234161377, "grad_norm_var": 0.3461683691160814, "learning_rate": 2e-05, "loss": 0.4422, "loss/crossentropy": 2.2293606996536255, "loss/hidden": 0.173828125, "loss/logits": 0.030479850247502327, "loss/reg": 0.02379394881427288, "step": 1336 }, { "epoch": 0.6685, "grad_norm": 1.5283000469207764, "grad_norm_var": 0.34869462176112187, "learning_rate": 2e-05, "loss": 0.4369, "loss/crossentropy": 2.55380380153656, "loss/hidden": 0.16650390625, "loss/logits": 0.032473089173436165, "loss/reg": 0.023791363462805748, "step": 1337 }, { "epoch": 0.669, "grad_norm": 1.3858225345611572, "grad_norm_var": 0.3502641276347217, "learning_rate": 2e-05, "loss": 0.4403, "loss/crossentropy": 2.364560842514038, "loss/hidden": 0.17041015625, "loss/logits": 0.03198127821087837, "loss/reg": 0.02378905564546585, "step": 1338 }, { "epoch": 0.6695, "grad_norm": 1.4333000183105469, "grad_norm_var": 0.3471374399560941, "learning_rate": 2e-05, "loss": 0.4355, "loss/crossentropy": 2.514798641204834, "loss/hidden": 0.1611328125, "loss/logits": 0.03649984207004309, "loss/reg": 0.023786714300513268, "step": 1339 }, { "epoch": 0.67, "grad_norm": 1.49425208568573, "grad_norm_var": 0.34815796148798006, "learning_rate": 2e-05, "loss": 0.4127, "loss/crossentropy": 2.451537013053894, "loss/hidden": 0.14892578125, "loss/logits": 0.025915359146893024, "loss/reg": 0.02378448285162449, "step": 1340 }, { "epoch": 0.6705, "grad_norm": 1.364202618598938, "grad_norm_var": 0.14155822181354907, "learning_rate": 2e-05, "loss": 0.4458, "loss/crossentropy": 2.2742252349853516, "loss/hidden": 0.17822265625, "loss/logits": 0.02972761169075966, "loss/reg": 0.023781999945640564, "step": 1341 }, { "epoch": 0.671, "grad_norm": 1.3675141334533691, "grad_norm_var": 0.1367785272504178, "learning_rate": 2e-05, "loss": 0.4329, "loss/crossentropy": 2.4399064779281616, "loss/hidden": 0.1650390625, "loss/logits": 0.03008042648434639, "loss/reg": 0.02377980761229992, "step": 1342 }, { "epoch": 0.6715, "grad_norm": 1.739666223526001, "grad_norm_var": 0.11257230684105075, "learning_rate": 2e-05, "loss": 0.4296, "loss/crossentropy": 2.3073580265045166, "loss/hidden": 0.1533203125, "loss/logits": 0.0385186281055212, "loss/reg": 0.02377736195921898, "step": 1343 }, { "epoch": 0.672, "grad_norm": 1.364190936088562, "grad_norm_var": 0.11445201509484164, "learning_rate": 2e-05, "loss": 0.4681, "loss/crossentropy": 2.3126569986343384, "loss/hidden": 0.19189453125, "loss/logits": 0.03849446773529053, "loss/reg": 0.02377496473491192, "step": 1344 }, { "epoch": 0.6725, "grad_norm": 1.7589856386184692, "grad_norm_var": 0.11608550666011386, "learning_rate": 2e-05, "loss": 0.4438, "loss/crossentropy": 2.2252254486083984, "loss/hidden": 0.17578125, "loss/logits": 0.030335014685988426, "loss/reg": 0.02377244643867016, "step": 1345 }, { "epoch": 0.673, "grad_norm": 1.5148929357528687, "grad_norm_var": 0.1155597806029616, "learning_rate": 2e-05, "loss": 0.4564, "loss/crossentropy": 2.173453211784363, "loss/hidden": 0.18896484375, "loss/logits": 0.02973311860114336, "loss/reg": 0.023769889026880264, "step": 1346 }, { "epoch": 0.6735, "grad_norm": 1.3687435388565063, "grad_norm_var": 0.11676889873661077, "learning_rate": 2e-05, "loss": 0.4483, "loss/crossentropy": 2.3290340900421143, "loss/hidden": 0.171875, "loss/logits": 0.03875895403325558, "loss/reg": 0.023767419159412384, "step": 1347 }, { "epoch": 0.674, "grad_norm": 3.605093479156494, "grad_norm_var": 0.3397037594268179, "learning_rate": 2e-05, "loss": 0.4573, "loss/crossentropy": 2.528464674949646, "loss/hidden": 0.18505859375, "loss/logits": 0.03462876006960869, "loss/reg": 0.02376495860517025, "step": 1348 }, { "epoch": 0.6745, "grad_norm": 1.2763408422470093, "grad_norm_var": 0.31041857364604836, "learning_rate": 2e-05, "loss": 0.4117, "loss/crossentropy": 2.3279651403427124, "loss/hidden": 0.14892578125, "loss/logits": 0.025109270587563515, "loss/reg": 0.023762483149766922, "step": 1349 }, { "epoch": 0.675, "grad_norm": 1.2447208166122437, "grad_norm_var": 0.3167090932037666, "learning_rate": 2e-05, "loss": 0.4339, "loss/crossentropy": 2.317818284034729, "loss/hidden": 0.16357421875, "loss/logits": 0.03267843183130026, "loss/reg": 0.023760035634040833, "step": 1350 }, { "epoch": 0.6755, "grad_norm": 1.469759225845337, "grad_norm_var": 0.3120066895490725, "learning_rate": 2e-05, "loss": 0.4657, "loss/crossentropy": 2.6747782230377197, "loss/hidden": 0.189453125, "loss/logits": 0.03863661177456379, "loss/reg": 0.023757578805088997, "step": 1351 }, { "epoch": 0.676, "grad_norm": 1.188594937324524, "grad_norm_var": 0.3188659312546353, "learning_rate": 2e-05, "loss": 0.4495, "loss/crossentropy": 2.6325184106826782, "loss/hidden": 0.17578125, "loss/logits": 0.03611903823912144, "loss/reg": 0.0237550251185894, "step": 1352 }, { "epoch": 0.6765, "grad_norm": 1.4543743133544922, "grad_norm_var": 0.3196088985917853, "learning_rate": 2e-05, "loss": 0.4425, "loss/crossentropy": 2.461496353149414, "loss/hidden": 0.17236328125, "loss/logits": 0.032596323639154434, "loss/reg": 0.02375258132815361, "step": 1353 }, { "epoch": 0.677, "grad_norm": 1.183428406715393, "grad_norm_var": 0.32698827229071603, "learning_rate": 2e-05, "loss": 0.4135, "loss/crossentropy": 2.411842107772827, "loss/hidden": 0.14892578125, "loss/logits": 0.02709823753684759, "loss/reg": 0.023749923333525658, "step": 1354 }, { "epoch": 0.6775, "grad_norm": 1.2251843214035034, "grad_norm_var": 0.3329822256302141, "learning_rate": 2e-05, "loss": 0.4623, "loss/crossentropy": 2.385651111602783, "loss/hidden": 0.18603515625, "loss/logits": 0.03881765343248844, "loss/reg": 0.023747442290186882, "step": 1355 }, { "epoch": 0.678, "grad_norm": 1.8396114110946655, "grad_norm_var": 0.3383879160154535, "learning_rate": 2e-05, "loss": 0.5556, "loss/crossentropy": 2.159119963645935, "loss/hidden": 0.27587890625, "loss/logits": 0.042316026985645294, "loss/reg": 0.023744840174913406, "step": 1356 }, { "epoch": 0.6785, "grad_norm": 1.6769040822982788, "grad_norm_var": 0.33632199932469997, "learning_rate": 2e-05, "loss": 0.4362, "loss/crossentropy": 2.420010805130005, "loss/hidden": 0.17041015625, "loss/logits": 0.028409303165972233, "loss/reg": 0.023742124438285828, "step": 1357 }, { "epoch": 0.679, "grad_norm": 1.4979515075683594, "grad_norm_var": 0.3336920570721417, "learning_rate": 2e-05, "loss": 0.4697, "loss/crossentropy": 2.002126097679138, "loss/hidden": 0.197265625, "loss/logits": 0.03505042381584644, "loss/reg": 0.02373962290585041, "step": 1358 }, { "epoch": 0.6795, "grad_norm": 1.339608907699585, "grad_norm_var": 0.3356063743636861, "learning_rate": 2e-05, "loss": 0.4591, "loss/crossentropy": 2.4204870462417603, "loss/hidden": 0.18408203125, "loss/logits": 0.03763877786695957, "loss/reg": 0.02373688668012619, "step": 1359 }, { "epoch": 0.68, "grad_norm": 2.6153082847595215, "grad_norm_var": 0.4002688084625047, "learning_rate": 2e-05, "loss": 0.5644, "loss/crossentropy": 1.8696808218955994, "loss/hidden": 0.27783203125, "loss/logits": 0.049255505204200745, "loss/reg": 0.023734180256724358, "step": 1360 }, { "epoch": 0.6805, "grad_norm": 1.7975633144378662, "grad_norm_var": 0.4009675788079647, "learning_rate": 2e-05, "loss": 0.4502, "loss/crossentropy": 2.283734917640686, "loss/hidden": 0.1806640625, "loss/logits": 0.03226998262107372, "loss/reg": 0.02373143844306469, "step": 1361 }, { "epoch": 0.681, "grad_norm": 1.3946635723114014, "grad_norm_var": 0.40393475291140984, "learning_rate": 2e-05, "loss": 0.4437, "loss/crossentropy": 2.3782224655151367, "loss/hidden": 0.1728515625, "loss/logits": 0.03351980075240135, "loss/reg": 0.02372862957417965, "step": 1362 }, { "epoch": 0.6815, "grad_norm": 1.5255178213119507, "grad_norm_var": 0.39988194537197613, "learning_rate": 2e-05, "loss": 0.4427, "loss/crossentropy": 2.39896559715271, "loss/hidden": 0.171875, "loss/logits": 0.03352793958038092, "loss/reg": 0.023726122453808784, "step": 1363 }, { "epoch": 0.682, "grad_norm": 1.2733867168426514, "grad_norm_var": 0.13058789078406388, "learning_rate": 2e-05, "loss": 0.4295, "loss/crossentropy": 2.261076331138611, "loss/hidden": 0.162109375, "loss/logits": 0.03012457862496376, "loss/reg": 0.02372356690466404, "step": 1364 }, { "epoch": 0.6825, "grad_norm": 1.837705135345459, "grad_norm_var": 0.1335292862046036, "learning_rate": 2e-05, "loss": 0.4557, "loss/crossentropy": 2.247922897338867, "loss/hidden": 0.1865234375, "loss/logits": 0.03198765777051449, "loss/reg": 0.02372095361351967, "step": 1365 }, { "epoch": 0.683, "grad_norm": 1.5753334760665894, "grad_norm_var": 0.1275530359959636, "learning_rate": 2e-05, "loss": 0.4808, "loss/crossentropy": 2.2536725997924805, "loss/hidden": 0.2060546875, "loss/logits": 0.03755863197147846, "loss/reg": 0.02371850237250328, "step": 1366 }, { "epoch": 0.6835, "grad_norm": 1.447576642036438, "grad_norm_var": 0.12783865842738631, "learning_rate": 2e-05, "loss": 0.4403, "loss/crossentropy": 2.3656728267669678, "loss/hidden": 0.16357421875, "loss/logits": 0.039546214044094086, "loss/reg": 0.023716144263744354, "step": 1367 }, { "epoch": 0.684, "grad_norm": 1.3603750467300415, "grad_norm_var": 0.12130121846223171, "learning_rate": 2e-05, "loss": 0.4643, "loss/crossentropy": 2.415152430534363, "loss/hidden": 0.1865234375, "loss/logits": 0.04059493914246559, "loss/reg": 0.02371359057724476, "step": 1368 }, { "epoch": 0.6845, "grad_norm": 1.0393022298812866, "grad_norm_var": 0.13820691270152227, "learning_rate": 2e-05, "loss": 0.3977, "loss/crossentropy": 2.226056694984436, "loss/hidden": 0.1357421875, "loss/logits": 0.024835828691720963, "loss/reg": 0.02371094562113285, "step": 1369 }, { "epoch": 0.685, "grad_norm": 1.7829720973968506, "grad_norm_var": 0.1322215247018124, "learning_rate": 2e-05, "loss": 0.5041, "loss/crossentropy": 2.510174036026001, "loss/hidden": 0.21533203125, "loss/logits": 0.0516891460865736, "loss/reg": 0.02370813861489296, "step": 1370 }, { "epoch": 0.6855, "grad_norm": 1.4974333047866821, "grad_norm_var": 0.12409001917904922, "learning_rate": 2e-05, "loss": 0.4361, "loss/crossentropy": 2.45763623714447, "loss/hidden": 0.16943359375, "loss/logits": 0.029570632614195347, "loss/reg": 0.02370576746761799, "step": 1371 }, { "epoch": 0.686, "grad_norm": 2.463162660598755, "grad_norm_var": 0.16882568198071363, "learning_rate": 2e-05, "loss": 0.5276, "loss/crossentropy": 2.1458094120025635, "loss/hidden": 0.2373046875, "loss/logits": 0.05326741002500057, "loss/reg": 0.02370315231382847, "step": 1372 }, { "epoch": 0.6865, "grad_norm": 2.5386321544647217, "grad_norm_var": 0.2203043192597228, "learning_rate": 2e-05, "loss": 0.5752, "loss/crossentropy": 2.3038665056228638, "loss/hidden": 0.2705078125, "loss/logits": 0.06766052544116974, "loss/reg": 0.023700760677456856, "step": 1373 }, { "epoch": 0.687, "grad_norm": 1.279981255531311, "grad_norm_var": 0.2287580151051623, "learning_rate": 2e-05, "loss": 0.4214, "loss/crossentropy": 2.436690330505371, "loss/hidden": 0.14892578125, "loss/logits": 0.035530680790543556, "loss/reg": 0.02369816228747368, "step": 1374 }, { "epoch": 0.6875, "grad_norm": 1.230238676071167, "grad_norm_var": 0.2343678483688691, "learning_rate": 2e-05, "loss": 0.4142, "loss/crossentropy": 2.427309036254883, "loss/hidden": 0.14892578125, "loss/logits": 0.028325392864644527, "loss/reg": 0.02369537763297558, "step": 1375 }, { "epoch": 0.688, "grad_norm": 2.1449315547943115, "grad_norm_var": 0.18867092664802806, "learning_rate": 2e-05, "loss": 0.4724, "loss/crossentropy": 2.393447160720825, "loss/hidden": 0.20361328125, "loss/logits": 0.0318912947550416, "loss/reg": 0.02369256503880024, "step": 1376 }, { "epoch": 0.6885, "grad_norm": 1.614142894744873, "grad_norm_var": 0.18684194347558922, "learning_rate": 2e-05, "loss": 0.5406, "loss/crossentropy": 2.102261245250702, "loss/hidden": 0.24853515625, "loss/logits": 0.05518599599599838, "loss/reg": 0.02369013801217079, "step": 1377 }, { "epoch": 0.689, "grad_norm": 1.2378525733947754, "grad_norm_var": 0.1932017017733111, "learning_rate": 2e-05, "loss": 0.4366, "loss/crossentropy": 2.2186710834503174, "loss/hidden": 0.16943359375, "loss/logits": 0.0302474033087492, "loss/reg": 0.02368772216141224, "step": 1378 }, { "epoch": 0.6895, "grad_norm": 1.3566957712173462, "grad_norm_var": 0.1970092361753761, "learning_rate": 2e-05, "loss": 0.3982, "loss/crossentropy": 2.546470046043396, "loss/hidden": 0.13525390625, "loss/logits": 0.026141813024878502, "loss/reg": 0.02368505485355854, "step": 1379 }, { "epoch": 0.69, "grad_norm": 1.2005629539489746, "grad_norm_var": 0.2005604341405349, "learning_rate": 2e-05, "loss": 0.4418, "loss/crossentropy": 2.2552963495254517, "loss/hidden": 0.17333984375, "loss/logits": 0.03166076820343733, "loss/reg": 0.023682620376348495, "step": 1380 }, { "epoch": 0.6905, "grad_norm": 1.9562398195266724, "grad_norm_var": 0.20518861482912196, "learning_rate": 2e-05, "loss": 0.5098, "loss/crossentropy": 2.2495819330215454, "loss/hidden": 0.23095703125, "loss/logits": 0.0420466773211956, "loss/reg": 0.023680146783590317, "step": 1381 }, { "epoch": 0.691, "grad_norm": 1.4204621315002441, "grad_norm_var": 0.20735892064978187, "learning_rate": 2e-05, "loss": 0.4234, "loss/crossentropy": 2.263777256011963, "loss/hidden": 0.15966796875, "loss/logits": 0.026978014037013054, "loss/reg": 0.02367776446044445, "step": 1382 }, { "epoch": 0.6915, "grad_norm": 2.1433403491973877, "grad_norm_var": 0.22364496503635584, "learning_rate": 2e-05, "loss": 0.443, "loss/crossentropy": 2.3404159545898438, "loss/hidden": 0.183349609375, "loss/logits": 0.0229120384901762, "loss/reg": 0.023675233125686646, "step": 1383 }, { "epoch": 0.692, "grad_norm": 1.4612318277359009, "grad_norm_var": 0.22049831846723483, "learning_rate": 2e-05, "loss": 0.435, "loss/crossentropy": 2.3787938356399536, "loss/hidden": 0.16943359375, "loss/logits": 0.028827445581555367, "loss/reg": 0.023672768846154213, "step": 1384 }, { "epoch": 0.6925, "grad_norm": 1.356377124786377, "grad_norm_var": 0.20105030555048078, "learning_rate": 2e-05, "loss": 0.4449, "loss/crossentropy": 2.473434090614319, "loss/hidden": 0.1767578125, "loss/logits": 0.0314208772033453, "loss/reg": 0.02367040514945984, "step": 1385 }, { "epoch": 0.693, "grad_norm": 1.1685643196105957, "grad_norm_var": 0.21520606580289575, "learning_rate": 2e-05, "loss": 0.4541, "loss/crossentropy": 2.138159155845642, "loss/hidden": 0.185546875, "loss/logits": 0.03185593895614147, "loss/reg": 0.023668091744184494, "step": 1386 }, { "epoch": 0.6935, "grad_norm": 1.520918846130371, "grad_norm_var": 0.21482740549679208, "learning_rate": 2e-05, "loss": 0.3933, "loss/crossentropy": 2.549217104911804, "loss/hidden": 0.13720703125, "loss/logits": 0.0194573812186718, "loss/reg": 0.02366561070084572, "step": 1387 }, { "epoch": 0.694, "grad_norm": 1.3178868293762207, "grad_norm_var": 0.16970641122309568, "learning_rate": 2e-05, "loss": 0.4825, "loss/crossentropy": 2.2156635522842407, "loss/hidden": 0.20849609375, "loss/logits": 0.03734987787902355, "loss/reg": 0.02366327866911888, "step": 1388 }, { "epoch": 0.6945, "grad_norm": 1.2805134057998657, "grad_norm_var": 0.10434541468175282, "learning_rate": 2e-05, "loss": 0.4346, "loss/crossentropy": 2.1902356147766113, "loss/hidden": 0.169921875, "loss/logits": 0.028107551857829094, "loss/reg": 0.023660695180296898, "step": 1389 }, { "epoch": 0.695, "grad_norm": 1.4917412996292114, "grad_norm_var": 0.1014830543172114, "learning_rate": 2e-05, "loss": 0.3961, "loss/crossentropy": 2.46234929561615, "loss/hidden": 0.1357421875, "loss/logits": 0.023821561597287655, "loss/reg": 0.023658404126763344, "step": 1390 }, { "epoch": 0.6955, "grad_norm": 1.2825431823730469, "grad_norm_var": 0.09981558763132382, "learning_rate": 2e-05, "loss": 0.4386, "loss/crossentropy": 2.4950649738311768, "loss/hidden": 0.16748046875, "loss/logits": 0.034576233476400375, "loss/reg": 0.02365582063794136, "step": 1391 }, { "epoch": 0.696, "grad_norm": 1.0627645254135132, "grad_norm_var": 0.07953715480548619, "learning_rate": 2e-05, "loss": 0.385, "loss/crossentropy": 2.324121117591858, "loss/hidden": 0.12939453125, "loss/logits": 0.019065213855355978, "loss/reg": 0.0236531812697649, "step": 1392 }, { "epoch": 0.6965, "grad_norm": 1.2363553047180176, "grad_norm_var": 0.079156088219622, "learning_rate": 2e-05, "loss": 0.4086, "loss/crossentropy": 2.692628264427185, "loss/hidden": 0.146240234375, "loss/logits": 0.02587859146296978, "loss/reg": 0.023650668561458588, "step": 1393 }, { "epoch": 0.697, "grad_norm": 1.3195236921310425, "grad_norm_var": 0.07774326246347835, "learning_rate": 2e-05, "loss": 0.4268, "loss/crossentropy": 2.2705594301223755, "loss/hidden": 0.158203125, "loss/logits": 0.0320826917886734, "loss/reg": 0.02364785596728325, "step": 1394 }, { "epoch": 0.6975, "grad_norm": 1.3812922239303589, "grad_norm_var": 0.07760303897853754, "learning_rate": 2e-05, "loss": 0.4493, "loss/crossentropy": 2.4334908723831177, "loss/hidden": 0.17138671875, "loss/logits": 0.04147607646882534, "loss/reg": 0.023645086213946342, "step": 1395 }, { "epoch": 0.698, "grad_norm": 1.3648511171340942, "grad_norm_var": 0.07464701664065293, "learning_rate": 2e-05, "loss": 0.4789, "loss/crossentropy": 2.5334564447402954, "loss/hidden": 0.201171875, "loss/logits": 0.04130409471690655, "loss/reg": 0.02364257536828518, "step": 1396 }, { "epoch": 0.6985, "grad_norm": 1.8778526782989502, "grad_norm_var": 0.0694556142458523, "learning_rate": 2e-05, "loss": 0.4686, "loss/crossentropy": 2.253718376159668, "loss/hidden": 0.19873046875, "loss/logits": 0.033460862934589386, "loss/reg": 0.023639997467398643, "step": 1397 }, { "epoch": 0.699, "grad_norm": 1.0649257898330688, "grad_norm_var": 0.07723400074944091, "learning_rate": 2e-05, "loss": 0.3847, "loss/crossentropy": 2.403126835823059, "loss/hidden": 0.128662109375, "loss/logits": 0.019640752114355564, "loss/reg": 0.0236373171210289, "step": 1398 }, { "epoch": 0.6995, "grad_norm": 0.9858599901199341, "grad_norm_var": 0.04558018881049334, "learning_rate": 2e-05, "loss": 0.3982, "loss/crossentropy": 2.2231950759887695, "loss/hidden": 0.138671875, "loss/logits": 0.02315397746860981, "loss/reg": 0.02363484352827072, "step": 1399 }, { "epoch": 0.7, "grad_norm": 1.3760892152786255, "grad_norm_var": 0.04446770302423217, "learning_rate": 2e-05, "loss": 0.4886, "loss/crossentropy": 2.3370308876037598, "loss/hidden": 0.20654296875, "loss/logits": 0.045770518481731415, "loss/reg": 0.023632274940609932, "step": 1400 }, { "epoch": 0.7005, "grad_norm": 1.633719563484192, "grad_norm_var": 0.050694139558335634, "learning_rate": 2e-05, "loss": 0.4001, "loss/crossentropy": 2.4175291061401367, "loss/hidden": 0.14208984375, "loss/logits": 0.021711762994527817, "loss/reg": 0.023629970848560333, "step": 1401 }, { "epoch": 0.701, "grad_norm": 1.5971498489379883, "grad_norm_var": 0.052644270149189036, "learning_rate": 2e-05, "loss": 0.4603, "loss/crossentropy": 2.2261587381362915, "loss/hidden": 0.19140625, "loss/logits": 0.032638235948979855, "loss/reg": 0.023627305403351784, "step": 1402 }, { "epoch": 0.7015, "grad_norm": 1.2570019960403442, "grad_norm_var": 0.05140971627937218, "learning_rate": 2e-05, "loss": 0.3887, "loss/crossentropy": 2.4443479776382446, "loss/hidden": 0.12939453125, "loss/logits": 0.0230065593495965, "loss/reg": 0.023624898865818977, "step": 1403 }, { "epoch": 0.702, "grad_norm": 1.5167655944824219, "grad_norm_var": 0.05314610912009237, "learning_rate": 2e-05, "loss": 0.4748, "loss/crossentropy": 2.462609887123108, "loss/hidden": 0.20068359375, "loss/logits": 0.037887776270508766, "loss/reg": 0.02362249046564102, "step": 1404 }, { "epoch": 0.7025, "grad_norm": 1.3424351215362549, "grad_norm_var": 0.052745515833931715, "learning_rate": 2e-05, "loss": 0.4595, "loss/crossentropy": 2.2617905139923096, "loss/hidden": 0.18505859375, "loss/logits": 0.03824649378657341, "loss/reg": 0.02361990138888359, "step": 1405 }, { "epoch": 0.703, "grad_norm": 1.2809338569641113, "grad_norm_var": 0.05187429464569006, "learning_rate": 2e-05, "loss": 0.4088, "loss/crossentropy": 2.3717641830444336, "loss/hidden": 0.14306640625, "loss/logits": 0.02956732176244259, "loss/reg": 0.02361760474741459, "step": 1406 }, { "epoch": 0.7035, "grad_norm": 1.7771258354187012, "grad_norm_var": 0.06279631634374751, "learning_rate": 2e-05, "loss": 0.4556, "loss/crossentropy": 2.469625473022461, "loss/hidden": 0.18017578125, "loss/logits": 0.03924528695642948, "loss/reg": 0.02361505851149559, "step": 1407 }, { "epoch": 0.704, "grad_norm": 1.414624571800232, "grad_norm_var": 0.05566685888704838, "learning_rate": 2e-05, "loss": 0.4175, "loss/crossentropy": 2.7455949783325195, "loss/hidden": 0.1552734375, "loss/logits": 0.026056132279336452, "loss/reg": 0.023612603545188904, "step": 1408 }, { "epoch": 0.7045, "grad_norm": 1.3036192655563354, "grad_norm_var": 0.054467126651640524, "learning_rate": 2e-05, "loss": 0.4093, "loss/crossentropy": 2.5232421159744263, "loss/hidden": 0.14501953125, "loss/logits": 0.028133532963693142, "loss/reg": 0.023610040545463562, "step": 1409 }, { "epoch": 0.705, "grad_norm": 1.2797057628631592, "grad_norm_var": 0.05502458620776493, "learning_rate": 2e-05, "loss": 0.4161, "loss/crossentropy": 2.2470325231552124, "loss/hidden": 0.15283203125, "loss/logits": 0.027143074199557304, "loss/reg": 0.023607581853866577, "step": 1410 }, { "epoch": 0.7055, "grad_norm": 1.2794984579086304, "grad_norm_var": 0.05597188755687809, "learning_rate": 2e-05, "loss": 0.4053, "loss/crossentropy": 2.288419008255005, "loss/hidden": 0.14794921875, "loss/logits": 0.021314891055226326, "loss/reg": 0.0236049797385931, "step": 1411 }, { "epoch": 0.706, "grad_norm": 2.200571060180664, "grad_norm_var": 0.0960401931657619, "learning_rate": 2e-05, "loss": 0.6026, "loss/crossentropy": 2.12148916721344, "loss/hidden": 0.30029296875, "loss/logits": 0.06627136748284101, "loss/reg": 0.02360256016254425, "step": 1412 }, { "epoch": 0.7065, "grad_norm": 2.5475215911865234, "grad_norm_var": 0.16233898418936382, "learning_rate": 2e-05, "loss": 0.4245, "loss/crossentropy": 2.7688039541244507, "loss/hidden": 0.15966796875, "loss/logits": 0.02882098313421011, "loss/reg": 0.02360014244914055, "step": 1413 }, { "epoch": 0.707, "grad_norm": 1.3649111986160278, "grad_norm_var": 0.15091742893504806, "learning_rate": 2e-05, "loss": 0.3992, "loss/crossentropy": 2.421576499938965, "loss/hidden": 0.13916015625, "loss/logits": 0.024032247252762318, "loss/reg": 0.023597724735736847, "step": 1414 }, { "epoch": 0.7075, "grad_norm": 1.353563904762268, "grad_norm_var": 0.13367826295360388, "learning_rate": 2e-05, "loss": 0.4107, "loss/crossentropy": 2.5319186449050903, "loss/hidden": 0.14892578125, "loss/logits": 0.02581237070262432, "loss/reg": 0.023595217615365982, "step": 1415 }, { "epoch": 0.708, "grad_norm": 1.1511586904525757, "grad_norm_var": 0.1415410624712725, "learning_rate": 2e-05, "loss": 0.3948, "loss/crossentropy": 2.396964430809021, "loss/hidden": 0.13525390625, "loss/logits": 0.02357430011034012, "loss/reg": 0.023592684417963028, "step": 1416 }, { "epoch": 0.7085, "grad_norm": 1.4777796268463135, "grad_norm_var": 0.1406708433314444, "learning_rate": 2e-05, "loss": 0.4242, "loss/crossentropy": 2.25082266330719, "loss/hidden": 0.15185546875, "loss/logits": 0.03641578182578087, "loss/reg": 0.02359013259410858, "step": 1417 }, { "epoch": 0.709, "grad_norm": 1.4813765287399292, "grad_norm_var": 0.14014819307293463, "learning_rate": 2e-05, "loss": 0.4335, "loss/crossentropy": 2.433822274208069, "loss/hidden": 0.169921875, "loss/logits": 0.027691357769072056, "loss/reg": 0.02358764037489891, "step": 1418 }, { "epoch": 0.7095, "grad_norm": 3.4135758876800537, "grad_norm_var": 0.3604375985303822, "learning_rate": 2e-05, "loss": 0.5497, "loss/crossentropy": 2.3843711614608765, "loss/hidden": 0.2734375, "loss/logits": 0.0403892807662487, "loss/reg": 0.02358505129814148, "step": 1419 }, { "epoch": 0.71, "grad_norm": 1.213165521621704, "grad_norm_var": 0.37104821359083356, "learning_rate": 2e-05, "loss": 0.3982, "loss/crossentropy": 2.5343793630599976, "loss/hidden": 0.13623046875, "loss/logits": 0.02614509966224432, "loss/reg": 0.023582441732287407, "step": 1420 }, { "epoch": 0.7105, "grad_norm": 1.5525851249694824, "grad_norm_var": 0.3660983405644202, "learning_rate": 2e-05, "loss": 0.4927, "loss/crossentropy": 2.1852606534957886, "loss/hidden": 0.21630859375, "loss/logits": 0.04058670625090599, "loss/reg": 0.023579921573400497, "step": 1421 }, { "epoch": 0.711, "grad_norm": 1.3050885200500488, "grad_norm_var": 0.36500823755956063, "learning_rate": 2e-05, "loss": 0.4211, "loss/crossentropy": 2.417192816734314, "loss/hidden": 0.162109375, "loss/logits": 0.02324726153165102, "loss/reg": 0.023577282205224037, "step": 1422 }, { "epoch": 0.7115, "grad_norm": 1.6903064250946045, "grad_norm_var": 0.36380217397103765, "learning_rate": 2e-05, "loss": 0.4785, "loss/crossentropy": 2.4316320419311523, "loss/hidden": 0.208984375, "loss/logits": 0.0337921567261219, "loss/reg": 0.023574667051434517, "step": 1423 }, { "epoch": 0.712, "grad_norm": 1.1675231456756592, "grad_norm_var": 0.3746094012963262, "learning_rate": 2e-05, "loss": 0.4142, "loss/crossentropy": 2.2177504301071167, "loss/hidden": 0.1494140625, "loss/logits": 0.02909334283322096, "loss/reg": 0.023572128266096115, "step": 1424 }, { "epoch": 0.7125, "grad_norm": 1.718462586402893, "grad_norm_var": 0.3683427865372977, "learning_rate": 2e-05, "loss": 0.5107, "loss/crossentropy": 2.356824278831482, "loss/hidden": 0.2265625, "loss/logits": 0.048446234315633774, "loss/reg": 0.023569492623209953, "step": 1425 }, { "epoch": 0.713, "grad_norm": 2.538555145263672, "grad_norm_var": 0.4073657383302283, "learning_rate": 2e-05, "loss": 0.4659, "loss/crossentropy": 2.4271206855773926, "loss/hidden": 0.197265625, "loss/logits": 0.03294919244945049, "loss/reg": 0.023566963151097298, "step": 1426 }, { "epoch": 0.7135, "grad_norm": 1.6605249643325806, "grad_norm_var": 0.3942648744597231, "learning_rate": 2e-05, "loss": 0.4182, "loss/crossentropy": 2.5462480783462524, "loss/hidden": 0.15283203125, "loss/logits": 0.029766596853733063, "loss/reg": 0.023564644157886505, "step": 1427 }, { "epoch": 0.714, "grad_norm": 1.6154025793075562, "grad_norm_var": 0.3797151310753638, "learning_rate": 2e-05, "loss": 0.474, "loss/crossentropy": 2.253910183906555, "loss/hidden": 0.2021484375, "loss/logits": 0.03623027540743351, "loss/reg": 0.023562012240290642, "step": 1428 }, { "epoch": 0.7145, "grad_norm": 2.050323963165283, "grad_norm_var": 0.3391940969651538, "learning_rate": 2e-05, "loss": 0.4883, "loss/crossentropy": 2.319291830062866, "loss/hidden": 0.203125, "loss/logits": 0.049589984118938446, "loss/reg": 0.023559633642435074, "step": 1429 }, { "epoch": 0.715, "grad_norm": 1.2968723773956299, "grad_norm_var": 0.3422705946198695, "learning_rate": 2e-05, "loss": 0.4308, "loss/crossentropy": 2.3565025329589844, "loss/hidden": 0.166015625, "loss/logits": 0.029243918135762215, "loss/reg": 0.023557225242257118, "step": 1430 }, { "epoch": 0.7155, "grad_norm": 1.465996265411377, "grad_norm_var": 0.3383485792832592, "learning_rate": 2e-05, "loss": 0.4574, "loss/crossentropy": 2.4091076850891113, "loss/hidden": 0.18603515625, "loss/logits": 0.03585449419915676, "loss/reg": 0.023554889485239983, "step": 1431 }, { "epoch": 0.716, "grad_norm": 1.6185139417648315, "grad_norm_var": 0.31936229587676096, "learning_rate": 2e-05, "loss": 0.4477, "loss/crossentropy": 1.988387107849121, "loss/hidden": 0.1845703125, "loss/logits": 0.027648674324154854, "loss/reg": 0.02355222962796688, "step": 1432 }, { "epoch": 0.7165, "grad_norm": 1.5127618312835693, "grad_norm_var": 0.3183830238570701, "learning_rate": 2e-05, "loss": 0.4341, "loss/crossentropy": 2.4324188232421875, "loss/hidden": 0.16650390625, "loss/logits": 0.03210682421922684, "loss/reg": 0.02354956604540348, "step": 1433 }, { "epoch": 0.717, "grad_norm": 1.5678179264068604, "grad_norm_var": 0.3162575020195957, "learning_rate": 2e-05, "loss": 0.4676, "loss/crossentropy": 2.4026317596435547, "loss/hidden": 0.19921875, "loss/logits": 0.03290037252008915, "loss/reg": 0.023546863347291946, "step": 1434 }, { "epoch": 0.7175, "grad_norm": 1.6551094055175781, "grad_norm_var": 0.11049876185942271, "learning_rate": 2e-05, "loss": 0.4312, "loss/crossentropy": 2.351606845855713, "loss/hidden": 0.1650390625, "loss/logits": 0.030725182965397835, "loss/reg": 0.02354429103434086, "step": 1435 }, { "epoch": 0.718, "grad_norm": 1.3460294008255005, "grad_norm_var": 0.10471709905145345, "learning_rate": 2e-05, "loss": 0.4204, "loss/crossentropy": 2.403334140777588, "loss/hidden": 0.15771484375, "loss/logits": 0.02723412588238716, "loss/reg": 0.023541752249002457, "step": 1436 }, { "epoch": 0.7185, "grad_norm": 1.1729974746704102, "grad_norm_var": 0.1166343133725992, "learning_rate": 2e-05, "loss": 0.4015, "loss/crossentropy": 2.3932619094848633, "loss/hidden": 0.14111328125, "loss/logits": 0.02496551349759102, "loss/reg": 0.02353922463953495, "step": 1437 }, { "epoch": 0.719, "grad_norm": 1.7645087242126465, "grad_norm_var": 0.11259440907935142, "learning_rate": 2e-05, "loss": 0.5111, "loss/crossentropy": 2.400877833366394, "loss/hidden": 0.23486328125, "loss/logits": 0.0408332534134388, "loss/reg": 0.02353672869503498, "step": 1438 }, { "epoch": 0.7195, "grad_norm": 1.3634532690048218, "grad_norm_var": 0.11599423217601744, "learning_rate": 2e-05, "loss": 0.4493, "loss/crossentropy": 2.424346089363098, "loss/hidden": 0.1796875, "loss/logits": 0.03430754691362381, "loss/reg": 0.023534253239631653, "step": 1439 }, { "epoch": 0.72, "grad_norm": 1.3123286962509155, "grad_norm_var": 0.10905751409417323, "learning_rate": 2e-05, "loss": 0.4399, "loss/crossentropy": 2.2828234434127808, "loss/hidden": 0.169921875, "loss/logits": 0.03468863479793072, "loss/reg": 0.023531882092356682, "step": 1440 }, { "epoch": 0.7205, "grad_norm": 3.1446728706359863, "grad_norm_var": 0.2580052108983352, "learning_rate": 2e-05, "loss": 0.6787, "loss/crossentropy": 1.8992632031440735, "loss/hidden": 0.33447265625, "loss/logits": 0.10889805294573307, "loss/reg": 0.023529645055532455, "step": 1441 }, { "epoch": 0.721, "grad_norm": 1.5229130983352661, "grad_norm_var": 0.20795354022681156, "learning_rate": 2e-05, "loss": 0.4591, "loss/crossentropy": 2.284720540046692, "loss/hidden": 0.1875, "loss/logits": 0.036282142624258995, "loss/reg": 0.02352738194167614, "step": 1442 }, { "epoch": 0.7215, "grad_norm": 1.6657564640045166, "grad_norm_var": 0.20797696901367027, "learning_rate": 2e-05, "loss": 0.4496, "loss/crossentropy": 2.7669016122817993, "loss/hidden": 0.18310546875, "loss/logits": 0.031232742592692375, "loss/reg": 0.023524843156337738, "step": 1443 }, { "epoch": 0.722, "grad_norm": 2.9521846771240234, "grad_norm_var": 0.3171124021500166, "learning_rate": 2e-05, "loss": 0.5604, "loss/crossentropy": 2.3520604372024536, "loss/hidden": 0.2763671875, "loss/logits": 0.04880333133041859, "loss/reg": 0.023522403091192245, "step": 1444 }, { "epoch": 0.7225, "grad_norm": 1.5790318250656128, "grad_norm_var": 0.3098142392085926, "learning_rate": 2e-05, "loss": 0.4899, "loss/crossentropy": 2.0901917219161987, "loss/hidden": 0.21533203125, "loss/logits": 0.039392558857798576, "loss/reg": 0.023519445210695267, "step": 1445 }, { "epoch": 0.723, "grad_norm": 1.3354227542877197, "grad_norm_var": 0.3079182473817125, "learning_rate": 2e-05, "loss": 0.4233, "loss/crossentropy": 2.3203498125076294, "loss/hidden": 0.16064453125, "loss/logits": 0.02744780946522951, "loss/reg": 0.02351679466664791, "step": 1446 }, { "epoch": 0.7235, "grad_norm": 1.3747113943099976, "grad_norm_var": 0.3111194517989119, "learning_rate": 2e-05, "loss": 0.4114, "loss/crossentropy": 2.5399714708328247, "loss/hidden": 0.14990234375, "loss/logits": 0.026368978433310986, "loss/reg": 0.02351376973092556, "step": 1447 }, { "epoch": 0.724, "grad_norm": 1.1484705209732056, "grad_norm_var": 0.3288139086814922, "learning_rate": 2e-05, "loss": 0.4122, "loss/crossentropy": 2.4500149488449097, "loss/hidden": 0.15087890625, "loss/logits": 0.026164425536990166, "loss/reg": 0.023511258885264397, "step": 1448 }, { "epoch": 0.7245, "grad_norm": 1.3708717823028564, "grad_norm_var": 0.3326900567825229, "learning_rate": 2e-05, "loss": 0.3965, "loss/crossentropy": 2.305969476699829, "loss/hidden": 0.138671875, "loss/logits": 0.022724819369614124, "loss/reg": 0.02350870706140995, "step": 1449 }, { "epoch": 0.725, "grad_norm": 2.349400520324707, "grad_norm_var": 0.3631110489319557, "learning_rate": 2e-05, "loss": 0.443, "loss/crossentropy": 2.395747423171997, "loss/hidden": 0.1796875, "loss/logits": 0.028218965977430344, "loss/reg": 0.023506123572587967, "step": 1450 }, { "epoch": 0.7255, "grad_norm": 1.7106391191482544, "grad_norm_var": 0.3630371761170198, "learning_rate": 2e-05, "loss": 0.4614, "loss/crossentropy": 2.6804983615875244, "loss/hidden": 0.1904296875, "loss/logits": 0.03596752695739269, "loss/reg": 0.02350357361137867, "step": 1451 }, { "epoch": 0.726, "grad_norm": 2.972860813140869, "grad_norm_var": 0.4528425190093097, "learning_rate": 2e-05, "loss": 0.4555, "loss/crossentropy": 2.1960572004318237, "loss/hidden": 0.181396484375, "loss/logits": 0.0390651635825634, "loss/reg": 0.023501023650169373, "step": 1452 }, { "epoch": 0.7265, "grad_norm": 1.1931060552597046, "grad_norm_var": 0.45119672384324666, "learning_rate": 2e-05, "loss": 0.409, "loss/crossentropy": 2.4346343278884888, "loss/hidden": 0.14501953125, "loss/logits": 0.02896373998373747, "loss/reg": 0.023498453199863434, "step": 1453 }, { "epoch": 0.727, "grad_norm": 1.793229579925537, "grad_norm_var": 0.45112186135817844, "learning_rate": 2e-05, "loss": 0.4728, "loss/crossentropy": 2.494977831840515, "loss/hidden": 0.20068359375, "loss/logits": 0.037164075300097466, "loss/reg": 0.023496052250266075, "step": 1454 }, { "epoch": 0.7275, "grad_norm": 1.9371393918991089, "grad_norm_var": 0.4383518223702936, "learning_rate": 2e-05, "loss": 0.5224, "loss/crossentropy": 2.0521376729011536, "loss/hidden": 0.2529296875, "loss/logits": 0.034543922170996666, "loss/reg": 0.02349347248673439, "step": 1455 }, { "epoch": 0.728, "grad_norm": 1.477908968925476, "grad_norm_var": 0.4285223862932327, "learning_rate": 2e-05, "loss": 0.4217, "loss/crossentropy": 2.2566416263580322, "loss/hidden": 0.1572265625, "loss/logits": 0.029540160670876503, "loss/reg": 0.02349095791578293, "step": 1456 }, { "epoch": 0.7285, "grad_norm": 1.43665611743927, "grad_norm_var": 0.3149916450445355, "learning_rate": 2e-05, "loss": 0.435, "loss/crossentropy": 2.3300145864486694, "loss/hidden": 0.16650390625, "loss/logits": 0.03365709260106087, "loss/reg": 0.023488519713282585, "step": 1457 }, { "epoch": 0.729, "grad_norm": 2.223034381866455, "grad_norm_var": 0.32547722216857267, "learning_rate": 2e-05, "loss": 0.5049, "loss/crossentropy": 2.456981062889099, "loss/hidden": 0.22509765625, "loss/logits": 0.04493347555398941, "loss/reg": 0.023486167192459106, "step": 1458 }, { "epoch": 0.7295, "grad_norm": 1.679583191871643, "grad_norm_var": 0.3252738977751884, "learning_rate": 2e-05, "loss": 0.4379, "loss/crossentropy": 2.432957887649536, "loss/hidden": 0.16943359375, "loss/logits": 0.03366055339574814, "loss/reg": 0.023483600467443466, "step": 1459 }, { "epoch": 0.73, "grad_norm": 1.673349380493164, "grad_norm_var": 0.22819496323031288, "learning_rate": 2e-05, "loss": 0.492, "loss/crossentropy": 2.410443902015686, "loss/hidden": 0.2109375, "loss/logits": 0.046233994886279106, "loss/reg": 0.023481376469135284, "step": 1460 }, { "epoch": 0.7305, "grad_norm": 1.5115046501159668, "grad_norm_var": 0.22960029400701293, "learning_rate": 2e-05, "loss": 0.4361, "loss/crossentropy": 2.6036850214004517, "loss/hidden": 0.17041015625, "loss/logits": 0.030860383063554764, "loss/reg": 0.023478906601667404, "step": 1461 }, { "epoch": 0.731, "grad_norm": 1.3442504405975342, "grad_norm_var": 0.22917693899710986, "learning_rate": 2e-05, "loss": 0.4744, "loss/crossentropy": 2.410821318626404, "loss/hidden": 0.19384765625, "loss/logits": 0.04574625752866268, "loss/reg": 0.02347634732723236, "step": 1462 }, { "epoch": 0.7315, "grad_norm": 1.2325595617294312, "grad_norm_var": 0.23660137846549864, "learning_rate": 2e-05, "loss": 0.4353, "loss/crossentropy": 2.428195834159851, "loss/hidden": 0.1669921875, "loss/logits": 0.03353757597506046, "loss/reg": 0.023473726585507393, "step": 1463 }, { "epoch": 0.732, "grad_norm": 1.4386786222457886, "grad_norm_var": 0.22087578651664874, "learning_rate": 2e-05, "loss": 0.4601, "loss/crossentropy": 2.128316283226013, "loss/hidden": 0.19189453125, "loss/logits": 0.03346416354179382, "loss/reg": 0.023471109569072723, "step": 1464 }, { "epoch": 0.7325, "grad_norm": 1.5255026817321777, "grad_norm_var": 0.2153978679484633, "learning_rate": 2e-05, "loss": 0.426, "loss/crossentropy": 2.4801331758499146, "loss/hidden": 0.162109375, "loss/logits": 0.029225386679172516, "loss/reg": 0.023468641564249992, "step": 1465 }, { "epoch": 0.733, "grad_norm": 1.558826208114624, "grad_norm_var": 0.18798010841390062, "learning_rate": 2e-05, "loss": 0.4271, "loss/crossentropy": 2.345631241798401, "loss/hidden": 0.1591796875, "loss/logits": 0.03320986311882734, "loss/reg": 0.02346622571349144, "step": 1466 }, { "epoch": 0.7335, "grad_norm": 1.4616813659667969, "grad_norm_var": 0.1904816907030834, "learning_rate": 2e-05, "loss": 0.4534, "loss/crossentropy": 2.260239005088806, "loss/hidden": 0.1826171875, "loss/logits": 0.03611057437956333, "loss/reg": 0.023463619872927666, "step": 1467 }, { "epoch": 0.734, "grad_norm": 1.2021178007125854, "grad_norm_var": 0.07500963522951735, "learning_rate": 2e-05, "loss": 0.441, "loss/crossentropy": 2.4786767959594727, "loss/hidden": 0.17041015625, "loss/logits": 0.03600460663437843, "loss/reg": 0.023461153730750084, "step": 1468 }, { "epoch": 0.7345, "grad_norm": 1.321462869644165, "grad_norm_var": 0.07004997562031713, "learning_rate": 2e-05, "loss": 0.4248, "loss/crossentropy": 2.4473639726638794, "loss/hidden": 0.16064453125, "loss/logits": 0.029547326266765594, "loss/reg": 0.023458639159798622, "step": 1469 }, { "epoch": 0.735, "grad_norm": 1.302802324295044, "grad_norm_var": 0.06924901126378126, "learning_rate": 2e-05, "loss": 0.4392, "loss/crossentropy": 2.320843458175659, "loss/hidden": 0.1708984375, "loss/logits": 0.03376789018511772, "loss/reg": 0.02345600537955761, "step": 1470 }, { "epoch": 0.7355, "grad_norm": 1.744510293006897, "grad_norm_var": 0.06086570608285336, "learning_rate": 2e-05, "loss": 0.4287, "loss/crossentropy": 2.4008067846298218, "loss/hidden": 0.16259765625, "loss/logits": 0.03158361464738846, "loss/reg": 0.02345338650047779, "step": 1471 }, { "epoch": 0.736, "grad_norm": 1.1985650062561035, "grad_norm_var": 0.06687850358083645, "learning_rate": 2e-05, "loss": 0.4133, "loss/crossentropy": 2.5214314460754395, "loss/hidden": 0.14794921875, "loss/logits": 0.03085363283753395, "loss/reg": 0.023450734093785286, "step": 1472 }, { "epoch": 0.7365, "grad_norm": 2.1167149543762207, "grad_norm_var": 0.090861085965173, "learning_rate": 2e-05, "loss": 0.4895, "loss/crossentropy": 1.9878064393997192, "loss/hidden": 0.22265625, "loss/logits": 0.03239255491644144, "loss/reg": 0.023448146879673004, "step": 1473 }, { "epoch": 0.737, "grad_norm": 1.5386013984680176, "grad_norm_var": 0.057208890733344654, "learning_rate": 2e-05, "loss": 0.4493, "loss/crossentropy": 2.1566672325134277, "loss/hidden": 0.1826171875, "loss/logits": 0.03220840450376272, "loss/reg": 0.0234454907476902, "step": 1474 }, { "epoch": 0.7375, "grad_norm": 1.3006364107131958, "grad_norm_var": 0.05663883015899618, "learning_rate": 2e-05, "loss": 0.4451, "loss/crossentropy": 2.2811367511749268, "loss/hidden": 0.1748046875, "loss/logits": 0.03587420843541622, "loss/reg": 0.02344280481338501, "step": 1475 }, { "epoch": 0.738, "grad_norm": 2.268388271331787, "grad_norm_var": 0.09514090985834489, "learning_rate": 2e-05, "loss": 0.4557, "loss/crossentropy": 2.2090927362442017, "loss/hidden": 0.18994140625, "loss/logits": 0.03136050421744585, "loss/reg": 0.02344009466469288, "step": 1476 }, { "epoch": 0.7385, "grad_norm": 1.514344334602356, "grad_norm_var": 0.09514418896451105, "learning_rate": 2e-05, "loss": 0.4319, "loss/crossentropy": 2.2778546810150146, "loss/hidden": 0.16943359375, "loss/logits": 0.028109371662139893, "loss/reg": 0.023437298834323883, "step": 1477 }, { "epoch": 0.739, "grad_norm": 1.1976886987686157, "grad_norm_var": 0.09961535847471854, "learning_rate": 2e-05, "loss": 0.4028, "loss/crossentropy": 2.5273643732070923, "loss/hidden": 0.14794921875, "loss/logits": 0.02050770726054907, "loss/reg": 0.02343466505408287, "step": 1478 }, { "epoch": 0.7395, "grad_norm": 1.8187288045883179, "grad_norm_var": 0.10056368997682572, "learning_rate": 2e-05, "loss": 0.4755, "loss/crossentropy": 2.3583481311798096, "loss/hidden": 0.21044921875, "loss/logits": 0.030701249837875366, "loss/reg": 0.023432079702615738, "step": 1479 }, { "epoch": 0.74, "grad_norm": 2.092109441757202, "grad_norm_var": 0.11913386201947915, "learning_rate": 2e-05, "loss": 0.57, "loss/crossentropy": 2.2834445238113403, "loss/hidden": 0.27978515625, "loss/logits": 0.05595431476831436, "loss/reg": 0.023429367691278458, "step": 1480 }, { "epoch": 0.7405, "grad_norm": 1.6731066703796387, "grad_norm_var": 0.11956731584117103, "learning_rate": 2e-05, "loss": 0.47, "loss/crossentropy": 2.29840624332428, "loss/hidden": 0.19775390625, "loss/logits": 0.03799319267272949, "loss/reg": 0.023426661267876625, "step": 1481 }, { "epoch": 0.741, "grad_norm": 1.7056869268417358, "grad_norm_var": 0.12046364336034585, "learning_rate": 2e-05, "loss": 0.4905, "loss/crossentropy": 2.46663236618042, "loss/hidden": 0.2119140625, "loss/logits": 0.04434940032660961, "loss/reg": 0.02342418022453785, "step": 1482 }, { "epoch": 0.7415, "grad_norm": 1.5112969875335693, "grad_norm_var": 0.11976152998951234, "learning_rate": 2e-05, "loss": 0.4492, "loss/crossentropy": 2.5104438066482544, "loss/hidden": 0.181640625, "loss/logits": 0.03338887542486191, "loss/reg": 0.023421762511134148, "step": 1483 }, { "epoch": 0.742, "grad_norm": 2.16302227973938, "grad_norm_var": 0.1272398268385037, "learning_rate": 2e-05, "loss": 0.4431, "loss/crossentropy": 2.4524015188217163, "loss/hidden": 0.1708984375, "loss/logits": 0.03798994794487953, "loss/reg": 0.023419423028826714, "step": 1484 }, { "epoch": 0.7425, "grad_norm": 2.6009202003479004, "grad_norm_var": 0.1727849916739044, "learning_rate": 2e-05, "loss": 0.4471, "loss/crossentropy": 2.459054470062256, "loss/hidden": 0.18017578125, "loss/logits": 0.032748810946941376, "loss/reg": 0.023416871204972267, "step": 1485 }, { "epoch": 0.743, "grad_norm": 1.2313926219940186, "grad_norm_var": 0.177211118899447, "learning_rate": 2e-05, "loss": 0.4193, "loss/crossentropy": 2.4845768213272095, "loss/hidden": 0.15576171875, "loss/logits": 0.02941302303224802, "loss/reg": 0.023414650931954384, "step": 1486 }, { "epoch": 0.7435, "grad_norm": 1.3175305128097534, "grad_norm_var": 0.18776426918117484, "learning_rate": 2e-05, "loss": 0.4491, "loss/crossentropy": 2.5155017375946045, "loss/hidden": 0.18310546875, "loss/logits": 0.03185183368623257, "loss/reg": 0.023412445560097694, "step": 1487 }, { "epoch": 0.744, "grad_norm": 1.3121087551116943, "grad_norm_var": 0.18093261119129972, "learning_rate": 2e-05, "loss": 0.4527, "loss/crossentropy": 2.266068696975708, "loss/hidden": 0.18359375, "loss/logits": 0.03500186279416084, "loss/reg": 0.02340994030237198, "step": 1488 }, { "epoch": 0.7445, "grad_norm": 2.539462089538574, "grad_norm_var": 0.2150192957888348, "learning_rate": 2e-05, "loss": 0.5812, "loss/crossentropy": 2.3054516315460205, "loss/hidden": 0.29296875, "loss/logits": 0.05416359752416611, "loss/reg": 0.023407652974128723, "step": 1489 }, { "epoch": 0.745, "grad_norm": 1.8638911247253418, "grad_norm_var": 0.21304660583959933, "learning_rate": 2e-05, "loss": 0.4883, "loss/crossentropy": 2.217733383178711, "loss/hidden": 0.2177734375, "loss/logits": 0.03648427501320839, "loss/reg": 0.023405244573950768, "step": 1490 }, { "epoch": 0.7455, "grad_norm": 1.4341673851013184, "grad_norm_var": 0.2060377327406276, "learning_rate": 2e-05, "loss": 0.4652, "loss/crossentropy": 2.259947180747986, "loss/hidden": 0.1962890625, "loss/logits": 0.03487166576087475, "loss/reg": 0.02340288832783699, "step": 1491 }, { "epoch": 0.746, "grad_norm": 1.286834478378296, "grad_norm_var": 0.20040431914197107, "learning_rate": 2e-05, "loss": 0.4281, "loss/crossentropy": 2.373807907104492, "loss/hidden": 0.16259765625, "loss/logits": 0.03151876013725996, "loss/reg": 0.02340046875178814, "step": 1492 }, { "epoch": 0.7465, "grad_norm": 2.74072527885437, "grad_norm_var": 0.263410407901218, "learning_rate": 2e-05, "loss": 0.5302, "loss/crossentropy": 2.263616144657135, "loss/hidden": 0.2568359375, "loss/logits": 0.039356544613838196, "loss/reg": 0.02339823544025421, "step": 1493 }, { "epoch": 0.747, "grad_norm": 1.9128124713897705, "grad_norm_var": 0.23979806512014498, "learning_rate": 2e-05, "loss": 0.5741, "loss/crossentropy": 2.0934388637542725, "loss/hidden": 0.2841796875, "loss/logits": 0.055920008569955826, "loss/reg": 0.023396024480462074, "step": 1494 }, { "epoch": 0.7475, "grad_norm": 1.9756958484649658, "grad_norm_var": 0.24120176602785268, "learning_rate": 2e-05, "loss": 0.4228, "loss/crossentropy": 2.3994463682174683, "loss/hidden": 0.16064453125, "loss/logits": 0.028181973844766617, "loss/reg": 0.02339351177215576, "step": 1495 }, { "epoch": 0.748, "grad_norm": 1.7724146842956543, "grad_norm_var": 0.23663205632003587, "learning_rate": 2e-05, "loss": 0.461, "loss/crossentropy": 2.1939653158187866, "loss/hidden": 0.18994140625, "loss/logits": 0.03715855535119772, "loss/reg": 0.023391004651784897, "step": 1496 }, { "epoch": 0.7485, "grad_norm": 1.5231564044952393, "grad_norm_var": 0.24087563457875186, "learning_rate": 2e-05, "loss": 0.4467, "loss/crossentropy": 2.418076753616333, "loss/hidden": 0.17919921875, "loss/logits": 0.03366350382566452, "loss/reg": 0.02338848076760769, "step": 1497 }, { "epoch": 0.749, "grad_norm": 1.3560765981674194, "grad_norm_var": 0.2531766876431429, "learning_rate": 2e-05, "loss": 0.4376, "loss/crossentropy": 2.092953681945801, "loss/hidden": 0.16796875, "loss/logits": 0.035764566622674465, "loss/reg": 0.023386115208268166, "step": 1498 }, { "epoch": 0.7495, "grad_norm": 2.6173150539398193, "grad_norm_var": 0.28943914508452623, "learning_rate": 2e-05, "loss": 0.5938, "loss/crossentropy": 2.3732458353042603, "loss/hidden": 0.291015625, "loss/logits": 0.06896837241947651, "loss/reg": 0.023383593186736107, "step": 1499 }, { "epoch": 0.75, "grad_norm": 2.2990541458129883, "grad_norm_var": 0.2962192790031487, "learning_rate": 2e-05, "loss": 0.5086, "loss/crossentropy": 2.3469722270965576, "loss/hidden": 0.24072265625, "loss/logits": 0.03404225967824459, "loss/reg": 0.02338109351694584, "step": 1500 }, { "epoch": 0.7505, "grad_norm": 6.053563594818115, "grad_norm_var": 1.3816725595266346, "learning_rate": 2e-05, "loss": 0.8284, "loss/crossentropy": 2.2309868335723877, "loss/hidden": 0.4892578125, "loss/logits": 0.1053722184151411, "loss/reg": 0.023378517478704453, "step": 1501 }, { "epoch": 0.751, "grad_norm": 1.4381011724472046, "grad_norm_var": 1.361029946092843, "learning_rate": 2e-05, "loss": 0.4233, "loss/crossentropy": 2.3353075981140137, "loss/hidden": 0.15869140625, "loss/logits": 0.030879972502589226, "loss/reg": 0.023375999182462692, "step": 1502 }, { "epoch": 0.7515, "grad_norm": 1.7041223049163818, "grad_norm_var": 1.330544016606859, "learning_rate": 2e-05, "loss": 0.4519, "loss/crossentropy": 2.3510212898254395, "loss/hidden": 0.18310546875, "loss/logits": 0.035089364275336266, "loss/reg": 0.023373527452349663, "step": 1503 }, { "epoch": 0.752, "grad_norm": 1.176741600036621, "grad_norm_var": 1.346168787370407, "learning_rate": 2e-05, "loss": 0.4271, "loss/crossentropy": 2.301971435546875, "loss/hidden": 0.162109375, "loss/logits": 0.03127031493932009, "loss/reg": 0.023370975628495216, "step": 1504 }, { "epoch": 0.7525, "grad_norm": 1.3868812322616577, "grad_norm_var": 1.362565183966303, "learning_rate": 2e-05, "loss": 0.4299, "loss/crossentropy": 2.300473690032959, "loss/hidden": 0.1650390625, "loss/logits": 0.031166426837444305, "loss/reg": 0.023368434980511665, "step": 1505 }, { "epoch": 0.753, "grad_norm": 1.5717381238937378, "grad_norm_var": 1.3745201891776084, "learning_rate": 2e-05, "loss": 0.434, "loss/crossentropy": 2.2145345211029053, "loss/hidden": 0.1650390625, "loss/logits": 0.03533552121371031, "loss/reg": 0.02336590550839901, "step": 1506 }, { "epoch": 0.7535, "grad_norm": 1.5625638961791992, "grad_norm_var": 1.3655969008810318, "learning_rate": 2e-05, "loss": 0.4732, "loss/crossentropy": 2.4806735515594482, "loss/hidden": 0.19384765625, "loss/logits": 0.045676751993596554, "loss/reg": 0.02336341328918934, "step": 1507 }, { "epoch": 0.754, "grad_norm": 1.5724704265594482, "grad_norm_var": 1.3426361132111952, "learning_rate": 2e-05, "loss": 0.458, "loss/crossentropy": 2.5147154331207275, "loss/hidden": 0.189453125, "loss/logits": 0.03498086519539356, "loss/reg": 0.02336088940501213, "step": 1508 }, { "epoch": 0.7545, "grad_norm": 1.2432793378829956, "grad_norm_var": 1.3431686166198147, "learning_rate": 2e-05, "loss": 0.3892, "loss/crossentropy": 2.441771388053894, "loss/hidden": 0.13134765625, "loss/logits": 0.024279465898871422, "loss/reg": 0.023358337581157684, "step": 1509 }, { "epoch": 0.755, "grad_norm": 1.1907211542129517, "grad_norm_var": 1.3791328093235484, "learning_rate": 2e-05, "loss": 0.4286, "loss/crossentropy": 2.4157201051712036, "loss/hidden": 0.1650390625, "loss/logits": 0.029976122081279755, "loss/reg": 0.023355863988399506, "step": 1510 }, { "epoch": 0.7555, "grad_norm": 1.7555755376815796, "grad_norm_var": 1.3800200121858432, "learning_rate": 2e-05, "loss": 0.4285, "loss/crossentropy": 2.5669732093811035, "loss/hidden": 0.16259765625, "loss/logits": 0.032341357320547104, "loss/reg": 0.02335333824157715, "step": 1511 }, { "epoch": 0.756, "grad_norm": 1.4463238716125488, "grad_norm_var": 1.3917343393376849, "learning_rate": 2e-05, "loss": 0.4156, "loss/crossentropy": 2.3183934688568115, "loss/hidden": 0.1494140625, "loss/logits": 0.032720635645091534, "loss/reg": 0.02335066720843315, "step": 1512 }, { "epoch": 0.7565, "grad_norm": 1.8817647695541382, "grad_norm_var": 1.3832543893532858, "learning_rate": 2e-05, "loss": 0.4843, "loss/crossentropy": 2.045651853084564, "loss/hidden": 0.21044921875, "loss/logits": 0.040373530238866806, "loss/reg": 0.023348016664385796, "step": 1513 }, { "epoch": 0.757, "grad_norm": 2.0054848194122314, "grad_norm_var": 1.3632931739013794, "learning_rate": 2e-05, "loss": 0.6077, "loss/crossentropy": 2.3150511980056763, "loss/hidden": 0.30810546875, "loss/logits": 0.066120695322752, "loss/reg": 0.02334539033472538, "step": 1514 }, { "epoch": 0.7575, "grad_norm": 1.4316192865371704, "grad_norm_var": 1.3427547339581412, "learning_rate": 2e-05, "loss": 0.4183, "loss/crossentropy": 2.3082213401794434, "loss/hidden": 0.1552734375, "loss/logits": 0.02960424032062292, "loss/reg": 0.023342687636613846, "step": 1515 }, { "epoch": 0.758, "grad_norm": 1.4792301654815674, "grad_norm_var": 1.3364955062616057, "learning_rate": 2e-05, "loss": 0.4542, "loss/crossentropy": 2.229594111442566, "loss/hidden": 0.18701171875, "loss/logits": 0.03382623475044966, "loss/reg": 0.023340150713920593, "step": 1516 }, { "epoch": 0.7585, "grad_norm": 1.378159523010254, "grad_norm_var": 0.05499430187053349, "learning_rate": 2e-05, "loss": 0.4331, "loss/crossentropy": 2.3700714111328125, "loss/hidden": 0.16455078125, "loss/logits": 0.035162342712283134, "loss/reg": 0.02333764359354973, "step": 1517 }, { "epoch": 0.759, "grad_norm": 1.4192622900009155, "grad_norm_var": 0.05520725190068181, "learning_rate": 2e-05, "loss": 0.4744, "loss/crossentropy": 2.1727020740509033, "loss/hidden": 0.19970703125, "loss/logits": 0.04138432815670967, "loss/reg": 0.023334944620728493, "step": 1518 }, { "epoch": 0.7595, "grad_norm": 1.5111662149429321, "grad_norm_var": 0.052613845086676686, "learning_rate": 2e-05, "loss": 0.4544, "loss/crossentropy": 2.2911019325256348, "loss/hidden": 0.18505859375, "loss/logits": 0.035999225452542305, "loss/reg": 0.02333231456577778, "step": 1519 }, { "epoch": 0.76, "grad_norm": 1.5904415845870972, "grad_norm_var": 0.04543488593400274, "learning_rate": 2e-05, "loss": 0.411, "loss/crossentropy": 2.3194239139556885, "loss/hidden": 0.15380859375, "loss/logits": 0.023918326012790203, "loss/reg": 0.023329300805926323, "step": 1520 }, { "epoch": 0.7605, "grad_norm": 1.1604093313217163, "grad_norm_var": 0.0528615068401732, "learning_rate": 2e-05, "loss": 0.4225, "loss/crossentropy": 2.206283152103424, "loss/hidden": 0.1591796875, "loss/logits": 0.03008684329688549, "loss/reg": 0.023326555266976357, "step": 1521 }, { "epoch": 0.761, "grad_norm": 1.8183667659759521, "grad_norm_var": 0.05861065574009997, "learning_rate": 2e-05, "loss": 0.4382, "loss/crossentropy": 2.4522966146469116, "loss/hidden": 0.17578125, "loss/logits": 0.02915840595960617, "loss/reg": 0.023324020206928253, "step": 1522 }, { "epoch": 0.7615, "grad_norm": 1.9318912029266357, "grad_norm_var": 0.06884144736975527, "learning_rate": 2e-05, "loss": 0.4305, "loss/crossentropy": 2.4032152891159058, "loss/hidden": 0.16259765625, "loss/logits": 0.03465164825320244, "loss/reg": 0.023321056738495827, "step": 1523 }, { "epoch": 0.762, "grad_norm": 1.3856819868087769, "grad_norm_var": 0.07048760261173755, "learning_rate": 2e-05, "loss": 0.4574, "loss/crossentropy": 2.4888235330581665, "loss/hidden": 0.1904296875, "loss/logits": 0.03376224543899298, "loss/reg": 0.023318205028772354, "step": 1524 }, { "epoch": 0.7625, "grad_norm": 2.0698816776275635, "grad_norm_var": 0.08056257023113833, "learning_rate": 2e-05, "loss": 0.4702, "loss/crossentropy": 2.4298810958862305, "loss/hidden": 0.19677734375, "loss/logits": 0.04024036321789026, "loss/reg": 0.02331569977104664, "step": 1525 }, { "epoch": 0.763, "grad_norm": 1.7094788551330566, "grad_norm_var": 0.06969563841946425, "learning_rate": 2e-05, "loss": 0.4486, "loss/crossentropy": 2.4382940530776978, "loss/hidden": 0.1845703125, "loss/logits": 0.03088864777237177, "loss/reg": 0.023313157260417938, "step": 1526 }, { "epoch": 0.7635, "grad_norm": 1.7802170515060425, "grad_norm_var": 0.07016778667789912, "learning_rate": 2e-05, "loss": 0.4732, "loss/crossentropy": 2.4365394115448, "loss/hidden": 0.2001953125, "loss/logits": 0.039911434054374695, "loss/reg": 0.023310648277401924, "step": 1527 }, { "epoch": 0.764, "grad_norm": 2.6927785873413086, "grad_norm_var": 0.13758242415195784, "learning_rate": 2e-05, "loss": 0.645, "loss/crossentropy": 2.0314077138900757, "loss/hidden": 0.35546875, "loss/logits": 0.056443119421601295, "loss/reg": 0.02330797351896763, "step": 1528 }, { "epoch": 0.7645, "grad_norm": 2.23351788520813, "grad_norm_var": 0.15370605581981486, "learning_rate": 2e-05, "loss": 0.4534, "loss/crossentropy": 2.5266857147216797, "loss/hidden": 0.18896484375, "loss/logits": 0.03136393055319786, "loss/reg": 0.023305490612983704, "step": 1529 }, { "epoch": 0.765, "grad_norm": 1.495396375656128, "grad_norm_var": 0.1508814132006193, "learning_rate": 2e-05, "loss": 0.4171, "loss/crossentropy": 2.4519113302230835, "loss/hidden": 0.15283203125, "loss/logits": 0.031247646547853947, "loss/reg": 0.02330303005874157, "step": 1530 }, { "epoch": 0.7655, "grad_norm": 2.117763042449951, "grad_norm_var": 0.15639622485214394, "learning_rate": 2e-05, "loss": 0.4553, "loss/crossentropy": 2.728012442588806, "loss/hidden": 0.18408203125, "loss/logits": 0.03820735961198807, "loss/reg": 0.023300379514694214, "step": 1531 }, { "epoch": 0.766, "grad_norm": 1.2518669366836548, "grad_norm_var": 0.16740663803541475, "learning_rate": 2e-05, "loss": 0.4509, "loss/crossentropy": 2.3151432275772095, "loss/hidden": 0.18408203125, "loss/logits": 0.033854938112199306, "loss/reg": 0.023297840729355812, "step": 1532 }, { "epoch": 0.7665, "grad_norm": 1.6661626100540161, "grad_norm_var": 0.15940086312676746, "learning_rate": 2e-05, "loss": 0.4237, "loss/crossentropy": 2.624950885772705, "loss/hidden": 0.15966796875, "loss/logits": 0.03104830253869295, "loss/reg": 0.023295121267437935, "step": 1533 }, { "epoch": 0.767, "grad_norm": 1.2690476179122925, "grad_norm_var": 0.1672279185359154, "learning_rate": 2e-05, "loss": 0.4322, "loss/crossentropy": 2.2488889694213867, "loss/hidden": 0.17041015625, "loss/logits": 0.02886138390749693, "loss/reg": 0.02329253777861595, "step": 1534 }, { "epoch": 0.7675, "grad_norm": 1.4908874034881592, "grad_norm_var": 0.16784599970408365, "learning_rate": 2e-05, "loss": 0.4199, "loss/crossentropy": 2.32344913482666, "loss/hidden": 0.1572265625, "loss/logits": 0.029767291620373726, "loss/reg": 0.023290077224373817, "step": 1535 }, { "epoch": 0.768, "grad_norm": 1.5248539447784424, "grad_norm_var": 0.1693264389141717, "learning_rate": 2e-05, "loss": 0.4806, "loss/crossentropy": 2.282503128051758, "loss/hidden": 0.20556640625, "loss/logits": 0.042184172198176384, "loss/reg": 0.02328774333000183, "step": 1536 }, { "epoch": 0.7685, "grad_norm": 1.3552334308624268, "grad_norm_var": 0.1570355202480712, "learning_rate": 2e-05, "loss": 0.4453, "loss/crossentropy": 2.2128005027770996, "loss/hidden": 0.1787109375, "loss/logits": 0.03375644236803055, "loss/reg": 0.02328518033027649, "step": 1537 }, { "epoch": 0.769, "grad_norm": 1.7702122926712036, "grad_norm_var": 0.15665843688097023, "learning_rate": 2e-05, "loss": 0.4218, "loss/crossentropy": 2.2681163549423218, "loss/hidden": 0.16259765625, "loss/logits": 0.026351372711360455, "loss/reg": 0.02328294701874256, "step": 1538 }, { "epoch": 0.7695, "grad_norm": 2.065890073776245, "grad_norm_var": 0.161315321835504, "learning_rate": 2e-05, "loss": 0.5305, "loss/crossentropy": 2.223512649536133, "loss/hidden": 0.25048828125, "loss/logits": 0.047157226130366325, "loss/reg": 0.023280519992113113, "step": 1539 }, { "epoch": 0.77, "grad_norm": 1.4778271913528442, "grad_norm_var": 0.15746298503991624, "learning_rate": 2e-05, "loss": 0.4345, "loss/crossentropy": 2.3824329376220703, "loss/hidden": 0.1669921875, "loss/logits": 0.03470621630549431, "loss/reg": 0.023278141394257545, "step": 1540 }, { "epoch": 0.7705, "grad_norm": 1.643192172050476, "grad_norm_var": 0.15054023023162647, "learning_rate": 2e-05, "loss": 0.4223, "loss/crossentropy": 2.65035343170166, "loss/hidden": 0.15673828125, "loss/logits": 0.032792385667562485, "loss/reg": 0.023275921121239662, "step": 1541 }, { "epoch": 0.771, "grad_norm": 1.5637279748916626, "grad_norm_var": 0.15210194531488597, "learning_rate": 2e-05, "loss": 0.4743, "loss/crossentropy": 2.5058376789093018, "loss/hidden": 0.2060546875, "loss/logits": 0.03547433018684387, "loss/reg": 0.023273425176739693, "step": 1542 }, { "epoch": 0.7715, "grad_norm": 5.894736289978027, "grad_norm_var": 1.2473798526593487, "learning_rate": 2e-05, "loss": 0.6759, "loss/crossentropy": 2.7393654584884644, "loss/hidden": 0.3525390625, "loss/logits": 0.09067841898649931, "loss/reg": 0.023270903155207634, "step": 1543 }, { "epoch": 0.772, "grad_norm": 1.4357421398162842, "grad_norm_var": 1.2249250941187129, "learning_rate": 2e-05, "loss": 0.4196, "loss/crossentropy": 2.1938605308532715, "loss/hidden": 0.158203125, "loss/logits": 0.028688468039035797, "loss/reg": 0.023268546909093857, "step": 1544 }, { "epoch": 0.7725, "grad_norm": 1.4271492958068848, "grad_norm_var": 1.2287387850562255, "learning_rate": 2e-05, "loss": 0.4829, "loss/crossentropy": 2.1993138790130615, "loss/hidden": 0.21240234375, "loss/logits": 0.03782237879931927, "loss/reg": 0.023266203701496124, "step": 1545 }, { "epoch": 0.773, "grad_norm": 1.6962809562683105, "grad_norm_var": 1.2220146551281785, "learning_rate": 2e-05, "loss": 0.4143, "loss/crossentropy": 2.3468743562698364, "loss/hidden": 0.15380859375, "loss/logits": 0.027830702252686024, "loss/reg": 0.023263977840542793, "step": 1546 }, { "epoch": 0.7735, "grad_norm": 1.708454966545105, "grad_norm_var": 1.2180449645963336, "learning_rate": 2e-05, "loss": 0.437, "loss/crossentropy": 2.366321086883545, "loss/hidden": 0.17626953125, "loss/logits": 0.028090238571166992, "loss/reg": 0.023261502385139465, "step": 1547 }, { "epoch": 0.774, "grad_norm": 2.553924083709717, "grad_norm_var": 1.2240565005172073, "learning_rate": 2e-05, "loss": 0.6236, "loss/crossentropy": 2.093143939971924, "loss/hidden": 0.32373046875, "loss/logits": 0.06730393506586552, "loss/reg": 0.023259302601218224, "step": 1548 }, { "epoch": 0.7745, "grad_norm": 1.2026230096817017, "grad_norm_var": 1.2524918261951337, "learning_rate": 2e-05, "loss": 0.4183, "loss/crossentropy": 2.4064027070999146, "loss/hidden": 0.1572265625, "loss/logits": 0.028545232489705086, "loss/reg": 0.02325684390962124, "step": 1549 }, { "epoch": 0.775, "grad_norm": 1.5950380563735962, "grad_norm_var": 1.2325789918369907, "learning_rate": 2e-05, "loss": 0.3969, "loss/crossentropy": 2.371984839439392, "loss/hidden": 0.140625, "loss/logits": 0.02372877486050129, "loss/reg": 0.02325470745563507, "step": 1550 }, { "epoch": 0.7755, "grad_norm": 1.067559003829956, "grad_norm_var": 1.2668916559295922, "learning_rate": 2e-05, "loss": 0.4115, "loss/crossentropy": 2.2676392793655396, "loss/hidden": 0.15478515625, "loss/logits": 0.0241701677441597, "loss/reg": 0.023252317681908607, "step": 1551 }, { "epoch": 0.776, "grad_norm": 2.6447858810424805, "grad_norm_var": 1.2931606651566094, "learning_rate": 2e-05, "loss": 0.4562, "loss/crossentropy": 2.527026653289795, "loss/hidden": 0.19482421875, "loss/logits": 0.028835158795118332, "loss/reg": 0.0232497937977314, "step": 1552 }, { "epoch": 0.7765, "grad_norm": 1.3525029420852661, "grad_norm_var": 1.2933754435969356, "learning_rate": 2e-05, "loss": 0.4655, "loss/crossentropy": 2.107416331768036, "loss/hidden": 0.1982421875, "loss/logits": 0.0347739988937974, "loss/reg": 0.023247426375746727, "step": 1553 }, { "epoch": 0.777, "grad_norm": 1.3164350986480713, "grad_norm_var": 1.3167433755836309, "learning_rate": 2e-05, "loss": 0.4165, "loss/crossentropy": 2.509757399559021, "loss/hidden": 0.15673828125, "loss/logits": 0.027274997904896736, "loss/reg": 0.023244967684149742, "step": 1554 }, { "epoch": 0.7775, "grad_norm": 1.9293817281723022, "grad_norm_var": 1.3151683429148335, "learning_rate": 2e-05, "loss": 0.4151, "loss/crossentropy": 2.647552251815796, "loss/hidden": 0.158203125, "loss/logits": 0.024518443271517754, "loss/reg": 0.023242756724357605, "step": 1555 }, { "epoch": 0.778, "grad_norm": 1.7657341957092285, "grad_norm_var": 1.3038804133117678, "learning_rate": 2e-05, "loss": 0.4489, "loss/crossentropy": 2.2430570125579834, "loss/hidden": 0.17822265625, "loss/logits": 0.038234325125813484, "loss/reg": 0.02324022725224495, "step": 1556 }, { "epoch": 0.7785, "grad_norm": 2.497610330581665, "grad_norm_var": 1.3174225363238234, "learning_rate": 2e-05, "loss": 0.4612, "loss/crossentropy": 2.505717158317566, "loss/hidden": 0.19775390625, "loss/logits": 0.03111663181334734, "loss/reg": 0.023237932473421097, "step": 1557 }, { "epoch": 0.779, "grad_norm": 1.704455852508545, "grad_norm_var": 1.3108827016120235, "learning_rate": 2e-05, "loss": 0.4357, "loss/crossentropy": 2.389075994491577, "loss/hidden": 0.1767578125, "loss/logits": 0.02655597310513258, "loss/reg": 0.02323562279343605, "step": 1558 }, { "epoch": 0.7795, "grad_norm": 1.900311827659607, "grad_norm_var": 0.22688966028548616, "learning_rate": 2e-05, "loss": 0.5081, "loss/crossentropy": 2.4398266077041626, "loss/hidden": 0.23583984375, "loss/logits": 0.03995893709361553, "loss/reg": 0.023233113810420036, "step": 1559 }, { "epoch": 0.78, "grad_norm": 1.3378366231918335, "grad_norm_var": 0.2314262808823021, "learning_rate": 2e-05, "loss": 0.4106, "loss/crossentropy": 2.1936975717544556, "loss/hidden": 0.15576171875, "loss/logits": 0.022547971457242966, "loss/reg": 0.023230722174048424, "step": 1560 }, { "epoch": 0.7805, "grad_norm": 1.7965880632400513, "grad_norm_var": 0.22497679016718142, "learning_rate": 2e-05, "loss": 0.446, "loss/crossentropy": 2.281595468521118, "loss/hidden": 0.18359375, "loss/logits": 0.030129313468933105, "loss/reg": 0.02322840318083763, "step": 1561 }, { "epoch": 0.781, "grad_norm": 1.6652514934539795, "grad_norm_var": 0.22527719371189883, "learning_rate": 2e-05, "loss": 0.4288, "loss/crossentropy": 2.3167933225631714, "loss/hidden": 0.17041015625, "loss/logits": 0.02617516089230776, "loss/reg": 0.02322593703866005, "step": 1562 }, { "epoch": 0.7815, "grad_norm": 1.6363804340362549, "grad_norm_var": 0.2260242298357046, "learning_rate": 2e-05, "loss": 0.4647, "loss/crossentropy": 1.9782673716545105, "loss/hidden": 0.20556640625, "loss/logits": 0.026893844828009605, "loss/reg": 0.023223651573061943, "step": 1563 }, { "epoch": 0.782, "grad_norm": 1.7427809238433838, "grad_norm_var": 0.1799729760451602, "learning_rate": 2e-05, "loss": 0.4594, "loss/crossentropy": 2.475069522857666, "loss/hidden": 0.1875, "loss/logits": 0.03972475230693817, "loss/reg": 0.023221155628561974, "step": 1564 }, { "epoch": 0.7825, "grad_norm": 1.4905965328216553, "grad_norm_var": 0.16616583137613528, "learning_rate": 2e-05, "loss": 0.4287, "loss/crossentropy": 2.290923833847046, "loss/hidden": 0.16943359375, "loss/logits": 0.027077090926468372, "loss/reg": 0.023218607529997826, "step": 1565 }, { "epoch": 0.783, "grad_norm": 1.8255786895751953, "grad_norm_var": 0.16579392065956847, "learning_rate": 2e-05, "loss": 0.4183, "loss/crossentropy": 2.6674692630767822, "loss/hidden": 0.1591796875, "loss/logits": 0.026930052787065506, "loss/reg": 0.023216072469949722, "step": 1566 }, { "epoch": 0.7835, "grad_norm": 2.251720428466797, "grad_norm_var": 0.14890348739905898, "learning_rate": 2e-05, "loss": 0.4939, "loss/crossentropy": 2.6777660846710205, "loss/hidden": 0.22119140625, "loss/logits": 0.040533529594540596, "loss/reg": 0.023213520646095276, "step": 1567 }, { "epoch": 0.784, "grad_norm": 1.9184706211090088, "grad_norm_var": 0.10041432594898173, "learning_rate": 2e-05, "loss": 0.4455, "loss/crossentropy": 2.48405659198761, "loss/hidden": 0.18017578125, "loss/logits": 0.03325136937201023, "loss/reg": 0.0232110396027565, "step": 1568 }, { "epoch": 0.7845, "grad_norm": 2.3974111080169678, "grad_norm_var": 0.11212794269452289, "learning_rate": 2e-05, "loss": 0.5316, "loss/crossentropy": 2.4565058946609497, "loss/hidden": 0.24951171875, "loss/logits": 0.04996338486671448, "loss/reg": 0.023208467289805412, "step": 1569 }, { "epoch": 0.785, "grad_norm": 1.3549920320510864, "grad_norm_var": 0.10961390038742369, "learning_rate": 2e-05, "loss": 0.4089, "loss/crossentropy": 2.359605550765991, "loss/hidden": 0.1484375, "loss/logits": 0.028434154577553272, "loss/reg": 0.023205863311886787, "step": 1570 }, { "epoch": 0.7855, "grad_norm": 1.2709044218063354, "grad_norm_var": 0.12763188642899853, "learning_rate": 2e-05, "loss": 0.45, "loss/crossentropy": 2.117920219898224, "loss/hidden": 0.1826171875, "loss/logits": 0.035326533019542694, "loss/reg": 0.023203279823064804, "step": 1571 }, { "epoch": 0.786, "grad_norm": 1.4363126754760742, "grad_norm_var": 0.13525123557490268, "learning_rate": 2e-05, "loss": 0.4487, "loss/crossentropy": 2.070175528526306, "loss/hidden": 0.1875, "loss/logits": 0.029196069575846195, "loss/reg": 0.023200761526823044, "step": 1572 }, { "epoch": 0.7865, "grad_norm": 1.4775161743164062, "grad_norm_var": 0.10053524622992847, "learning_rate": 2e-05, "loss": 0.446, "loss/crossentropy": 2.640804171562195, "loss/hidden": 0.17919921875, "loss/logits": 0.034833875484764576, "loss/reg": 0.02319827489554882, "step": 1573 }, { "epoch": 0.787, "grad_norm": 1.5340831279754639, "grad_norm_var": 0.10225829614935306, "learning_rate": 2e-05, "loss": 0.4402, "loss/crossentropy": 2.298627734184265, "loss/hidden": 0.1748046875, "loss/logits": 0.03342457953840494, "loss/reg": 0.02319585159420967, "step": 1574 }, { "epoch": 0.7875, "grad_norm": 1.4753564596176147, "grad_norm_var": 0.10161700731717402, "learning_rate": 2e-05, "loss": 0.4379, "loss/crossentropy": 2.3629835844039917, "loss/hidden": 0.17626953125, "loss/logits": 0.029691355302929878, "loss/reg": 0.02319331094622612, "step": 1575 }, { "epoch": 0.788, "grad_norm": 2.231339454650879, "grad_norm_var": 0.11274765054892152, "learning_rate": 2e-05, "loss": 0.5219, "loss/crossentropy": 2.3111387491226196, "loss/hidden": 0.25048828125, "loss/logits": 0.03952763415873051, "loss/reg": 0.023190749809145927, "step": 1576 }, { "epoch": 0.7885, "grad_norm": 1.3988337516784668, "grad_norm_var": 0.11852513456262694, "learning_rate": 2e-05, "loss": 0.4242, "loss/crossentropy": 2.541161060333252, "loss/hidden": 0.16259765625, "loss/logits": 0.029688138514757156, "loss/reg": 0.023188097402453423, "step": 1577 }, { "epoch": 0.789, "grad_norm": 1.5263261795043945, "grad_norm_var": 0.12026800389912082, "learning_rate": 2e-05, "loss": 0.4356, "loss/crossentropy": 2.4031816720962524, "loss/hidden": 0.17431640625, "loss/logits": 0.02942817658185959, "loss/reg": 0.023185575380921364, "step": 1578 }, { "epoch": 0.7895, "grad_norm": 1.917905330657959, "grad_norm_var": 0.12337632181772822, "learning_rate": 2e-05, "loss": 0.4671, "loss/crossentropy": 2.400526762008667, "loss/hidden": 0.2021484375, "loss/logits": 0.033078462816774845, "loss/reg": 0.023183133453130722, "step": 1579 }, { "epoch": 0.79, "grad_norm": 2.0703365802764893, "grad_norm_var": 0.13181370320904567, "learning_rate": 2e-05, "loss": 0.4093, "loss/crossentropy": 1.912480115890503, "loss/hidden": 0.15869140625, "loss/logits": 0.018787679262459278, "loss/reg": 0.023180615156888962, "step": 1580 }, { "epoch": 0.7905, "grad_norm": 2.055941343307495, "grad_norm_var": 0.1342255915417723, "learning_rate": 2e-05, "loss": 0.526, "loss/crossentropy": 2.1755658388137817, "loss/hidden": 0.25927734375, "loss/logits": 0.03490355238318443, "loss/reg": 0.02317827008664608, "step": 1581 }, { "epoch": 0.791, "grad_norm": 1.1831824779510498, "grad_norm_var": 0.15430979289185978, "learning_rate": 2e-05, "loss": 0.4029, "loss/crossentropy": 2.3617440462112427, "loss/hidden": 0.14697265625, "loss/logits": 0.024197732098400593, "loss/reg": 0.02317577414214611, "step": 1582 }, { "epoch": 0.7915, "grad_norm": 1.4519609212875366, "grad_norm_var": 0.13745687144184232, "learning_rate": 2e-05, "loss": 0.4694, "loss/crossentropy": 2.295761823654175, "loss/hidden": 0.19921875, "loss/logits": 0.03844046592712402, "loss/reg": 0.02317335642874241, "step": 1583 }, { "epoch": 0.792, "grad_norm": 1.534197449684143, "grad_norm_var": 0.13389399149251766, "learning_rate": 2e-05, "loss": 0.4531, "loss/crossentropy": 2.2924489974975586, "loss/hidden": 0.1767578125, "loss/logits": 0.04463693127036095, "loss/reg": 0.023170989006757736, "step": 1584 }, { "epoch": 0.7925, "grad_norm": 1.8300005197525024, "grad_norm_var": 0.09707661533023403, "learning_rate": 2e-05, "loss": 0.5176, "loss/crossentropy": 2.3310474157333374, "loss/hidden": 0.23486328125, "loss/logits": 0.05109906196594238, "loss/reg": 0.023168709129095078, "step": 1585 }, { "epoch": 0.793, "grad_norm": 1.2439618110656738, "grad_norm_var": 0.10161223968455312, "learning_rate": 2e-05, "loss": 0.4037, "loss/crossentropy": 2.2324079275131226, "loss/hidden": 0.14453125, "loss/logits": 0.027506624348461628, "loss/reg": 0.02316616289317608, "step": 1586 }, { "epoch": 0.7935, "grad_norm": 1.6163288354873657, "grad_norm_var": 0.0938027555024466, "learning_rate": 2e-05, "loss": 0.4392, "loss/crossentropy": 2.536410927772522, "loss/hidden": 0.169921875, "loss/logits": 0.037604911252856255, "loss/reg": 0.023163635283708572, "step": 1587 }, { "epoch": 0.794, "grad_norm": 1.1579729318618774, "grad_norm_var": 0.10560929736321494, "learning_rate": 2e-05, "loss": 0.4123, "loss/crossentropy": 2.4252489805221558, "loss/hidden": 0.150390625, "loss/logits": 0.03028416447341442, "loss/reg": 0.023161334916949272, "step": 1588 }, { "epoch": 0.7945, "grad_norm": 2.056169271469116, "grad_norm_var": 0.11657917936412522, "learning_rate": 2e-05, "loss": 0.4863, "loss/crossentropy": 2.3423261642456055, "loss/hidden": 0.20361328125, "loss/logits": 0.05107624363154173, "loss/reg": 0.023159068077802658, "step": 1589 }, { "epoch": 0.795, "grad_norm": 1.2322633266448975, "grad_norm_var": 0.12664541026909054, "learning_rate": 2e-05, "loss": 0.4201, "loss/crossentropy": 2.258147120475769, "loss/hidden": 0.15966796875, "loss/logits": 0.028841855004429817, "loss/reg": 0.023156482726335526, "step": 1590 }, { "epoch": 0.7955, "grad_norm": 1.4345581531524658, "grad_norm_var": 0.12755737501392914, "learning_rate": 2e-05, "loss": 0.4291, "loss/crossentropy": 2.4102907180786133, "loss/hidden": 0.169921875, "loss/logits": 0.027651555836200714, "loss/reg": 0.023154061287641525, "step": 1591 }, { "epoch": 0.796, "grad_norm": 2.623196601867676, "grad_norm_var": 0.16902592388542997, "learning_rate": 2e-05, "loss": 0.5085, "loss/crossentropy": 2.4452123641967773, "loss/hidden": 0.23828125, "loss/logits": 0.03867449425160885, "loss/reg": 0.023151807487010956, "step": 1592 }, { "epoch": 0.7965, "grad_norm": 1.1513710021972656, "grad_norm_var": 0.18100263857273305, "learning_rate": 2e-05, "loss": 0.439, "loss/crossentropy": 2.2152241468429565, "loss/hidden": 0.1767578125, "loss/logits": 0.030737859196960926, "loss/reg": 0.023149540647864342, "step": 1593 }, { "epoch": 0.797, "grad_norm": 1.4377129077911377, "grad_norm_var": 0.18272251392224484, "learning_rate": 2e-05, "loss": 0.4097, "loss/crossentropy": 2.3043720722198486, "loss/hidden": 0.15283203125, "loss/logits": 0.025365683250129223, "loss/reg": 0.023146886378526688, "step": 1594 }, { "epoch": 0.7975, "grad_norm": 1.639028549194336, "grad_norm_var": 0.1766851802806513, "learning_rate": 2e-05, "loss": 0.4636, "loss/crossentropy": 2.422786235809326, "loss/hidden": 0.177734375, "loss/logits": 0.0544711509719491, "loss/reg": 0.023144405335187912, "step": 1595 }, { "epoch": 0.798, "grad_norm": 1.2421246767044067, "grad_norm_var": 0.1684333370511676, "learning_rate": 2e-05, "loss": 0.3977, "loss/crossentropy": 2.6057989597320557, "loss/hidden": 0.14111328125, "loss/logits": 0.025157983414828777, "loss/reg": 0.023141996935009956, "step": 1596 }, { "epoch": 0.7985, "grad_norm": 1.2711045742034912, "grad_norm_var": 0.1545756380850302, "learning_rate": 2e-05, "loss": 0.4284, "loss/crossentropy": 2.264176368713379, "loss/hidden": 0.16845703125, "loss/logits": 0.028568227775394917, "loss/reg": 0.02313930355012417, "step": 1597 }, { "epoch": 0.799, "grad_norm": 1.5185736417770386, "grad_norm_var": 0.14714454199060936, "learning_rate": 2e-05, "loss": 0.4114, "loss/crossentropy": 2.471498489379883, "loss/hidden": 0.14990234375, "loss/logits": 0.030109106563031673, "loss/reg": 0.02313670702278614, "step": 1598 }, { "epoch": 0.7995, "grad_norm": 1.553176760673523, "grad_norm_var": 0.14676495590723318, "learning_rate": 2e-05, "loss": 0.3957, "loss/crossentropy": 2.330732226371765, "loss/hidden": 0.13916015625, "loss/logits": 0.025215147994458675, "loss/reg": 0.023134108632802963, "step": 1599 }, { "epoch": 0.8, "grad_norm": 1.5054106712341309, "grad_norm_var": 0.14681544855401113, "learning_rate": 2e-05, "loss": 0.4242, "loss/crossentropy": 2.3858243227005005, "loss/hidden": 0.162109375, "loss/logits": 0.030760521069169044, "loss/reg": 0.023131774738430977, "step": 1600 }, { "epoch": 0.8005, "grad_norm": 1.565080165863037, "grad_norm_var": 0.1406777927219105, "learning_rate": 2e-05, "loss": 0.4291, "loss/crossentropy": 2.4610700607299805, "loss/hidden": 0.1630859375, "loss/logits": 0.03468041494488716, "loss/reg": 0.023129595443606377, "step": 1601 }, { "epoch": 0.801, "grad_norm": 1.212703824043274, "grad_norm_var": 0.1418705661983741, "learning_rate": 2e-05, "loss": 0.4174, "loss/crossentropy": 2.3021336793899536, "loss/hidden": 0.15966796875, "loss/logits": 0.026437725871801376, "loss/reg": 0.023127034306526184, "step": 1602 }, { "epoch": 0.8015, "grad_norm": 1.2017550468444824, "grad_norm_var": 0.1469311922279634, "learning_rate": 2e-05, "loss": 0.3971, "loss/crossentropy": 2.5906589031219482, "loss/hidden": 0.14208984375, "loss/logits": 0.023727728985249996, "loss/reg": 0.023124776780605316, "step": 1603 }, { "epoch": 0.802, "grad_norm": 3.218196392059326, "grad_norm_var": 0.3216560098977896, "learning_rate": 2e-05, "loss": 0.5194, "loss/crossentropy": 2.1525968313217163, "loss/hidden": 0.24365234375, "loss/logits": 0.044501783326268196, "loss/reg": 0.023122500628232956, "step": 1604 }, { "epoch": 0.8025, "grad_norm": 1.5749452114105225, "grad_norm_var": 0.3079126424294389, "learning_rate": 2e-05, "loss": 0.4253, "loss/crossentropy": 2.2141406536102295, "loss/hidden": 0.16552734375, "loss/logits": 0.02860566135495901, "loss/reg": 0.0231203343719244, "step": 1605 }, { "epoch": 0.803, "grad_norm": 1.7416962385177612, "grad_norm_var": 0.3000833317033832, "learning_rate": 2e-05, "loss": 0.4899, "loss/crossentropy": 2.4038604497909546, "loss/hidden": 0.22021484375, "loss/logits": 0.038527075201272964, "loss/reg": 0.023117849603295326, "step": 1606 }, { "epoch": 0.8035, "grad_norm": 12.239812850952148, "grad_norm_var": 7.332656902880251, "learning_rate": 2e-05, "loss": 0.5506, "loss/crossentropy": 2.144862651824951, "loss/hidden": 0.2763671875, "loss/logits": 0.04309249948710203, "loss/reg": 0.023115267977118492, "step": 1607 }, { "epoch": 0.804, "grad_norm": 1.863564133644104, "grad_norm_var": 7.335328194748469, "learning_rate": 2e-05, "loss": 0.4323, "loss/crossentropy": 2.537988543510437, "loss/hidden": 0.16845703125, "loss/logits": 0.032738376408815384, "loss/reg": 0.023112677037715912, "step": 1608 }, { "epoch": 0.8045, "grad_norm": 1.3629024028778076, "grad_norm_var": 7.307251217498798, "learning_rate": 2e-05, "loss": 0.4164, "loss/crossentropy": 2.1855788230895996, "loss/hidden": 0.1572265625, "loss/logits": 0.028044618666172028, "loss/reg": 0.023110322654247284, "step": 1609 }, { "epoch": 0.805, "grad_norm": 1.3974899053573608, "grad_norm_var": 7.311758223035725, "learning_rate": 2e-05, "loss": 0.4062, "loss/crossentropy": 2.217948317527771, "loss/hidden": 0.1484375, "loss/logits": 0.026731343939900398, "loss/reg": 0.02310797944664955, "step": 1610 }, { "epoch": 0.8055, "grad_norm": 2.0517637729644775, "grad_norm_var": 7.28841256335691, "learning_rate": 2e-05, "loss": 0.4398, "loss/crossentropy": 2.581295609474182, "loss/hidden": 0.17431640625, "loss/logits": 0.03437975142151117, "loss/reg": 0.023105405271053314, "step": 1611 }, { "epoch": 0.806, "grad_norm": 2.579063892364502, "grad_norm_var": 7.2146663129960755, "learning_rate": 2e-05, "loss": 0.4879, "loss/crossentropy": 2.704404354095459, "loss/hidden": 0.2119140625, "loss/logits": 0.04493995010852814, "loss/reg": 0.023102805018424988, "step": 1612 }, { "epoch": 0.8065, "grad_norm": 1.4066587686538696, "grad_norm_var": 7.196024324251629, "learning_rate": 2e-05, "loss": 0.4595, "loss/crossentropy": 2.1706148386001587, "loss/hidden": 0.19287109375, "loss/logits": 0.03565484471619129, "loss/reg": 0.023100463673472404, "step": 1613 }, { "epoch": 0.807, "grad_norm": 1.691936731338501, "grad_norm_var": 7.178116795127499, "learning_rate": 2e-05, "loss": 0.4514, "loss/crossentropy": 2.369131565093994, "loss/hidden": 0.18505859375, "loss/logits": 0.03535500913858414, "loss/reg": 0.02309785783290863, "step": 1614 }, { "epoch": 0.8075, "grad_norm": 1.2398101091384888, "grad_norm_var": 7.21902571074465, "learning_rate": 2e-05, "loss": 0.3903, "loss/crossentropy": 2.3177562952041626, "loss/hidden": 0.13525390625, "loss/logits": 0.0241070706397295, "loss/reg": 0.02309543453156948, "step": 1615 }, { "epoch": 0.808, "grad_norm": 2.775026559829712, "grad_norm_var": 7.17412256855064, "learning_rate": 2e-05, "loss": 0.4131, "loss/crossentropy": 2.624569892883301, "loss/hidden": 0.15576171875, "loss/logits": 0.02637580782175064, "loss/reg": 0.02309308759868145, "step": 1616 }, { "epoch": 0.8085, "grad_norm": 1.959854006767273, "grad_norm_var": 7.137539141392571, "learning_rate": 2e-05, "loss": 0.4264, "loss/crossentropy": 2.393427848815918, "loss/hidden": 0.16796875, "loss/logits": 0.0275327330455184, "loss/reg": 0.023090790957212448, "step": 1617 }, { "epoch": 0.809, "grad_norm": 1.2265393733978271, "grad_norm_var": 7.135232046007838, "learning_rate": 2e-05, "loss": 0.4337, "loss/crossentropy": 2.4681339263916016, "loss/hidden": 0.17138671875, "loss/logits": 0.03143185377120972, "loss/reg": 0.023088427260518074, "step": 1618 }, { "epoch": 0.8095, "grad_norm": 1.782827377319336, "grad_norm_var": 7.05802258224737, "learning_rate": 2e-05, "loss": 0.414, "loss/crossentropy": 2.497174382209778, "loss/hidden": 0.154296875, "loss/logits": 0.02879659365862608, "loss/reg": 0.023086171597242355, "step": 1619 }, { "epoch": 0.81, "grad_norm": 1.3240845203399658, "grad_norm_var": 7.1026412994491706, "learning_rate": 2e-05, "loss": 0.4085, "loss/crossentropy": 2.3640178442001343, "loss/hidden": 0.1533203125, "loss/logits": 0.024376518093049526, "loss/reg": 0.023083915933966637, "step": 1620 }, { "epoch": 0.8105, "grad_norm": 1.827821969985962, "grad_norm_var": 7.079203255275327, "learning_rate": 2e-05, "loss": 0.4672, "loss/crossentropy": 2.2232565879821777, "loss/hidden": 0.19677734375, "loss/logits": 0.039636192843317986, "loss/reg": 0.02308170683681965, "step": 1621 }, { "epoch": 0.811, "grad_norm": 1.488737940788269, "grad_norm_var": 7.105554975206242, "learning_rate": 2e-05, "loss": 0.4777, "loss/crossentropy": 2.3754860162734985, "loss/hidden": 0.201171875, "loss/logits": 0.04573565348982811, "loss/reg": 0.023079518228769302, "step": 1622 }, { "epoch": 0.8115, "grad_norm": 1.6029181480407715, "grad_norm_var": 0.20554311559602045, "learning_rate": 2e-05, "loss": 0.5146, "loss/crossentropy": 2.1231746673583984, "loss/hidden": 0.2275390625, "loss/logits": 0.056324394419789314, "loss/reg": 0.023077305406332016, "step": 1623 }, { "epoch": 0.812, "grad_norm": 1.273179054260254, "grad_norm_var": 0.21632680198712767, "learning_rate": 2e-05, "loss": 0.4327, "loss/crossentropy": 2.269154667854309, "loss/hidden": 0.16845703125, "loss/logits": 0.03346337750554085, "loss/reg": 0.02307521365582943, "step": 1624 }, { "epoch": 0.8125, "grad_norm": 1.5247483253479004, "grad_norm_var": 0.21097195205831215, "learning_rate": 2e-05, "loss": 0.4599, "loss/crossentropy": 2.123014211654663, "loss/hidden": 0.197265625, "loss/logits": 0.03189415484666824, "loss/reg": 0.02307269349694252, "step": 1625 }, { "epoch": 0.813, "grad_norm": 1.631244421005249, "grad_norm_var": 0.20505121684640598, "learning_rate": 2e-05, "loss": 0.4684, "loss/crossentropy": 2.1642907857894897, "loss/hidden": 0.19677734375, "loss/logits": 0.040870968252420425, "loss/reg": 0.02307022735476494, "step": 1626 }, { "epoch": 0.8135, "grad_norm": 1.8798401355743408, "grad_norm_var": 0.19910183072842996, "learning_rate": 2e-05, "loss": 0.4351, "loss/crossentropy": 2.588270664215088, "loss/hidden": 0.17333984375, "loss/logits": 0.031081863678991795, "loss/reg": 0.023067684844136238, "step": 1627 }, { "epoch": 0.814, "grad_norm": 1.2950395345687866, "grad_norm_var": 0.15180106705406657, "learning_rate": 2e-05, "loss": 0.4603, "loss/crossentropy": 2.419018030166626, "loss/hidden": 0.17138671875, "loss/logits": 0.05826069973409176, "loss/reg": 0.02306544780731201, "step": 1628 }, { "epoch": 0.8145, "grad_norm": 24.923316955566406, "grad_norm_var": 34.04542175114692, "learning_rate": 2e-05, "loss": 0.7786, "loss/crossentropy": 2.3988600969314575, "loss/hidden": 0.49560546875, "loss/logits": 0.0523617435246706, "loss/reg": 0.023063141852617264, "step": 1629 }, { "epoch": 0.815, "grad_norm": 2.5214052200317383, "grad_norm_var": 33.933755082592214, "learning_rate": 2e-05, "loss": 0.5531, "loss/crossentropy": 2.5637893676757812, "loss/hidden": 0.275390625, "loss/logits": 0.04713789001107216, "loss/reg": 0.023060709238052368, "step": 1630 }, { "epoch": 0.8155, "grad_norm": 1.2581124305725098, "grad_norm_var": 33.92913341630277, "learning_rate": 2e-05, "loss": 0.4302, "loss/crossentropy": 2.1320899724960327, "loss/hidden": 0.16845703125, "loss/logits": 0.03112439066171646, "loss/reg": 0.023058375343680382, "step": 1631 }, { "epoch": 0.816, "grad_norm": 2.5482232570648193, "grad_norm_var": 33.94348873438556, "learning_rate": 2e-05, "loss": 0.6642, "loss/crossentropy": 1.9117431640625, "loss/hidden": 0.37109375, "loss/logits": 0.0625968836247921, "loss/reg": 0.02305583469569683, "step": 1632 }, { "epoch": 0.8165, "grad_norm": 1.597198486328125, "grad_norm_var": 34.008253404182256, "learning_rate": 2e-05, "loss": 0.4802, "loss/crossentropy": 2.3248562812805176, "loss/hidden": 0.2060546875, "loss/logits": 0.04361774958670139, "loss/reg": 0.023053383454680443, "step": 1633 }, { "epoch": 0.817, "grad_norm": 1.1865489482879639, "grad_norm_var": 34.018377825217904, "learning_rate": 2e-05, "loss": 0.4076, "loss/crossentropy": 2.3980835676193237, "loss/hidden": 0.1484375, "loss/logits": 0.028623439371585846, "loss/reg": 0.023050816729664803, "step": 1634 }, { "epoch": 0.8175, "grad_norm": 1.4046670198440552, "grad_norm_var": 34.09393493073639, "learning_rate": 2e-05, "loss": 0.4186, "loss/crossentropy": 2.367344379425049, "loss/hidden": 0.15771484375, "loss/logits": 0.030404978431761265, "loss/reg": 0.023048415780067444, "step": 1635 }, { "epoch": 0.818, "grad_norm": 1.5358824729919434, "grad_norm_var": 34.04713949789893, "learning_rate": 2e-05, "loss": 0.3885, "loss/crossentropy": 2.635706663131714, "loss/hidden": 0.1318359375, "loss/logits": 0.026234203949570656, "loss/reg": 0.02304593101143837, "step": 1636 }, { "epoch": 0.8185, "grad_norm": 1.2111634016036987, "grad_norm_var": 34.174986550380915, "learning_rate": 2e-05, "loss": 0.4149, "loss/crossentropy": 2.4690955877304077, "loss/hidden": 0.15283203125, "loss/logits": 0.0316432137042284, "loss/reg": 0.023043323308229446, "step": 1637 }, { "epoch": 0.819, "grad_norm": 1.3363852500915527, "grad_norm_var": 34.20825665031358, "learning_rate": 2e-05, "loss": 0.4009, "loss/crossentropy": 2.242287516593933, "loss/hidden": 0.14501953125, "loss/logits": 0.025482993572950363, "loss/reg": 0.023040831089019775, "step": 1638 }, { "epoch": 0.8195, "grad_norm": 1.8490867614746094, "grad_norm_var": 34.16469112797808, "learning_rate": 2e-05, "loss": 0.4946, "loss/crossentropy": 2.4505655765533447, "loss/hidden": 0.21337890625, "loss/logits": 0.0508628049865365, "loss/reg": 0.023038217797875404, "step": 1639 }, { "epoch": 0.82, "grad_norm": 1.4550219774246216, "grad_norm_var": 34.12341073128782, "learning_rate": 2e-05, "loss": 0.4708, "loss/crossentropy": 2.149672269821167, "loss/hidden": 0.19970703125, "loss/logits": 0.040755780413746834, "loss/reg": 0.023035811260342598, "step": 1640 }, { "epoch": 0.8205, "grad_norm": 1.5251140594482422, "grad_norm_var": 34.12333527068636, "learning_rate": 2e-05, "loss": 0.4627, "loss/crossentropy": 2.3447247743606567, "loss/hidden": 0.197265625, "loss/logits": 0.03514695540070534, "loss/reg": 0.023033197969198227, "step": 1641 }, { "epoch": 0.821, "grad_norm": 13.460094451904297, "grad_norm_var": 40.59549407786183, "learning_rate": 2e-05, "loss": 0.5042, "loss/crossentropy": 2.601618528366089, "loss/hidden": 0.23779296875, "loss/logits": 0.03612148202955723, "loss/reg": 0.023030424490571022, "step": 1642 }, { "epoch": 0.8215, "grad_norm": 1.2327735424041748, "grad_norm_var": 40.78833425322313, "learning_rate": 2e-05, "loss": 0.4008, "loss/crossentropy": 2.4731369018554688, "loss/hidden": 0.140625, "loss/logits": 0.02993260882794857, "loss/reg": 0.023027580231428146, "step": 1643 }, { "epoch": 0.822, "grad_norm": 1.524492859840393, "grad_norm_var": 40.715868110383035, "learning_rate": 2e-05, "loss": 0.4387, "loss/crossentropy": 2.4846259355545044, "loss/hidden": 0.1748046875, "loss/logits": 0.03361409995704889, "loss/reg": 0.02302512526512146, "step": 1644 }, { "epoch": 0.8225, "grad_norm": 1.2758790254592896, "grad_norm_var": 9.018881776760608, "learning_rate": 2e-05, "loss": 0.434, "loss/crossentropy": 2.2878633737564087, "loss/hidden": 0.16796875, "loss/logits": 0.03582877665758133, "loss/reg": 0.023022696375846863, "step": 1645 }, { "epoch": 0.823, "grad_norm": 1.5986084938049316, "grad_norm_var": 9.045800842250319, "learning_rate": 2e-05, "loss": 0.4196, "loss/crossentropy": 2.3812626600265503, "loss/hidden": 0.1591796875, "loss/logits": 0.03021799586713314, "loss/reg": 0.02302025444805622, "step": 1646 }, { "epoch": 0.8235, "grad_norm": 1.4929598569869995, "grad_norm_var": 9.018190421650518, "learning_rate": 2e-05, "loss": 0.4045, "loss/crossentropy": 2.4511682987213135, "loss/hidden": 0.14892578125, "loss/logits": 0.025395757518708706, "loss/reg": 0.02301778830587864, "step": 1647 }, { "epoch": 0.824, "grad_norm": 1.4948441982269287, "grad_norm_var": 9.047710234698881, "learning_rate": 2e-05, "loss": 0.4522, "loss/crossentropy": 2.2096160650253296, "loss/hidden": 0.18505859375, "loss/logits": 0.03697221539914608, "loss/reg": 0.023015225306153297, "step": 1648 }, { "epoch": 0.8245, "grad_norm": 1.7332985401153564, "grad_norm_var": 9.0379509596088, "learning_rate": 2e-05, "loss": 0.4668, "loss/crossentropy": 2.3227975368499756, "loss/hidden": 0.19921875, "loss/logits": 0.037440777756273746, "loss/reg": 0.023012757301330566, "step": 1649 }, { "epoch": 0.825, "grad_norm": 2.2488136291503906, "grad_norm_var": 8.963901793690663, "learning_rate": 2e-05, "loss": 0.4448, "loss/crossentropy": 2.352696657180786, "loss/hidden": 0.18310546875, "loss/logits": 0.0316165778785944, "loss/reg": 0.023010345175862312, "step": 1650 }, { "epoch": 0.8255, "grad_norm": 1.5257325172424316, "grad_norm_var": 8.950789974582705, "learning_rate": 2e-05, "loss": 0.4114, "loss/crossentropy": 2.37747323513031, "loss/hidden": 0.1572265625, "loss/logits": 0.02410216350108385, "loss/reg": 0.02300778217613697, "step": 1651 }, { "epoch": 0.826, "grad_norm": 1.8591517210006714, "grad_norm_var": 8.92519375229253, "learning_rate": 2e-05, "loss": 0.4952, "loss/crossentropy": 2.2731701135635376, "loss/hidden": 0.2119140625, "loss/logits": 0.05323890969157219, "loss/reg": 0.023005163297057152, "step": 1652 }, { "epoch": 0.8265, "grad_norm": 1.6279247999191284, "grad_norm_var": 8.875463367206468, "learning_rate": 2e-05, "loss": 0.4234, "loss/crossentropy": 2.2822866439819336, "loss/hidden": 0.1630859375, "loss/logits": 0.030256139114499092, "loss/reg": 0.02300269901752472, "step": 1653 }, { "epoch": 0.827, "grad_norm": 1.2949538230895996, "grad_norm_var": 8.881045821586516, "learning_rate": 2e-05, "loss": 0.4295, "loss/crossentropy": 2.4483895301818848, "loss/hidden": 0.1630859375, "loss/logits": 0.0363735593855381, "loss/reg": 0.023000460118055344, "step": 1654 }, { "epoch": 0.8275, "grad_norm": 1.2874404191970825, "grad_norm_var": 8.936394709625619, "learning_rate": 2e-05, "loss": 0.4334, "loss/crossentropy": 2.3898115158081055, "loss/hidden": 0.16796875, "loss/logits": 0.03548043966293335, "loss/reg": 0.022998474538326263, "step": 1655 }, { "epoch": 0.828, "grad_norm": 1.5840253829956055, "grad_norm_var": 8.923075939282624, "learning_rate": 2e-05, "loss": 0.456, "loss/crossentropy": 2.3553361892700195, "loss/hidden": 0.1884765625, "loss/logits": 0.03758828155696392, "loss/reg": 0.022995930165052414, "step": 1656 }, { "epoch": 0.8285, "grad_norm": 1.1669474840164185, "grad_norm_var": 8.96799758421738, "learning_rate": 2e-05, "loss": 0.381, "loss/crossentropy": 2.548807144165039, "loss/hidden": 0.12744140625, "loss/logits": 0.023608416318893433, "loss/reg": 0.022993767634034157, "step": 1657 }, { "epoch": 0.829, "grad_norm": 1.3104734420776367, "grad_norm_var": 0.07534442062330123, "learning_rate": 2e-05, "loss": 0.4122, "loss/crossentropy": 2.2222912311553955, "loss/hidden": 0.154296875, "loss/logits": 0.027987757697701454, "loss/reg": 0.022991687059402466, "step": 1658 }, { "epoch": 0.8295, "grad_norm": 2.410997152328491, "grad_norm_var": 0.11759094401073747, "learning_rate": 2e-05, "loss": 0.6151, "loss/crossentropy": 2.0733728408813477, "loss/hidden": 0.333984375, "loss/logits": 0.05117853730916977, "loss/reg": 0.022989830002188683, "step": 1659 }, { "epoch": 0.83, "grad_norm": 1.55594003200531, "grad_norm_var": 0.11737898907536574, "learning_rate": 2e-05, "loss": 0.4358, "loss/crossentropy": 2.2041454315185547, "loss/hidden": 0.16845703125, "loss/logits": 0.03746410086750984, "loss/reg": 0.022987263277173042, "step": 1660 }, { "epoch": 0.8305, "grad_norm": 1.7716691493988037, "grad_norm_var": 0.11186125740769033, "learning_rate": 2e-05, "loss": 0.3991, "loss/crossentropy": 2.4366633892059326, "loss/hidden": 0.14697265625, "loss/logits": 0.02228802628815174, "loss/reg": 0.02298472821712494, "step": 1661 }, { "epoch": 0.831, "grad_norm": 1.4420490264892578, "grad_norm_var": 0.11389684457441239, "learning_rate": 2e-05, "loss": 0.4123, "loss/crossentropy": 2.599808931350708, "loss/hidden": 0.15673828125, "loss/logits": 0.025708637200295925, "loss/reg": 0.02298245020210743, "step": 1662 }, { "epoch": 0.8315, "grad_norm": 1.126309871673584, "grad_norm_var": 0.12816484039347759, "learning_rate": 2e-05, "loss": 0.3985, "loss/crossentropy": 2.2323700189590454, "loss/hidden": 0.1396484375, "loss/logits": 0.029093537479639053, "loss/reg": 0.022979876026511192, "step": 1663 }, { "epoch": 0.832, "grad_norm": 1.8172188997268677, "grad_norm_var": 0.13056853667108398, "learning_rate": 2e-05, "loss": 0.4711, "loss/crossentropy": 2.5063730478286743, "loss/hidden": 0.19873046875, "loss/logits": 0.0425629410892725, "loss/reg": 0.022977303713560104, "step": 1664 }, { "epoch": 0.8325, "grad_norm": 3.0363452434539795, "grad_norm_var": 0.2580790516701178, "learning_rate": 2e-05, "loss": 0.5382, "loss/crossentropy": 2.3657928705215454, "loss/hidden": 0.268310546875, "loss/logits": 0.04010665416717529, "loss/reg": 0.022974872961640358, "step": 1665 }, { "epoch": 0.833, "grad_norm": 1.2701658010482788, "grad_norm_var": 0.24523293891671988, "learning_rate": 2e-05, "loss": 0.3718, "loss/crossentropy": 2.386319637298584, "loss/hidden": 0.124267578125, "loss/logits": 0.01782753597944975, "loss/reg": 0.022972485050559044, "step": 1666 }, { "epoch": 0.8335, "grad_norm": 2.1191272735595703, "grad_norm_var": 0.2589543825866409, "learning_rate": 2e-05, "loss": 0.4094, "loss/crossentropy": 2.2309489250183105, "loss/hidden": 0.1494140625, "loss/logits": 0.030267059803009033, "loss/reg": 0.02296994999051094, "step": 1667 }, { "epoch": 0.834, "grad_norm": 1.3244271278381348, "grad_norm_var": 0.26316420886106257, "learning_rate": 2e-05, "loss": 0.4091, "loss/crossentropy": 2.2874940633773804, "loss/hidden": 0.1494140625, "loss/logits": 0.03000403381884098, "loss/reg": 0.02296753227710724, "step": 1668 }, { "epoch": 0.8345, "grad_norm": 1.4092116355895996, "grad_norm_var": 0.2663347603033822, "learning_rate": 2e-05, "loss": 0.4036, "loss/crossentropy": 2.384338140487671, "loss/hidden": 0.1474609375, "loss/logits": 0.026476314291357994, "loss/reg": 0.022965088486671448, "step": 1669 }, { "epoch": 0.835, "grad_norm": 1.62082040309906, "grad_norm_var": 0.2588288547408162, "learning_rate": 2e-05, "loss": 0.4793, "loss/crossentropy": 2.1965416073799133, "loss/hidden": 0.20751953125, "loss/logits": 0.042130330577492714, "loss/reg": 0.02296249382197857, "step": 1670 }, { "epoch": 0.8355, "grad_norm": 2.5013601779937744, "grad_norm_var": 0.29373184542219466, "learning_rate": 2e-05, "loss": 0.4534, "loss/crossentropy": 2.3103402853012085, "loss/hidden": 0.18896484375, "loss/logits": 0.03484947420656681, "loss/reg": 0.022959880530834198, "step": 1671 }, { "epoch": 0.836, "grad_norm": 1.700095295906067, "grad_norm_var": 0.2925206968647416, "learning_rate": 2e-05, "loss": 0.4456, "loss/crossentropy": 2.3426826000213623, "loss/hidden": 0.1748046875, "loss/logits": 0.04122760146856308, "loss/reg": 0.02295738458633423, "step": 1672 }, { "epoch": 0.8365, "grad_norm": 2.7791571617126465, "grad_norm_var": 0.3352385341546844, "learning_rate": 2e-05, "loss": 0.5213, "loss/crossentropy": 2.452883005142212, "loss/hidden": 0.248046875, "loss/logits": 0.04374842904508114, "loss/reg": 0.022954750806093216, "step": 1673 }, { "epoch": 0.837, "grad_norm": 1.3409507274627686, "grad_norm_var": 0.33320691501421645, "learning_rate": 2e-05, "loss": 0.4048, "loss/crossentropy": 2.351631283760071, "loss/hidden": 0.14892578125, "loss/logits": 0.0263042114675045, "loss/reg": 0.022952163591980934, "step": 1674 }, { "epoch": 0.8375, "grad_norm": 1.8099993467330933, "grad_norm_var": 0.3089535187739005, "learning_rate": 2e-05, "loss": 0.4723, "loss/crossentropy": 2.3619593381881714, "loss/hidden": 0.20703125, "loss/logits": 0.03580853994935751, "loss/reg": 0.02294965460896492, "step": 1675 }, { "epoch": 0.838, "grad_norm": 1.2641894817352295, "grad_norm_var": 0.3233415272972024, "learning_rate": 2e-05, "loss": 0.4738, "loss/crossentropy": 2.2749900817871094, "loss/hidden": 0.197265625, "loss/logits": 0.047040607780218124, "loss/reg": 0.022947140038013458, "step": 1676 }, { "epoch": 0.8385, "grad_norm": 1.43521249294281, "grad_norm_var": 0.3303785607627444, "learning_rate": 2e-05, "loss": 0.4134, "loss/crossentropy": 2.2958513498306274, "loss/hidden": 0.15576171875, "loss/logits": 0.02822498418390751, "loss/reg": 0.02294457145035267, "step": 1677 }, { "epoch": 0.839, "grad_norm": 1.880581259727478, "grad_norm_var": 0.32440405684143336, "learning_rate": 2e-05, "loss": 0.5203, "loss/crossentropy": 2.5314308404922485, "loss/hidden": 0.23583984375, "loss/logits": 0.05500957649201155, "loss/reg": 0.022941984236240387, "step": 1678 }, { "epoch": 0.8395, "grad_norm": 1.4752358198165894, "grad_norm_var": 0.3017318093173941, "learning_rate": 2e-05, "loss": 0.4043, "loss/crossentropy": 2.373136043548584, "loss/hidden": 0.14794921875, "loss/logits": 0.026981882750988007, "loss/reg": 0.022939518094062805, "step": 1679 }, { "epoch": 0.84, "grad_norm": 1.6214615106582642, "grad_norm_var": 0.3036514979065638, "learning_rate": 2e-05, "loss": 0.4526, "loss/crossentropy": 2.3645347356796265, "loss/hidden": 0.1845703125, "loss/logits": 0.038687046617269516, "loss/reg": 0.022937096655368805, "step": 1680 }, { "epoch": 0.8405, "grad_norm": 2.2418832778930664, "grad_norm_var": 0.2107344148931207, "learning_rate": 2e-05, "loss": 0.4214, "loss/crossentropy": 2.4823267459869385, "loss/hidden": 0.1640625, "loss/logits": 0.02803431637585163, "loss/reg": 0.022934794425964355, "step": 1681 }, { "epoch": 0.841, "grad_norm": 2.971312999725342, "grad_norm_var": 0.2856894840213674, "learning_rate": 2e-05, "loss": 0.6283, "loss/crossentropy": 2.1208351850509644, "loss/hidden": 0.37158203125, "loss/logits": 0.02735395822674036, "loss/reg": 0.02293219417333603, "step": 1682 }, { "epoch": 0.8415, "grad_norm": 1.5801922082901, "grad_norm_var": 0.28403227039487244, "learning_rate": 2e-05, "loss": 0.3926, "loss/crossentropy": 2.56937313079834, "loss/hidden": 0.13916015625, "loss/logits": 0.02419054415076971, "loss/reg": 0.022929731756448746, "step": 1683 }, { "epoch": 0.842, "grad_norm": 2.2374184131622314, "grad_norm_var": 0.2770492394930005, "learning_rate": 2e-05, "loss": 0.5308, "loss/crossentropy": 2.353764295578003, "loss/hidden": 0.25732421875, "loss/logits": 0.044213516637682915, "loss/reg": 0.02292727865278721, "step": 1684 }, { "epoch": 0.8425, "grad_norm": 1.4383546113967896, "grad_norm_var": 0.2753241881358552, "learning_rate": 2e-05, "loss": 0.4721, "loss/crossentropy": 2.1140084862709045, "loss/hidden": 0.20703125, "loss/logits": 0.03579618874937296, "loss/reg": 0.02292483299970627, "step": 1685 }, { "epoch": 0.843, "grad_norm": 1.4581494331359863, "grad_norm_var": 0.2823531072303079, "learning_rate": 2e-05, "loss": 0.4387, "loss/crossentropy": 2.34401535987854, "loss/hidden": 0.1787109375, "loss/logits": 0.030774756334722042, "loss/reg": 0.022922255098819733, "step": 1686 }, { "epoch": 0.8435, "grad_norm": 1.4093376398086548, "grad_norm_var": 0.263278753257605, "learning_rate": 2e-05, "loss": 0.4136, "loss/crossentropy": 2.465882182121277, "loss/hidden": 0.15234375, "loss/logits": 0.032055970281362534, "loss/reg": 0.022919660434126854, "step": 1687 }, { "epoch": 0.844, "grad_norm": 1.5240459442138672, "grad_norm_var": 0.2673313757129769, "learning_rate": 2e-05, "loss": 0.398, "loss/crossentropy": 2.320050001144409, "loss/hidden": 0.141845703125, "loss/logits": 0.026945553719997406, "loss/reg": 0.022917049005627632, "step": 1688 }, { "epoch": 0.8445, "grad_norm": 7.262933731079102, "grad_norm_var": 2.121647862424502, "learning_rate": 2e-05, "loss": 0.9268, "loss/crossentropy": 2.2761436700820923, "loss/hidden": 0.48876953125, "loss/logits": 0.20890014059841633, "loss/reg": 0.022914528846740723, "step": 1689 }, { "epoch": 0.845, "grad_norm": 1.4794998168945312, "grad_norm_var": 2.1095745457299615, "learning_rate": 2e-05, "loss": 0.4308, "loss/crossentropy": 2.334906578063965, "loss/hidden": 0.171875, "loss/logits": 0.029798144474625587, "loss/reg": 0.022911950945854187, "step": 1690 }, { "epoch": 0.8455, "grad_norm": 1.4296311140060425, "grad_norm_var": 2.1317074764367883, "learning_rate": 2e-05, "loss": 0.4302, "loss/crossentropy": 2.331926465034485, "loss/hidden": 0.16552734375, "loss/logits": 0.0355403907597065, "loss/reg": 0.02290956676006317, "step": 1691 }, { "epoch": 0.846, "grad_norm": 2.1096997261047363, "grad_norm_var": 2.0884379174542818, "learning_rate": 2e-05, "loss": 0.4455, "loss/crossentropy": 2.557571768760681, "loss/hidden": 0.177734375, "loss/logits": 0.03866210114210844, "loss/reg": 0.022907033562660217, "step": 1692 }, { "epoch": 0.8465, "grad_norm": 1.6497454643249512, "grad_norm_var": 2.072379136217235, "learning_rate": 2e-05, "loss": 0.4447, "loss/crossentropy": 2.39884877204895, "loss/hidden": 0.18359375, "loss/logits": 0.03204050101339817, "loss/reg": 0.022904478013515472, "step": 1693 }, { "epoch": 0.847, "grad_norm": 3.0329835414886475, "grad_norm_var": 2.120038982631581, "learning_rate": 2e-05, "loss": 0.6206, "loss/crossentropy": 2.4439542293548584, "loss/hidden": 0.294921875, "loss/logits": 0.09666961058974266, "loss/reg": 0.022901998832821846, "step": 1694 }, { "epoch": 0.8475, "grad_norm": 1.6038068532943726, "grad_norm_var": 2.1089456280954626, "learning_rate": 2e-05, "loss": 0.4652, "loss/crossentropy": 2.129871666431427, "loss/hidden": 0.19921875, "loss/logits": 0.03698125295341015, "loss/reg": 0.022899584844708443, "step": 1695 }, { "epoch": 0.848, "grad_norm": 1.957082986831665, "grad_norm_var": 2.0905146641594694, "learning_rate": 2e-05, "loss": 0.4753, "loss/crossentropy": 2.3653637170791626, "loss/hidden": 0.19970703125, "loss/logits": 0.04664120636880398, "loss/reg": 0.02289716713130474, "step": 1696 }, { "epoch": 0.8485, "grad_norm": 1.42384672164917, "grad_norm_var": 2.129038865225132, "learning_rate": 2e-05, "loss": 0.3962, "loss/crossentropy": 2.5141024589538574, "loss/hidden": 0.140625, "loss/logits": 0.026606767438352108, "loss/reg": 0.022894656285643578, "step": 1697 }, { "epoch": 0.849, "grad_norm": 1.4435638189315796, "grad_norm_var": 2.1097529678037024, "learning_rate": 2e-05, "loss": 0.4624, "loss/crossentropy": 2.211042284965515, "loss/hidden": 0.19580078125, "loss/logits": 0.0376845495775342, "loss/reg": 0.02289220504462719, "step": 1698 }, { "epoch": 0.8495, "grad_norm": 1.3736546039581299, "grad_norm_var": 2.125770387110932, "learning_rate": 2e-05, "loss": 0.4857, "loss/crossentropy": 2.224379062652588, "loss/hidden": 0.21533203125, "loss/logits": 0.041461410000920296, "loss/reg": 0.02288985066115856, "step": 1699 }, { "epoch": 0.85, "grad_norm": 1.2207798957824707, "grad_norm_var": 2.1652485676396744, "learning_rate": 2e-05, "loss": 0.4432, "loss/crossentropy": 2.3198060989379883, "loss/hidden": 0.17919921875, "loss/logits": 0.03512590378522873, "loss/reg": 0.022887248545885086, "step": 1700 }, { "epoch": 0.8505, "grad_norm": 3.1013894081115723, "grad_norm_var": 2.2161002754379138, "learning_rate": 2e-05, "loss": 0.4683, "loss/crossentropy": 2.369223117828369, "loss/hidden": 0.2021484375, "loss/logits": 0.03727924171835184, "loss/reg": 0.02288457751274109, "step": 1701 }, { "epoch": 0.851, "grad_norm": 1.519582986831665, "grad_norm_var": 2.2111400237679426, "learning_rate": 2e-05, "loss": 0.4225, "loss/crossentropy": 2.1703940629959106, "loss/hidden": 0.1669921875, "loss/logits": 0.02673946786671877, "loss/reg": 0.022881818935275078, "step": 1702 }, { "epoch": 0.8515, "grad_norm": 1.3196645975112915, "grad_norm_var": 2.219856788865909, "learning_rate": 2e-05, "loss": 0.3877, "loss/crossentropy": 2.4375799894332886, "loss/hidden": 0.1357421875, "loss/logits": 0.02319456171244383, "loss/reg": 0.02287893183529377, "step": 1703 }, { "epoch": 0.852, "grad_norm": 1.607952356338501, "grad_norm_var": 2.2139568549493474, "learning_rate": 2e-05, "loss": 0.4585, "loss/crossentropy": 2.397140145301819, "loss/hidden": 0.18994140625, "loss/logits": 0.039843300357460976, "loss/reg": 0.022875996306538582, "step": 1704 }, { "epoch": 0.8525, "grad_norm": 1.3758463859558105, "grad_norm_var": 0.32430155493763096, "learning_rate": 2e-05, "loss": 0.4734, "loss/crossentropy": 2.4673901796340942, "loss/hidden": 0.193359375, "loss/logits": 0.05126242712140083, "loss/reg": 0.022873075678944588, "step": 1705 }, { "epoch": 0.853, "grad_norm": 1.749706506729126, "grad_norm_var": 0.31991028408618616, "learning_rate": 2e-05, "loss": 0.4687, "loss/crossentropy": 2.74143385887146, "loss/hidden": 0.193359375, "loss/logits": 0.046618303284049034, "loss/reg": 0.022870399057865143, "step": 1706 }, { "epoch": 0.8535, "grad_norm": 2.9021739959716797, "grad_norm_var": 0.3935280096896221, "learning_rate": 2e-05, "loss": 0.61, "loss/crossentropy": 2.360695719718933, "loss/hidden": 0.33447265625, "loss/logits": 0.046809954568743706, "loss/reg": 0.022867854684591293, "step": 1707 }, { "epoch": 0.854, "grad_norm": 1.5873744487762451, "grad_norm_var": 0.3915854985762199, "learning_rate": 2e-05, "loss": 0.5241, "loss/crossentropy": 2.2110289335250854, "loss/hidden": 0.240234375, "loss/logits": 0.05524888634681702, "loss/reg": 0.022865328937768936, "step": 1708 }, { "epoch": 0.8545, "grad_norm": 2.6934444904327393, "grad_norm_var": 0.4381563541382609, "learning_rate": 2e-05, "loss": 0.4892, "loss/crossentropy": 1.9379909038543701, "loss/hidden": 0.23095703125, "loss/logits": 0.029659108258783817, "loss/reg": 0.02286284975707531, "step": 1709 }, { "epoch": 0.855, "grad_norm": 1.629195213317871, "grad_norm_var": 0.3435589120556684, "learning_rate": 2e-05, "loss": 0.4169, "loss/crossentropy": 2.4776086807250977, "loss/hidden": 0.15576171875, "loss/logits": 0.032544512301683426, "loss/reg": 0.022860383614897728, "step": 1710 }, { "epoch": 0.8555, "grad_norm": 2.148226022720337, "grad_norm_var": 0.3491618389264744, "learning_rate": 2e-05, "loss": 0.485, "loss/crossentropy": 2.5047671794891357, "loss/hidden": 0.205078125, "loss/logits": 0.05138644762337208, "loss/reg": 0.02285795472562313, "step": 1711 }, { "epoch": 0.856, "grad_norm": 1.456741452217102, "grad_norm_var": 0.35538574638478854, "learning_rate": 2e-05, "loss": 0.404, "loss/crossentropy": 2.458739399909973, "loss/hidden": 0.14892578125, "loss/logits": 0.0265263793990016, "loss/reg": 0.02285546064376831, "step": 1712 }, { "epoch": 0.8565, "grad_norm": 1.3901511430740356, "grad_norm_var": 0.3570773520934078, "learning_rate": 2e-05, "loss": 0.4505, "loss/crossentropy": 2.324171304702759, "loss/hidden": 0.18505859375, "loss/logits": 0.03695343807339668, "loss/reg": 0.022852910682559013, "step": 1713 }, { "epoch": 0.857, "grad_norm": 1.6300503015518188, "grad_norm_var": 0.35082418432478046, "learning_rate": 2e-05, "loss": 0.4168, "loss/crossentropy": 2.6420832872390747, "loss/hidden": 0.15771484375, "loss/logits": 0.030615486204624176, "loss/reg": 0.02285032905638218, "step": 1714 }, { "epoch": 0.8575, "grad_norm": 1.3466908931732178, "grad_norm_var": 0.35238126851175644, "learning_rate": 2e-05, "loss": 0.3992, "loss/crossentropy": 2.43982470035553, "loss/hidden": 0.1455078125, "loss/logits": 0.025227680802345276, "loss/reg": 0.022847697138786316, "step": 1715 }, { "epoch": 0.858, "grad_norm": 2.9759249687194824, "grad_norm_var": 0.41113615805510123, "learning_rate": 2e-05, "loss": 0.3993, "loss/crossentropy": 2.382121205329895, "loss/hidden": 0.148681640625, "loss/logits": 0.02220490388572216, "loss/reg": 0.022845016792416573, "step": 1716 }, { "epoch": 0.8585, "grad_norm": 1.4040857553482056, "grad_norm_var": 0.3197881529322027, "learning_rate": 2e-05, "loss": 0.4771, "loss/crossentropy": 2.0994767546653748, "loss/hidden": 0.20849609375, "loss/logits": 0.04016950540244579, "loss/reg": 0.022842474281787872, "step": 1717 }, { "epoch": 0.859, "grad_norm": 1.6990382671356201, "grad_norm_var": 0.3151857693459073, "learning_rate": 2e-05, "loss": 0.5367, "loss/crossentropy": 2.323665142059326, "loss/hidden": 0.27392578125, "loss/logits": 0.034341275691986084, "loss/reg": 0.022839896380901337, "step": 1718 }, { "epoch": 0.8595, "grad_norm": 1.2173619270324707, "grad_norm_var": 0.3224909500736409, "learning_rate": 2e-05, "loss": 0.4331, "loss/crossentropy": 2.1206226348876953, "loss/hidden": 0.17333984375, "loss/logits": 0.031357141211628914, "loss/reg": 0.02283727563917637, "step": 1719 }, { "epoch": 0.86, "grad_norm": 2.2023870944976807, "grad_norm_var": 0.3292850127322119, "learning_rate": 2e-05, "loss": 0.5505, "loss/crossentropy": 2.327507257461548, "loss/hidden": 0.27001953125, "loss/logits": 0.052125243470072746, "loss/reg": 0.022834734991192818, "step": 1720 }, { "epoch": 0.8605, "grad_norm": 1.5860549211502075, "grad_norm_var": 0.319092889556803, "learning_rate": 2e-05, "loss": 0.4335, "loss/crossentropy": 2.394924759864807, "loss/hidden": 0.173828125, "loss/logits": 0.031398216262459755, "loss/reg": 0.022832229733467102, "step": 1721 }, { "epoch": 0.861, "grad_norm": 2.052626132965088, "grad_norm_var": 0.3207301547447267, "learning_rate": 2e-05, "loss": 0.4896, "loss/crossentropy": 2.6150401830673218, "loss/hidden": 0.21728515625, "loss/logits": 0.04404893517494202, "loss/reg": 0.02282971516251564, "step": 1722 }, { "epoch": 0.8615, "grad_norm": 1.9768868684768677, "grad_norm_var": 0.246910721101532, "learning_rate": 2e-05, "loss": 0.5072, "loss/crossentropy": 2.564804196357727, "loss/hidden": 0.23876953125, "loss/logits": 0.040187520906329155, "loss/reg": 0.022827180102467537, "step": 1723 }, { "epoch": 0.862, "grad_norm": 1.905512809753418, "grad_norm_var": 0.2436969642283363, "learning_rate": 2e-05, "loss": 0.4706, "loss/crossentropy": 2.4824973344802856, "loss/hidden": 0.21337890625, "loss/logits": 0.028959065675735474, "loss/reg": 0.02282462827861309, "step": 1724 }, { "epoch": 0.8625, "grad_norm": 1.9956358671188354, "grad_norm_var": 0.19399456280608826, "learning_rate": 2e-05, "loss": 0.4786, "loss/crossentropy": 2.287666082382202, "loss/hidden": 0.20751953125, "loss/logits": 0.042856570333242416, "loss/reg": 0.022822000086307526, "step": 1725 }, { "epoch": 0.863, "grad_norm": 1.5660924911499023, "grad_norm_var": 0.19558407654289164, "learning_rate": 2e-05, "loss": 0.4187, "loss/crossentropy": 2.3453445434570312, "loss/hidden": 0.16455078125, "loss/logits": 0.025976940989494324, "loss/reg": 0.022819381207227707, "step": 1726 }, { "epoch": 0.8635, "grad_norm": 1.2870204448699951, "grad_norm_var": 0.2001835773595658, "learning_rate": 2e-05, "loss": 0.4523, "loss/crossentropy": 2.185749650001526, "loss/hidden": 0.185546875, "loss/logits": 0.038570983335375786, "loss/reg": 0.022816654294729233, "step": 1727 }, { "epoch": 0.864, "grad_norm": 1.5679943561553955, "grad_norm_var": 0.19689234439128783, "learning_rate": 2e-05, "loss": 0.48, "loss/crossentropy": 2.1167298555374146, "loss/hidden": 0.21630859375, "loss/logits": 0.0355659443885088, "loss/reg": 0.022814186289906502, "step": 1728 }, { "epoch": 0.8645, "grad_norm": 1.7449185848236084, "grad_norm_var": 0.1883177922944227, "learning_rate": 2e-05, "loss": 0.4252, "loss/crossentropy": 2.49346387386322, "loss/hidden": 0.162109375, "loss/logits": 0.03502298891544342, "loss/reg": 0.022811725735664368, "step": 1729 }, { "epoch": 0.865, "grad_norm": 1.6579017639160156, "grad_norm_var": 0.18788410072038247, "learning_rate": 2e-05, "loss": 0.52, "loss/crossentropy": 2.4262338876724243, "loss/hidden": 0.24169921875, "loss/logits": 0.05023909732699394, "loss/reg": 0.0228092223405838, "step": 1730 }, { "epoch": 0.8655, "grad_norm": 1.4867043495178223, "grad_norm_var": 0.18136299973852205, "learning_rate": 2e-05, "loss": 0.4134, "loss/crossentropy": 2.3994067907333374, "loss/hidden": 0.1552734375, "loss/logits": 0.03004833124577999, "loss/reg": 0.022806638851761818, "step": 1731 }, { "epoch": 0.866, "grad_norm": 1.3810359239578247, "grad_norm_var": 0.08398193136193673, "learning_rate": 2e-05, "loss": 0.4072, "loss/crossentropy": 2.3403743505477905, "loss/hidden": 0.1533203125, "loss/logits": 0.025856359861791134, "loss/reg": 0.02280416525900364, "step": 1732 }, { "epoch": 0.8665, "grad_norm": 1.6184099912643433, "grad_norm_var": 0.0792338392069519, "learning_rate": 2e-05, "loss": 0.416, "loss/crossentropy": 2.6625880002975464, "loss/hidden": 0.15234375, "loss/logits": 0.03561602905392647, "loss/reg": 0.022801598533988, "step": 1733 }, { "epoch": 0.867, "grad_norm": 2.438000202178955, "grad_norm_var": 0.11483483909980136, "learning_rate": 2e-05, "loss": 0.4855, "loss/crossentropy": 2.478938102722168, "loss/hidden": 0.20556640625, "loss/logits": 0.05192135088145733, "loss/reg": 0.022799065336585045, "step": 1734 }, { "epoch": 0.8675, "grad_norm": 1.7405439615249634, "grad_norm_var": 0.09616209020188246, "learning_rate": 2e-05, "loss": 0.5137, "loss/crossentropy": 2.30058753490448, "loss/hidden": 0.23095703125, "loss/logits": 0.05478241667151451, "loss/reg": 0.022796491160988808, "step": 1735 }, { "epoch": 0.868, "grad_norm": 1.431205153465271, "grad_norm_var": 0.08815077463142741, "learning_rate": 2e-05, "loss": 0.4983, "loss/crossentropy": 2.2017308473587036, "loss/hidden": 0.2216796875, "loss/logits": 0.0487048402428627, "loss/reg": 0.022793902084231377, "step": 1736 }, { "epoch": 0.8685, "grad_norm": 9.956767082214355, "grad_norm_var": 4.3237782917981535, "learning_rate": 2e-05, "loss": 1.0532, "loss/crossentropy": 3.484397292137146, "loss/hidden": 0.634765625, "loss/logits": 0.19047586619853973, "loss/reg": 0.022791236639022827, "step": 1737 }, { "epoch": 0.869, "grad_norm": 1.2082791328430176, "grad_norm_var": 4.389199988572325, "learning_rate": 2e-05, "loss": 0.4212, "loss/crossentropy": 2.3602211475372314, "loss/hidden": 0.162109375, "loss/logits": 0.03119662031531334, "loss/reg": 0.02278871089220047, "step": 1738 }, { "epoch": 0.8695, "grad_norm": 1.8653035163879395, "grad_norm_var": 4.393077132745998, "learning_rate": 2e-05, "loss": 0.563, "loss/crossentropy": 1.9802654385566711, "loss/hidden": 0.29443359375, "loss/logits": 0.04070642963051796, "loss/reg": 0.022786037996411324, "step": 1739 }, { "epoch": 0.87, "grad_norm": 1.7705045938491821, "grad_norm_var": 4.399125143377921, "learning_rate": 2e-05, "loss": 0.4537, "loss/crossentropy": 2.4105314016342163, "loss/hidden": 0.193359375, "loss/logits": 0.032505772076547146, "loss/reg": 0.022783316671848297, "step": 1740 }, { "epoch": 0.8705, "grad_norm": 1.8441373109817505, "grad_norm_var": 4.404077104357423, "learning_rate": 2e-05, "loss": 0.4757, "loss/crossentropy": 2.494503378868103, "loss/hidden": 0.21533203125, "loss/logits": 0.03259473852813244, "loss/reg": 0.022780809551477432, "step": 1741 }, { "epoch": 0.871, "grad_norm": 1.637903094291687, "grad_norm_var": 4.3987100041283655, "learning_rate": 2e-05, "loss": 0.4373, "loss/crossentropy": 2.5231049060821533, "loss/hidden": 0.17529296875, "loss/logits": 0.03421847615391016, "loss/reg": 0.022778036072850227, "step": 1742 }, { "epoch": 0.8715, "grad_norm": 1.2547270059585571, "grad_norm_var": 4.402554673430745, "learning_rate": 2e-05, "loss": 0.4012, "loss/crossentropy": 2.4014939069747925, "loss/hidden": 0.14404296875, "loss/logits": 0.029431598260998726, "loss/reg": 0.02277528867125511, "step": 1743 }, { "epoch": 0.872, "grad_norm": 1.684816598892212, "grad_norm_var": 4.394143219321391, "learning_rate": 2e-05, "loss": 0.4864, "loss/crossentropy": 2.567805290222168, "loss/hidden": 0.212890625, "loss/logits": 0.04573212191462517, "loss/reg": 0.022772807627916336, "step": 1744 }, { "epoch": 0.8725, "grad_norm": 1.8819024562835693, "grad_norm_var": 4.387550777046777, "learning_rate": 2e-05, "loss": 0.5153, "loss/crossentropy": 2.206387996673584, "loss/hidden": 0.23828125, "loss/logits": 0.049341777339577675, "loss/reg": 0.022770432755351067, "step": 1745 }, { "epoch": 0.873, "grad_norm": 1.7741963863372803, "grad_norm_var": 4.380321608464935, "learning_rate": 2e-05, "loss": 0.4103, "loss/crossentropy": 2.610047698020935, "loss/hidden": 0.15966796875, "loss/logits": 0.022908887825906277, "loss/reg": 0.022767851129174232, "step": 1746 }, { "epoch": 0.8735, "grad_norm": 3.9074506759643555, "grad_norm_var": 4.520894958490501, "learning_rate": 2e-05, "loss": 0.6931, "loss/crossentropy": 2.1363461017608643, "loss/hidden": 0.365234375, "loss/logits": 0.10018501989543438, "loss/reg": 0.02276543714106083, "step": 1747 }, { "epoch": 0.874, "grad_norm": 1.198498249053955, "grad_norm_var": 4.546248895237187, "learning_rate": 2e-05, "loss": 0.4262, "loss/crossentropy": 2.4276458024978638, "loss/hidden": 0.16796875, "loss/logits": 0.030584653839468956, "loss/reg": 0.022762905806303024, "step": 1748 }, { "epoch": 0.8745, "grad_norm": 1.802208662033081, "grad_norm_var": 4.531024858198469, "learning_rate": 2e-05, "loss": 0.4372, "loss/crossentropy": 2.446824789047241, "loss/hidden": 0.18017578125, "loss/logits": 0.029467890039086342, "loss/reg": 0.02276029624044895, "step": 1749 }, { "epoch": 0.875, "grad_norm": 1.8234444856643677, "grad_norm_var": 4.546376504661152, "learning_rate": 2e-05, "loss": 0.4537, "loss/crossentropy": 2.524027705192566, "loss/hidden": 0.18603515625, "loss/logits": 0.04007681459188461, "loss/reg": 0.022757630795240402, "step": 1750 }, { "epoch": 0.8755, "grad_norm": 1.8809150457382202, "grad_norm_var": 4.537158333397111, "learning_rate": 2e-05, "loss": 0.4648, "loss/crossentropy": 2.4926512241363525, "loss/hidden": 0.20556640625, "loss/logits": 0.03168141841888428, "loss/reg": 0.02275506965816021, "step": 1751 }, { "epoch": 0.876, "grad_norm": 1.2176272869110107, "grad_norm_var": 4.564967615041626, "learning_rate": 2e-05, "loss": 0.4126, "loss/crossentropy": 2.351579189300537, "loss/hidden": 0.15869140625, "loss/logits": 0.026421986520290375, "loss/reg": 0.022752393037080765, "step": 1752 }, { "epoch": 0.8765, "grad_norm": 2.546253204345703, "grad_norm_var": 0.4261500613285089, "learning_rate": 2e-05, "loss": 0.4591, "loss/crossentropy": 2.254626989364624, "loss/hidden": 0.19189453125, "loss/logits": 0.039690613746643066, "loss/reg": 0.022749925032258034, "step": 1753 }, { "epoch": 0.877, "grad_norm": 1.4721755981445312, "grad_norm_var": 0.4085867001765545, "learning_rate": 2e-05, "loss": 0.4259, "loss/crossentropy": 2.4233288764953613, "loss/hidden": 0.16748046875, "loss/logits": 0.03093926515430212, "loss/reg": 0.022747157141566277, "step": 1754 }, { "epoch": 0.8775, "grad_norm": 1.2497280836105347, "grad_norm_var": 0.43081935423290685, "learning_rate": 2e-05, "loss": 0.3901, "loss/crossentropy": 2.395901918411255, "loss/hidden": 0.138916015625, "loss/logits": 0.023775647394359112, "loss/reg": 0.022744452580809593, "step": 1755 }, { "epoch": 0.878, "grad_norm": 1.8739466667175293, "grad_norm_var": 0.4309550360190786, "learning_rate": 2e-05, "loss": 0.4859, "loss/crossentropy": 2.5788021087646484, "loss/hidden": 0.21923828125, "loss/logits": 0.03924744948744774, "loss/reg": 0.02274180017411709, "step": 1756 }, { "epoch": 0.8785, "grad_norm": 1.8786344528198242, "grad_norm_var": 0.43116057997378515, "learning_rate": 2e-05, "loss": 0.433, "loss/crossentropy": 2.2879987955093384, "loss/hidden": 0.17431640625, "loss/logits": 0.03129947930574417, "loss/reg": 0.02273917943239212, "step": 1757 }, { "epoch": 0.879, "grad_norm": 1.6786879301071167, "grad_norm_var": 0.4302863936647914, "learning_rate": 2e-05, "loss": 0.507, "loss/crossentropy": 2.3784435987472534, "loss/hidden": 0.236328125, "loss/logits": 0.04326160717755556, "loss/reg": 0.022736700251698494, "step": 1758 }, { "epoch": 0.8795, "grad_norm": 1.2841837406158447, "grad_norm_var": 0.42811919905549467, "learning_rate": 2e-05, "loss": 0.4027, "loss/crossentropy": 2.5569673776626587, "loss/hidden": 0.14990234375, "loss/logits": 0.025482705794274807, "loss/reg": 0.02273416332900524, "step": 1759 }, { "epoch": 0.88, "grad_norm": 1.5086617469787598, "grad_norm_var": 0.43328459560282606, "learning_rate": 2e-05, "loss": 0.5087, "loss/crossentropy": 2.386002779006958, "loss/hidden": 0.2529296875, "loss/logits": 0.028444298543035984, "loss/reg": 0.02273155376315117, "step": 1760 }, { "epoch": 0.8805, "grad_norm": 1.7824084758758545, "grad_norm_var": 0.4329647889707303, "learning_rate": 2e-05, "loss": 0.4786, "loss/crossentropy": 2.3847200870513916, "loss/hidden": 0.20849609375, "loss/logits": 0.04277382045984268, "loss/reg": 0.02272888645529747, "step": 1761 }, { "epoch": 0.881, "grad_norm": 4.4069061279296875, "grad_norm_var": 0.8553708809071431, "learning_rate": 2e-05, "loss": 0.4992, "loss/crossentropy": 2.23597252368927, "loss/hidden": 0.2265625, "loss/logits": 0.04541921988129616, "loss/reg": 0.02272612974047661, "step": 1762 }, { "epoch": 0.8815, "grad_norm": 1.4857374429702759, "grad_norm_var": 0.596154104293141, "learning_rate": 2e-05, "loss": 0.4126, "loss/crossentropy": 2.1902058124542236, "loss/hidden": 0.15771484375, "loss/logits": 0.027693829499185085, "loss/reg": 0.022723568603396416, "step": 1763 }, { "epoch": 0.882, "grad_norm": 1.1129963397979736, "grad_norm_var": 0.6036749302760668, "learning_rate": 2e-05, "loss": 0.404, "loss/crossentropy": 2.1529598236083984, "loss/hidden": 0.14794921875, "loss/logits": 0.028801556676626205, "loss/reg": 0.022720852866768837, "step": 1764 }, { "epoch": 0.8825, "grad_norm": 1.3617935180664062, "grad_norm_var": 0.6164186737964911, "learning_rate": 2e-05, "loss": 0.4394, "loss/crossentropy": 2.4010159969329834, "loss/hidden": 0.17578125, "loss/logits": 0.03646496683359146, "loss/reg": 0.022718340158462524, "step": 1765 }, { "epoch": 0.883, "grad_norm": 1.3944129943847656, "grad_norm_var": 0.6257383981751897, "learning_rate": 2e-05, "loss": 0.4295, "loss/crossentropy": 2.249167323112488, "loss/hidden": 0.17236328125, "loss/logits": 0.029935719445347786, "loss/reg": 0.022715754806995392, "step": 1766 }, { "epoch": 0.8835, "grad_norm": 1.9540674686431885, "grad_norm_var": 0.6272674150301994, "learning_rate": 2e-05, "loss": 0.445, "loss/crossentropy": 2.6090357303619385, "loss/hidden": 0.1884765625, "loss/logits": 0.029395846650004387, "loss/reg": 0.022712942212820053, "step": 1767 }, { "epoch": 0.884, "grad_norm": 1.2891789674758911, "grad_norm_var": 0.62238428300894, "learning_rate": 2e-05, "loss": 0.3992, "loss/crossentropy": 2.690170645713806, "loss/hidden": 0.146484375, "loss/logits": 0.02565884869545698, "loss/reg": 0.022710150107741356, "step": 1768 }, { "epoch": 0.8845, "grad_norm": 1.2826368808746338, "grad_norm_var": 0.590971243638222, "learning_rate": 2e-05, "loss": 0.4288, "loss/crossentropy": 2.5179227590560913, "loss/hidden": 0.17236328125, "loss/logits": 0.029347356408834457, "loss/reg": 0.022707320749759674, "step": 1769 }, { "epoch": 0.885, "grad_norm": 1.7831594944000244, "grad_norm_var": 0.588045487335727, "learning_rate": 2e-05, "loss": 0.4584, "loss/crossentropy": 2.4644254446029663, "loss/hidden": 0.197265625, "loss/logits": 0.03404002822935581, "loss/reg": 0.022704841569066048, "step": 1770 }, { "epoch": 0.8855, "grad_norm": 1.3630361557006836, "grad_norm_var": 0.5819252647022783, "learning_rate": 2e-05, "loss": 0.4236, "loss/crossentropy": 2.4129849672317505, "loss/hidden": 0.1630859375, "loss/logits": 0.03353873174637556, "loss/reg": 0.022702371701598167, "step": 1771 }, { "epoch": 0.886, "grad_norm": 1.395241379737854, "grad_norm_var": 0.5861043275034279, "learning_rate": 2e-05, "loss": 0.452, "loss/crossentropy": 2.3290704488754272, "loss/hidden": 0.1845703125, "loss/logits": 0.04044055938720703, "loss/reg": 0.022700009867548943, "step": 1772 }, { "epoch": 0.8865, "grad_norm": 1.7868410348892212, "grad_norm_var": 0.584262372098181, "learning_rate": 2e-05, "loss": 0.5297, "loss/crossentropy": 2.2407915592193604, "loss/hidden": 0.25830078125, "loss/logits": 0.044432349503040314, "loss/reg": 0.022697754204273224, "step": 1773 }, { "epoch": 0.887, "grad_norm": 1.151076316833496, "grad_norm_var": 0.6017088609785968, "learning_rate": 2e-05, "loss": 0.4063, "loss/crossentropy": 2.358174681663513, "loss/hidden": 0.15185546875, "loss/logits": 0.027467947453260422, "loss/reg": 0.022695155814290047, "step": 1774 }, { "epoch": 0.8875, "grad_norm": 1.4789233207702637, "grad_norm_var": 0.5946741348237327, "learning_rate": 2e-05, "loss": 0.492, "loss/crossentropy": 2.0869343280792236, "loss/hidden": 0.21875, "loss/logits": 0.046349382027983665, "loss/reg": 0.022692805156111717, "step": 1775 }, { "epoch": 0.888, "grad_norm": 1.7221815586090088, "grad_norm_var": 0.5932558452639825, "learning_rate": 2e-05, "loss": 0.4587, "loss/crossentropy": 2.397850751876831, "loss/hidden": 0.19140625, "loss/logits": 0.04042772948741913, "loss/reg": 0.02269033156335354, "step": 1776 }, { "epoch": 0.8885, "grad_norm": 1.2972921133041382, "grad_norm_var": 0.6008173321053277, "learning_rate": 2e-05, "loss": 0.4126, "loss/crossentropy": 2.3940863609313965, "loss/hidden": 0.15673828125, "loss/logits": 0.028958087787032127, "loss/reg": 0.02268776297569275, "step": 1777 }, { "epoch": 0.889, "grad_norm": 1.3768194913864136, "grad_norm_var": 0.05743777499220073, "learning_rate": 2e-05, "loss": 0.4215, "loss/crossentropy": 2.385925769805908, "loss/hidden": 0.162109375, "loss/logits": 0.03253248520195484, "loss/reg": 0.022685421630740166, "step": 1778 }, { "epoch": 0.8895, "grad_norm": 1.526502013206482, "grad_norm_var": 0.057723853573745744, "learning_rate": 2e-05, "loss": 0.4341, "loss/crossentropy": 2.2163840532302856, "loss/hidden": 0.17578125, "loss/logits": 0.03145410865545273, "loss/reg": 0.022683102637529373, "step": 1779 }, { "epoch": 0.89, "grad_norm": 1.6970871686935425, "grad_norm_var": 0.05243035328896462, "learning_rate": 2e-05, "loss": 0.4789, "loss/crossentropy": 2.2332208156585693, "loss/hidden": 0.20654296875, "loss/logits": 0.04551873542368412, "loss/reg": 0.022680532187223434, "step": 1780 }, { "epoch": 0.8905, "grad_norm": 1.4872901439666748, "grad_norm_var": 0.051248249436363774, "learning_rate": 2e-05, "loss": 0.4758, "loss/crossentropy": 2.301728844642639, "loss/hidden": 0.2080078125, "loss/logits": 0.04098478890955448, "loss/reg": 0.022678013890981674, "step": 1781 }, { "epoch": 0.891, "grad_norm": 1.4966388940811157, "grad_norm_var": 0.050474361598935265, "learning_rate": 2e-05, "loss": 0.4928, "loss/crossentropy": 2.034238338470459, "loss/hidden": 0.22265625, "loss/logits": 0.04338419623672962, "loss/reg": 0.022675424814224243, "step": 1782 }, { "epoch": 0.8915, "grad_norm": 1.5417438745498657, "grad_norm_var": 0.03643927829848502, "learning_rate": 2e-05, "loss": 0.4808, "loss/crossentropy": 2.180203914642334, "loss/hidden": 0.2060546875, "loss/logits": 0.048051947727799416, "loss/reg": 0.02267291769385338, "step": 1783 }, { "epoch": 0.892, "grad_norm": 1.3023936748504639, "grad_norm_var": 0.03611445252943189, "learning_rate": 2e-05, "loss": 0.4562, "loss/crossentropy": 2.403857707977295, "loss/hidden": 0.19189453125, "loss/logits": 0.037562835961580276, "loss/reg": 0.022670235484838486, "step": 1784 }, { "epoch": 0.8925, "grad_norm": 1.5010074377059937, "grad_norm_var": 0.033332240131487785, "learning_rate": 2e-05, "loss": 0.4343, "loss/crossentropy": 2.3974303007125854, "loss/hidden": 0.17431640625, "loss/logits": 0.03334982506930828, "loss/reg": 0.02266768552362919, "step": 1785 }, { "epoch": 0.893, "grad_norm": 1.8571279048919678, "grad_norm_var": 0.036524026921363806, "learning_rate": 2e-05, "loss": 0.4137, "loss/crossentropy": 2.7112414836883545, "loss/hidden": 0.1552734375, "loss/logits": 0.03175277356058359, "loss/reg": 0.022665170952677727, "step": 1786 }, { "epoch": 0.8935, "grad_norm": 1.2478469610214233, "grad_norm_var": 0.0394388347318376, "learning_rate": 2e-05, "loss": 0.4274, "loss/crossentropy": 2.3169617652893066, "loss/hidden": 0.1708984375, "loss/logits": 0.02990701049566269, "loss/reg": 0.02266273833811283, "step": 1787 }, { "epoch": 0.894, "grad_norm": 2.265127182006836, "grad_norm_var": 0.07555353783638541, "learning_rate": 2e-05, "loss": 0.4095, "loss/crossentropy": 2.3823176622390747, "loss/hidden": 0.16015625, "loss/logits": 0.02273565251380205, "loss/reg": 0.02266021817922592, "step": 1788 }, { "epoch": 0.8945, "grad_norm": 1.3702856302261353, "grad_norm_var": 0.07302160323975777, "learning_rate": 2e-05, "loss": 0.3994, "loss/crossentropy": 2.4472192525863647, "loss/hidden": 0.14794921875, "loss/logits": 0.02484053187072277, "loss/reg": 0.02265772968530655, "step": 1789 }, { "epoch": 0.895, "grad_norm": 1.5002284049987793, "grad_norm_var": 0.06346798172954345, "learning_rate": 2e-05, "loss": 0.4878, "loss/crossentropy": 2.291977286338806, "loss/hidden": 0.2177734375, "loss/logits": 0.043478766456246376, "loss/reg": 0.022655179724097252, "step": 1790 }, { "epoch": 0.8955, "grad_norm": 1.1956472396850586, "grad_norm_var": 0.07085745843397048, "learning_rate": 2e-05, "loss": 0.421, "loss/crossentropy": 2.269936203956604, "loss/hidden": 0.16552734375, "loss/logits": 0.02890065312385559, "loss/reg": 0.02265259623527527, "step": 1791 }, { "epoch": 0.896, "grad_norm": 1.631693720817566, "grad_norm_var": 0.068979061781067, "learning_rate": 2e-05, "loss": 0.4571, "loss/crossentropy": 2.319582223892212, "loss/hidden": 0.193359375, "loss/logits": 0.0372174559161067, "loss/reg": 0.02265011891722679, "step": 1792 }, { "epoch": 0.8965, "grad_norm": 1.1987940073013306, "grad_norm_var": 0.07248952922075773, "learning_rate": 2e-05, "loss": 0.4329, "loss/crossentropy": 2.2927104234695435, "loss/hidden": 0.17431640625, "loss/logits": 0.03212358243763447, "loss/reg": 0.022647712379693985, "step": 1793 }, { "epoch": 0.897, "grad_norm": 2.1740849018096924, "grad_norm_var": 0.09781844329650032, "learning_rate": 2e-05, "loss": 0.5068, "loss/crossentropy": 1.931494951248169, "loss/hidden": 0.24462890625, "loss/logits": 0.03571862727403641, "loss/reg": 0.022645175457000732, "step": 1794 }, { "epoch": 0.8975, "grad_norm": 2.1789815425872803, "grad_norm_var": 0.12133015992480196, "learning_rate": 2e-05, "loss": 0.4422, "loss/crossentropy": 2.1458447575569153, "loss/hidden": 0.1875, "loss/logits": 0.02825307659804821, "loss/reg": 0.022642606869339943, "step": 1795 }, { "epoch": 0.898, "grad_norm": 1.4859169721603394, "grad_norm_var": 0.12146453537655641, "learning_rate": 2e-05, "loss": 0.458, "loss/crossentropy": 2.554581642150879, "loss/hidden": 0.19189453125, "loss/logits": 0.03974371217191219, "loss/reg": 0.02264014631509781, "step": 1796 }, { "epoch": 0.8985, "grad_norm": 1.7171391248703003, "grad_norm_var": 0.121628688093484, "learning_rate": 2e-05, "loss": 0.4388, "loss/crossentropy": 2.5617785453796387, "loss/hidden": 0.17626953125, "loss/logits": 0.03613244369626045, "loss/reg": 0.022637590765953064, "step": 1797 }, { "epoch": 0.899, "grad_norm": 1.327169418334961, "grad_norm_var": 0.12585053460300416, "learning_rate": 2e-05, "loss": 0.4193, "loss/crossentropy": 2.2237210273742676, "loss/hidden": 0.1611328125, "loss/logits": 0.03177984245121479, "loss/reg": 0.022635027766227722, "step": 1798 }, { "epoch": 0.8995, "grad_norm": 1.7142618894577026, "grad_norm_var": 0.12652134086684536, "learning_rate": 2e-05, "loss": 0.4097, "loss/crossentropy": 2.3350863456726074, "loss/hidden": 0.153564453125, "loss/logits": 0.02983055729418993, "loss/reg": 0.02263249270617962, "step": 1799 }, { "epoch": 0.9, "grad_norm": 1.9557993412017822, "grad_norm_var": 0.12690867583912677, "learning_rate": 2e-05, "loss": 0.4866, "loss/crossentropy": 2.4636796712875366, "loss/hidden": 0.21826171875, "loss/logits": 0.042068254202604294, "loss/reg": 0.02263004146516323, "step": 1800 }, { "epoch": 0.9005, "grad_norm": 1.1416701078414917, "grad_norm_var": 0.14188113240769837, "learning_rate": 2e-05, "loss": 0.4022, "loss/crossentropy": 2.5174624919891357, "loss/hidden": 0.14794921875, "loss/logits": 0.02796847652643919, "loss/reg": 0.022627437487244606, "step": 1801 }, { "epoch": 0.901, "grad_norm": 1.4578477144241333, "grad_norm_var": 0.13936010822746894, "learning_rate": 2e-05, "loss": 0.4365, "loss/crossentropy": 2.3277111053466797, "loss/hidden": 0.17578125, "loss/logits": 0.03451576270163059, "loss/reg": 0.02262502908706665, "step": 1802 }, { "epoch": 0.9015, "grad_norm": 2.0312423706054688, "grad_norm_var": 0.14117838718365097, "learning_rate": 2e-05, "loss": 0.3985, "loss/crossentropy": 2.541975498199463, "loss/hidden": 0.1455078125, "loss/logits": 0.026732699014246464, "loss/reg": 0.02262257970869541, "step": 1803 }, { "epoch": 0.902, "grad_norm": 2.132697820663452, "grad_norm_var": 0.13135331477077988, "learning_rate": 2e-05, "loss": 0.4634, "loss/crossentropy": 2.439447283744812, "loss/hidden": 0.2001953125, "loss/logits": 0.03704650327563286, "loss/reg": 0.022620007395744324, "step": 1804 }, { "epoch": 0.9025, "grad_norm": 1.7248797416687012, "grad_norm_var": 0.12653841640955357, "learning_rate": 2e-05, "loss": 0.4427, "loss/crossentropy": 2.479643940925598, "loss/hidden": 0.17822265625, "loss/logits": 0.038322363048791885, "loss/reg": 0.02261737734079361, "step": 1805 }, { "epoch": 0.903, "grad_norm": 1.5996307134628296, "grad_norm_var": 0.12503174039449808, "learning_rate": 2e-05, "loss": 0.4873, "loss/crossentropy": 2.3944002389907837, "loss/hidden": 0.21728515625, "loss/logits": 0.04385751113295555, "loss/reg": 0.02261476404964924, "step": 1806 }, { "epoch": 0.9035, "grad_norm": 2.0911006927490234, "grad_norm_var": 0.11890385472204343, "learning_rate": 2e-05, "loss": 0.4095, "loss/crossentropy": 2.336695432662964, "loss/hidden": 0.158447265625, "loss/logits": 0.024968229234218597, "loss/reg": 0.022612126544117928, "step": 1807 }, { "epoch": 0.904, "grad_norm": 1.5537153482437134, "grad_norm_var": 0.12022990836071562, "learning_rate": 2e-05, "loss": 0.4726, "loss/crossentropy": 2.2479125261306763, "loss/hidden": 0.22021484375, "loss/logits": 0.026327339932322502, "loss/reg": 0.022609485313296318, "step": 1808 }, { "epoch": 0.9045, "grad_norm": 1.2499186992645264, "grad_norm_var": 0.11685534109740553, "learning_rate": 2e-05, "loss": 0.4078, "loss/crossentropy": 2.4833693504333496, "loss/hidden": 0.1494140625, "loss/logits": 0.03229031339287758, "loss/reg": 0.02260700799524784, "step": 1809 }, { "epoch": 0.905, "grad_norm": 1.5342835187911987, "grad_norm_var": 0.10378850866723704, "learning_rate": 2e-05, "loss": 0.4023, "loss/crossentropy": 2.1959651708602905, "loss/hidden": 0.150390625, "loss/logits": 0.02590491622686386, "loss/reg": 0.022604528814554214, "step": 1810 }, { "epoch": 0.9055, "grad_norm": 1.044647455215454, "grad_norm_var": 0.10889354132386113, "learning_rate": 2e-05, "loss": 0.3867, "loss/crossentropy": 2.1947706937789917, "loss/hidden": 0.1357421875, "loss/logits": 0.0249461866915226, "loss/reg": 0.022602051496505737, "step": 1811 }, { "epoch": 0.906, "grad_norm": 3.011995792388916, "grad_norm_var": 0.2291783334976803, "learning_rate": 2e-05, "loss": 0.5375, "loss/crossentropy": 2.6274040937423706, "loss/hidden": 0.26025390625, "loss/logits": 0.05128267593681812, "loss/reg": 0.022599538788199425, "step": 1812 }, { "epoch": 0.9065, "grad_norm": 1.5391746759414673, "grad_norm_var": 0.2308816121342284, "learning_rate": 2e-05, "loss": 0.436, "loss/crossentropy": 2.5975476503372192, "loss/hidden": 0.1787109375, "loss/logits": 0.031343039125204086, "loss/reg": 0.022597048431634903, "step": 1813 }, { "epoch": 0.907, "grad_norm": 1.6539846658706665, "grad_norm_var": 0.22155591112929945, "learning_rate": 2e-05, "loss": 0.4405, "loss/crossentropy": 2.389395594596863, "loss/hidden": 0.1796875, "loss/logits": 0.03484657034277916, "loss/reg": 0.022594643756747246, "step": 1814 }, { "epoch": 0.9075, "grad_norm": 1.2229188680648804, "grad_norm_var": 0.23667999380509078, "learning_rate": 2e-05, "loss": 0.4212, "loss/crossentropy": 2.35276997089386, "loss/hidden": 0.1650390625, "loss/logits": 0.030212889425456524, "loss/reg": 0.02259230427443981, "step": 1815 }, { "epoch": 0.908, "grad_norm": 1.4916632175445557, "grad_norm_var": 0.23332946859573647, "learning_rate": 2e-05, "loss": 0.4277, "loss/crossentropy": 2.2875664830207825, "loss/hidden": 0.16650390625, "loss/logits": 0.035321952775120735, "loss/reg": 0.02258998341858387, "step": 1816 }, { "epoch": 0.9085, "grad_norm": 1.2740663290023804, "grad_norm_var": 0.2253617779282422, "learning_rate": 2e-05, "loss": 0.4286, "loss/crossentropy": 2.3370351791381836, "loss/hidden": 0.16455078125, "loss/logits": 0.03822075389325619, "loss/reg": 0.02258743718266487, "step": 1817 }, { "epoch": 0.909, "grad_norm": 1.2872508764266968, "grad_norm_var": 0.23185537664945718, "learning_rate": 2e-05, "loss": 0.4221, "loss/crossentropy": 2.4487507343292236, "loss/hidden": 0.16357421875, "loss/logits": 0.03271046280860901, "loss/reg": 0.02258501760661602, "step": 1818 }, { "epoch": 0.9095, "grad_norm": 1.5939817428588867, "grad_norm_var": 0.2217355171208072, "learning_rate": 2e-05, "loss": 0.4138, "loss/crossentropy": 2.6092634201049805, "loss/hidden": 0.16015625, "loss/logits": 0.02780964784324169, "loss/reg": 0.022582601755857468, "step": 1819 }, { "epoch": 0.91, "grad_norm": 16.670120239257812, "grad_norm_var": 14.413642548278558, "learning_rate": 2e-05, "loss": 0.5563, "loss/crossentropy": 2.3885061740875244, "loss/hidden": 0.2939453125, "loss/logits": 0.036548784002661705, "loss/reg": 0.022580305114388466, "step": 1820 }, { "epoch": 0.9105, "grad_norm": 1.2790286540985107, "grad_norm_var": 14.474163637655296, "learning_rate": 2e-05, "loss": 0.373, "loss/crossentropy": 2.6208510398864746, "loss/hidden": 0.12646484375, "loss/logits": 0.020793078001588583, "loss/reg": 0.022577952593564987, "step": 1821 }, { "epoch": 0.911, "grad_norm": 2.2307803630828857, "grad_norm_var": 14.422778758807375, "learning_rate": 2e-05, "loss": 0.4049, "loss/crossentropy": 2.3613555431365967, "loss/hidden": 0.1572265625, "loss/logits": 0.02187713049352169, "loss/reg": 0.022575698792934418, "step": 1822 }, { "epoch": 0.9115, "grad_norm": 1.8077691793441772, "grad_norm_var": 14.44496363143063, "learning_rate": 2e-05, "loss": 0.4596, "loss/crossentropy": 2.2239835262298584, "loss/hidden": 0.20068359375, "loss/logits": 0.03322533704340458, "loss/reg": 0.022573480382561684, "step": 1823 }, { "epoch": 0.912, "grad_norm": 1.8814700841903687, "grad_norm_var": 14.409108100365703, "learning_rate": 2e-05, "loss": 0.4243, "loss/crossentropy": 2.503218650817871, "loss/hidden": 0.16748046875, "loss/logits": 0.031097950413823128, "loss/reg": 0.022571343928575516, "step": 1824 }, { "epoch": 0.9125, "grad_norm": 1.4032585620880127, "grad_norm_var": 14.384031530190613, "learning_rate": 2e-05, "loss": 0.4415, "loss/crossentropy": 2.498712182044983, "loss/hidden": 0.18408203125, "loss/logits": 0.03172986023128033, "loss/reg": 0.022569168359041214, "step": 1825 }, { "epoch": 0.913, "grad_norm": 1.4820616245269775, "grad_norm_var": 14.391329331953612, "learning_rate": 2e-05, "loss": 0.3925, "loss/crossentropy": 2.2149962186813354, "loss/hidden": 0.14404296875, "loss/logits": 0.022828245535492897, "loss/reg": 0.022566672414541245, "step": 1826 }, { "epoch": 0.9135, "grad_norm": 1.4449611902236938, "grad_norm_var": 14.320749149874818, "learning_rate": 2e-05, "loss": 0.4056, "loss/crossentropy": 2.385851740837097, "loss/hidden": 0.14794921875, "loss/logits": 0.03205075114965439, "loss/reg": 0.022564470767974854, "step": 1827 }, { "epoch": 0.914, "grad_norm": 1.4951767921447754, "grad_norm_var": 14.377107771874954, "learning_rate": 2e-05, "loss": 0.4165, "loss/crossentropy": 2.23067569732666, "loss/hidden": 0.16015625, "loss/logits": 0.030674993991851807, "loss/reg": 0.02256196364760399, "step": 1828 }, { "epoch": 0.9145, "grad_norm": 1.4566495418548584, "grad_norm_var": 14.38793906557842, "learning_rate": 2e-05, "loss": 0.4193, "loss/crossentropy": 2.438162684440613, "loss/hidden": 0.162109375, "loss/logits": 0.03160354122519493, "loss/reg": 0.02255944348871708, "step": 1829 }, { "epoch": 0.915, "grad_norm": 1.2917461395263672, "grad_norm_var": 14.436020724601905, "learning_rate": 2e-05, "loss": 0.4317, "loss/crossentropy": 2.349228262901306, "loss/hidden": 0.1708984375, "loss/logits": 0.03521360456943512, "loss/reg": 0.022557225078344345, "step": 1830 }, { "epoch": 0.9155, "grad_norm": 1.3939363956451416, "grad_norm_var": 14.40970744042121, "learning_rate": 2e-05, "loss": 0.376, "loss/crossentropy": 2.434686541557312, "loss/hidden": 0.130126953125, "loss/logits": 0.020286419428884983, "loss/reg": 0.022554853931069374, "step": 1831 }, { "epoch": 0.916, "grad_norm": 1.1982911825180054, "grad_norm_var": 14.453267319482267, "learning_rate": 2e-05, "loss": 0.406, "loss/crossentropy": 2.2608832120895386, "loss/hidden": 0.1513671875, "loss/logits": 0.02907765470445156, "loss/reg": 0.02255268208682537, "step": 1832 }, { "epoch": 0.9165, "grad_norm": 1.3199797868728638, "grad_norm_var": 14.446203864298448, "learning_rate": 2e-05, "loss": 0.4498, "loss/crossentropy": 2.0910937786102295, "loss/hidden": 0.1884765625, "loss/logits": 0.03581584058701992, "loss/reg": 0.022550417110323906, "step": 1833 }, { "epoch": 0.917, "grad_norm": 1.717532753944397, "grad_norm_var": 14.390936544297686, "learning_rate": 2e-05, "loss": 0.4676, "loss/crossentropy": 2.2329607009887695, "loss/hidden": 0.20849609375, "loss/logits": 0.03362170793116093, "loss/reg": 0.022548070177435875, "step": 1834 }, { "epoch": 0.9175, "grad_norm": 1.6943548917770386, "grad_norm_var": 14.379719646058886, "learning_rate": 2e-05, "loss": 0.4554, "loss/crossentropy": 2.37869393825531, "loss/hidden": 0.19384765625, "loss/logits": 0.036072161979973316, "loss/reg": 0.022545799612998962, "step": 1835 }, { "epoch": 0.918, "grad_norm": 2.670581817626953, "grad_norm_var": 0.15172412365397647, "learning_rate": 2e-05, "loss": 0.5334, "loss/crossentropy": 2.3655420541763306, "loss/hidden": 0.2705078125, "loss/logits": 0.03744707256555557, "loss/reg": 0.022543571889400482, "step": 1836 }, { "epoch": 0.9185, "grad_norm": 1.4796888828277588, "grad_norm_var": 0.14537294518872693, "learning_rate": 2e-05, "loss": 0.446, "loss/crossentropy": 2.2026679515838623, "loss/hidden": 0.1884765625, "loss/logits": 0.03215072676539421, "loss/reg": 0.022541362792253494, "step": 1837 }, { "epoch": 0.919, "grad_norm": 1.465540885925293, "grad_norm_var": 0.11996093294203304, "learning_rate": 2e-05, "loss": 0.3955, "loss/crossentropy": 2.6018182039260864, "loss/hidden": 0.146484375, "loss/logits": 0.023670999333262444, "loss/reg": 0.022538956254720688, "step": 1838 }, { "epoch": 0.9195, "grad_norm": 2.6521716117858887, "grad_norm_var": 0.19071007315725008, "learning_rate": 2e-05, "loss": 0.5236, "loss/crossentropy": 2.290159225463867, "loss/hidden": 0.25390625, "loss/logits": 0.044362759217619896, "loss/reg": 0.022536424919962883, "step": 1839 }, { "epoch": 0.92, "grad_norm": 1.2103041410446167, "grad_norm_var": 0.19617798026988734, "learning_rate": 2e-05, "loss": 0.4108, "loss/crossentropy": 2.3787648677825928, "loss/hidden": 0.15185546875, "loss/logits": 0.033578867092728615, "loss/reg": 0.022533901035785675, "step": 1840 }, { "epoch": 0.9205, "grad_norm": 1.3619086742401123, "grad_norm_var": 0.19729243671530577, "learning_rate": 2e-05, "loss": 0.3874, "loss/crossentropy": 2.400606870651245, "loss/hidden": 0.138671875, "loss/logits": 0.023377398028969765, "loss/reg": 0.022531181573867798, "step": 1841 }, { "epoch": 0.921, "grad_norm": 1.7256075143814087, "grad_norm_var": 0.19770787293851314, "learning_rate": 2e-05, "loss": 0.3938, "loss/crossentropy": 2.6239322423934937, "loss/hidden": 0.13623046875, "loss/logits": 0.03229370526969433, "loss/reg": 0.02252843603491783, "step": 1842 }, { "epoch": 0.9215, "grad_norm": 1.4745891094207764, "grad_norm_var": 0.1971555977191843, "learning_rate": 2e-05, "loss": 0.3882, "loss/crossentropy": 2.5068957805633545, "loss/hidden": 0.13671875, "loss/logits": 0.026268533430993557, "loss/reg": 0.022525638341903687, "step": 1843 }, { "epoch": 0.922, "grad_norm": 1.3436163663864136, "grad_norm_var": 0.20071971118220464, "learning_rate": 2e-05, "loss": 0.3943, "loss/crossentropy": 2.5273977518081665, "loss/hidden": 0.14453125, "loss/logits": 0.024529898539185524, "loss/reg": 0.02252272516489029, "step": 1844 }, { "epoch": 0.9225, "grad_norm": 2.0100479125976562, "grad_norm_var": 0.209944773268783, "learning_rate": 2e-05, "loss": 0.4533, "loss/crossentropy": 2.466127634048462, "loss/hidden": 0.193359375, "loss/logits": 0.03471413720399141, "loss/reg": 0.02252020128071308, "step": 1845 }, { "epoch": 0.923, "grad_norm": 1.6746459007263184, "grad_norm_var": 0.20206274459075116, "learning_rate": 2e-05, "loss": 0.4434, "loss/crossentropy": 2.244032859802246, "loss/hidden": 0.18359375, "loss/logits": 0.034620098769664764, "loss/reg": 0.02251766063272953, "step": 1846 }, { "epoch": 0.9235, "grad_norm": 1.7665760517120361, "grad_norm_var": 0.19804128550095795, "learning_rate": 2e-05, "loss": 0.4023, "loss/crossentropy": 2.1969178915023804, "loss/hidden": 0.15087890625, "loss/logits": 0.026284687221050262, "loss/reg": 0.022515103220939636, "step": 1847 }, { "epoch": 0.924, "grad_norm": 2.707465648651123, "grad_norm_var": 0.24490152911908372, "learning_rate": 2e-05, "loss": 0.4971, "loss/crossentropy": 2.5745354890823364, "loss/hidden": 0.23193359375, "loss/logits": 0.0400242879986763, "loss/reg": 0.022512590512633324, "step": 1848 }, { "epoch": 0.9245, "grad_norm": 1.9823905229568481, "grad_norm_var": 0.23282989475386653, "learning_rate": 2e-05, "loss": 0.4681, "loss/crossentropy": 2.053453028202057, "loss/hidden": 0.20849609375, "loss/logits": 0.03454894572496414, "loss/reg": 0.02251008152961731, "step": 1849 }, { "epoch": 0.925, "grad_norm": 1.7281039953231812, "grad_norm_var": 0.23270857087946387, "learning_rate": 2e-05, "loss": 0.4487, "loss/crossentropy": 2.4333807229995728, "loss/hidden": 0.18359375, "loss/logits": 0.04003257304430008, "loss/reg": 0.022507477551698685, "step": 1850 }, { "epoch": 0.9255, "grad_norm": 1.4039613008499146, "grad_norm_var": 0.24242675596621838, "learning_rate": 2e-05, "loss": 0.4234, "loss/crossentropy": 2.3681410551071167, "loss/hidden": 0.1640625, "loss/logits": 0.03433472663164139, "loss/reg": 0.022504812106490135, "step": 1851 }, { "epoch": 0.926, "grad_norm": 1.361372947692871, "grad_norm_var": 0.196025750965984, "learning_rate": 2e-05, "loss": 0.4174, "loss/crossentropy": 2.1348154544830322, "loss/hidden": 0.1572265625, "loss/logits": 0.03517003171145916, "loss/reg": 0.022502336651086807, "step": 1852 }, { "epoch": 0.9265, "grad_norm": 1.5376478433609009, "grad_norm_var": 0.19446169115935935, "learning_rate": 2e-05, "loss": 0.4399, "loss/crossentropy": 2.3104746341705322, "loss/hidden": 0.1748046875, "loss/logits": 0.04014399088919163, "loss/reg": 0.022499844431877136, "step": 1853 }, { "epoch": 0.927, "grad_norm": 1.6215567588806152, "grad_norm_var": 0.19083799211992075, "learning_rate": 2e-05, "loss": 0.4836, "loss/crossentropy": 2.3783109188079834, "loss/hidden": 0.2099609375, "loss/logits": 0.0486428327858448, "loss/reg": 0.02249729447066784, "step": 1854 }, { "epoch": 0.9275, "grad_norm": 1.4279130697250366, "grad_norm_var": 0.1327791587908031, "learning_rate": 2e-05, "loss": 0.3999, "loss/crossentropy": 2.4454482793807983, "loss/hidden": 0.146484375, "loss/logits": 0.02843039110302925, "loss/reg": 0.022494826465845108, "step": 1855 }, { "epoch": 0.928, "grad_norm": 1.3840880393981934, "grad_norm_var": 0.12456864834301858, "learning_rate": 2e-05, "loss": 0.4034, "loss/crossentropy": 2.2915083169937134, "loss/hidden": 0.15185546875, "loss/logits": 0.026664272882044315, "loss/reg": 0.02249237336218357, "step": 1856 }, { "epoch": 0.9285, "grad_norm": 1.6339174509048462, "grad_norm_var": 0.11849177496741632, "learning_rate": 2e-05, "loss": 0.4585, "loss/crossentropy": 2.2919251918792725, "loss/hidden": 0.1962890625, "loss/logits": 0.03729063458740711, "loss/reg": 0.022490020841360092, "step": 1857 }, { "epoch": 0.929, "grad_norm": 1.4191261529922485, "grad_norm_var": 0.12225227678708066, "learning_rate": 2e-05, "loss": 0.3906, "loss/crossentropy": 2.4586825370788574, "loss/hidden": 0.13818359375, "loss/logits": 0.027546225115656853, "loss/reg": 0.022487731650471687, "step": 1858 }, { "epoch": 0.9295, "grad_norm": 1.765230417251587, "grad_norm_var": 0.12054770545048896, "learning_rate": 2e-05, "loss": 0.4006, "loss/crossentropy": 2.3640472888946533, "loss/hidden": 0.1484375, "loss/logits": 0.027359573170542717, "loss/reg": 0.022485224530100822, "step": 1859 }, { "epoch": 0.93, "grad_norm": 1.314341425895691, "grad_norm_var": 0.12188687798420096, "learning_rate": 2e-05, "loss": 0.3958, "loss/crossentropy": 2.542987108230591, "loss/hidden": 0.1455078125, "loss/logits": 0.025465862825512886, "loss/reg": 0.022482680156826973, "step": 1860 }, { "epoch": 0.9305, "grad_norm": 1.4204936027526855, "grad_norm_var": 0.11697036921642787, "learning_rate": 2e-05, "loss": 0.3786, "loss/crossentropy": 2.3337528705596924, "loss/hidden": 0.13330078125, "loss/logits": 0.0205409936606884, "loss/reg": 0.0224803127348423, "step": 1861 }, { "epoch": 0.931, "grad_norm": 1.553285002708435, "grad_norm_var": 0.11723807462238127, "learning_rate": 2e-05, "loss": 0.4207, "loss/crossentropy": 2.5051403045654297, "loss/hidden": 0.16357421875, "loss/logits": 0.032310767099261284, "loss/reg": 0.022477777674794197, "step": 1862 }, { "epoch": 0.9315, "grad_norm": 1.88336980342865, "grad_norm_var": 0.12026858023636061, "learning_rate": 2e-05, "loss": 0.4247, "loss/crossentropy": 2.7639983892440796, "loss/hidden": 0.16650390625, "loss/logits": 0.033397359773516655, "loss/reg": 0.022475138306617737, "step": 1863 }, { "epoch": 0.932, "grad_norm": 1.633898377418518, "grad_norm_var": 0.03864676483869018, "learning_rate": 2e-05, "loss": 0.4802, "loss/crossentropy": 2.3575611114501953, "loss/hidden": 0.22021484375, "loss/logits": 0.035267666913568974, "loss/reg": 0.022472495213150978, "step": 1864 }, { "epoch": 0.9325, "grad_norm": 1.7001293897628784, "grad_norm_var": 0.02799001185132912, "learning_rate": 2e-05, "loss": 0.4325, "loss/crossentropy": 2.351699948310852, "loss/hidden": 0.1748046875, "loss/logits": 0.03303542733192444, "loss/reg": 0.022469859570264816, "step": 1865 }, { "epoch": 0.933, "grad_norm": 2.136624336242676, "grad_norm_var": 0.04816114932450143, "learning_rate": 2e-05, "loss": 0.5564, "loss/crossentropy": 2.115296185016632, "loss/hidden": 0.2880859375, "loss/logits": 0.043633848428726196, "loss/reg": 0.022467387840151787, "step": 1866 }, { "epoch": 0.9335, "grad_norm": 3.2611968517303467, "grad_norm_var": 0.22143645197999617, "learning_rate": 2e-05, "loss": 0.5975, "loss/crossentropy": 2.6474725008010864, "loss/hidden": 0.297607421875, "loss/logits": 0.07519873604178429, "loss/reg": 0.022464843466877937, "step": 1867 }, { "epoch": 0.934, "grad_norm": 1.7594748735427856, "grad_norm_var": 0.2138510846890559, "learning_rate": 2e-05, "loss": 0.4304, "loss/crossentropy": 2.440946102142334, "loss/hidden": 0.1806640625, "loss/logits": 0.025086318142712116, "loss/reg": 0.02246221713721752, "step": 1868 }, { "epoch": 0.9345, "grad_norm": 1.7738037109375, "grad_norm_var": 0.2117281243319851, "learning_rate": 2e-05, "loss": 0.4517, "loss/crossentropy": 2.58816659450531, "loss/hidden": 0.18505859375, "loss/logits": 0.04205223172903061, "loss/reg": 0.022459525614976883, "step": 1869 }, { "epoch": 0.935, "grad_norm": 1.5588525533676147, "grad_norm_var": 0.21288492425881386, "learning_rate": 2e-05, "loss": 0.4193, "loss/crossentropy": 2.6452767848968506, "loss/hidden": 0.15966796875, "loss/logits": 0.03501817770302296, "loss/reg": 0.022456802427768707, "step": 1870 }, { "epoch": 0.9355, "grad_norm": 1.714156150817871, "grad_norm_var": 0.20660591312481713, "learning_rate": 2e-05, "loss": 0.3944, "loss/crossentropy": 2.630972743034363, "loss/hidden": 0.142578125, "loss/logits": 0.02727540396153927, "loss/reg": 0.022454047575592995, "step": 1871 }, { "epoch": 0.936, "grad_norm": 1.4239534139633179, "grad_norm_var": 0.2047895173630837, "learning_rate": 2e-05, "loss": 0.418, "loss/crossentropy": 2.3476343154907227, "loss/hidden": 0.16357421875, "loss/logits": 0.029905791394412518, "loss/reg": 0.022451288998126984, "step": 1872 }, { "epoch": 0.9365, "grad_norm": 1.3508610725402832, "grad_norm_var": 0.21406456048783054, "learning_rate": 2e-05, "loss": 0.4097, "loss/crossentropy": 2.158454120159149, "loss/hidden": 0.15869140625, "loss/logits": 0.026525546796619892, "loss/reg": 0.02244875766336918, "step": 1873 }, { "epoch": 0.937, "grad_norm": 1.8326611518859863, "grad_norm_var": 0.20765040453607733, "learning_rate": 2e-05, "loss": 0.4879, "loss/crossentropy": 2.358833074569702, "loss/hidden": 0.23291015625, "loss/logits": 0.03050221409648657, "loss/reg": 0.022446228191256523, "step": 1874 }, { "epoch": 0.9375, "grad_norm": 4.5147013664245605, "grad_norm_var": 0.6838218076838546, "learning_rate": 2e-05, "loss": 0.556, "loss/crossentropy": 2.2411223649978638, "loss/hidden": 0.28271484375, "loss/logits": 0.048853909596800804, "loss/reg": 0.022443652153015137, "step": 1875 }, { "epoch": 0.938, "grad_norm": 2.6737983226776123, "grad_norm_var": 0.6882806728767182, "learning_rate": 2e-05, "loss": 0.4451, "loss/crossentropy": 2.5554691553115845, "loss/hidden": 0.18310546875, "loss/logits": 0.03760566934943199, "loss/reg": 0.022440902888774872, "step": 1876 }, { "epoch": 0.9385, "grad_norm": 1.9008327722549438, "grad_norm_var": 0.6648208335261887, "learning_rate": 2e-05, "loss": 0.4281, "loss/crossentropy": 2.85513436794281, "loss/hidden": 0.17236328125, "loss/logits": 0.03139635734260082, "loss/reg": 0.022438300773501396, "step": 1877 }, { "epoch": 0.939, "grad_norm": 1.3097538948059082, "grad_norm_var": 0.6843957065276801, "learning_rate": 2e-05, "loss": 0.4314, "loss/crossentropy": 2.171161413192749, "loss/hidden": 0.17333984375, "loss/logits": 0.03369998559355736, "loss/reg": 0.022435514256358147, "step": 1878 }, { "epoch": 0.9395, "grad_norm": 1.6398875713348389, "grad_norm_var": 0.6927558067930797, "learning_rate": 2e-05, "loss": 0.4433, "loss/crossentropy": 2.272761583328247, "loss/hidden": 0.1845703125, "loss/logits": 0.0343943927437067, "loss/reg": 0.022432943806052208, "step": 1879 }, { "epoch": 0.94, "grad_norm": 1.6524351835250854, "grad_norm_var": 0.6918439217164187, "learning_rate": 2e-05, "loss": 0.4265, "loss/crossentropy": 2.3826204538345337, "loss/hidden": 0.169921875, "loss/logits": 0.03229558374732733, "loss/reg": 0.022430358454585075, "step": 1880 }, { "epoch": 0.9405, "grad_norm": 1.5030336380004883, "grad_norm_var": 0.7024858941629423, "learning_rate": 2e-05, "loss": 0.4619, "loss/crossentropy": 2.4012389183044434, "loss/hidden": 0.20458984375, "loss/logits": 0.033006876707077026, "loss/reg": 0.022427737712860107, "step": 1881 }, { "epoch": 0.941, "grad_norm": 1.1325322389602661, "grad_norm_var": 0.7472577601143559, "learning_rate": 2e-05, "loss": 0.3661, "loss/crossentropy": 2.3976120948791504, "loss/hidden": 0.123291015625, "loss/logits": 0.018594788387417793, "loss/reg": 0.022425329312682152, "step": 1882 }, { "epoch": 0.9415, "grad_norm": 1.1853415966033936, "grad_norm_var": 0.6502409271460311, "learning_rate": 2e-05, "loss": 0.4157, "loss/crossentropy": 2.298361301422119, "loss/hidden": 0.16015625, "loss/logits": 0.031314633786678314, "loss/reg": 0.022422639653086662, "step": 1883 }, { "epoch": 0.942, "grad_norm": 1.0973137617111206, "grad_norm_var": 0.6819181070580886, "learning_rate": 2e-05, "loss": 0.3617, "loss/crossentropy": 2.4975024461746216, "loss/hidden": 0.117919921875, "loss/logits": 0.019550339318811893, "loss/reg": 0.02241992950439453, "step": 1884 }, { "epoch": 0.9425, "grad_norm": 1.7438507080078125, "grad_norm_var": 0.6819449915123499, "learning_rate": 2e-05, "loss": 0.4634, "loss/crossentropy": 2.204440951347351, "loss/hidden": 0.20458984375, "loss/logits": 0.03460996691137552, "loss/reg": 0.022417448461055756, "step": 1885 }, { "epoch": 0.943, "grad_norm": 1.2848305702209473, "grad_norm_var": 0.6941560719689528, "learning_rate": 2e-05, "loss": 0.4238, "loss/crossentropy": 2.4170058965682983, "loss/hidden": 0.16796875, "loss/logits": 0.03171874303370714, "loss/reg": 0.022415155544877052, "step": 1886 }, { "epoch": 0.9435, "grad_norm": 1.4587926864624023, "grad_norm_var": 0.6993669145136687, "learning_rate": 2e-05, "loss": 0.4306, "loss/crossentropy": 2.349593162536621, "loss/hidden": 0.17431640625, "loss/logits": 0.032118335366249084, "loss/reg": 0.022412730380892754, "step": 1887 }, { "epoch": 0.944, "grad_norm": 1.3421655893325806, "grad_norm_var": 0.7031391966357072, "learning_rate": 2e-05, "loss": 0.4002, "loss/crossentropy": 2.1012765169143677, "loss/hidden": 0.14599609375, "loss/logits": 0.030093910172581673, "loss/reg": 0.02241034060716629, "step": 1888 }, { "epoch": 0.9445, "grad_norm": 1.568566083908081, "grad_norm_var": 0.6951998080418774, "learning_rate": 2e-05, "loss": 0.4535, "loss/crossentropy": 2.2762022018432617, "loss/hidden": 0.193359375, "loss/logits": 0.03608548082411289, "loss/reg": 0.022408101707696915, "step": 1889 }, { "epoch": 0.945, "grad_norm": 1.2452459335327148, "grad_norm_var": 0.7095108720725463, "learning_rate": 2e-05, "loss": 0.4212, "loss/crossentropy": 2.4997655153274536, "loss/hidden": 0.16552734375, "loss/logits": 0.03162308409810066, "loss/reg": 0.022405438125133514, "step": 1890 }, { "epoch": 0.9455, "grad_norm": 1.5108202695846558, "grad_norm_var": 0.14745889251719102, "learning_rate": 2e-05, "loss": 0.4281, "loss/crossentropy": 2.2601327896118164, "loss/hidden": 0.16796875, "loss/logits": 0.03605945594608784, "loss/reg": 0.02240295149385929, "step": 1891 }, { "epoch": 0.946, "grad_norm": 1.6598831415176392, "grad_norm_var": 0.05513170444355821, "learning_rate": 2e-05, "loss": 0.4369, "loss/crossentropy": 2.2406809329986572, "loss/hidden": 0.1787109375, "loss/logits": 0.034201012924313545, "loss/reg": 0.02240018919110298, "step": 1892 }, { "epoch": 0.9465, "grad_norm": 1.3680381774902344, "grad_norm_var": 0.041003415881171415, "learning_rate": 2e-05, "loss": 0.4007, "loss/crossentropy": 2.4398785829544067, "loss/hidden": 0.148193359375, "loss/logits": 0.028537730686366558, "loss/reg": 0.022397480905056, "step": 1893 }, { "epoch": 0.947, "grad_norm": 1.0278970003128052, "grad_norm_var": 0.05007064750664269, "learning_rate": 2e-05, "loss": 0.3886, "loss/crossentropy": 2.148539900779724, "loss/hidden": 0.14208984375, "loss/logits": 0.022542059421539307, "loss/reg": 0.022394755855202675, "step": 1894 }, { "epoch": 0.9475, "grad_norm": 1.5086561441421509, "grad_norm_var": 0.0469721299358883, "learning_rate": 2e-05, "loss": 0.4413, "loss/crossentropy": 2.251150965690613, "loss/hidden": 0.1845703125, "loss/logits": 0.03278907388448715, "loss/reg": 0.022392379119992256, "step": 1895 }, { "epoch": 0.948, "grad_norm": 1.2127013206481934, "grad_norm_var": 0.04385164563974554, "learning_rate": 2e-05, "loss": 0.368, "loss/crossentropy": 2.422248601913452, "loss/hidden": 0.123779296875, "loss/logits": 0.020290100947022438, "loss/reg": 0.02238963358104229, "step": 1896 }, { "epoch": 0.9485, "grad_norm": 1.2324891090393066, "grad_norm_var": 0.043468858091787806, "learning_rate": 2e-05, "loss": 0.3832, "loss/crossentropy": 2.0850866436958313, "loss/hidden": 0.136962890625, "loss/logits": 0.02234545536339283, "loss/reg": 0.022387119010090828, "step": 1897 }, { "epoch": 0.949, "grad_norm": 1.4261040687561035, "grad_norm_var": 0.040394134059281585, "learning_rate": 2e-05, "loss": 0.4292, "loss/crossentropy": 2.431095004081726, "loss/hidden": 0.171875, "loss/logits": 0.033502984791994095, "loss/reg": 0.022384393960237503, "step": 1898 }, { "epoch": 0.9495, "grad_norm": 1.439329981803894, "grad_norm_var": 0.038272658552281236, "learning_rate": 2e-05, "loss": 0.387, "loss/crossentropy": 2.3343453407287598, "loss/hidden": 0.13916015625, "loss/logits": 0.02406618557870388, "loss/reg": 0.02238152176141739, "step": 1899 }, { "epoch": 0.95, "grad_norm": 1.0969916582107544, "grad_norm_var": 0.03828493091074415, "learning_rate": 2e-05, "loss": 0.3786, "loss/crossentropy": 2.563263773918152, "loss/hidden": 0.1318359375, "loss/logits": 0.02294111903756857, "loss/reg": 0.022378597408533096, "step": 1900 }, { "epoch": 0.9505, "grad_norm": 1.1716290712356567, "grad_norm_var": 0.031210427928216926, "learning_rate": 2e-05, "loss": 0.4071, "loss/crossentropy": 2.391364336013794, "loss/hidden": 0.1572265625, "loss/logits": 0.02612478658556938, "loss/reg": 0.022375814616680145, "step": 1901 }, { "epoch": 0.951, "grad_norm": 1.3930448293685913, "grad_norm_var": 0.03104337690990106, "learning_rate": 2e-05, "loss": 0.3902, "loss/crossentropy": 2.4488768577575684, "loss/hidden": 0.1396484375, "loss/logits": 0.02680811006575823, "loss/reg": 0.022373300045728683, "step": 1902 }, { "epoch": 0.9515, "grad_norm": 1.588849425315857, "grad_norm_var": 0.03391953124871445, "learning_rate": 2e-05, "loss": 0.4344, "loss/crossentropy": 2.354185700416565, "loss/hidden": 0.1806640625, "loss/logits": 0.03003675863146782, "loss/reg": 0.02237078920006752, "step": 1903 }, { "epoch": 0.952, "grad_norm": 1.7344504594802856, "grad_norm_var": 0.0424987168581661, "learning_rate": 2e-05, "loss": 0.4191, "loss/crossentropy": 2.253276824951172, "loss/hidden": 0.16455078125, "loss/logits": 0.030912759713828564, "loss/reg": 0.02236793003976345, "step": 1904 }, { "epoch": 0.9525, "grad_norm": 1.2497073411941528, "grad_norm_var": 0.0411145507961009, "learning_rate": 2e-05, "loss": 0.3802, "loss/crossentropy": 2.337361454963684, "loss/hidden": 0.13623046875, "loss/logits": 0.02029071655124426, "loss/reg": 0.02236493118107319, "step": 1905 }, { "epoch": 0.953, "grad_norm": 1.4437819719314575, "grad_norm_var": 0.040365271308345045, "learning_rate": 2e-05, "loss": 0.3873, "loss/crossentropy": 2.3124853372573853, "loss/hidden": 0.1435546875, "loss/logits": 0.020108817145228386, "loss/reg": 0.02236202545464039, "step": 1906 }, { "epoch": 0.9535, "grad_norm": 1.5539910793304443, "grad_norm_var": 0.04124039089983468, "learning_rate": 2e-05, "loss": 0.3817, "loss/crossentropy": 2.313749074935913, "loss/hidden": 0.13671875, "loss/logits": 0.02142718993127346, "loss/reg": 0.02235933393239975, "step": 1907 }, { "epoch": 0.954, "grad_norm": 1.723780632019043, "grad_norm_var": 0.04386541517829012, "learning_rate": 2e-05, "loss": 0.4172, "loss/crossentropy": 2.3829362392425537, "loss/hidden": 0.16455078125, "loss/logits": 0.02905107382684946, "loss/reg": 0.022356610745191574, "step": 1908 }, { "epoch": 0.9545, "grad_norm": 1.19853937625885, "grad_norm_var": 0.04606052697454756, "learning_rate": 2e-05, "loss": 0.4345, "loss/crossentropy": 2.2605234384536743, "loss/hidden": 0.17626953125, "loss/logits": 0.03473933879286051, "loss/reg": 0.022353753447532654, "step": 1909 }, { "epoch": 0.955, "grad_norm": 2.8696258068084717, "grad_norm_var": 0.17279256562972117, "learning_rate": 2e-05, "loss": 0.5594, "loss/crossentropy": 2.468735933303833, "loss/hidden": 0.29638671875, "loss/logits": 0.0395014937967062, "loss/reg": 0.022350985556840897, "step": 1910 }, { "epoch": 0.9555, "grad_norm": 1.2979068756103516, "grad_norm_var": 0.17505073259704976, "learning_rate": 2e-05, "loss": 0.3883, "loss/crossentropy": 2.4635722637176514, "loss/hidden": 0.13720703125, "loss/logits": 0.027581739239394665, "loss/reg": 0.02234843373298645, "step": 1911 }, { "epoch": 0.956, "grad_norm": 2.820190906524658, "grad_norm_var": 0.2798921413237015, "learning_rate": 2e-05, "loss": 0.6459, "loss/crossentropy": 2.131369948387146, "loss/hidden": 0.365234375, "loss/logits": 0.057183969765901566, "loss/reg": 0.02234589122235775, "step": 1912 }, { "epoch": 0.9565, "grad_norm": 1.3415361642837524, "grad_norm_var": 0.2756186472645955, "learning_rate": 2e-05, "loss": 0.422, "loss/crossentropy": 2.289743185043335, "loss/hidden": 0.16650390625, "loss/logits": 0.03201920446008444, "loss/reg": 0.022343412041664124, "step": 1913 }, { "epoch": 0.957, "grad_norm": 1.1303457021713257, "grad_norm_var": 0.2873257056445263, "learning_rate": 2e-05, "loss": 0.3862, "loss/crossentropy": 2.3576101064682007, "loss/hidden": 0.14013671875, "loss/logits": 0.02269960194826126, "loss/reg": 0.022340916097164154, "step": 1914 }, { "epoch": 0.9575, "grad_norm": 2.158590078353882, "grad_norm_var": 0.3075251014181994, "learning_rate": 2e-05, "loss": 0.4402, "loss/crossentropy": 2.332666039466858, "loss/hidden": 0.1787109375, "loss/logits": 0.038076866418123245, "loss/reg": 0.022338369861245155, "step": 1915 }, { "epoch": 0.958, "grad_norm": 7.00865364074707, "grad_norm_var": 2.08675653148509, "learning_rate": 2e-05, "loss": 0.6937, "loss/crossentropy": 2.646122097969055, "loss/hidden": 0.392578125, "loss/logits": 0.07780970819294453, "loss/reg": 0.02233590930700302, "step": 1916 }, { "epoch": 0.9585, "grad_norm": 1.3123027086257935, "grad_norm_var": 2.0728257314385186, "learning_rate": 2e-05, "loss": 0.4072, "loss/crossentropy": 2.2954723834991455, "loss/hidden": 0.1572265625, "loss/logits": 0.026589620858430862, "loss/reg": 0.022333433851599693, "step": 1917 }, { "epoch": 0.959, "grad_norm": 1.8116488456726074, "grad_norm_var": 2.050510475959324, "learning_rate": 2e-05, "loss": 0.4173, "loss/crossentropy": 2.6285065412521362, "loss/hidden": 0.16064453125, "loss/logits": 0.03334318473935127, "loss/reg": 0.022331027314066887, "step": 1918 }, { "epoch": 0.9595, "grad_norm": 1.3005446195602417, "grad_norm_var": 2.072096328270597, "learning_rate": 2e-05, "loss": 0.3712, "loss/crossentropy": 2.384632110595703, "loss/hidden": 0.1279296875, "loss/logits": 0.019956374540925026, "loss/reg": 0.022328531369566917, "step": 1919 }, { "epoch": 0.96, "grad_norm": 1.381271243095398, "grad_norm_var": 2.0922664903830954, "learning_rate": 2e-05, "loss": 0.3898, "loss/crossentropy": 2.2269625663757324, "loss/hidden": 0.146484375, "loss/logits": 0.020025085657835007, "loss/reg": 0.022326109930872917, "step": 1920 }, { "epoch": 0.9605, "grad_norm": 2.105039596557617, "grad_norm_var": 2.055258347725847, "learning_rate": 2e-05, "loss": 0.4958, "loss/crossentropy": 2.388508439064026, "loss/hidden": 0.22509765625, "loss/logits": 0.047498359344899654, "loss/reg": 0.02232373133301735, "step": 1921 }, { "epoch": 0.961, "grad_norm": 1.0844900608062744, "grad_norm_var": 2.091343013520689, "learning_rate": 2e-05, "loss": 0.3874, "loss/crossentropy": 2.326627492904663, "loss/hidden": 0.1396484375, "loss/logits": 0.02454256359487772, "loss/reg": 0.022321194410324097, "step": 1922 }, { "epoch": 0.9615, "grad_norm": 1.7085936069488525, "grad_norm_var": 2.0835161560615814, "learning_rate": 2e-05, "loss": 0.4864, "loss/crossentropy": 2.1795098781585693, "loss/hidden": 0.22021484375, "loss/logits": 0.04298657365143299, "loss/reg": 0.022318590432405472, "step": 1923 }, { "epoch": 0.962, "grad_norm": 1.0788378715515137, "grad_norm_var": 2.1346259374470815, "learning_rate": 2e-05, "loss": 0.3828, "loss/crossentropy": 2.402096390724182, "loss/hidden": 0.13623046875, "loss/logits": 0.023389977402985096, "loss/reg": 0.022316064685583115, "step": 1924 }, { "epoch": 0.9625, "grad_norm": 1.3279380798339844, "grad_norm_var": 2.1222672863766183, "learning_rate": 2e-05, "loss": 0.4175, "loss/crossentropy": 2.4705991744995117, "loss/hidden": 0.1650390625, "loss/logits": 0.02933623269200325, "loss/reg": 0.02231350913643837, "step": 1925 }, { "epoch": 0.963, "grad_norm": 1.8480850458145142, "grad_norm_var": 2.066806634795002, "learning_rate": 2e-05, "loss": 0.5545, "loss/crossentropy": 2.2896007299423218, "loss/hidden": 0.2978515625, "loss/logits": 0.03350013308227062, "loss/reg": 0.022310776636004448, "step": 1926 }, { "epoch": 0.9635, "grad_norm": 1.2676535844802856, "grad_norm_var": 2.0693722058326345, "learning_rate": 2e-05, "loss": 0.4228, "loss/crossentropy": 2.229649305343628, "loss/hidden": 0.16796875, "loss/logits": 0.03177023585885763, "loss/reg": 0.022308047860860825, "step": 1927 }, { "epoch": 0.964, "grad_norm": 1.5557312965393066, "grad_norm_var": 2.017172302933795, "learning_rate": 2e-05, "loss": 0.443, "loss/crossentropy": 2.5194848775863647, "loss/hidden": 0.18408203125, "loss/logits": 0.035887595266103745, "loss/reg": 0.02230549044907093, "step": 1928 }, { "epoch": 0.9645, "grad_norm": 1.9265832901000977, "grad_norm_var": 1.9997728547949178, "learning_rate": 2e-05, "loss": 0.4194, "loss/crossentropy": 2.3407788276672363, "loss/hidden": 0.16455078125, "loss/logits": 0.03186378628015518, "loss/reg": 0.02230297587811947, "step": 1929 }, { "epoch": 0.965, "grad_norm": 1.2221165895462036, "grad_norm_var": 1.9911827201257373, "learning_rate": 2e-05, "loss": 0.3808, "loss/crossentropy": 2.2330673933029175, "loss/hidden": 0.133544921875, "loss/logits": 0.02422002237290144, "loss/reg": 0.02230045385658741, "step": 1930 }, { "epoch": 0.9655, "grad_norm": 1.4464212656021118, "grad_norm_var": 1.996535291902523, "learning_rate": 2e-05, "loss": 0.431, "loss/crossentropy": 2.560730218887329, "loss/hidden": 0.17333984375, "loss/logits": 0.034654753282666206, "loss/reg": 0.022297974675893784, "step": 1931 }, { "epoch": 0.966, "grad_norm": 1.3387093544006348, "grad_norm_var": 0.09578263410320928, "learning_rate": 2e-05, "loss": 0.4139, "loss/crossentropy": 2.2281144857406616, "loss/hidden": 0.1640625, "loss/logits": 0.026879730634391308, "loss/reg": 0.02229529432952404, "step": 1932 }, { "epoch": 0.9665, "grad_norm": 1.6466373205184937, "grad_norm_var": 0.09519305136430105, "learning_rate": 2e-05, "loss": 0.4525, "loss/crossentropy": 2.416178584098816, "loss/hidden": 0.19677734375, "loss/logits": 0.03280434384942055, "loss/reg": 0.022292664274573326, "step": 1933 }, { "epoch": 0.967, "grad_norm": 1.9065243005752563, "grad_norm_var": 0.09965824271180447, "learning_rate": 2e-05, "loss": 0.398, "loss/crossentropy": 2.332263708114624, "loss/hidden": 0.14990234375, "loss/logits": 0.02516376320272684, "loss/reg": 0.02228992059826851, "step": 1934 }, { "epoch": 0.9675, "grad_norm": 1.438956618309021, "grad_norm_var": 0.09700722244866876, "learning_rate": 2e-05, "loss": 0.4454, "loss/crossentropy": 2.299056053161621, "loss/hidden": 0.1787109375, "loss/logits": 0.0438066478818655, "loss/reg": 0.022287409752607346, "step": 1935 }, { "epoch": 0.968, "grad_norm": 1.3283144235610962, "grad_norm_var": 0.09814598179248814, "learning_rate": 2e-05, "loss": 0.3993, "loss/crossentropy": 2.705706477165222, "loss/hidden": 0.150390625, "loss/logits": 0.026097907684743404, "loss/reg": 0.022284839302301407, "step": 1936 }, { "epoch": 0.9685, "grad_norm": 1.2671499252319336, "grad_norm_var": 0.07604085535109846, "learning_rate": 2e-05, "loss": 0.4034, "loss/crossentropy": 2.3038320541381836, "loss/hidden": 0.15283203125, "loss/logits": 0.027765167877078056, "loss/reg": 0.022282104939222336, "step": 1937 }, { "epoch": 0.969, "grad_norm": 1.7129496335983276, "grad_norm_var": 0.06908875770655426, "learning_rate": 2e-05, "loss": 0.4102, "loss/crossentropy": 2.409985303878784, "loss/hidden": 0.16015625, "loss/logits": 0.027242244221270084, "loss/reg": 0.022279653698205948, "step": 1938 }, { "epoch": 0.9695, "grad_norm": 2.031566619873047, "grad_norm_var": 0.08453384690603698, "learning_rate": 2e-05, "loss": 0.4135, "loss/crossentropy": 2.453916311264038, "loss/hidden": 0.162109375, "loss/logits": 0.028619682416319847, "loss/reg": 0.022277243435382843, "step": 1939 }, { "epoch": 0.97, "grad_norm": 1.2778832912445068, "grad_norm_var": 0.07526176615922105, "learning_rate": 2e-05, "loss": 0.4069, "loss/crossentropy": 2.225833773612976, "loss/hidden": 0.15185546875, "loss/logits": 0.032262424007058144, "loss/reg": 0.02227473258972168, "step": 1940 }, { "epoch": 0.9705, "grad_norm": 1.492366075515747, "grad_norm_var": 0.07243497295631436, "learning_rate": 2e-05, "loss": 0.4036, "loss/crossentropy": 2.5838898420333862, "loss/hidden": 0.15087890625, "loss/logits": 0.029996756464242935, "loss/reg": 0.022272255271673203, "step": 1941 }, { "epoch": 0.971, "grad_norm": 1.3968478441238403, "grad_norm_var": 0.06687936652998203, "learning_rate": 2e-05, "loss": 0.4118, "loss/crossentropy": 2.50420606136322, "loss/hidden": 0.15673828125, "loss/logits": 0.03238658234477043, "loss/reg": 0.022269796580076218, "step": 1942 }, { "epoch": 0.9715, "grad_norm": 1.295305848121643, "grad_norm_var": 0.06601141679391885, "learning_rate": 2e-05, "loss": 0.4081, "loss/crossentropy": 2.0319120287895203, "loss/hidden": 0.1572265625, "loss/logits": 0.028178725391626358, "loss/reg": 0.02226731739938259, "step": 1943 }, { "epoch": 0.972, "grad_norm": 2.280089855194092, "grad_norm_var": 0.10247276685499802, "learning_rate": 2e-05, "loss": 0.4834, "loss/crossentropy": 2.3557260036468506, "loss/hidden": 0.22216796875, "loss/logits": 0.03859390318393707, "loss/reg": 0.022265000268816948, "step": 1944 }, { "epoch": 0.9725, "grad_norm": 1.870656132698059, "grad_norm_var": 0.0999572300988054, "learning_rate": 2e-05, "loss": 0.4063, "loss/crossentropy": 2.2198551893234253, "loss/hidden": 0.15283203125, "loss/logits": 0.03083806298673153, "loss/reg": 0.022262422367930412, "step": 1945 }, { "epoch": 0.973, "grad_norm": 1.223396897315979, "grad_norm_var": 0.09989973331883183, "learning_rate": 2e-05, "loss": 0.3957, "loss/crossentropy": 2.442264437675476, "loss/hidden": 0.148681640625, "loss/logits": 0.024409527890384197, "loss/reg": 0.02225991152226925, "step": 1946 }, { "epoch": 0.9735, "grad_norm": 1.715518593788147, "grad_norm_var": 0.10036436305615364, "learning_rate": 2e-05, "loss": 0.4104, "loss/crossentropy": 2.298407196998596, "loss/hidden": 0.1552734375, "loss/logits": 0.03260168805718422, "loss/reg": 0.0222572460770607, "step": 1947 }, { "epoch": 0.974, "grad_norm": 1.156018853187561, "grad_norm_var": 0.10824091454887531, "learning_rate": 2e-05, "loss": 0.3887, "loss/crossentropy": 2.604636073112488, "loss/hidden": 0.13916015625, "loss/logits": 0.027005971409380436, "loss/reg": 0.022254537791013718, "step": 1948 }, { "epoch": 0.9745, "grad_norm": 1.4995312690734863, "grad_norm_var": 0.10799240399380565, "learning_rate": 2e-05, "loss": 0.4008, "loss/crossentropy": 2.598103880882263, "loss/hidden": 0.14794921875, "loss/logits": 0.030296839773654938, "loss/reg": 0.022251838818192482, "step": 1949 }, { "epoch": 0.975, "grad_norm": 1.2907731533050537, "grad_norm_var": 0.10289614463530032, "learning_rate": 2e-05, "loss": 0.413, "loss/crossentropy": 2.3919172286987305, "loss/hidden": 0.15966796875, "loss/logits": 0.03085498232394457, "loss/reg": 0.022249221801757812, "step": 1950 }, { "epoch": 0.9755, "grad_norm": 1.502394437789917, "grad_norm_var": 0.10248473161154052, "learning_rate": 2e-05, "loss": 0.3982, "loss/crossentropy": 2.357411503791809, "loss/hidden": 0.1484375, "loss/logits": 0.027324603870511055, "loss/reg": 0.0222467128187418, "step": 1951 }, { "epoch": 0.976, "grad_norm": 1.2249350547790527, "grad_norm_var": 0.1058127524217482, "learning_rate": 2e-05, "loss": 0.3818, "loss/crossentropy": 2.7032934427261353, "loss/hidden": 0.13525390625, "loss/logits": 0.024068184196949005, "loss/reg": 0.022244160994887352, "step": 1952 }, { "epoch": 0.9765, "grad_norm": 1.2931334972381592, "grad_norm_var": 0.10499684489912173, "learning_rate": 2e-05, "loss": 0.4009, "loss/crossentropy": 2.303010582923889, "loss/hidden": 0.150390625, "loss/logits": 0.028063518926501274, "loss/reg": 0.02224154584109783, "step": 1953 }, { "epoch": 0.977, "grad_norm": 1.1064225435256958, "grad_norm_var": 0.11209890357808133, "learning_rate": 2e-05, "loss": 0.394, "loss/crossentropy": 2.4800167083740234, "loss/hidden": 0.14306640625, "loss/logits": 0.028574367053806782, "loss/reg": 0.022238755598664284, "step": 1954 }, { "epoch": 0.9775, "grad_norm": 1.7888641357421875, "grad_norm_var": 0.09788471441156942, "learning_rate": 2e-05, "loss": 0.4597, "loss/crossentropy": 2.269771993160248, "loss/hidden": 0.19873046875, "loss/logits": 0.03857916593551636, "loss/reg": 0.022236214950680733, "step": 1955 }, { "epoch": 0.978, "grad_norm": 1.1377007961273193, "grad_norm_var": 0.1025800961707351, "learning_rate": 2e-05, "loss": 0.3758, "loss/crossentropy": 2.247413754463196, "loss/hidden": 0.1298828125, "loss/logits": 0.02357647381722927, "loss/reg": 0.022233642637729645, "step": 1956 }, { "epoch": 0.9785, "grad_norm": 1.4004080295562744, "grad_norm_var": 0.10264583324120663, "learning_rate": 2e-05, "loss": 0.4035, "loss/crossentropy": 2.3835976123809814, "loss/hidden": 0.15380859375, "loss/logits": 0.02738987375050783, "loss/reg": 0.022231118753552437, "step": 1957 }, { "epoch": 0.979, "grad_norm": 1.24699068069458, "grad_norm_var": 0.10508895477803246, "learning_rate": 2e-05, "loss": 0.3889, "loss/crossentropy": 2.438993811607361, "loss/hidden": 0.13818359375, "loss/logits": 0.028424736112356186, "loss/reg": 0.022228769958019257, "step": 1958 }, { "epoch": 0.9795, "grad_norm": 2.308201789855957, "grad_norm_var": 0.14973633890307708, "learning_rate": 2e-05, "loss": 0.4118, "loss/crossentropy": 2.373353362083435, "loss/hidden": 0.15673828125, "loss/logits": 0.032791512086987495, "loss/reg": 0.022226233035326004, "step": 1959 }, { "epoch": 0.98, "grad_norm": 1.3943239450454712, "grad_norm_var": 0.1069748260107654, "learning_rate": 2e-05, "loss": 0.4152, "loss/crossentropy": 2.4576724767684937, "loss/hidden": 0.16259765625, "loss/logits": 0.03036567196249962, "loss/reg": 0.022223642095923424, "step": 1960 }, { "epoch": 0.9805, "grad_norm": 1.372459053993225, "grad_norm_var": 0.09437562854595664, "learning_rate": 2e-05, "loss": 0.4329, "loss/crossentropy": 2.2118232250213623, "loss/hidden": 0.18115234375, "loss/logits": 0.029521776363253593, "loss/reg": 0.02222101204097271, "step": 1961 }, { "epoch": 0.981, "grad_norm": 1.6961666345596313, "grad_norm_var": 0.09618417236027692, "learning_rate": 2e-05, "loss": 0.3974, "loss/crossentropy": 2.430359721183777, "loss/hidden": 0.138671875, "loss/logits": 0.03654170501977205, "loss/reg": 0.02221854217350483, "step": 1962 }, { "epoch": 0.9815, "grad_norm": 1.9904968738555908, "grad_norm_var": 0.11079650013560964, "learning_rate": 2e-05, "loss": 0.3833, "loss/crossentropy": 2.349852681159973, "loss/hidden": 0.137939453125, "loss/logits": 0.023190665990114212, "loss/reg": 0.02221612073481083, "step": 1963 }, { "epoch": 0.982, "grad_norm": 1.5087324380874634, "grad_norm_var": 0.10413266118218628, "learning_rate": 2e-05, "loss": 0.385, "loss/crossentropy": 2.4656479358673096, "loss/hidden": 0.138671875, "loss/logits": 0.024239342659711838, "loss/reg": 0.02221374586224556, "step": 1964 }, { "epoch": 0.9825, "grad_norm": 3.662198781967163, "grad_norm_var": 0.40061585609098616, "learning_rate": 2e-05, "loss": 0.4676, "loss/crossentropy": 2.5416672229766846, "loss/hidden": 0.16748046875, "loss/logits": 0.07797094993293285, "loss/reg": 0.022211195901036263, "step": 1965 }, { "epoch": 0.983, "grad_norm": 3.0709733963012695, "grad_norm_var": 0.5204777832696872, "learning_rate": 2e-05, "loss": 0.446, "loss/crossentropy": 2.499261498451233, "loss/hidden": 0.18896484375, "loss/logits": 0.03491983376443386, "loss/reg": 0.022208670154213905, "step": 1966 }, { "epoch": 0.9835, "grad_norm": 1.299326777458191, "grad_norm_var": 0.5292589340957948, "learning_rate": 2e-05, "loss": 0.4253, "loss/crossentropy": 2.3157382011413574, "loss/hidden": 0.1669921875, "loss/logits": 0.03624746948480606, "loss/reg": 0.02220613695681095, "step": 1967 }, { "epoch": 0.984, "grad_norm": 2.600094795227051, "grad_norm_var": 0.5568919038873178, "learning_rate": 2e-05, "loss": 0.559, "loss/crossentropy": 2.232682466506958, "loss/hidden": 0.2373046875, "loss/logits": 0.09961535781621933, "loss/reg": 0.022203726693987846, "step": 1968 }, { "epoch": 0.9845, "grad_norm": 1.5083189010620117, "grad_norm_var": 0.5451060779468074, "learning_rate": 2e-05, "loss": 0.4475, "loss/crossentropy": 2.3565086126327515, "loss/hidden": 0.1845703125, "loss/logits": 0.04091835021972656, "loss/reg": 0.022201379761099815, "step": 1969 }, { "epoch": 0.985, "grad_norm": 1.5320773124694824, "grad_norm_var": 0.5160320549007683, "learning_rate": 2e-05, "loss": 0.4333, "loss/crossentropy": 2.222718358039856, "loss/hidden": 0.16845703125, "loss/logits": 0.04285791330039501, "loss/reg": 0.022199101746082306, "step": 1970 }, { "epoch": 0.9855, "grad_norm": 1.7056018114089966, "grad_norm_var": 0.5170866940808054, "learning_rate": 2e-05, "loss": 0.4092, "loss/crossentropy": 2.4271280765533447, "loss/hidden": 0.15673828125, "loss/logits": 0.03053828328847885, "loss/reg": 0.02219672128558159, "step": 1971 }, { "epoch": 0.986, "grad_norm": 1.1430257558822632, "grad_norm_var": 0.5165901006666008, "learning_rate": 2e-05, "loss": 0.3962, "loss/crossentropy": 2.436211943626404, "loss/hidden": 0.14453125, "loss/logits": 0.02975220326334238, "loss/reg": 0.02219444327056408, "step": 1972 }, { "epoch": 0.9865, "grad_norm": 1.38370943069458, "grad_norm_var": 0.5175861871168814, "learning_rate": 2e-05, "loss": 0.4147, "loss/crossentropy": 2.4028401374816895, "loss/hidden": 0.1611328125, "loss/logits": 0.03168147522956133, "loss/reg": 0.022191938012838364, "step": 1973 }, { "epoch": 0.987, "grad_norm": 1.449479579925537, "grad_norm_var": 0.5041676177403838, "learning_rate": 2e-05, "loss": 0.4342, "loss/crossentropy": 2.3942774534225464, "loss/hidden": 0.1767578125, "loss/logits": 0.03549867123365402, "loss/reg": 0.022189509123563766, "step": 1974 }, { "epoch": 0.9875, "grad_norm": 2.0965735912323, "grad_norm_var": 0.49408207054312825, "learning_rate": 2e-05, "loss": 0.4188, "loss/crossentropy": 2.363860249519348, "loss/hidden": 0.16259765625, "loss/logits": 0.034327320754528046, "loss/reg": 0.02218729257583618, "step": 1975 }, { "epoch": 0.988, "grad_norm": 1.2674319744110107, "grad_norm_var": 0.5026008210188041, "learning_rate": 2e-05, "loss": 0.4196, "loss/crossentropy": 2.3103872537612915, "loss/hidden": 0.16455078125, "loss/logits": 0.03319397568702698, "loss/reg": 0.022184785455465317, "step": 1976 }, { "epoch": 0.9885, "grad_norm": 1.3519755601882935, "grad_norm_var": 0.5038777873620819, "learning_rate": 2e-05, "loss": 0.3873, "loss/crossentropy": 2.3832825422286987, "loss/hidden": 0.140625, "loss/logits": 0.024883822537958622, "loss/reg": 0.022182263433933258, "step": 1977 }, { "epoch": 0.989, "grad_norm": 1.163076400756836, "grad_norm_var": 0.5310906853740609, "learning_rate": 2e-05, "loss": 0.3687, "loss/crossentropy": 2.3655554056167603, "loss/hidden": 0.1279296875, "loss/logits": 0.018932482227683067, "loss/reg": 0.02217974327504635, "step": 1978 }, { "epoch": 0.9895, "grad_norm": 1.2967028617858887, "grad_norm_var": 0.543166161422513, "learning_rate": 2e-05, "loss": 0.4113, "loss/crossentropy": 2.2330108880996704, "loss/hidden": 0.16064453125, "loss/logits": 0.02884120587259531, "loss/reg": 0.02217736653983593, "step": 1979 }, { "epoch": 0.99, "grad_norm": 0.9606475830078125, "grad_norm_var": 0.5797518155803723, "learning_rate": 2e-05, "loss": 0.3794, "loss/crossentropy": 2.309578061103821, "loss/hidden": 0.13427734375, "loss/logits": 0.023378074169158936, "loss/reg": 0.022174881771206856, "step": 1980 }, { "epoch": 0.9905, "grad_norm": 1.4933451414108276, "grad_norm_var": 0.3115809486329993, "learning_rate": 2e-05, "loss": 0.4478, "loss/crossentropy": 2.2966067790985107, "loss/hidden": 0.18994140625, "loss/logits": 0.036129954271018505, "loss/reg": 0.022172508761286736, "step": 1981 }, { "epoch": 0.991, "grad_norm": 1.1591241359710693, "grad_norm_var": 0.16063496865446744, "learning_rate": 2e-05, "loss": 0.3848, "loss/crossentropy": 2.3767281770706177, "loss/hidden": 0.13916015625, "loss/logits": 0.023951291106641293, "loss/reg": 0.022169925272464752, "step": 1982 }, { "epoch": 0.9915, "grad_norm": 1.2430766820907593, "grad_norm_var": 0.16206145180208573, "learning_rate": 2e-05, "loss": 0.4026, "loss/crossentropy": 2.3017172813415527, "loss/hidden": 0.150390625, "loss/logits": 0.03055698424577713, "loss/reg": 0.02216746285557747, "step": 1983 }, { "epoch": 0.992, "grad_norm": 1.0589817762374878, "grad_norm_var": 0.07615843072529553, "learning_rate": 2e-05, "loss": 0.3757, "loss/crossentropy": 2.41360604763031, "loss/hidden": 0.13134765625, "loss/logits": 0.022694111801683903, "loss/reg": 0.02216503396630287, "step": 1984 }, { "epoch": 0.9925, "grad_norm": 1.074315071105957, "grad_norm_var": 0.07954031445189572, "learning_rate": 2e-05, "loss": 0.3816, "loss/crossentropy": 2.434372305870056, "loss/hidden": 0.1376953125, "loss/logits": 0.02227596938610077, "loss/reg": 0.02216257154941559, "step": 1985 }, { "epoch": 0.993, "grad_norm": 1.1688265800476074, "grad_norm_var": 0.07830008007563455, "learning_rate": 2e-05, "loss": 0.3919, "loss/crossentropy": 2.4475741386413574, "loss/hidden": 0.1416015625, "loss/logits": 0.02868059929460287, "loss/reg": 0.022160008549690247, "step": 1986 }, { "epoch": 0.9935, "grad_norm": 1.343680739402771, "grad_norm_var": 0.06756511802767377, "learning_rate": 2e-05, "loss": 0.3966, "loss/crossentropy": 2.1644026041030884, "loss/hidden": 0.15087890625, "loss/logits": 0.02410024218261242, "loss/reg": 0.022157687693834305, "step": 1987 }, { "epoch": 0.994, "grad_norm": 2.975306272506714, "grad_norm_var": 0.2412736036708892, "learning_rate": 2e-05, "loss": 0.4244, "loss/crossentropy": 2.527232050895691, "loss/hidden": 0.17431640625, "loss/logits": 0.0285196453332901, "loss/reg": 0.022155148908495903, "step": 1988 }, { "epoch": 0.9945, "grad_norm": 1.4210851192474365, "grad_norm_var": 0.24125286489004902, "learning_rate": 2e-05, "loss": 0.4013, "loss/crossentropy": 2.453064799308777, "loss/hidden": 0.15283203125, "loss/logits": 0.026922681368887424, "loss/reg": 0.02215270884335041, "step": 1989 }, { "epoch": 0.995, "grad_norm": 1.1747286319732666, "grad_norm_var": 0.24444132193735152, "learning_rate": 2e-05, "loss": 0.3815, "loss/crossentropy": 2.421423316001892, "loss/hidden": 0.138427734375, "loss/logits": 0.021576720289885998, "loss/reg": 0.02215024270117283, "step": 1990 }, { "epoch": 0.9955, "grad_norm": 1.2867865562438965, "grad_norm_var": 0.20919603916842147, "learning_rate": 2e-05, "loss": 0.3843, "loss/crossentropy": 2.414217710494995, "loss/hidden": 0.13623046875, "loss/logits": 0.02661888301372528, "loss/reg": 0.022147882729768753, "step": 1991 }, { "epoch": 0.996, "grad_norm": 1.7244079113006592, "grad_norm_var": 0.21782960949894387, "learning_rate": 2e-05, "loss": 0.3918, "loss/crossentropy": 2.4352335929870605, "loss/hidden": 0.14208984375, "loss/logits": 0.028264615684747696, "loss/reg": 0.022145364433526993, "step": 1992 }, { "epoch": 0.9965, "grad_norm": 1.0717110633850098, "grad_norm_var": 0.22335652296934896, "learning_rate": 2e-05, "loss": 0.3637, "loss/crossentropy": 2.3417575359344482, "loss/hidden": 0.122314453125, "loss/logits": 0.019959733821451664, "loss/reg": 0.022142987698316574, "step": 1993 }, { "epoch": 0.997, "grad_norm": 0.9572432041168213, "grad_norm_var": 0.23116159615423915, "learning_rate": 2e-05, "loss": 0.3697, "loss/crossentropy": 2.3081470727920532, "loss/hidden": 0.128662109375, "loss/logits": 0.019598262384533882, "loss/reg": 0.022140614688396454, "step": 1994 }, { "epoch": 0.9975, "grad_norm": 1.2059389352798462, "grad_norm_var": 0.23217773839135317, "learning_rate": 2e-05, "loss": 0.3804, "loss/crossentropy": 2.416892886161804, "loss/hidden": 0.138671875, "loss/logits": 0.02030058763921261, "loss/reg": 0.022138269618153572, "step": 1995 }, { "epoch": 0.998, "grad_norm": 1.0577036142349243, "grad_norm_var": 0.22795505383013293, "learning_rate": 2e-05, "loss": 0.3623, "loss/crossentropy": 2.3178452253341675, "loss/hidden": 0.12255859375, "loss/logits": 0.018403733149170876, "loss/reg": 0.022135984152555466, "step": 1996 }, { "epoch": 0.9985, "grad_norm": 1.2266889810562134, "grad_norm_var": 0.22689434089943936, "learning_rate": 2e-05, "loss": 0.3796, "loss/crossentropy": 2.3155784606933594, "loss/hidden": 0.1357421875, "loss/logits": 0.02248302474617958, "loss/reg": 0.022133611142635345, "step": 1997 }, { "epoch": 0.999, "grad_norm": 1.0541908740997314, "grad_norm_var": 0.2298592464456514, "learning_rate": 2e-05, "loss": 0.3634, "loss/crossentropy": 2.4655479192733765, "loss/hidden": 0.12255859375, "loss/logits": 0.01956414245069027, "loss/reg": 0.02213137224316597, "step": 1998 }, { "epoch": 0.9995, "grad_norm": 1.7036995887756348, "grad_norm_var": 0.2386848838311654, "learning_rate": 2e-05, "loss": 0.4205, "loss/crossentropy": 2.35299813747406, "loss/hidden": 0.17236328125, "loss/logits": 0.026862223632633686, "loss/reg": 0.02212887816131115, "step": 1999 }, { "epoch": 1.0, "grad_norm": 1.3390984535217285, "grad_norm_var": 0.23294083127609208, "learning_rate": 2e-05, "loss": 0.3795, "loss/crossentropy": 2.3336535692214966, "loss/hidden": 0.1357421875, "loss/logits": 0.022448450326919556, "loss/reg": 0.022126398980617523, "step": 2000 } ], "logging_steps": 1, "max_steps": 2000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.57623446257664e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }