{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7000190326001251, "eval_steps": 1000, "global_step": 12873, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "crossentropy": 5.638836145401001, "epoch": 5.437885750020392e-05, "grad_norm": 1.5967193841934204, "learning_rate": 3.875968992248062e-05, "loss": 5.6388, "step": 1 }, { "crossentropy": 5.714629888534546, "epoch": 0.00010875771500040784, "grad_norm": 1.6303424835205078, "learning_rate": 7.751937984496124e-05, "loss": 5.7146, "step": 2 }, { "crossentropy": 5.702889680862427, "epoch": 0.00016313657250061176, "grad_norm": 1.5842801332473755, "learning_rate": 0.00011627906976744187, "loss": 5.7029, "step": 3 }, { "crossentropy": 5.9462730884552, "epoch": 0.00021751543000081568, "grad_norm": 1.948069453239441, "learning_rate": 0.0001550387596899225, "loss": 5.9463, "step": 4 }, { "crossentropy": 5.727362632751465, "epoch": 0.0002718942875010196, "grad_norm": 1.495962381362915, "learning_rate": 0.0001937984496124031, "loss": 5.7274, "step": 5 }, { "crossentropy": 5.463281869888306, "epoch": 0.0003262731450012235, "grad_norm": 1.5488165616989136, "learning_rate": 0.00023255813953488373, "loss": 5.4633, "step": 6 }, { "crossentropy": 5.571461200714111, "epoch": 0.00038065200250142744, "grad_norm": 1.3516031503677368, "learning_rate": 0.00027131782945736437, "loss": 5.5715, "step": 7 }, { "crossentropy": 5.605658531188965, "epoch": 0.00043503086000163136, "grad_norm": 1.2457646131515503, "learning_rate": 0.000310077519379845, "loss": 5.6057, "step": 8 }, { "crossentropy": 5.4235618114471436, "epoch": 0.0004894097175018353, "grad_norm": 1.1078996658325195, "learning_rate": 0.0003488372093023256, "loss": 5.4236, "step": 9 }, { "crossentropy": 5.584756135940552, "epoch": 0.0005437885750020392, "grad_norm": 1.1383275985717773, "learning_rate": 0.0003875968992248062, "loss": 5.5848, "step": 10 }, { "crossentropy": 5.4516496658325195, "epoch": 0.0005981674325022431, "grad_norm": 0.9900858402252197, "learning_rate": 0.0004263565891472868, "loss": 5.4516, "step": 11 }, { "crossentropy": 5.513082027435303, "epoch": 0.000652546290002447, "grad_norm": 0.8560095429420471, "learning_rate": 0.00046511627906976747, "loss": 5.5131, "step": 12 }, { "crossentropy": 5.510245323181152, "epoch": 0.000706925147502651, "grad_norm": 0.8773645758628845, "learning_rate": 0.0005038759689922481, "loss": 5.5102, "step": 13 }, { "crossentropy": 5.374821186065674, "epoch": 0.0007613040050028549, "grad_norm": 0.7263167500495911, "learning_rate": 0.0005426356589147287, "loss": 5.3748, "step": 14 }, { "crossentropy": 5.319120407104492, "epoch": 0.0008156828625030589, "grad_norm": 0.7545031905174255, "learning_rate": 0.0005813953488372093, "loss": 5.3191, "step": 15 }, { "crossentropy": 5.359729766845703, "epoch": 0.0008700617200032627, "grad_norm": 0.7343056201934814, "grad_norm_var": 0.14622413202156395, "learning_rate": 0.00062015503875969, "loss": 5.3597, "step": 16 }, { "crossentropy": 5.4352662563323975, "epoch": 0.0009244405775034666, "grad_norm": 0.8176167607307434, "grad_norm_var": 0.14545886154616322, "learning_rate": 0.0006589147286821706, "loss": 5.4353, "step": 17 }, { "crossentropy": 5.3581702709198, "epoch": 0.0009788194350036706, "grad_norm": 0.6764337420463562, "grad_norm_var": 0.144473983730063, "learning_rate": 0.0006976744186046512, "loss": 5.3582, "step": 18 }, { "crossentropy": 5.182053327560425, "epoch": 0.0010331982925038746, "grad_norm": 0.5477958917617798, "grad_norm_var": 0.14687957088250833, "learning_rate": 0.0007364341085271318, "loss": 5.1821, "step": 19 }, { "crossentropy": 5.229555130004883, "epoch": 0.0010875771500040783, "grad_norm": 0.6014132499694824, "grad_norm_var": 0.099159524633617, "learning_rate": 0.0007751937984496124, "loss": 5.2296, "step": 20 }, { "crossentropy": 5.14061713218689, "epoch": 0.0011419560075042823, "grad_norm": 0.5848256945610046, "grad_norm_var": 0.08677069420470405, "learning_rate": 0.0008139534883720931, "loss": 5.1406, "step": 21 }, { "crossentropy": 5.117424249649048, "epoch": 0.0011963348650044863, "grad_norm": 0.5564687848091125, "grad_norm_var": 0.0637864790342921, "learning_rate": 0.0008527131782945736, "loss": 5.1174, "step": 22 }, { "crossentropy": 5.1112353801727295, "epoch": 0.0012507137225046903, "grad_norm": 0.5007688999176025, "grad_norm_var": 0.051891398907762704, "learning_rate": 0.0008914728682170544, "loss": 5.1112, "step": 23 }, { "crossentropy": 5.102404832839966, "epoch": 0.001305092580004894, "grad_norm": 0.5316453576087952, "grad_norm_var": 0.04081993812296679, "learning_rate": 0.0009302325581395349, "loss": 5.1024, "step": 24 }, { "crossentropy": 5.0071094036102295, "epoch": 0.001359471437505098, "grad_norm": 0.4701120853424072, "grad_norm_var": 0.03581752227406197, "learning_rate": 0.0009689922480620156, "loss": 5.0071, "step": 25 }, { "crossentropy": 5.028736114501953, "epoch": 0.001413850295005302, "grad_norm": 0.4358255863189697, "grad_norm_var": 0.026565085460050483, "learning_rate": 0.0010077519379844962, "loss": 5.0287, "step": 26 }, { "crossentropy": 4.997894763946533, "epoch": 0.0014682291525055058, "grad_norm": 0.455787718296051, "grad_norm_var": 0.02134389769232475, "learning_rate": 0.0010465116279069768, "loss": 4.9979, "step": 27 }, { "crossentropy": 4.975160121917725, "epoch": 0.0015226080100057097, "grad_norm": 0.49684271216392517, "grad_norm_var": 0.018724350312837142, "learning_rate": 0.0010852713178294575, "loss": 4.9752, "step": 28 }, { "crossentropy": 5.053090333938599, "epoch": 0.0015769868675059137, "grad_norm": 0.502557635307312, "grad_norm_var": 0.01416808926310627, "learning_rate": 0.0011240310077519381, "loss": 5.0531, "step": 29 }, { "crossentropy": 4.947605848312378, "epoch": 0.0016313657250061177, "grad_norm": 0.40679746866226196, "grad_norm_var": 0.014616870839835091, "learning_rate": 0.0011627906976744186, "loss": 4.9476, "step": 30 }, { "crossentropy": 4.893946886062622, "epoch": 0.0016857445825063215, "grad_norm": 0.9644978046417236, "grad_norm_var": 0.022619958527171132, "learning_rate": 0.0012015503875968993, "loss": 4.8939, "step": 31 }, { "crossentropy": 4.8387110233306885, "epoch": 0.0017401234400065255, "grad_norm": 0.4920768141746521, "grad_norm_var": 0.02131095634926764, "learning_rate": 0.00124031007751938, "loss": 4.8387, "step": 32 }, { "crossentropy": 5.0075719356536865, "epoch": 0.0017945022975067294, "grad_norm": 0.4226278066635132, "grad_norm_var": 0.0177626889113253, "learning_rate": 0.0012790697674418606, "loss": 5.0076, "step": 33 }, { "crossentropy": 4.919002532958984, "epoch": 0.0018488811550069332, "grad_norm": 0.3497632145881653, "grad_norm_var": 0.018507406070432674, "learning_rate": 0.0013178294573643412, "loss": 4.919, "step": 34 }, { "crossentropy": 4.889570236206055, "epoch": 0.0019032600125071372, "grad_norm": 0.6765391230583191, "grad_norm_var": 0.020020677375977885, "learning_rate": 0.0013565891472868217, "loss": 4.8896, "step": 35 }, { "crossentropy": 5.0372185707092285, "epoch": 0.001957638870007341, "grad_norm": 0.4056819975376129, "grad_norm_var": 0.020500092289695775, "learning_rate": 0.0013953488372093023, "loss": 5.0372, "step": 36 }, { "crossentropy": 4.834566354751587, "epoch": 0.002012017727507545, "grad_norm": 0.5156898498535156, "grad_norm_var": 0.020162551752766083, "learning_rate": 0.001434108527131783, "loss": 4.8346, "step": 37 }, { "crossentropy": 4.756373882293701, "epoch": 0.002066396585007749, "grad_norm": 0.3732472360134125, "grad_norm_var": 0.02116163430355572, "learning_rate": 0.0014728682170542637, "loss": 4.7564, "step": 38 }, { "crossentropy": 4.78908634185791, "epoch": 0.002120775442507953, "grad_norm": 0.3464740514755249, "grad_norm_var": 0.022634340411535534, "learning_rate": 0.0015116279069767441, "loss": 4.7891, "step": 39 }, { "crossentropy": 4.7621870040893555, "epoch": 0.0021751543000081567, "grad_norm": 0.33866196870803833, "grad_norm_var": 0.023900337425881116, "learning_rate": 0.0015503875968992248, "loss": 4.7622, "step": 40 }, { "crossentropy": 4.7910826206207275, "epoch": 0.0022295331575083606, "grad_norm": 0.32456713914871216, "grad_norm_var": 0.025383654868399778, "learning_rate": 0.0015891472868217054, "loss": 4.7911, "step": 41 }, { "crossentropy": 4.810269594192505, "epoch": 0.0022839120150085646, "grad_norm": 0.45656827092170715, "grad_norm_var": 0.02531816699609354, "learning_rate": 0.0016279069767441861, "loss": 4.8103, "step": 42 }, { "crossentropy": 4.659280061721802, "epoch": 0.0023382908725087686, "grad_norm": 0.3062628209590912, "grad_norm_var": 0.02700931108998491, "learning_rate": 0.0016666666666666666, "loss": 4.6593, "step": 43 }, { "crossentropy": 4.787783145904541, "epoch": 0.0023926697300089726, "grad_norm": 0.30951032042503357, "grad_norm_var": 0.028311841729148603, "learning_rate": 0.0017054263565891472, "loss": 4.7878, "step": 44 }, { "crossentropy": 4.4860680103302, "epoch": 0.0024470485875091766, "grad_norm": 0.27684733271598816, "grad_norm_var": 0.029898262816955526, "learning_rate": 0.0017441860465116279, "loss": 4.4861, "step": 45 }, { "crossentropy": 4.695001840591431, "epoch": 0.0025014274450093806, "grad_norm": 0.3550558090209961, "grad_norm_var": 0.030262660426417773, "learning_rate": 0.0017829457364341088, "loss": 4.695, "step": 46 }, { "crossentropy": 4.672351121902466, "epoch": 0.002555806302509584, "grad_norm": 0.30933570861816406, "grad_norm_var": 0.010584989201023494, "learning_rate": 0.001821705426356589, "loss": 4.6724, "step": 47 }, { "crossentropy": 4.620923042297363, "epoch": 0.002610185160009788, "grad_norm": 0.3006344735622406, "grad_norm_var": 0.010300215515385272, "learning_rate": 0.0018604651162790699, "loss": 4.6209, "step": 48 }, { "crossentropy": 4.720210075378418, "epoch": 0.002664564017509992, "grad_norm": 0.28604182600975037, "grad_norm_var": 0.010675618750010033, "learning_rate": 0.0018992248062015505, "loss": 4.7202, "step": 49 }, { "crossentropy": 4.639574289321899, "epoch": 0.002718942875010196, "grad_norm": 0.2975113093852997, "grad_norm_var": 0.010991986182507137, "learning_rate": 0.0019379844961240312, "loss": 4.6396, "step": 50 }, { "crossentropy": 4.513094902038574, "epoch": 0.0027733217325104, "grad_norm": 0.266658216714859, "grad_norm_var": 0.004598219993339943, "learning_rate": 0.0019767441860465114, "loss": 4.5131, "step": 51 }, { "crossentropy": 4.506099700927734, "epoch": 0.002827700590010604, "grad_norm": 0.26799148321151733, "grad_norm_var": 0.004610285386170902, "learning_rate": 0.0020155038759689923, "loss": 4.5061, "step": 52 }, { "crossentropy": 4.40588116645813, "epoch": 0.002882079447510808, "grad_norm": 0.4169268012046814, "grad_norm_var": 0.0028167015793561217, "learning_rate": 0.0020542635658914728, "loss": 4.4059, "step": 53 }, { "crossentropy": 4.518248796463013, "epoch": 0.0029364583050110115, "grad_norm": 0.2775914967060089, "grad_norm_var": 0.002798971020187842, "learning_rate": 0.0020930232558139536, "loss": 4.5182, "step": 54 }, { "crossentropy": 4.507075071334839, "epoch": 0.0029908371625112155, "grad_norm": 0.24736866354942322, "grad_norm_var": 0.0030767507160983716, "learning_rate": 0.002131782945736434, "loss": 4.5071, "step": 55 }, { "crossentropy": 4.413163185119629, "epoch": 0.0030452160200114195, "grad_norm": 0.23880819976329803, "grad_norm_var": 0.003382840303094791, "learning_rate": 0.002170542635658915, "loss": 4.4132, "step": 56 }, { "crossentropy": 4.514345407485962, "epoch": 0.0030995948775116235, "grad_norm": 0.23831014335155487, "grad_norm_var": 0.003664277554512548, "learning_rate": 0.0022093023255813954, "loss": 4.5143, "step": 57 }, { "crossentropy": 4.594726324081421, "epoch": 0.0031539737350118275, "grad_norm": 0.26751166582107544, "grad_norm_var": 0.0020324907345464103, "learning_rate": 0.0022480620155038763, "loss": 4.5947, "step": 58 }, { "crossentropy": 4.42350959777832, "epoch": 0.0032083525925120314, "grad_norm": 0.2544938027858734, "grad_norm_var": 0.0020973869831299712, "learning_rate": 0.0022868217054263567, "loss": 4.4235, "step": 59 }, { "crossentropy": 4.4713053703308105, "epoch": 0.0032627314500122354, "grad_norm": 0.25737330317497253, "grad_norm_var": 0.0021188760174762383, "learning_rate": 0.002325581395348837, "loss": 4.4713, "step": 60 }, { "crossentropy": 4.340162992477417, "epoch": 0.003317110307512439, "grad_norm": 0.21900710463523865, "grad_norm_var": 0.00239010071515701, "learning_rate": 0.002364341085271318, "loss": 4.3402, "step": 61 }, { "crossentropy": 4.330676794052124, "epoch": 0.003371489165012643, "grad_norm": 0.2527008354663849, "grad_norm_var": 0.00203816112640454, "learning_rate": 0.0024031007751937985, "loss": 4.3307, "step": 62 }, { "crossentropy": 4.3251426219940186, "epoch": 0.003425868022512847, "grad_norm": 0.22880013287067413, "grad_norm_var": 0.0020736709490899665, "learning_rate": 0.002441860465116279, "loss": 4.3251, "step": 63 }, { "crossentropy": 4.30205774307251, "epoch": 0.003480246880013051, "grad_norm": 0.3111916184425354, "grad_norm_var": 0.0021239582112602194, "learning_rate": 0.00248062015503876, "loss": 4.3021, "step": 64 }, { "crossentropy": 4.315934419631958, "epoch": 0.003534625737513255, "grad_norm": 0.405735045671463, "grad_norm_var": 0.003267110010534264, "learning_rate": 0.0025193798449612403, "loss": 4.3159, "step": 65 }, { "crossentropy": 4.254716873168945, "epoch": 0.003589004595013459, "grad_norm": 0.1869746893644333, "grad_norm_var": 0.0037431760551316176, "learning_rate": 0.002558139534883721, "loss": 4.2547, "step": 66 }, { "crossentropy": 4.373369216918945, "epoch": 0.003643383452513663, "grad_norm": 0.18999060988426208, "grad_norm_var": 0.00415585145154704, "learning_rate": 0.0025968992248062016, "loss": 4.3734, "step": 67 }, { "crossentropy": 4.292936563491821, "epoch": 0.0036977623100138664, "grad_norm": 0.1918153017759323, "grad_norm_var": 0.00450133152581921, "learning_rate": 0.0026356589147286825, "loss": 4.2929, "step": 68 }, { "crossentropy": 4.291545867919922, "epoch": 0.0037521411675140704, "grad_norm": 0.20005132257938385, "grad_norm_var": 0.0029476657514634834, "learning_rate": 0.0026744186046511625, "loss": 4.2915, "step": 69 }, { "crossentropy": 4.264338731765747, "epoch": 0.0038065200250142744, "grad_norm": 0.20518945157527924, "grad_norm_var": 0.002989463046078847, "learning_rate": 0.0027131782945736434, "loss": 4.2643, "step": 70 }, { "crossentropy": 4.260225296020508, "epoch": 0.0038608988825144784, "grad_norm": 0.38058605790138245, "grad_norm_var": 0.004168111917576844, "learning_rate": 0.002751937984496124, "loss": 4.2602, "step": 71 }, { "crossentropy": 4.269296884536743, "epoch": 0.003915277740014682, "grad_norm": 0.18909774720668793, "grad_norm_var": 0.004408559919673237, "learning_rate": 0.0027906976744186047, "loss": 4.2693, "step": 72 }, { "crossentropy": 4.27429723739624, "epoch": 0.003969656597514886, "grad_norm": 0.17849060893058777, "grad_norm_var": 0.004714892289419181, "learning_rate": 0.002829457364341085, "loss": 4.2743, "step": 73 }, { "crossentropy": 4.252473831176758, "epoch": 0.00402403545501509, "grad_norm": 0.19021974503993988, "grad_norm_var": 0.004855635757091972, "learning_rate": 0.002868217054263566, "loss": 4.2525, "step": 74 }, { "crossentropy": 4.27592134475708, "epoch": 0.004078414312515294, "grad_norm": 0.19290795922279358, "grad_norm_var": 0.0049745530733360434, "learning_rate": 0.002906976744186047, "loss": 4.2759, "step": 75 }, { "crossentropy": 4.157680511474609, "epoch": 0.004132793170015498, "grad_norm": 0.20590141415596008, "grad_norm_var": 0.004995226693471588, "learning_rate": 0.0029457364341085274, "loss": 4.1577, "step": 76 }, { "crossentropy": 4.227149963378906, "epoch": 0.004187172027515702, "grad_norm": 0.24871313571929932, "grad_norm_var": 0.004994793297967621, "learning_rate": 0.0029844961240310074, "loss": 4.2271, "step": 77 }, { "crossentropy": 4.277160882949829, "epoch": 0.004241550885015906, "grad_norm": 0.17956233024597168, "grad_norm_var": 0.0051555097372253675, "learning_rate": 0.0030232558139534882, "loss": 4.2772, "step": 78 }, { "crossentropy": 4.1154465675354, "epoch": 0.00429592974251611, "grad_norm": 0.15703128278255463, "grad_norm_var": 0.005492040705480201, "learning_rate": 0.003062015503875969, "loss": 4.1154, "step": 79 }, { "crossentropy": 4.15187668800354, "epoch": 0.004350308600016313, "grad_norm": 0.13991563022136688, "grad_norm_var": 0.005376375657144132, "learning_rate": 0.0031007751937984496, "loss": 4.1519, "step": 80 }, { "crossentropy": 4.1212992668151855, "epoch": 0.004404687457516518, "grad_norm": 0.15923456847667694, "grad_norm_var": 0.002909676565182646, "learning_rate": 0.0031395348837209304, "loss": 4.1213, "step": 81 }, { "crossentropy": 4.175853490829468, "epoch": 0.004459066315016721, "grad_norm": 0.15244433283805847, "grad_norm_var": 0.003042924750616497, "learning_rate": 0.003178294573643411, "loss": 4.1759, "step": 82 }, { "crossentropy": 4.129382848739624, "epoch": 0.004513445172516926, "grad_norm": 0.16179896891117096, "grad_norm_var": 0.0031210952487053254, "learning_rate": 0.0032170542635658918, "loss": 4.1294, "step": 83 }, { "crossentropy": 4.113377809524536, "epoch": 0.004567824030017129, "grad_norm": 0.16745366156101227, "grad_norm_var": 0.0031711639678689016, "learning_rate": 0.0032558139534883722, "loss": 4.1134, "step": 84 }, { "crossentropy": 4.151271104812622, "epoch": 0.004622202887517334, "grad_norm": 0.14339037239551544, "grad_norm_var": 0.003328272592350563, "learning_rate": 0.0032945736434108527, "loss": 4.1513, "step": 85 }, { "crossentropy": 4.019795775413513, "epoch": 0.004676581745017537, "grad_norm": 0.15447185933589935, "grad_norm_var": 0.003391368651411785, "learning_rate": 0.003333333333333333, "loss": 4.0198, "step": 86 }, { "crossentropy": 4.118191719055176, "epoch": 0.004730960602517741, "grad_norm": 0.14702387154102325, "grad_norm_var": 0.000790184920171048, "learning_rate": 0.003372093023255814, "loss": 4.1182, "step": 87 }, { "crossentropy": 4.077729225158691, "epoch": 0.004785339460017945, "grad_norm": 0.141363263130188, "grad_norm_var": 0.0008300042963908914, "learning_rate": 0.0034108527131782944, "loss": 4.0777, "step": 88 }, { "crossentropy": 4.093486070632935, "epoch": 0.004839718317518149, "grad_norm": 0.2444521188735962, "grad_norm_var": 0.001176652953557362, "learning_rate": 0.0034496124031007753, "loss": 4.0935, "step": 89 }, { "crossentropy": 4.162357330322266, "epoch": 0.004894097175018353, "grad_norm": 0.13947327435016632, "grad_norm_var": 0.0012286541831667394, "learning_rate": 0.0034883720930232558, "loss": 4.1624, "step": 90 }, { "crossentropy": 4.046319961547852, "epoch": 0.004948476032518557, "grad_norm": 0.1381562203168869, "grad_norm_var": 0.0012556872784415714, "learning_rate": 0.0035271317829457366, "loss": 4.0463, "step": 91 }, { "crossentropy": 4.033913612365723, "epoch": 0.005002854890018761, "grad_norm": 0.5767573118209839, "grad_norm_var": 0.011749226591005876, "learning_rate": 0.0035658914728682175, "loss": 4.0339, "step": 92 }, { "crossentropy": 4.061686038970947, "epoch": 0.005057233747518965, "grad_norm": 0.12608686089515686, "grad_norm_var": 0.011740570292099258, "learning_rate": 0.0036046511627906975, "loss": 4.0617, "step": 93 }, { "crossentropy": 4.0564563274383545, "epoch": 0.005111612605019168, "grad_norm": 0.13151514530181885, "grad_norm_var": 0.011907122868107963, "learning_rate": 0.003643410852713178, "loss": 4.0565, "step": 94 }, { "crossentropy": 4.0978924036026, "epoch": 0.005165991462519373, "grad_norm": 0.14305242896080017, "grad_norm_var": 0.011962212322069317, "learning_rate": 0.003682170542635659, "loss": 4.0979, "step": 95 }, { "crossentropy": 4.000400900840759, "epoch": 0.005220370320019576, "grad_norm": 0.12726163864135742, "grad_norm_var": 0.01203843624941882, "learning_rate": 0.0037209302325581397, "loss": 4.0004, "step": 96 }, { "crossentropy": 3.988866448402405, "epoch": 0.005274749177519781, "grad_norm": 0.1736975461244583, "grad_norm_var": 0.012014607231925819, "learning_rate": 0.00375968992248062, "loss": 3.9889, "step": 97 }, { "crossentropy": 3.9759955406188965, "epoch": 0.005329128035019984, "grad_norm": 0.12526874244213104, "grad_norm_var": 0.012157982584518662, "learning_rate": 0.003798449612403101, "loss": 3.976, "step": 98 }, { "crossentropy": 4.119030952453613, "epoch": 0.0053835068925201885, "grad_norm": 0.12652051448822021, "grad_norm_var": 0.012309982213211952, "learning_rate": 0.0038372093023255815, "loss": 4.119, "step": 99 }, { "crossentropy": 4.138980150222778, "epoch": 0.005437885750020392, "grad_norm": 0.14681276679039001, "grad_norm_var": 0.012358401100571202, "learning_rate": 0.0038759689922480624, "loss": 4.139, "step": 100 }, { "crossentropy": 3.9464231729507446, "epoch": 0.005492264607520596, "grad_norm": 0.2543572783470154, "grad_norm_var": 0.012673911286104197, "learning_rate": 0.003914728682170542, "loss": 3.9464, "step": 101 }, { "crossentropy": 4.033976078033447, "epoch": 0.0055466434650208, "grad_norm": 0.15010225772857666, "grad_norm_var": 0.012690570141127155, "learning_rate": 0.003953488372093023, "loss": 4.034, "step": 102 }, { "crossentropy": 4.059691667556763, "epoch": 0.005601022322521004, "grad_norm": 0.1349141150712967, "grad_norm_var": 0.01275418092268971, "learning_rate": 0.003992248062015504, "loss": 4.0597, "step": 103 }, { "crossentropy": 3.940764904022217, "epoch": 0.005655401180021208, "grad_norm": 0.13360857963562012, "grad_norm_var": 0.012797874648028262, "learning_rate": 0.004031007751937985, "loss": 3.9408, "step": 104 }, { "crossentropy": 3.867621421813965, "epoch": 0.0057097800375214116, "grad_norm": 0.12139341235160828, "grad_norm_var": 0.012678654549400772, "learning_rate": 0.004069767441860465, "loss": 3.8676, "step": 105 }, { "crossentropy": 3.832828998565674, "epoch": 0.005764158895021616, "grad_norm": 0.12798668444156647, "grad_norm_var": 0.012736427801420102, "learning_rate": 0.0041085271317829455, "loss": 3.8328, "step": 106 }, { "crossentropy": 3.9953399896621704, "epoch": 0.0058185377525218195, "grad_norm": 0.11683070659637451, "grad_norm_var": 0.012858504519581789, "learning_rate": 0.004147286821705427, "loss": 3.9953, "step": 107 }, { "crossentropy": 3.99992835521698, "epoch": 0.005872916610022023, "grad_norm": 0.11553674936294556, "grad_norm_var": 0.001125065782648434, "learning_rate": 0.004186046511627907, "loss": 3.9999, "step": 108 }, { "crossentropy": 3.869575262069702, "epoch": 0.0059272954675222275, "grad_norm": 0.11628300696611404, "grad_norm_var": 0.001150481012311699, "learning_rate": 0.004224806201550388, "loss": 3.8696, "step": 109 }, { "crossentropy": 3.923812985420227, "epoch": 0.005981674325022431, "grad_norm": 0.11618238687515259, "grad_norm_var": 0.0011831774726935305, "learning_rate": 0.004263565891472868, "loss": 3.9238, "step": 110 }, { "crossentropy": 3.8854827880859375, "epoch": 0.0060360531825226355, "grad_norm": 0.11377052962779999, "grad_norm_var": 0.0012223625583174522, "learning_rate": 0.004302325581395349, "loss": 3.8855, "step": 111 }, { "crossentropy": 4.052217483520508, "epoch": 0.006090432040022839, "grad_norm": 0.11283993721008301, "grad_norm_var": 0.0012551122542319435, "learning_rate": 0.00434108527131783, "loss": 4.0522, "step": 112 }, { "crossentropy": 3.794275999069214, "epoch": 0.006144810897523043, "grad_norm": 0.12474672496318817, "grad_norm_var": 0.001162952394248795, "learning_rate": 0.00437984496124031, "loss": 3.7943, "step": 113 }, { "crossentropy": 3.9406157732009888, "epoch": 0.006199189755023247, "grad_norm": 0.10543359816074371, "grad_norm_var": 0.0012095018572913635, "learning_rate": 0.004418604651162791, "loss": 3.9406, "step": 114 }, { "crossentropy": 3.9336589574813843, "epoch": 0.0062535686125234505, "grad_norm": 0.1409958153963089, "grad_norm_var": 0.001211380478731729, "learning_rate": 0.004457364341085271, "loss": 3.9337, "step": 115 }, { "crossentropy": 3.888079524040222, "epoch": 0.006307947470023655, "grad_norm": 0.13626986742019653, "grad_norm_var": 0.001199244022419643, "learning_rate": 0.004496124031007753, "loss": 3.8881, "step": 116 }, { "crossentropy": 3.9308043718338013, "epoch": 0.0063623263275238585, "grad_norm": 0.11348602920770645, "grad_norm_var": 0.00015218273489392328, "learning_rate": 0.004534883720930232, "loss": 3.9308, "step": 117 }, { "crossentropy": 3.9225785732269287, "epoch": 0.006416705185024063, "grad_norm": 0.1193360984325409, "grad_norm_var": 0.0001033390141984993, "learning_rate": 0.0045736434108527135, "loss": 3.9226, "step": 118 }, { "crossentropy": 3.8920642137527466, "epoch": 0.0064710840425242664, "grad_norm": 0.10460172593593597, "grad_norm_var": 0.00010796956484627289, "learning_rate": 0.004612403100775194, "loss": 3.8921, "step": 119 }, { "crossentropy": 3.8869340419769287, "epoch": 0.006525462900024471, "grad_norm": 0.1369384080171585, "grad_norm_var": 0.00011472382021468311, "learning_rate": 0.004651162790697674, "loss": 3.8869, "step": 120 }, { "crossentropy": 3.8934133052825928, "epoch": 0.006579841757524674, "grad_norm": 0.13856691122055054, "grad_norm_var": 0.00013597089565249668, "learning_rate": 0.004689922480620155, "loss": 3.8934, "step": 121 }, { "crossentropy": 3.847654938697815, "epoch": 0.006634220615024878, "grad_norm": 0.1327739655971527, "grad_norm_var": 0.00014171110027774066, "learning_rate": 0.004728682170542636, "loss": 3.8477, "step": 122 }, { "crossentropy": 3.8742947578430176, "epoch": 0.006688599472525082, "grad_norm": 0.36328452825546265, "grad_norm_var": 0.003783276842296719, "learning_rate": 0.0047674418604651166, "loss": 3.8743, "step": 123 }, { "crossentropy": 3.8521714210510254, "epoch": 0.006742978330025286, "grad_norm": 0.11482907086610794, "grad_norm_var": 0.003785327729266049, "learning_rate": 0.004806201550387597, "loss": 3.8522, "step": 124 }, { "crossentropy": 3.8809906244277954, "epoch": 0.00679735718752549, "grad_norm": 0.11548923701047897, "grad_norm_var": 0.0037875487225220695, "learning_rate": 0.0048449612403100775, "loss": 3.881, "step": 125 }, { "crossentropy": 3.8506332635879517, "epoch": 0.006851736045025694, "grad_norm": 0.12157391756772995, "grad_norm_var": 0.0037745106460563282, "learning_rate": 0.004883720930232558, "loss": 3.8506, "step": 126 }, { "crossentropy": 3.8630523681640625, "epoch": 0.006906114902525898, "grad_norm": 0.12979111075401306, "grad_norm_var": 0.003740539867234415, "learning_rate": 0.004922480620155038, "loss": 3.8631, "step": 127 }, { "crossentropy": 3.8340704441070557, "epoch": 0.006960493760026102, "grad_norm": 0.11517086625099182, "grad_norm_var": 0.00373300249772328, "learning_rate": 0.00496124031007752, "loss": 3.8341, "step": 128 }, { "crossentropy": 3.7526633739471436, "epoch": 0.007014872617526305, "grad_norm": 0.2062033861875534, "grad_norm_var": 0.004000169949640045, "learning_rate": 0.005, "loss": 3.7527, "step": 129 }, { "crossentropy": 3.815262794494629, "epoch": 0.00706925147502651, "grad_norm": 0.10637945681810379, "grad_norm_var": 0.003995435036173198, "learning_rate": 0.0050387596899224806, "loss": 3.8153, "step": 130 }, { "crossentropy": 3.8191882371902466, "epoch": 0.007123630332526713, "grad_norm": 0.1333419531583786, "grad_norm_var": 0.004001632197196418, "learning_rate": 0.005077519379844962, "loss": 3.8192, "step": 131 }, { "crossentropy": 3.8979804515838623, "epoch": 0.007178009190026918, "grad_norm": 0.1291196048259735, "grad_norm_var": 0.004011246060329574, "learning_rate": 0.005116279069767442, "loss": 3.898, "step": 132 }, { "crossentropy": 4.018152952194214, "epoch": 0.007232388047527121, "grad_norm": 0.10287975519895554, "grad_norm_var": 0.0040593858926219325, "learning_rate": 0.005155038759689923, "loss": 4.0182, "step": 133 }, { "crossentropy": 3.816002607345581, "epoch": 0.007286766905027326, "grad_norm": 0.10356906056404114, "grad_norm_var": 0.004122343044831285, "learning_rate": 0.005193798449612403, "loss": 3.816, "step": 134 }, { "crossentropy": 3.833424210548401, "epoch": 0.007341145762527529, "grad_norm": 0.10419629514217377, "grad_norm_var": 0.004124315891595986, "learning_rate": 0.0052325581395348845, "loss": 3.8334, "step": 135 }, { "crossentropy": 3.9065165519714355, "epoch": 0.007395524620027733, "grad_norm": 0.1043185368180275, "grad_norm_var": 0.0042079701039629425, "learning_rate": 0.005271317829457365, "loss": 3.9065, "step": 136 }, { "crossentropy": 3.8091856241226196, "epoch": 0.007449903477527937, "grad_norm": 0.101103775203228, "grad_norm_var": 0.004297066993524294, "learning_rate": 0.005310077519379845, "loss": 3.8092, "step": 137 }, { "crossentropy": 3.8544102907180786, "epoch": 0.007504282335028141, "grad_norm": 0.09948959946632385, "grad_norm_var": 0.004382850184756759, "learning_rate": 0.005348837209302325, "loss": 3.8544, "step": 138 }, { "crossentropy": 3.7258509397506714, "epoch": 0.007558661192528345, "grad_norm": 0.09777379781007767, "grad_norm_var": 0.0006867597836440743, "learning_rate": 0.005387596899224806, "loss": 3.7259, "step": 139 }, { "crossentropy": 3.8315542936325073, "epoch": 0.007613040050028549, "grad_norm": 0.16707365214824677, "grad_norm_var": 0.0008364710154258434, "learning_rate": 0.005426356589147287, "loss": 3.8316, "step": 140 }, { "crossentropy": 3.763762354850769, "epoch": 0.007667418907528753, "grad_norm": 0.11020097881555557, "grad_norm_var": 0.0008421694727573053, "learning_rate": 0.005465116279069767, "loss": 3.7638, "step": 141 }, { "crossentropy": 3.804951786994934, "epoch": 0.007721797765028957, "grad_norm": 0.09209636598825455, "grad_norm_var": 0.0008932847023456441, "learning_rate": 0.005503875968992248, "loss": 3.805, "step": 142 }, { "crossentropy": 3.7521172761917114, "epoch": 0.00777617662252916, "grad_norm": 0.1270759552717209, "grad_norm_var": 0.0008898096221147805, "learning_rate": 0.005542635658914729, "loss": 3.7521, "step": 143 }, { "crossentropy": 3.746659517288208, "epoch": 0.007830555480029365, "grad_norm": 0.09755942225456238, "grad_norm_var": 0.000917598280723225, "learning_rate": 0.005581395348837209, "loss": 3.7467, "step": 144 }, { "crossentropy": 3.7834253311157227, "epoch": 0.007884934337529568, "grad_norm": 0.09578251093626022, "grad_norm_var": 0.0003758771300035463, "learning_rate": 0.00562015503875969, "loss": 3.7834, "step": 145 }, { "crossentropy": 3.8035542964935303, "epoch": 0.007939313195029772, "grad_norm": 0.10352415591478348, "grad_norm_var": 0.00037804963728410855, "learning_rate": 0.00565891472868217, "loss": 3.8036, "step": 146 }, { "crossentropy": 3.799664616584778, "epoch": 0.007993692052529977, "grad_norm": 0.17500294744968414, "grad_norm_var": 0.0006130257301675925, "learning_rate": 0.005697674418604652, "loss": 3.7997, "step": 147 }, { "crossentropy": 3.7754770517349243, "epoch": 0.00804807091003018, "grad_norm": 0.09708363562822342, "grad_norm_var": 0.0006090539388915459, "learning_rate": 0.005736434108527132, "loss": 3.7755, "step": 148 }, { "crossentropy": 3.8091570138931274, "epoch": 0.008102449767530384, "grad_norm": 0.10928407311439514, "grad_norm_var": 0.0006045377218604468, "learning_rate": 0.0057751937984496125, "loss": 3.8092, "step": 149 }, { "crossentropy": 3.884873151779175, "epoch": 0.008156828625030588, "grad_norm": 0.11790856719017029, "grad_norm_var": 0.000602090028567596, "learning_rate": 0.005813953488372094, "loss": 3.8849, "step": 150 }, { "crossentropy": 3.8327094316482544, "epoch": 0.008211207482530791, "grad_norm": 0.13721150159835815, "grad_norm_var": 0.0006338067748914572, "learning_rate": 0.005852713178294574, "loss": 3.8327, "step": 151 }, { "crossentropy": 3.8449628353118896, "epoch": 0.008265586340030997, "grad_norm": 0.19074486196041107, "grad_norm_var": 0.000982972406940178, "learning_rate": 0.005891472868217055, "loss": 3.845, "step": 152 }, { "crossentropy": 3.79640793800354, "epoch": 0.0083199651975312, "grad_norm": 0.11655985563993454, "grad_norm_var": 0.0009591011612132121, "learning_rate": 0.005930232558139535, "loss": 3.7964, "step": 153 }, { "crossentropy": 3.723339080810547, "epoch": 0.008374344055031404, "grad_norm": 0.12555401027202606, "grad_norm_var": 0.000927160266461276, "learning_rate": 0.005968992248062015, "loss": 3.7233, "step": 154 }, { "crossentropy": 3.777983546257019, "epoch": 0.008428722912531607, "grad_norm": 0.19950932264328003, "grad_norm_var": 0.0012382682722322865, "learning_rate": 0.006007751937984496, "loss": 3.778, "step": 155 }, { "crossentropy": 3.7485573291778564, "epoch": 0.008483101770031812, "grad_norm": 0.12405721098184586, "grad_norm_var": 0.001134891408596583, "learning_rate": 0.0060465116279069765, "loss": 3.7486, "step": 156 }, { "crossentropy": 3.832965612411499, "epoch": 0.008537480627532016, "grad_norm": 1.7128417491912842, "grad_norm_var": 0.15824531949305973, "learning_rate": 0.006085271317829457, "loss": 3.833, "step": 157 }, { "crossentropy": 3.788973808288574, "epoch": 0.00859185948503222, "grad_norm": 0.11832281947135925, "grad_norm_var": 0.15781879957851677, "learning_rate": 0.006124031007751938, "loss": 3.789, "step": 158 }, { "crossentropy": 3.6508558988571167, "epoch": 0.008646238342532423, "grad_norm": 0.10111074149608612, "grad_norm_var": 0.15821034340845824, "learning_rate": 0.006162790697674419, "loss": 3.6509, "step": 159 }, { "crossentropy": 3.760523557662964, "epoch": 0.008700617200032627, "grad_norm": 0.09363985061645508, "grad_norm_var": 0.15827862572431212, "learning_rate": 0.006201550387596899, "loss": 3.7605, "step": 160 }, { "crossentropy": 3.7597063779830933, "epoch": 0.008754996057532832, "grad_norm": 0.09102148562669754, "grad_norm_var": 0.1583627897541532, "learning_rate": 0.00624031007751938, "loss": 3.7597, "step": 161 }, { "crossentropy": 3.8930171728134155, "epoch": 0.008809374915033035, "grad_norm": 0.1513589769601822, "grad_norm_var": 0.1577256980481089, "learning_rate": 0.006279069767441861, "loss": 3.893, "step": 162 }, { "crossentropy": 3.7778066396713257, "epoch": 0.008863753772533239, "grad_norm": 0.11236110329627991, "grad_norm_var": 0.1584204891656075, "learning_rate": 0.006317829457364341, "loss": 3.7778, "step": 163 }, { "crossentropy": 3.7360626459121704, "epoch": 0.008918132630033443, "grad_norm": 0.09922099858522415, "grad_norm_var": 0.15838434633354145, "learning_rate": 0.006356589147286822, "loss": 3.7361, "step": 164 }, { "crossentropy": 3.7793750762939453, "epoch": 0.008972511487533646, "grad_norm": 0.3068527579307556, "grad_norm_var": 0.15777452289088134, "learning_rate": 0.006395348837209303, "loss": 3.7794, "step": 165 }, { "crossentropy": 3.8092782497406006, "epoch": 0.009026890345033851, "grad_norm": 0.09996294230222702, "grad_norm_var": 0.15808054528403162, "learning_rate": 0.0064341085271317836, "loss": 3.8093, "step": 166 }, { "crossentropy": 3.7974082231521606, "epoch": 0.009081269202534055, "grad_norm": 0.08457635343074799, "grad_norm_var": 0.15894889792817177, "learning_rate": 0.006472868217054264, "loss": 3.7974, "step": 167 }, { "crossentropy": 3.707674264907837, "epoch": 0.009135648060034258, "grad_norm": 0.10778845846652985, "grad_norm_var": 0.15984617531773843, "learning_rate": 0.0065116279069767444, "loss": 3.7077, "step": 168 }, { "crossentropy": 3.6500507593154907, "epoch": 0.009190026917534462, "grad_norm": 0.31634393334388733, "grad_norm_var": 0.15937768104969022, "learning_rate": 0.006550387596899226, "loss": 3.6501, "step": 169 }, { "crossentropy": 3.686316728591919, "epoch": 0.009244405775034667, "grad_norm": 0.08650146424770355, "grad_norm_var": 0.1600703927176188, "learning_rate": 0.006589147286821705, "loss": 3.6863, "step": 170 }, { "crossentropy": 3.8108651638031006, "epoch": 0.009298784632534871, "grad_norm": 0.11837002635002136, "grad_norm_var": 0.1608965704290783, "learning_rate": 0.006627906976744186, "loss": 3.8109, "step": 171 }, { "crossentropy": 3.724506378173828, "epoch": 0.009353163490035074, "grad_norm": 0.10464204847812653, "grad_norm_var": 0.1612015550002498, "learning_rate": 0.006666666666666666, "loss": 3.7245, "step": 172 }, { "crossentropy": 3.8069660663604736, "epoch": 0.009407542347535278, "grad_norm": 0.09276324510574341, "grad_norm_var": 0.005269491801453822, "learning_rate": 0.0067054263565891475, "loss": 3.807, "step": 173 }, { "crossentropy": 3.791692018508911, "epoch": 0.009461921205035482, "grad_norm": 0.08620184659957886, "grad_norm_var": 0.005385282392068111, "learning_rate": 0.006744186046511628, "loss": 3.7917, "step": 174 }, { "crossentropy": 3.706003427505493, "epoch": 0.009516300062535687, "grad_norm": 0.10417436063289642, "grad_norm_var": 0.005374764803106522, "learning_rate": 0.006782945736434108, "loss": 3.706, "step": 175 }, { "crossentropy": 3.7066162824630737, "epoch": 0.00957067892003589, "grad_norm": 0.09082527458667755, "grad_norm_var": 0.005388336959914323, "learning_rate": 0.006821705426356589, "loss": 3.7066, "step": 176 }, { "crossentropy": 3.6590075492858887, "epoch": 0.009625057777536094, "grad_norm": 0.09205811470746994, "grad_norm_var": 0.005383250162377959, "learning_rate": 0.00686046511627907, "loss": 3.659, "step": 177 }, { "crossentropy": 3.768342614173889, "epoch": 0.009679436635036297, "grad_norm": 0.12111440300941467, "grad_norm_var": 0.005347736121244103, "learning_rate": 0.006899224806201551, "loss": 3.7683, "step": 178 }, { "crossentropy": 3.683881998062134, "epoch": 0.009733815492536501, "grad_norm": 0.08468785136938095, "grad_norm_var": 0.005447712447164398, "learning_rate": 0.006937984496124031, "loss": 3.6839, "step": 179 }, { "crossentropy": 3.647314190864563, "epoch": 0.009788194350036706, "grad_norm": 0.12685160338878632, "grad_norm_var": 0.005401357832747512, "learning_rate": 0.0069767441860465115, "loss": 3.6473, "step": 180 }, { "crossentropy": 3.7011477947235107, "epoch": 0.00984257320753691, "grad_norm": 0.10280420631170273, "grad_norm_var": 0.0030963483554693905, "learning_rate": 0.007015503875968993, "loss": 3.7011, "step": 181 }, { "crossentropy": 3.69332754611969, "epoch": 0.009896952065037113, "grad_norm": 0.0950852781534195, "grad_norm_var": 0.0031067882450948377, "learning_rate": 0.007054263565891473, "loss": 3.6933, "step": 182 }, { "crossentropy": 3.8360389471054077, "epoch": 0.009951330922537317, "grad_norm": 0.09220942854881287, "grad_norm_var": 0.00308106995018097, "learning_rate": 0.007093023255813954, "loss": 3.836, "step": 183 }, { "crossentropy": 3.628821015357971, "epoch": 0.010005709780037522, "grad_norm": 0.16774354875087738, "grad_norm_var": 0.003256866753192424, "learning_rate": 0.007131782945736435, "loss": 3.6288, "step": 184 }, { "crossentropy": 3.6842795610427856, "epoch": 0.010060088637537726, "grad_norm": 0.08090902864933014, "grad_norm_var": 0.0004839055880335135, "learning_rate": 0.0071705426356589155, "loss": 3.6843, "step": 185 }, { "crossentropy": 3.708488941192627, "epoch": 0.01011446749503793, "grad_norm": 0.08844894915819168, "grad_norm_var": 0.0004798757197612506, "learning_rate": 0.007209302325581395, "loss": 3.7085, "step": 186 }, { "crossentropy": 3.6663557291030884, "epoch": 0.010168846352538133, "grad_norm": 0.08370381593704224, "grad_norm_var": 0.00048419899205824163, "learning_rate": 0.0072480620155038755, "loss": 3.6664, "step": 187 }, { "crossentropy": 3.6957777738571167, "epoch": 0.010223225210038336, "grad_norm": 0.08908422291278839, "grad_norm_var": 0.000491541497394173, "learning_rate": 0.007286821705426356, "loss": 3.6958, "step": 188 }, { "crossentropy": 3.6697967052459717, "epoch": 0.010277604067538542, "grad_norm": 0.09533516317605972, "grad_norm_var": 0.0004895018834071338, "learning_rate": 0.007325581395348837, "loss": 3.6698, "step": 189 }, { "crossentropy": 3.607730746269226, "epoch": 0.010331982925038745, "grad_norm": 0.08723949640989304, "grad_norm_var": 0.00048764946079055127, "learning_rate": 0.007364341085271318, "loss": 3.6077, "step": 190 }, { "crossentropy": 3.679945707321167, "epoch": 0.010386361782538949, "grad_norm": 0.09514643996953964, "grad_norm_var": 0.000487889782575453, "learning_rate": 0.007403100775193798, "loss": 3.6799, "step": 191 }, { "crossentropy": 3.7590306997299194, "epoch": 0.010440740640039152, "grad_norm": 0.08632545918226242, "grad_norm_var": 0.0004944066795027485, "learning_rate": 0.0074418604651162795, "loss": 3.759, "step": 192 }, { "crossentropy": 3.647499918937683, "epoch": 0.010495119497539356, "grad_norm": 0.08155246078968048, "grad_norm_var": 0.000511444186860225, "learning_rate": 0.00748062015503876, "loss": 3.6475, "step": 193 }, { "crossentropy": 3.5766769647598267, "epoch": 0.010549498355039561, "grad_norm": 0.08309060335159302, "grad_norm_var": 0.00048786607388779724, "learning_rate": 0.00751937984496124, "loss": 3.5767, "step": 194 }, { "crossentropy": 3.6863993406295776, "epoch": 0.010603877212539765, "grad_norm": 0.0919685959815979, "grad_norm_var": 0.0004799418115889381, "learning_rate": 0.007558139534883721, "loss": 3.6864, "step": 195 }, { "crossentropy": 3.596468687057495, "epoch": 0.010658256070039968, "grad_norm": 0.08531089127063751, "grad_norm_var": 0.0004208944543102527, "learning_rate": 0.007596899224806202, "loss": 3.5965, "step": 196 }, { "crossentropy": 3.7269757986068726, "epoch": 0.010712634927540172, "grad_norm": 0.07712291926145554, "grad_norm_var": 0.0004323868175194099, "learning_rate": 0.007635658914728683, "loss": 3.727, "step": 197 }, { "crossentropy": 3.6419386863708496, "epoch": 0.010767013785040377, "grad_norm": 0.07941921800374985, "grad_norm_var": 0.0004423618291167828, "learning_rate": 0.007674418604651163, "loss": 3.6419, "step": 198 }, { "crossentropy": 3.671578526496887, "epoch": 0.01082139264254058, "grad_norm": 0.08738815784454346, "grad_norm_var": 0.00044338309136280317, "learning_rate": 0.007713178294573644, "loss": 3.6716, "step": 199 }, { "crossentropy": 3.5934096574783325, "epoch": 0.010875771500040784, "grad_norm": 0.09336249530315399, "grad_norm_var": 3.0413340911217047e-05, "learning_rate": 0.007751937984496125, "loss": 3.5934, "step": 200 }, { "crossentropy": 3.5713603496551514, "epoch": 0.010930150357540988, "grad_norm": 0.08722590655088425, "grad_norm_var": 2.8124163604081875e-05, "learning_rate": 0.007790697674418605, "loss": 3.5714, "step": 201 }, { "crossentropy": 3.6848793029785156, "epoch": 0.010984529215041191, "grad_norm": 0.08115727454423904, "grad_norm_var": 3.0021771377486405e-05, "learning_rate": 0.007829457364341085, "loss": 3.6849, "step": 202 }, { "crossentropy": 3.694795250892639, "epoch": 0.011038908072541397, "grad_norm": 0.08438830822706223, "grad_norm_var": 2.9793388395458337e-05, "learning_rate": 0.007868217054263566, "loss": 3.6948, "step": 203 }, { "crossentropy": 3.6348371505737305, "epoch": 0.0110932869300416, "grad_norm": 0.08449630439281464, "grad_norm_var": 2.957085274863172e-05, "learning_rate": 0.007906976744186046, "loss": 3.6348, "step": 204 }, { "crossentropy": 3.6770838499069214, "epoch": 0.011147665787541804, "grad_norm": 0.09289823472499847, "grad_norm_var": 2.700078772410741e-05, "learning_rate": 0.007945736434108527, "loss": 3.6771, "step": 205 }, { "crossentropy": 3.6005630493164062, "epoch": 0.011202044645042007, "grad_norm": 0.08574490249156952, "grad_norm_var": 2.691946036300977e-05, "learning_rate": 0.007984496124031008, "loss": 3.6006, "step": 206 }, { "crossentropy": 3.6470481157302856, "epoch": 0.01125642350254221, "grad_norm": 0.09729108959436417, "grad_norm_var": 2.9811694452733935e-05, "learning_rate": 0.008023255813953488, "loss": 3.647, "step": 207 }, { "crossentropy": 3.610029697418213, "epoch": 0.011310802360042416, "grad_norm": 0.0910760685801506, "grad_norm_var": 3.1319779202058226e-05, "learning_rate": 0.00806201550387597, "loss": 3.61, "step": 208 }, { "crossentropy": 3.5357600450515747, "epoch": 0.01136518121754262, "grad_norm": 0.07757166773080826, "grad_norm_var": 3.491941153185296e-05, "learning_rate": 0.00810077519379845, "loss": 3.5358, "step": 209 }, { "crossentropy": 3.6478623151779175, "epoch": 0.011419560075042823, "grad_norm": 0.260665625333786, "grad_norm_var": 0.0019316421424903306, "learning_rate": 0.00813953488372093, "loss": 3.6479, "step": 210 }, { "crossentropy": 3.6310808658599854, "epoch": 0.011473938932543027, "grad_norm": 0.07645844668149948, "grad_norm_var": 0.001957740068229574, "learning_rate": 0.008178294573643411, "loss": 3.6311, "step": 211 }, { "crossentropy": 3.645190715789795, "epoch": 0.011528317790043232, "grad_norm": 0.08186487853527069, "grad_norm_var": 0.001963553731376305, "learning_rate": 0.008217054263565891, "loss": 3.6452, "step": 212 }, { "crossentropy": 3.5350687503814697, "epoch": 0.011582696647543436, "grad_norm": 0.09612584859132767, "grad_norm_var": 0.0019379563390566584, "learning_rate": 0.008255813953488372, "loss": 3.5351, "step": 213 }, { "crossentropy": 3.5965148210525513, "epoch": 0.011637075505043639, "grad_norm": 0.08639203757047653, "grad_norm_var": 0.0019243517409744872, "learning_rate": 0.008294573643410854, "loss": 3.5965, "step": 214 }, { "crossentropy": 3.5300973653793335, "epoch": 0.011691454362543843, "grad_norm": 0.08408858627080917, "grad_norm_var": 0.001929593756805085, "learning_rate": 0.008333333333333333, "loss": 3.5301, "step": 215 }, { "crossentropy": 3.5999467372894287, "epoch": 0.011745833220044046, "grad_norm": 0.09819988161325455, "grad_norm_var": 0.0019283550895009898, "learning_rate": 0.008372093023255815, "loss": 3.5999, "step": 216 }, { "crossentropy": 3.513707160949707, "epoch": 0.011800212077544251, "grad_norm": 0.08338917791843414, "grad_norm_var": 0.001934711462454866, "learning_rate": 0.008410852713178296, "loss": 3.5137, "step": 217 }, { "crossentropy": 3.638343572616577, "epoch": 0.011854590935044455, "grad_norm": 0.1244392842054367, "grad_norm_var": 0.0019568296991841795, "learning_rate": 0.008449612403100775, "loss": 3.6383, "step": 218 }, { "crossentropy": 3.5671348571777344, "epoch": 0.011908969792544659, "grad_norm": 0.11046721041202545, "grad_norm_var": 0.001943945494831085, "learning_rate": 0.008488372093023255, "loss": 3.5671, "step": 219 }, { "crossentropy": 3.584064245223999, "epoch": 0.011963348650044862, "grad_norm": 0.07824593037366867, "grad_norm_var": 0.001960931208096624, "learning_rate": 0.008527131782945736, "loss": 3.5841, "step": 220 }, { "crossentropy": 3.61604905128479, "epoch": 0.012017727507545066, "grad_norm": 0.07479563355445862, "grad_norm_var": 0.0020023132450568432, "learning_rate": 0.008565891472868218, "loss": 3.616, "step": 221 }, { "crossentropy": 3.426172137260437, "epoch": 0.012072106365045271, "grad_norm": 0.14180700480937958, "grad_norm_var": 0.002089007651018206, "learning_rate": 0.008604651162790697, "loss": 3.4262, "step": 222 }, { "crossentropy": 3.5985110998153687, "epoch": 0.012126485222545474, "grad_norm": 0.0824151560664177, "grad_norm_var": 0.0021160062852212907, "learning_rate": 0.008643410852713179, "loss": 3.5985, "step": 223 }, { "crossentropy": 3.606351852416992, "epoch": 0.012180864080045678, "grad_norm": 0.09245358407497406, "grad_norm_var": 0.0021139348006685775, "learning_rate": 0.00868217054263566, "loss": 3.6064, "step": 224 }, { "crossentropy": 3.5477837324142456, "epoch": 0.012235242937545882, "grad_norm": 0.15377916395664215, "grad_norm_var": 0.002217655077510681, "learning_rate": 0.00872093023255814, "loss": 3.5478, "step": 225 }, { "crossentropy": 3.5412299633026123, "epoch": 0.012289621795046087, "grad_norm": 0.09205123782157898, "grad_norm_var": 0.000558974763907958, "learning_rate": 0.00875968992248062, "loss": 3.5412, "step": 226 }, { "crossentropy": 3.531318187713623, "epoch": 0.01234400065254629, "grad_norm": 0.087016761302948, "grad_norm_var": 0.0005365866887021454, "learning_rate": 0.0087984496124031, "loss": 3.5313, "step": 227 }, { "crossentropy": 3.6088088750839233, "epoch": 0.012398379510046494, "grad_norm": 0.17549583315849304, "grad_norm_var": 0.0008834416543503171, "learning_rate": 0.008837209302325582, "loss": 3.6088, "step": 228 }, { "crossentropy": 3.4962291717529297, "epoch": 0.012452758367546697, "grad_norm": 0.0897669866681099, "grad_norm_var": 0.0008924945656954147, "learning_rate": 0.008875968992248063, "loss": 3.4962, "step": 229 }, { "crossentropy": 3.3944568634033203, "epoch": 0.012507137225046901, "grad_norm": 0.09523353725671768, "grad_norm_var": 0.0008773004764088155, "learning_rate": 0.008914728682170543, "loss": 3.3945, "step": 230 }, { "crossentropy": 3.540575385093689, "epoch": 0.012561516082547106, "grad_norm": 0.09146452695131302, "grad_norm_var": 0.0008611405258804134, "learning_rate": 0.008953488372093024, "loss": 3.5406, "step": 231 }, { "crossentropy": 3.587121605873108, "epoch": 0.01261589494004731, "grad_norm": 0.09349583834409714, "grad_norm_var": 0.0008664366172463393, "learning_rate": 0.008992248062015505, "loss": 3.5871, "step": 232 }, { "crossentropy": 3.654576897621155, "epoch": 0.012670273797547513, "grad_norm": 0.18298651278018951, "grad_norm_var": 0.0012107860955183829, "learning_rate": 0.009031007751937985, "loss": 3.6546, "step": 233 }, { "crossentropy": 3.578448534011841, "epoch": 0.012724652655047717, "grad_norm": 0.08298065513372421, "grad_norm_var": 0.0012404377812091562, "learning_rate": 0.009069767441860464, "loss": 3.5784, "step": 234 }, { "crossentropy": 3.567777991294861, "epoch": 0.012779031512547922, "grad_norm": 0.08627218008041382, "grad_norm_var": 0.0012683513726813845, "learning_rate": 0.009108527131782946, "loss": 3.5678, "step": 235 }, { "crossentropy": 3.6384066343307495, "epoch": 0.012833410370048126, "grad_norm": 0.07926711440086365, "grad_norm_var": 0.0012646013570720483, "learning_rate": 0.009147286821705427, "loss": 3.6384, "step": 236 }, { "crossentropy": 3.708517909049988, "epoch": 0.01288778922754833, "grad_norm": 0.08653037995100021, "grad_norm_var": 0.001223867999578785, "learning_rate": 0.009186046511627907, "loss": 3.7085, "step": 237 }, { "crossentropy": 3.4970686435699463, "epoch": 0.012942168085048533, "grad_norm": 0.08177584409713745, "grad_norm_var": 0.001171009612652879, "learning_rate": 0.009224806201550388, "loss": 3.4971, "step": 238 }, { "crossentropy": 3.570704936981201, "epoch": 0.012996546942548736, "grad_norm": 0.28061383962631226, "grad_norm_var": 0.003073960283064835, "learning_rate": 0.009263565891472867, "loss": 3.5707, "step": 239 }, { "crossentropy": 3.5393412113189697, "epoch": 0.013050925800048942, "grad_norm": 0.07681635022163391, "grad_norm_var": 0.0031377088424652663, "learning_rate": 0.009302325581395349, "loss": 3.5393, "step": 240 }, { "crossentropy": 3.5425487756729126, "epoch": 0.013105304657549145, "grad_norm": 0.09008538722991943, "grad_norm_var": 0.0030595690326224918, "learning_rate": 0.00934108527131783, "loss": 3.5425, "step": 241 }, { "crossentropy": 3.529078722000122, "epoch": 0.013159683515049349, "grad_norm": 0.09887199848890305, "grad_norm_var": 0.003045479758198009, "learning_rate": 0.00937984496124031, "loss": 3.5291, "step": 242 }, { "crossentropy": 3.574668288230896, "epoch": 0.013214062372549552, "grad_norm": 0.0841749981045723, "grad_norm_var": 0.0030551350936095945, "learning_rate": 0.009418604651162791, "loss": 3.5747, "step": 243 }, { "crossentropy": 3.512553572654724, "epoch": 0.013268441230049756, "grad_norm": 0.07676009833812714, "grad_norm_var": 0.0028152209683664016, "learning_rate": 0.009457364341085272, "loss": 3.5126, "step": 244 }, { "crossentropy": 3.487416386604309, "epoch": 0.013322820087549961, "grad_norm": 0.0820871964097023, "grad_norm_var": 0.002834319511595936, "learning_rate": 0.009496124031007752, "loss": 3.4874, "step": 245 }, { "crossentropy": 3.5970929861068726, "epoch": 0.013377198945050165, "grad_norm": 0.07988698780536652, "grad_norm_var": 0.0028676699911466593, "learning_rate": 0.009534883720930233, "loss": 3.5971, "step": 246 }, { "crossentropy": 3.598742365837097, "epoch": 0.013431577802550368, "grad_norm": 0.07307852804660797, "grad_norm_var": 0.002918006637867557, "learning_rate": 0.009573643410852714, "loss": 3.5987, "step": 247 }, { "crossentropy": 3.5432448387145996, "epoch": 0.013485956660050572, "grad_norm": 0.07640985399484634, "grad_norm_var": 0.002956150439856642, "learning_rate": 0.009612403100775194, "loss": 3.5432, "step": 248 }, { "crossentropy": 3.466787815093994, "epoch": 0.013540335517550777, "grad_norm": 0.06709449738264084, "grad_norm_var": 0.0025312167544649577, "learning_rate": 0.009651162790697675, "loss": 3.4668, "step": 249 }, { "crossentropy": 3.4825587272644043, "epoch": 0.01359471437505098, "grad_norm": 0.08869365602731705, "grad_norm_var": 0.002524924459067035, "learning_rate": 0.009689922480620155, "loss": 3.4826, "step": 250 }, { "crossentropy": 3.557650566101074, "epoch": 0.013649093232551184, "grad_norm": 0.08912874013185501, "grad_norm_var": 0.0025223859334497036, "learning_rate": 0.009728682170542636, "loss": 3.5577, "step": 251 }, { "crossentropy": 3.501117706298828, "epoch": 0.013703472090051388, "grad_norm": 0.08176298439502716, "grad_norm_var": 0.0025177210980438006, "learning_rate": 0.009767441860465116, "loss": 3.5011, "step": 252 }, { "crossentropy": 3.516678214073181, "epoch": 0.013757850947551591, "grad_norm": 0.0751088410615921, "grad_norm_var": 0.002538179625262593, "learning_rate": 0.009806201550387597, "loss": 3.5167, "step": 253 }, { "crossentropy": 3.5166773796081543, "epoch": 0.013812229805051797, "grad_norm": 0.0745319053530693, "grad_norm_var": 0.0025531664795212477, "learning_rate": 0.009844961240310077, "loss": 3.5167, "step": 254 }, { "crossentropy": 3.6435049772262573, "epoch": 0.013866608662552, "grad_norm": 0.07099827378988266, "grad_norm_var": 6.817599442155778e-05, "learning_rate": 0.009883720930232558, "loss": 3.6435, "step": 255 }, { "crossentropy": 3.435419797897339, "epoch": 0.013920987520052204, "grad_norm": 0.08580715209245682, "grad_norm_var": 6.900032514154387e-05, "learning_rate": 0.00992248062015504, "loss": 3.4354, "step": 256 }, { "crossentropy": 3.494046211242676, "epoch": 0.013975366377552407, "grad_norm": 0.07390860468149185, "grad_norm_var": 6.555477454100802e-05, "learning_rate": 0.009961240310077519, "loss": 3.494, "step": 257 }, { "crossentropy": 3.4698500633239746, "epoch": 0.01402974523505261, "grad_norm": 0.08222740888595581, "grad_norm_var": 4.075249989901701e-05, "learning_rate": 0.01, "loss": 3.4699, "step": 258 }, { "crossentropy": 3.4859423637390137, "epoch": 0.014084124092552816, "grad_norm": 0.07362310588359833, "grad_norm_var": 4.022481282566377e-05, "learning_rate": 0.009999999844952331, "loss": 3.4859, "step": 259 }, { "crossentropy": 3.5849229097366333, "epoch": 0.01413850295005302, "grad_norm": 0.0862388089299202, "grad_norm_var": 4.4027676931301016e-05, "learning_rate": 0.009999999379809332, "loss": 3.5849, "step": 260 }, { "crossentropy": 3.3799198865890503, "epoch": 0.014192881807553223, "grad_norm": 0.0814206525683403, "grad_norm_var": 4.376211742826985e-05, "learning_rate": 0.009999998604571035, "loss": 3.3799, "step": 261 }, { "crossentropy": 3.553397297859192, "epoch": 0.014247260665053427, "grad_norm": 0.07757677882909775, "grad_norm_var": 4.3743919978892624e-05, "learning_rate": 0.009999997519237485, "loss": 3.5534, "step": 262 }, { "crossentropy": 3.6091744899749756, "epoch": 0.014301639522553632, "grad_norm": 0.0835760235786438, "grad_norm_var": 4.290218470157994e-05, "learning_rate": 0.00999999612380875, "loss": 3.6092, "step": 263 }, { "crossentropy": 3.407516360282898, "epoch": 0.014356018380053836, "grad_norm": 0.08491606265306473, "grad_norm_var": 4.419562044994362e-05, "learning_rate": 0.009999994418284917, "loss": 3.4075, "step": 264 }, { "crossentropy": 3.527740240097046, "epoch": 0.014410397237554039, "grad_norm": 0.07588035613298416, "grad_norm_var": 3.4149899024465734e-05, "learning_rate": 0.009999992402666092, "loss": 3.5277, "step": 265 }, { "crossentropy": 3.408521890640259, "epoch": 0.014464776095054243, "grad_norm": 0.08575274050235748, "grad_norm_var": 3.14138115260848e-05, "learning_rate": 0.0099999900769524, "loss": 3.4085, "step": 266 }, { "crossentropy": 3.389896512031555, "epoch": 0.014519154952554446, "grad_norm": 0.08300282806158066, "grad_norm_var": 2.642849006319059e-05, "learning_rate": 0.009999987441143983, "loss": 3.3899, "step": 267 }, { "crossentropy": 3.3431397676467896, "epoch": 0.014573533810054651, "grad_norm": 0.07683193683624268, "grad_norm_var": 2.6638373271776574e-05, "learning_rate": 0.009999984495241008, "loss": 3.3431, "step": 268 }, { "crossentropy": 3.4841312170028687, "epoch": 0.014627912667554855, "grad_norm": 0.07401034981012344, "grad_norm_var": 2.7351465341816234e-05, "learning_rate": 0.009999981239243656, "loss": 3.4841, "step": 269 }, { "crossentropy": 3.4940598011016846, "epoch": 0.014682291525055059, "grad_norm": 0.08107045292854309, "grad_norm_var": 2.578475327089195e-05, "learning_rate": 0.009999977673152128, "loss": 3.4941, "step": 270 }, { "crossentropy": 3.514147400856018, "epoch": 0.014736670382555262, "grad_norm": 0.07026305794715881, "grad_norm_var": 2.668161412430365e-05, "learning_rate": 0.009999973796966648, "loss": 3.5141, "step": 271 }, { "crossentropy": 3.45902943611145, "epoch": 0.014791049240055466, "grad_norm": 0.068415567278862, "grad_norm_var": 3.1555427931273525e-05, "learning_rate": 0.009999969610687455, "loss": 3.459, "step": 272 }, { "crossentropy": 3.406840682029724, "epoch": 0.014845428097555671, "grad_norm": 0.07088036090135574, "grad_norm_var": 3.4050925122131036e-05, "learning_rate": 0.009999965114314805, "loss": 3.4068, "step": 273 }, { "crossentropy": 3.391509175300598, "epoch": 0.014899806955055874, "grad_norm": 0.1691248118877411, "grad_norm_var": 0.0005494123197215206, "learning_rate": 0.009999960307848983, "loss": 3.3915, "step": 274 }, { "crossentropy": 3.4723175764083862, "epoch": 0.014954185812556078, "grad_norm": 0.07668375968933105, "grad_norm_var": 0.0005457992358329324, "learning_rate": 0.009999955191290282, "loss": 3.4723, "step": 275 }, { "crossentropy": 3.465295195579529, "epoch": 0.015008564670056282, "grad_norm": 0.07340843230485916, "grad_norm_var": 0.0005524337626783051, "learning_rate": 0.009999949764639023, "loss": 3.4653, "step": 276 }, { "crossentropy": 3.4192365407943726, "epoch": 0.015062943527556487, "grad_norm": 0.08026103675365448, "grad_norm_var": 0.0005528085200641591, "learning_rate": 0.00999994402789554, "loss": 3.4192, "step": 277 }, { "crossentropy": 3.5382388830184937, "epoch": 0.01511732238505669, "grad_norm": 0.09014740586280823, "grad_norm_var": 0.0005532122056302555, "learning_rate": 0.00999993798106019, "loss": 3.5382, "step": 278 }, { "crossentropy": 3.5052764415740967, "epoch": 0.015171701242556894, "grad_norm": 0.07389292865991592, "grad_norm_var": 0.0005596379088515399, "learning_rate": 0.00999993162413335, "loss": 3.5053, "step": 279 }, { "crossentropy": 3.50546658039093, "epoch": 0.015226080100057097, "grad_norm": 0.07086962461471558, "grad_norm_var": 0.0005691465714606094, "learning_rate": 0.00999992495711541, "loss": 3.5055, "step": 280 }, { "crossentropy": 3.4771530628204346, "epoch": 0.015280458957557301, "grad_norm": 0.07612856477499008, "grad_norm_var": 0.000568930323017125, "learning_rate": 0.009999917980006785, "loss": 3.4772, "step": 281 }, { "crossentropy": 3.45108163356781, "epoch": 0.015334837815057506, "grad_norm": 0.07557900995016098, "grad_norm_var": 0.0005710501037039996, "learning_rate": 0.009999910692807909, "loss": 3.4511, "step": 282 }, { "crossentropy": 3.4078657627105713, "epoch": 0.01538921667255771, "grad_norm": 0.06914747506380081, "grad_norm_var": 0.0005810305794031557, "learning_rate": 0.009999903095519234, "loss": 3.4079, "step": 283 }, { "crossentropy": 3.43261456489563, "epoch": 0.015443595530057913, "grad_norm": 0.07554497569799423, "grad_norm_var": 0.0005818569799888993, "learning_rate": 0.00999989518814123, "loss": 3.4326, "step": 284 }, { "crossentropy": 3.512303590774536, "epoch": 0.015497974387558117, "grad_norm": 0.06696019321680069, "grad_norm_var": 0.0005915003246876872, "learning_rate": 0.009999886970674388, "loss": 3.5123, "step": 285 }, { "crossentropy": 3.4201513528823853, "epoch": 0.01555235324505832, "grad_norm": 0.06818956881761551, "grad_norm_var": 0.0006009309611042954, "learning_rate": 0.009999878443119217, "loss": 3.4202, "step": 286 }, { "crossentropy": 3.393484592437744, "epoch": 0.015606732102558526, "grad_norm": 0.0680573508143425, "grad_norm_var": 0.0006040158382816972, "learning_rate": 0.009999869605476245, "loss": 3.3935, "step": 287 }, { "crossentropy": 3.5317713022232056, "epoch": 0.01566111096005873, "grad_norm": 0.06516802310943604, "grad_norm_var": 0.0006095095618314524, "learning_rate": 0.009999860457746024, "loss": 3.5318, "step": 288 }, { "crossentropy": 3.444399833679199, "epoch": 0.015715489817558933, "grad_norm": 0.07314152270555496, "grad_norm_var": 0.0006072672612415064, "learning_rate": 0.009999850999929118, "loss": 3.4444, "step": 289 }, { "crossentropy": 3.5313533544540405, "epoch": 0.015769868675059136, "grad_norm": 0.06758904457092285, "grad_norm_var": 3.8519693910177286e-05, "learning_rate": 0.009999841232026114, "loss": 3.5314, "step": 290 }, { "crossentropy": 3.50236976146698, "epoch": 0.01582424753255934, "grad_norm": 0.0711338073015213, "grad_norm_var": 3.784691963929036e-05, "learning_rate": 0.00999983115403762, "loss": 3.5024, "step": 291 }, { "crossentropy": 3.484432816505432, "epoch": 0.015878626390059544, "grad_norm": 0.06274712830781937, "grad_norm_var": 4.412321396065529e-05, "learning_rate": 0.009999820765964255, "loss": 3.4844, "step": 292 }, { "crossentropy": 3.3365036249160767, "epoch": 0.015933005247559747, "grad_norm": 0.07929203659296036, "grad_norm_var": 4.313522602676529e-05, "learning_rate": 0.009999810067806671, "loss": 3.3365, "step": 293 }, { "crossentropy": 3.441346287727356, "epoch": 0.015987384105059954, "grad_norm": 0.07353205978870392, "grad_norm_var": 2.0406161154314327e-05, "learning_rate": 0.009999799059565526, "loss": 3.4413, "step": 294 }, { "crossentropy": 3.3346978425979614, "epoch": 0.016041762962560158, "grad_norm": 0.07999410480260849, "grad_norm_var": 2.5036565691906552e-05, "learning_rate": 0.009999787741241507, "loss": 3.3347, "step": 295 }, { "crossentropy": 3.456690788269043, "epoch": 0.01609614182006036, "grad_norm": 0.0703643187880516, "grad_norm_var": 2.5091097833499867e-05, "learning_rate": 0.00999977611283531, "loss": 3.4567, "step": 296 }, { "crossentropy": 3.4737279415130615, "epoch": 0.016150520677560565, "grad_norm": 0.07280790060758591, "grad_norm_var": 2.3691358444447955e-05, "learning_rate": 0.009999764174347663, "loss": 3.4737, "step": 297 }, { "crossentropy": 3.5287946462631226, "epoch": 0.01620489953506077, "grad_norm": 0.07490158826112747, "grad_norm_var": 2.332478876620192e-05, "learning_rate": 0.009999751925779302, "loss": 3.5288, "step": 298 }, { "crossentropy": 3.414504885673523, "epoch": 0.016259278392560972, "grad_norm": 0.07066501677036285, "grad_norm_var": 2.3061369645025616e-05, "learning_rate": 0.009999739367130988, "loss": 3.4145, "step": 299 }, { "crossentropy": 3.4083255529403687, "epoch": 0.016313657250061175, "grad_norm": 0.06843928247690201, "grad_norm_var": 2.2153127525420304e-05, "learning_rate": 0.009999726498403498, "loss": 3.4083, "step": 300 }, { "crossentropy": 3.5281803607940674, "epoch": 0.01636803610756138, "grad_norm": 0.06657624244689941, "grad_norm_var": 2.2359499420413855e-05, "learning_rate": 0.009999713319597633, "loss": 3.5282, "step": 301 }, { "crossentropy": 3.384378433227539, "epoch": 0.016422414965061582, "grad_norm": 0.06961915642023087, "grad_norm_var": 2.1992047895854257e-05, "learning_rate": 0.00999969983071421, "loss": 3.3844, "step": 302 }, { "crossentropy": 3.5387587547302246, "epoch": 0.01647679382256179, "grad_norm": 0.16005438566207886, "grad_norm_var": 0.0005163739863045504, "learning_rate": 0.009999686031754065, "loss": 3.5388, "step": 303 }, { "crossentropy": 3.250088334083557, "epoch": 0.016531172680061993, "grad_norm": 0.07206714153289795, "grad_norm_var": 0.0005088083060656166, "learning_rate": 0.009999671922718053, "loss": 3.2501, "step": 304 }, { "crossentropy": 3.387649416923523, "epoch": 0.016585551537562197, "grad_norm": 0.07948385924100876, "grad_norm_var": 0.0005080106066299843, "learning_rate": 0.009999657503607049, "loss": 3.3876, "step": 305 }, { "crossentropy": 3.3840749263763428, "epoch": 0.0166399303950624, "grad_norm": 0.07016249746084213, "grad_norm_var": 0.0005050395238775402, "learning_rate": 0.009999642774421948, "loss": 3.3841, "step": 306 }, { "crossentropy": 3.395912289619446, "epoch": 0.016694309252562604, "grad_norm": 0.06661797314882278, "grad_norm_var": 0.000510216489915303, "learning_rate": 0.009999627735163663, "loss": 3.3959, "step": 307 }, { "crossentropy": 3.403930425643921, "epoch": 0.016748688110062807, "grad_norm": 0.06205541640520096, "grad_norm_var": 0.0005115916044082622, "learning_rate": 0.009999612385833127, "loss": 3.4039, "step": 308 }, { "crossentropy": 3.363746404647827, "epoch": 0.01680306696756301, "grad_norm": 0.26956138014793396, "grad_norm_var": 0.002825044336515281, "learning_rate": 0.009999596726431293, "loss": 3.3637, "step": 309 }, { "crossentropy": 3.476794958114624, "epoch": 0.016857445825063214, "grad_norm": 0.08131230622529984, "grad_norm_var": 0.0028125935096534656, "learning_rate": 0.00999958075695913, "loss": 3.4768, "step": 310 }, { "crossentropy": 3.4142271280288696, "epoch": 0.016911824682563418, "grad_norm": 0.07251974940299988, "grad_norm_var": 0.0028257256131941594, "learning_rate": 0.00999956447741763, "loss": 3.4142, "step": 311 }, { "crossentropy": 3.453118681907654, "epoch": 0.016966203540063625, "grad_norm": 0.0737333670258522, "grad_norm_var": 0.0028179736787855856, "learning_rate": 0.009999547887807802, "loss": 3.4531, "step": 312 }, { "crossentropy": 3.2724900245666504, "epoch": 0.01702058239756383, "grad_norm": 0.09837423264980316, "grad_norm_var": 0.0028022283627642712, "learning_rate": 0.009999530988130676, "loss": 3.2725, "step": 313 }, { "crossentropy": 3.4408209323883057, "epoch": 0.017074961255064032, "grad_norm": 0.06418675184249878, "grad_norm_var": 0.002832415580015187, "learning_rate": 0.009999513778387299, "loss": 3.4408, "step": 314 }, { "crossentropy": 3.3572994470596313, "epoch": 0.017129340112564236, "grad_norm": 0.06895001977682114, "grad_norm_var": 0.002837098250223737, "learning_rate": 0.009999496258578737, "loss": 3.3573, "step": 315 }, { "crossentropy": 3.4239859580993652, "epoch": 0.01718371897006444, "grad_norm": 0.06894141435623169, "grad_norm_var": 0.0028356549589504325, "learning_rate": 0.009999478428706078, "loss": 3.424, "step": 316 }, { "crossentropy": 3.421961545944214, "epoch": 0.017238097827564643, "grad_norm": 0.0728181004524231, "grad_norm_var": 0.0028183763475124893, "learning_rate": 0.00999946028877043, "loss": 3.422, "step": 317 }, { "crossentropy": 3.2918691635131836, "epoch": 0.017292476685064846, "grad_norm": 0.07161250710487366, "grad_norm_var": 0.0028130341490914383, "learning_rate": 0.009999441838772915, "loss": 3.2919, "step": 318 }, { "crossentropy": 3.358832836151123, "epoch": 0.01734685554256505, "grad_norm": 0.06694286316633224, "grad_norm_var": 0.00249483898377137, "learning_rate": 0.009999423078714677, "loss": 3.3588, "step": 319 }, { "crossentropy": 3.3152624368667603, "epoch": 0.017401234400065253, "grad_norm": 0.08311066031455994, "grad_norm_var": 0.0024834789830908096, "learning_rate": 0.009999404008596882, "loss": 3.3153, "step": 320 }, { "crossentropy": 3.378923773765564, "epoch": 0.017455613257565457, "grad_norm": 0.07224222272634506, "grad_norm_var": 0.002492709271039653, "learning_rate": 0.009999384628420711, "loss": 3.3789, "step": 321 }, { "crossentropy": 3.3498390913009644, "epoch": 0.017509992115065664, "grad_norm": 0.06689100712537766, "grad_norm_var": 0.002499935929951405, "learning_rate": 0.009999364938187365, "loss": 3.3498, "step": 322 }, { "crossentropy": 3.373263955116272, "epoch": 0.017564370972565867, "grad_norm": 0.06665370613336563, "grad_norm_var": 0.0024998484691825555, "learning_rate": 0.009999344937898069, "loss": 3.3733, "step": 323 }, { "crossentropy": 3.3216311931610107, "epoch": 0.01761874983006607, "grad_norm": 0.06982091069221497, "grad_norm_var": 0.002479866698221237, "learning_rate": 0.009999324627554058, "loss": 3.3216, "step": 324 }, { "crossentropy": 3.3044458627700806, "epoch": 0.017673128687566275, "grad_norm": 0.0669289082288742, "grad_norm_var": 7.264807515810096e-05, "learning_rate": 0.009999304007156597, "loss": 3.3044, "step": 325 }, { "crossentropy": 3.361254096031189, "epoch": 0.017727507545066478, "grad_norm": 0.0675644651055336, "grad_norm_var": 6.888467651710746e-05, "learning_rate": 0.009999283076706961, "loss": 3.3613, "step": 326 }, { "crossentropy": 3.329406499862671, "epoch": 0.01778188640256668, "grad_norm": 0.6242774724960327, "grad_norm_var": 0.019137668497096454, "learning_rate": 0.009999261836206448, "loss": 3.3294, "step": 327 }, { "crossentropy": 3.3870599269866943, "epoch": 0.017836265260066885, "grad_norm": 0.07314576208591461, "grad_norm_var": 0.01914025259643993, "learning_rate": 0.009999240285656378, "loss": 3.3871, "step": 328 }, { "crossentropy": 3.415145516395569, "epoch": 0.01789064411756709, "grad_norm": 0.06713801622390747, "grad_norm_var": 0.01923467574945389, "learning_rate": 0.009999218425058088, "loss": 3.4151, "step": 329 }, { "crossentropy": 3.321105718612671, "epoch": 0.017945022975067292, "grad_norm": 0.0654146820306778, "grad_norm_var": 0.019228177673038797, "learning_rate": 0.00999919625441293, "loss": 3.3211, "step": 330 }, { "crossentropy": 3.303057909011841, "epoch": 0.0179994018325675, "grad_norm": 0.07082128524780273, "grad_norm_var": 0.019219519672164648, "learning_rate": 0.009999173773722283, "loss": 3.3031, "step": 331 }, { "crossentropy": 3.3781274557113647, "epoch": 0.018053780690067703, "grad_norm": 0.07477091252803802, "grad_norm_var": 0.019193892220910204, "learning_rate": 0.009999150982987537, "loss": 3.3781, "step": 332 }, { "crossentropy": 3.416086792945862, "epoch": 0.018108159547567906, "grad_norm": 0.06654556095600128, "grad_norm_var": 0.0192232742553561, "learning_rate": 0.009999127882210109, "loss": 3.4161, "step": 333 }, { "crossentropy": 3.3981311321258545, "epoch": 0.01816253840506811, "grad_norm": 0.06890884041786194, "grad_norm_var": 0.019235629073236855, "learning_rate": 0.009999104471391431, "loss": 3.3981, "step": 334 }, { "crossentropy": 3.31412935256958, "epoch": 0.018216917262568313, "grad_norm": 0.06607803702354431, "grad_norm_var": 0.019240000608393253, "learning_rate": 0.009999080750532952, "loss": 3.3141, "step": 335 }, { "crossentropy": 3.304350256919861, "epoch": 0.018271296120068517, "grad_norm": 0.11871770769357681, "grad_norm_var": 0.01921819454725964, "learning_rate": 0.009999056719636149, "loss": 3.3044, "step": 336 }, { "crossentropy": 3.440518021583557, "epoch": 0.01832567497756872, "grad_norm": 0.08838114142417908, "grad_norm_var": 0.01916049763816865, "learning_rate": 0.009999032378702505, "loss": 3.4405, "step": 337 }, { "crossentropy": 3.273046851158142, "epoch": 0.018380053835068924, "grad_norm": 0.06485237926244736, "grad_norm_var": 0.019171830574497787, "learning_rate": 0.009999007727733536, "loss": 3.273, "step": 338 }, { "crossentropy": 3.256856322288513, "epoch": 0.018434432692569128, "grad_norm": 0.16671641170978546, "grad_norm_var": 0.019252639383045483, "learning_rate": 0.009998982766730766, "loss": 3.2569, "step": 339 }, { "crossentropy": 3.3471001386642456, "epoch": 0.018488811550069335, "grad_norm": 0.19053782522678375, "grad_norm_var": 0.019456277688729817, "learning_rate": 0.00999895749569575, "loss": 3.3471, "step": 340 }, { "crossentropy": 3.3234068155288696, "epoch": 0.018543190407569538, "grad_norm": 0.06966140866279602, "grad_norm_var": 0.019436935157884055, "learning_rate": 0.009998931914630044, "loss": 3.3234, "step": 341 }, { "crossentropy": 3.381627082824707, "epoch": 0.018597569265069742, "grad_norm": 0.08428682386875153, "grad_norm_var": 0.01933422046944117, "learning_rate": 0.009998906023535245, "loss": 3.3816, "step": 342 }, { "crossentropy": 3.350903868675232, "epoch": 0.018651948122569945, "grad_norm": 0.0648796558380127, "grad_norm_var": 0.001467512593933687, "learning_rate": 0.009998879822412956, "loss": 3.3509, "step": 343 }, { "crossentropy": 3.395211338996887, "epoch": 0.01870632698007015, "grad_norm": 0.07966339588165283, "grad_norm_var": 0.001457646960101955, "learning_rate": 0.009998853311264799, "loss": 3.3952, "step": 344 }, { "crossentropy": 3.239471673965454, "epoch": 0.018760705837570352, "grad_norm": 0.06852038204669952, "grad_norm_var": 0.0014539284181479457, "learning_rate": 0.00999882649009242, "loss": 3.2395, "step": 345 }, { "crossentropy": 3.371743679046631, "epoch": 0.018815084695070556, "grad_norm": 0.07980605214834213, "grad_norm_var": 0.0014234443467539084, "learning_rate": 0.009998799358897484, "loss": 3.3717, "step": 346 }, { "crossentropy": 3.2922704219818115, "epoch": 0.01886946355257076, "grad_norm": 0.07571585476398468, "grad_norm_var": 0.0014131128084711266, "learning_rate": 0.00999877191768167, "loss": 3.2923, "step": 347 }, { "crossentropy": 3.449204683303833, "epoch": 0.018923842410070963, "grad_norm": 0.0703648254275322, "grad_norm_var": 0.0014228338645273426, "learning_rate": 0.009998744166446685, "loss": 3.4492, "step": 348 }, { "crossentropy": 3.4094408750534058, "epoch": 0.01897822126757117, "grad_norm": 0.07604625821113586, "grad_norm_var": 0.001400059735406467, "learning_rate": 0.009998716105194245, "loss": 3.4094, "step": 349 }, { "crossentropy": 3.284795045852661, "epoch": 0.019032600125071374, "grad_norm": 0.07315584272146225, "grad_norm_var": 0.0013894867157350848, "learning_rate": 0.009998687733926093, "loss": 3.2848, "step": 350 }, { "crossentropy": 3.448261260986328, "epoch": 0.019086978982571577, "grad_norm": 0.5441951751708984, "grad_norm_var": 0.014162159459999777, "learning_rate": 0.009998659052643986, "loss": 3.4483, "step": 351 }, { "crossentropy": 3.308355689048767, "epoch": 0.01914135784007178, "grad_norm": 0.10573507100343704, "grad_norm_var": 0.014174426709282132, "learning_rate": 0.009998630061349708, "loss": 3.3084, "step": 352 }, { "crossentropy": 3.2317174673080444, "epoch": 0.019195736697571984, "grad_norm": 0.09520689398050308, "grad_norm_var": 0.014149556676190422, "learning_rate": 0.009998600760045051, "loss": 3.2317, "step": 353 }, { "crossentropy": 3.3533629179000854, "epoch": 0.019250115555072188, "grad_norm": 0.08033286780118942, "grad_norm_var": 0.014052080874459234, "learning_rate": 0.009998571148731837, "loss": 3.3534, "step": 354 }, { "crossentropy": 3.3487950563430786, "epoch": 0.01930449441257239, "grad_norm": 0.1695932000875473, "grad_norm_var": 0.014070401551547573, "learning_rate": 0.0099985412274119, "loss": 3.3488, "step": 355 }, { "crossentropy": 3.356499671936035, "epoch": 0.019358873270072595, "grad_norm": 0.07152798771858215, "grad_norm_var": 0.013843955692840573, "learning_rate": 0.009998510996087095, "loss": 3.3565, "step": 356 }, { "crossentropy": 3.5154892206192017, "epoch": 0.0194132521275728, "grad_norm": 0.42922234535217285, "grad_norm_var": 0.019844422071085135, "learning_rate": 0.0099984804547593, "loss": 3.5155, "step": 357 }, { "crossentropy": 3.3450675010681152, "epoch": 0.019467630985073002, "grad_norm": 0.12214180827140808, "grad_norm_var": 0.019675415002011375, "learning_rate": 0.009998449603430408, "loss": 3.3451, "step": 358 }, { "crossentropy": 3.3442468643188477, "epoch": 0.01952200984257321, "grad_norm": 0.08665797859430313, "grad_norm_var": 0.019493076774698796, "learning_rate": 0.009998418442102329, "loss": 3.3442, "step": 359 }, { "crossentropy": 3.401782989501953, "epoch": 0.019576388700073413, "grad_norm": 0.09089110791683197, "grad_norm_var": 0.01941176346060295, "learning_rate": 0.009998386970777, "loss": 3.4018, "step": 360 }, { "crossentropy": 3.3462846279144287, "epoch": 0.019630767557573616, "grad_norm": 0.08168740570545197, "grad_norm_var": 0.01929720652963861, "learning_rate": 0.00999835518945637, "loss": 3.3463, "step": 361 }, { "crossentropy": 3.2699739933013916, "epoch": 0.01968514641507382, "grad_norm": 0.06988145411014557, "grad_norm_var": 0.01938403173711008, "learning_rate": 0.009998323098142413, "loss": 3.27, "step": 362 }, { "crossentropy": 3.347229242324829, "epoch": 0.019739525272574023, "grad_norm": 0.06358539313077927, "grad_norm_var": 0.019497439510143193, "learning_rate": 0.009998290696837116, "loss": 3.3472, "step": 363 }, { "crossentropy": 3.409914255142212, "epoch": 0.019793904130074227, "grad_norm": 0.07303816825151443, "grad_norm_var": 0.01947328277587255, "learning_rate": 0.00999825798554249, "loss": 3.4099, "step": 364 }, { "crossentropy": 3.2691829204559326, "epoch": 0.01984828298757443, "grad_norm": 0.13955241441726685, "grad_norm_var": 0.01918757775833221, "learning_rate": 0.009998224964260562, "loss": 3.2692, "step": 365 }, { "crossentropy": 3.3396424055099487, "epoch": 0.019902661845074634, "grad_norm": 0.06675408035516739, "grad_norm_var": 0.019250204324295413, "learning_rate": 0.009998191632993383, "loss": 3.3396, "step": 366 }, { "crossentropy": 3.337554454803467, "epoch": 0.019957040702574837, "grad_norm": 0.0737956091761589, "grad_norm_var": 0.007924853766261862, "learning_rate": 0.00999815799174302, "loss": 3.3376, "step": 367 }, { "crossentropy": 3.363837480545044, "epoch": 0.020011419560075044, "grad_norm": 0.0653655007481575, "grad_norm_var": 0.00806971809420267, "learning_rate": 0.009998124040511557, "loss": 3.3638, "step": 368 }, { "crossentropy": 3.3690396547317505, "epoch": 0.020065798417575248, "grad_norm": 0.0812348946928978, "grad_norm_var": 0.008111717214584764, "learning_rate": 0.0099980897793011, "loss": 3.369, "step": 369 }, { "crossentropy": 3.3581022024154663, "epoch": 0.02012017727507545, "grad_norm": 0.06538533419370651, "grad_norm_var": 0.008185463715849824, "learning_rate": 0.009998055208113775, "loss": 3.3581, "step": 370 }, { "crossentropy": 3.314501643180847, "epoch": 0.020174556132575655, "grad_norm": 0.06498236954212189, "grad_norm_var": 0.008029772036055668, "learning_rate": 0.009998020326951726, "loss": 3.3145, "step": 371 }, { "crossentropy": 3.2638334035873413, "epoch": 0.02022893499007586, "grad_norm": 0.06988417357206345, "grad_norm_var": 0.008036807350396996, "learning_rate": 0.009997985135817115, "loss": 3.2638, "step": 372 }, { "crossentropy": 3.289473056793213, "epoch": 0.020283313847576062, "grad_norm": 0.06489303708076477, "grad_norm_var": 0.0004738541010219271, "learning_rate": 0.009997949634712127, "loss": 3.2895, "step": 373 }, { "crossentropy": 3.396390438079834, "epoch": 0.020337692705076266, "grad_norm": 0.07396254688501358, "grad_norm_var": 0.0003481087412773087, "learning_rate": 0.009997913823638961, "loss": 3.3964, "step": 374 }, { "crossentropy": 3.2013171911239624, "epoch": 0.02039207156257647, "grad_norm": 0.06636767089366913, "grad_norm_var": 0.0003476354899068647, "learning_rate": 0.00999787770259984, "loss": 3.2013, "step": 375 }, { "crossentropy": 3.4503889083862305, "epoch": 0.020446450420076673, "grad_norm": 0.06740888953208923, "grad_norm_var": 0.00033454808092784314, "learning_rate": 0.009997841271597004, "loss": 3.4504, "step": 376 }, { "crossentropy": 3.3211491107940674, "epoch": 0.02050082927757688, "grad_norm": 0.06109267473220825, "grad_norm_var": 0.000340596227099012, "learning_rate": 0.00999780453063271, "loss": 3.3211, "step": 377 }, { "crossentropy": 3.2785515785217285, "epoch": 0.020555208135077083, "grad_norm": 0.06522738188505173, "grad_norm_var": 0.0003438535535785538, "learning_rate": 0.00999776747970924, "loss": 3.2786, "step": 378 }, { "crossentropy": 3.242287039756775, "epoch": 0.020609586992577287, "grad_norm": 0.07044260203838348, "grad_norm_var": 0.00033849722871705674, "learning_rate": 0.009997730118828891, "loss": 3.2423, "step": 379 }, { "crossentropy": 3.362986207008362, "epoch": 0.02066396585007749, "grad_norm": 0.06508927047252655, "grad_norm_var": 0.00034249773603528083, "learning_rate": 0.009997692447993978, "loss": 3.363, "step": 380 }, { "crossentropy": 3.2283509969711304, "epoch": 0.020718344707577694, "grad_norm": 0.07017350941896439, "grad_norm_var": 2.3899170247742898e-05, "learning_rate": 0.009997654467206837, "loss": 3.2284, "step": 381 }, { "crossentropy": 3.1602760553359985, "epoch": 0.020772723565077898, "grad_norm": 0.0749887228012085, "grad_norm_var": 2.6490719194137887e-05, "learning_rate": 0.009997616176469827, "loss": 3.1603, "step": 382 }, { "crossentropy": 3.223981022834778, "epoch": 0.0208271024225781, "grad_norm": 0.07971374690532684, "grad_norm_var": 3.2646647308753655e-05, "learning_rate": 0.009997577575785324, "loss": 3.224, "step": 383 }, { "crossentropy": 3.221740484237671, "epoch": 0.020881481280078305, "grad_norm": 0.0779888853430748, "grad_norm_var": 3.625599602191457e-05, "learning_rate": 0.009997538665155715, "loss": 3.2217, "step": 384 }, { "crossentropy": 3.326294183731079, "epoch": 0.020935860137578508, "grad_norm": 0.0701688900589943, "grad_norm_var": 2.7225438128558367e-05, "learning_rate": 0.009997499444583419, "loss": 3.3263, "step": 385 }, { "crossentropy": 3.2947232723236084, "epoch": 0.020990238995078712, "grad_norm": 0.07052719593048096, "grad_norm_var": 2.6238183739945207e-05, "learning_rate": 0.009997459914070866, "loss": 3.2947, "step": 386 }, { "crossentropy": 3.216952085494995, "epoch": 0.02104461785257892, "grad_norm": 0.10973214358091354, "grad_norm_var": 0.0001241020802203061, "learning_rate": 0.009997420073620507, "loss": 3.217, "step": 387 }, { "crossentropy": 3.3354313373565674, "epoch": 0.021098996710079122, "grad_norm": 0.06173468381166458, "grad_norm_var": 0.00013093649613434266, "learning_rate": 0.009997379923234816, "loss": 3.3354, "step": 388 }, { "crossentropy": 3.281916618347168, "epoch": 0.021153375567579326, "grad_norm": 0.06188591569662094, "grad_norm_var": 0.00013428885180453523, "learning_rate": 0.009997339462916281, "loss": 3.2819, "step": 389 }, { "crossentropy": 3.2923394441604614, "epoch": 0.02120775442507953, "grad_norm": 0.06403574347496033, "grad_norm_var": 0.00013739552810189287, "learning_rate": 0.009997298692667411, "loss": 3.2923, "step": 390 }, { "crossentropy": 3.2112622261047363, "epoch": 0.021262133282579733, "grad_norm": 0.06150186434388161, "grad_norm_var": 0.0001419040523199831, "learning_rate": 0.009997257612490736, "loss": 3.2113, "step": 391 }, { "crossentropy": 3.3886613845825195, "epoch": 0.021316512140079936, "grad_norm": 0.06579659134149551, "grad_norm_var": 0.00014278090235620474, "learning_rate": 0.009997216222388802, "loss": 3.3887, "step": 392 }, { "crossentropy": 3.2773184776306152, "epoch": 0.02137089099758014, "grad_norm": 0.06153217703104019, "grad_norm_var": 0.00014223401221628198, "learning_rate": 0.009997174522364176, "loss": 3.2773, "step": 393 }, { "crossentropy": 3.2679330110549927, "epoch": 0.021425269855080344, "grad_norm": 0.0606406070291996, "grad_norm_var": 0.00014687055454990657, "learning_rate": 0.009997132512419446, "loss": 3.2679, "step": 394 }, { "crossentropy": 3.162135124206543, "epoch": 0.021479648712580547, "grad_norm": 0.06170500069856644, "grad_norm_var": 0.0001515599470393869, "learning_rate": 0.009997090192557217, "loss": 3.1621, "step": 395 }, { "crossentropy": 3.332362174987793, "epoch": 0.021534027570080754, "grad_norm": 0.06834445148706436, "grad_norm_var": 0.00015016638335579104, "learning_rate": 0.009997047562780111, "loss": 3.3324, "step": 396 }, { "crossentropy": 3.253801107406616, "epoch": 0.021588406427580958, "grad_norm": 0.06753119826316833, "grad_norm_var": 0.00015055196947941495, "learning_rate": 0.009997004623090775, "loss": 3.2538, "step": 397 }, { "crossentropy": 3.23642098903656, "epoch": 0.02164278528508116, "grad_norm": 0.06551223248243332, "grad_norm_var": 0.0001496897613031102, "learning_rate": 0.009996961373491871, "loss": 3.2364, "step": 398 }, { "crossentropy": 3.2498202323913574, "epoch": 0.021697164142581365, "grad_norm": 0.061929699033498764, "grad_norm_var": 0.00014469714714378378, "learning_rate": 0.009996917813986081, "loss": 3.2498, "step": 399 }, { "crossentropy": 3.2605373859405518, "epoch": 0.02175154300008157, "grad_norm": 0.0625394880771637, "grad_norm_var": 0.0001393691126862911, "learning_rate": 0.009996873944576108, "loss": 3.2605, "step": 400 }, { "crossentropy": 3.2348986864089966, "epoch": 0.021805921857581772, "grad_norm": 0.07600051909685135, "grad_norm_var": 0.0001438070589916203, "learning_rate": 0.009996829765264672, "loss": 3.2349, "step": 401 }, { "crossentropy": 3.234248399734497, "epoch": 0.021860300715081975, "grad_norm": 0.05892547592520714, "grad_norm_var": 0.00014762859561563772, "learning_rate": 0.00999678527605451, "loss": 3.2342, "step": 402 }, { "crossentropy": 3.258663773536682, "epoch": 0.02191467957258218, "grad_norm": 0.06160751357674599, "grad_norm_var": 1.7117895224714497e-05, "learning_rate": 0.009996740476948386, "loss": 3.2587, "step": 403 }, { "crossentropy": 3.149019956588745, "epoch": 0.021969058430082383, "grad_norm": 0.0673244297504425, "grad_norm_var": 1.751173354543976e-05, "learning_rate": 0.009996695367949074, "loss": 3.149, "step": 404 }, { "crossentropy": 3.3387171030044556, "epoch": 0.02202343728758259, "grad_norm": 0.061481330543756485, "grad_norm_var": 1.7645491570977603e-05, "learning_rate": 0.009996649949059376, "loss": 3.3387, "step": 405 }, { "crossentropy": 3.229107618331909, "epoch": 0.022077816145082793, "grad_norm": 0.057250916957855225, "grad_norm_var": 2.0626440463466836e-05, "learning_rate": 0.009996604220282104, "loss": 3.2291, "step": 406 }, { "crossentropy": 3.3407572507858276, "epoch": 0.022132195002582997, "grad_norm": 0.057880837470293045, "grad_norm_var": 2.2519977036411278e-05, "learning_rate": 0.009996558181620098, "loss": 3.3408, "step": 407 }, { "crossentropy": 3.265431523323059, "epoch": 0.0221865738600832, "grad_norm": 0.06230003386735916, "grad_norm_var": 2.2213480124162974e-05, "learning_rate": 0.009996511833076212, "loss": 3.2654, "step": 408 }, { "crossentropy": 3.215889573097229, "epoch": 0.022240952717583404, "grad_norm": 0.06311461329460144, "grad_norm_var": 2.2000869172678996e-05, "learning_rate": 0.009996465174653319, "loss": 3.2159, "step": 409 }, { "crossentropy": 3.1176416873931885, "epoch": 0.022295331575083607, "grad_norm": 0.06835338473320007, "grad_norm_var": 2.2901155856529187e-05, "learning_rate": 0.009996418206354316, "loss": 3.1176, "step": 410 }, { "crossentropy": 3.280820846557617, "epoch": 0.02234971043258381, "grad_norm": 0.06267677992582321, "grad_norm_var": 2.2680620522638694e-05, "learning_rate": 0.009996370928182112, "loss": 3.2808, "step": 411 }, { "crossentropy": 3.2444405555725098, "epoch": 0.022404089290084014, "grad_norm": 0.06099102273583412, "grad_norm_var": 2.172543441418609e-05, "learning_rate": 0.009996323340139642, "loss": 3.2444, "step": 412 }, { "crossentropy": 3.203736662864685, "epoch": 0.022458468147584218, "grad_norm": 0.06456132233142853, "grad_norm_var": 2.0666039403053477e-05, "learning_rate": 0.009996275442229858, "loss": 3.2037, "step": 413 }, { "crossentropy": 3.2672332525253296, "epoch": 0.02251284700508442, "grad_norm": 0.09201914817094803, "grad_norm_var": 7.24755705005684e-05, "learning_rate": 0.009996227234455727, "loss": 3.2672, "step": 414 }, { "crossentropy": 3.234895944595337, "epoch": 0.02256722586258463, "grad_norm": 0.06308036297559738, "grad_norm_var": 7.209727677131242e-05, "learning_rate": 0.009996178716820242, "loss": 3.2349, "step": 415 }, { "crossentropy": 3.2622172832489014, "epoch": 0.022621604720084832, "grad_norm": 0.06923907995223999, "grad_norm_var": 7.269865273170402e-05, "learning_rate": 0.00999612988932641, "loss": 3.2622, "step": 416 }, { "crossentropy": 3.245298147201538, "epoch": 0.022675983577585036, "grad_norm": 0.06015429645776749, "grad_norm_var": 6.604919945663075e-05, "learning_rate": 0.009996080751977261, "loss": 3.2453, "step": 417 }, { "crossentropy": 3.110771894454956, "epoch": 0.02273036243508524, "grad_norm": 0.062176961451768875, "grad_norm_var": 6.432139276814948e-05, "learning_rate": 0.00999603130477584, "loss": 3.1108, "step": 418 }, { "crossentropy": 3.182268500328064, "epoch": 0.022784741292585443, "grad_norm": 0.06231215223670006, "grad_norm_var": 6.406768158799437e-05, "learning_rate": 0.009995981547725219, "loss": 3.1823, "step": 419 }, { "crossentropy": 3.222161889076233, "epoch": 0.022839120150085646, "grad_norm": 0.05787523835897446, "grad_norm_var": 6.631932407253159e-05, "learning_rate": 0.009995931480828477, "loss": 3.2222, "step": 420 }, { "crossentropy": 3.2498297691345215, "epoch": 0.02289349900758585, "grad_norm": 0.06648773699998856, "grad_norm_var": 6.614334286362637e-05, "learning_rate": 0.009995881104088724, "loss": 3.2498, "step": 421 }, { "crossentropy": 3.2737042903900146, "epoch": 0.022947877865086053, "grad_norm": 0.06306195259094238, "grad_norm_var": 6.271112975779758e-05, "learning_rate": 0.00999583041750908, "loss": 3.2737, "step": 422 }, { "crossentropy": 3.1155083179473877, "epoch": 0.023002256722586257, "grad_norm": 0.05849751457571983, "grad_norm_var": 6.216862633956465e-05, "learning_rate": 0.009995779421092695, "loss": 3.1155, "step": 423 }, { "crossentropy": 3.3037221431732178, "epoch": 0.023056635580086464, "grad_norm": 0.06366145610809326, "grad_norm_var": 6.182951430273216e-05, "learning_rate": 0.009995728114842724, "loss": 3.3037, "step": 424 }, { "crossentropy": 3.1704448461532593, "epoch": 0.023111014437586667, "grad_norm": 0.06575530767440796, "grad_norm_var": 6.163973634853252e-05, "learning_rate": 0.009995676498762355, "loss": 3.1704, "step": 425 }, { "crossentropy": 3.2287373542785645, "epoch": 0.02316539329508687, "grad_norm": 0.16612108051776886, "grad_norm_var": 0.00070202478936299, "learning_rate": 0.009995624572854786, "loss": 3.2287, "step": 426 }, { "crossentropy": 3.297630786895752, "epoch": 0.023219772152587075, "grad_norm": 0.059629831463098526, "grad_norm_var": 0.0007060542529604595, "learning_rate": 0.00999557233712324, "loss": 3.2976, "step": 427 }, { "crossentropy": 3.2012851238250732, "epoch": 0.023274151010087278, "grad_norm": 0.06056234985589981, "grad_norm_var": 0.0007066364734203634, "learning_rate": 0.009995519791570952, "loss": 3.2013, "step": 428 }, { "crossentropy": 3.2521086931228638, "epoch": 0.02332852986758748, "grad_norm": 0.06505224108695984, "grad_norm_var": 0.0007062333769875149, "learning_rate": 0.009995466936201187, "loss": 3.2521, "step": 429 }, { "crossentropy": 3.181937575340271, "epoch": 0.023382908725087685, "grad_norm": 0.060147352516651154, "grad_norm_var": 0.0006803159783354171, "learning_rate": 0.009995413771017219, "loss": 3.1819, "step": 430 }, { "crossentropy": 3.223262310028076, "epoch": 0.02343728758258789, "grad_norm": 0.06234416738152504, "grad_norm_var": 0.000680929784934478, "learning_rate": 0.009995360296022346, "loss": 3.2233, "step": 431 }, { "crossentropy": 3.3001279830932617, "epoch": 0.023491666440088092, "grad_norm": 0.06513009965419769, "grad_norm_var": 0.0006818224880313042, "learning_rate": 0.009995306511219883, "loss": 3.3001, "step": 432 }, { "crossentropy": 3.2070647478103638, "epoch": 0.0235460452975883, "grad_norm": 0.06002810597419739, "grad_norm_var": 0.000681967026005716, "learning_rate": 0.009995252416613169, "loss": 3.2071, "step": 433 }, { "crossentropy": 3.224783778190613, "epoch": 0.023600424155088503, "grad_norm": 0.05815751105546951, "grad_norm_var": 0.000686460706004634, "learning_rate": 0.009995198012205558, "loss": 3.2248, "step": 434 }, { "crossentropy": 3.2888426780700684, "epoch": 0.023654803012588706, "grad_norm": 0.06370346993207932, "grad_norm_var": 0.0006854474234773926, "learning_rate": 0.009995143298000423, "loss": 3.2888, "step": 435 }, { "crossentropy": 3.0878247022628784, "epoch": 0.02370918187008891, "grad_norm": 0.05982378497719765, "grad_norm_var": 0.0006829208485854036, "learning_rate": 0.009995088274001156, "loss": 3.0878, "step": 436 }, { "crossentropy": 3.2410528659820557, "epoch": 0.023763560727589114, "grad_norm": 0.06515657156705856, "grad_norm_var": 0.0006834127575990066, "learning_rate": 0.00999503294021117, "loss": 3.2411, "step": 437 }, { "crossentropy": 3.192412257194519, "epoch": 0.023817939585089317, "grad_norm": 0.06403941661119461, "grad_norm_var": 0.000682756956010429, "learning_rate": 0.009994977296633902, "loss": 3.1924, "step": 438 }, { "crossentropy": 3.2256323099136353, "epoch": 0.02387231844258952, "grad_norm": 0.058767758309841156, "grad_norm_var": 0.000682397029191301, "learning_rate": 0.009994921343272796, "loss": 3.2256, "step": 439 }, { "crossentropy": 3.2858057022094727, "epoch": 0.023926697300089724, "grad_norm": 0.0674552321434021, "grad_norm_var": 0.0006807832869073127, "learning_rate": 0.009994865080131326, "loss": 3.2858, "step": 440 }, { "crossentropy": 3.288561463356018, "epoch": 0.023981076157589928, "grad_norm": 0.06109977513551712, "grad_norm_var": 0.000684069543991401, "learning_rate": 0.00999480850721298, "loss": 3.2886, "step": 441 }, { "crossentropy": 3.23559308052063, "epoch": 0.02403545501509013, "grad_norm": 0.06629807502031326, "grad_norm_var": 8.562275578740709e-06, "learning_rate": 0.009994751624521268, "loss": 3.2356, "step": 442 }, { "crossentropy": 3.187263607978821, "epoch": 0.024089833872590338, "grad_norm": 0.06052869185805321, "grad_norm_var": 8.288295542378752e-06, "learning_rate": 0.009994694432059716, "loss": 3.1873, "step": 443 }, { "crossentropy": 3.225001573562622, "epoch": 0.024144212730090542, "grad_norm": 0.06713346391916275, "grad_norm_var": 9.382733760255185e-06, "learning_rate": 0.009994636929831872, "loss": 3.225, "step": 444 }, { "crossentropy": 3.1889032125473022, "epoch": 0.024198591587590745, "grad_norm": 0.068536676466465, "grad_norm_var": 1.1186027957274929e-05, "learning_rate": 0.009994579117841301, "loss": 3.1889, "step": 445 }, { "crossentropy": 3.2530786991119385, "epoch": 0.02425297044509095, "grad_norm": 0.059997767210006714, "grad_norm_var": 1.124475814293224e-05, "learning_rate": 0.009994520996091591, "loss": 3.2531, "step": 446 }, { "crossentropy": 3.2384088039398193, "epoch": 0.024307349302591152, "grad_norm": 0.06679324060678482, "grad_norm_var": 1.2085416463840832e-05, "learning_rate": 0.009994462564586346, "loss": 3.2384, "step": 447 }, { "crossentropy": 3.1823559999465942, "epoch": 0.024361728160091356, "grad_norm": 0.06048982962965965, "grad_norm_var": 1.2293071299034469e-05, "learning_rate": 0.009994403823329188, "loss": 3.1824, "step": 448 }, { "crossentropy": 3.259485602378845, "epoch": 0.02441610701759156, "grad_norm": 0.06889597326517105, "grad_norm_var": 1.369340581894949e-05, "learning_rate": 0.009994344772323761, "loss": 3.2595, "step": 449 }, { "crossentropy": 3.0663996934890747, "epoch": 0.024470485875091763, "grad_norm": 0.06305443495512009, "grad_norm_var": 1.1668114340281955e-05, "learning_rate": 0.009994285411573727, "loss": 3.0664, "step": 450 }, { "crossentropy": 3.2321908473968506, "epoch": 0.024524864732591967, "grad_norm": 0.06474529951810837, "grad_norm_var": 1.1714085758385688e-05, "learning_rate": 0.009994225741082767, "loss": 3.2322, "step": 451 }, { "crossentropy": 3.2085098028182983, "epoch": 0.024579243590092174, "grad_norm": 0.06512083113193512, "grad_norm_var": 1.0570471794988748e-05, "learning_rate": 0.009994165760854585, "loss": 3.2085, "step": 452 }, { "crossentropy": 3.175427556037903, "epoch": 0.024633622447592377, "grad_norm": 0.05968019738793373, "grad_norm_var": 1.1788084203274002e-05, "learning_rate": 0.009994105470892897, "loss": 3.1754, "step": 453 }, { "crossentropy": 3.208691954612732, "epoch": 0.02468800130509258, "grad_norm": 0.06043791398406029, "grad_norm_var": 1.2538915479602212e-05, "learning_rate": 0.009994044871201445, "loss": 3.2087, "step": 454 }, { "crossentropy": 3.16926646232605, "epoch": 0.024742380162592784, "grad_norm": 0.0640651136636734, "grad_norm_var": 1.081635402538153e-05, "learning_rate": 0.009993983961783985, "loss": 3.1693, "step": 455 }, { "crossentropy": 3.208819627761841, "epoch": 0.024796759020092988, "grad_norm": 0.06496075540781021, "grad_norm_var": 1.0062967528579047e-05, "learning_rate": 0.009993922742644297, "loss": 3.2088, "step": 456 }, { "crossentropy": 3.2054349184036255, "epoch": 0.02485113787759319, "grad_norm": 0.07249043881893158, "grad_norm_var": 1.3972655327254326e-05, "learning_rate": 0.009993861213786174, "loss": 3.2054, "step": 457 }, { "crossentropy": 3.2167307138442993, "epoch": 0.024905516735093395, "grad_norm": 0.06717649102210999, "grad_norm_var": 1.422248135831324e-05, "learning_rate": 0.009993799375213438, "loss": 3.2167, "step": 458 }, { "crossentropy": 3.2262017726898193, "epoch": 0.0249598955925936, "grad_norm": 0.06769227981567383, "grad_norm_var": 1.3510830267560112e-05, "learning_rate": 0.009993737226929918, "loss": 3.2262, "step": 459 }, { "crossentropy": 3.201409339904785, "epoch": 0.025014274450093802, "grad_norm": 0.11848774552345276, "grad_norm_var": 0.000192404252505679, "learning_rate": 0.00999367476893947, "loss": 3.2014, "step": 460 }, { "crossentropy": 3.2800880670547485, "epoch": 0.02506865330759401, "grad_norm": 0.0692969411611557, "grad_norm_var": 0.0001924654780170509, "learning_rate": 0.00999361200124597, "loss": 3.2801, "step": 461 }, { "crossentropy": 3.1594455242156982, "epoch": 0.025123032165094213, "grad_norm": 0.07056381553411484, "grad_norm_var": 0.00018769529327237737, "learning_rate": 0.009993548923853308, "loss": 3.1594, "step": 462 }, { "crossentropy": 3.1767804622650146, "epoch": 0.025177411022594416, "grad_norm": 0.06618255376815796, "grad_norm_var": 0.00018789803928248068, "learning_rate": 0.009993485536765398, "loss": 3.1768, "step": 463 }, { "crossentropy": 3.177563428878784, "epoch": 0.02523178988009462, "grad_norm": 0.06331683695316315, "grad_norm_var": 0.0001852052962996932, "learning_rate": 0.00999342183998617, "loss": 3.1776, "step": 464 }, { "crossentropy": 3.156345248222351, "epoch": 0.025286168737594823, "grad_norm": 0.06484714150428772, "grad_norm_var": 0.00018635915558397608, "learning_rate": 0.009993357833519576, "loss": 3.1563, "step": 465 }, { "crossentropy": 3.2622199058532715, "epoch": 0.025340547595095027, "grad_norm": 0.07809585332870483, "grad_norm_var": 0.00018881125861445723, "learning_rate": 0.009993293517369583, "loss": 3.2622, "step": 466 }, { "crossentropy": 3.1877392530441284, "epoch": 0.02539492645259523, "grad_norm": 0.06919003278017044, "grad_norm_var": 0.00018703707233688172, "learning_rate": 0.009993228891540182, "loss": 3.1877, "step": 467 }, { "crossentropy": 3.1140637397766113, "epoch": 0.025449305310095434, "grad_norm": 0.05534876510500908, "grad_norm_var": 0.0001994933733428752, "learning_rate": 0.00999316395603538, "loss": 3.1141, "step": 468 }, { "crossentropy": 3.1782134771347046, "epoch": 0.025503684167595637, "grad_norm": 0.09477922320365906, "grad_norm_var": 0.00023058320593443611, "learning_rate": 0.009993098710859205, "loss": 3.1782, "step": 469 }, { "crossentropy": 3.20959734916687, "epoch": 0.025558063025095844, "grad_norm": 0.06104212626814842, "grad_norm_var": 0.00022970008077234834, "learning_rate": 0.009993033156015702, "loss": 3.2096, "step": 470 }, { "crossentropy": 3.235594391822815, "epoch": 0.025612441882596048, "grad_norm": 0.07311085611581802, "grad_norm_var": 0.00022558040641598143, "learning_rate": 0.009992967291508939, "loss": 3.2356, "step": 471 }, { "crossentropy": 3.1408987045288086, "epoch": 0.02566682074009625, "grad_norm": 0.06541889905929565, "grad_norm_var": 0.00022514603395865825, "learning_rate": 0.009992901117342998, "loss": 3.1409, "step": 472 }, { "crossentropy": 3.2576647996902466, "epoch": 0.025721199597596455, "grad_norm": 0.058800239115953445, "grad_norm_var": 0.0002365396429529431, "learning_rate": 0.009992834633521986, "loss": 3.2577, "step": 473 }, { "crossentropy": 3.0547285079956055, "epoch": 0.02577557845509666, "grad_norm": 0.05856812745332718, "grad_norm_var": 0.0002460869401531252, "learning_rate": 0.009992767840050026, "loss": 3.0547, "step": 474 }, { "crossentropy": 3.124190092086792, "epoch": 0.025829957312596862, "grad_norm": 0.1065036728978157, "grad_norm_var": 0.0003235222959047059, "learning_rate": 0.009992700736931256, "loss": 3.1242, "step": 475 }, { "crossentropy": 3.197759509086609, "epoch": 0.025884336170097066, "grad_norm": 0.07693889737129211, "grad_norm_var": 0.0001813439596313484, "learning_rate": 0.009992633324169843, "loss": 3.1978, "step": 476 }, { "crossentropy": 3.1460808515548706, "epoch": 0.02593871502759727, "grad_norm": 0.058578308671712875, "grad_norm_var": 0.0001906015234206489, "learning_rate": 0.009992565601769966, "loss": 3.1461, "step": 477 }, { "crossentropy": 3.239993691444397, "epoch": 0.025993093885097473, "grad_norm": 0.05888889357447624, "grad_norm_var": 0.00019836789703524577, "learning_rate": 0.009992497569735824, "loss": 3.24, "step": 478 }, { "crossentropy": 3.208915948867798, "epoch": 0.026047472742597676, "grad_norm": 0.05812390521168709, "grad_norm_var": 0.00020583083894478506, "learning_rate": 0.009992429228071639, "loss": 3.2089, "step": 479 }, { "crossentropy": 3.1540523767471313, "epoch": 0.026101851600097883, "grad_norm": 0.059936463832855225, "grad_norm_var": 0.00020903755062902997, "learning_rate": 0.009992360576781645, "loss": 3.1541, "step": 480 }, { "crossentropy": 3.1621973514556885, "epoch": 0.026156230457598087, "grad_norm": 0.05649339407682419, "grad_norm_var": 0.00021761895487784898, "learning_rate": 0.009992291615870104, "loss": 3.1622, "step": 481 }, { "crossentropy": 3.028660535812378, "epoch": 0.02621060931509829, "grad_norm": 0.06393558531999588, "grad_norm_var": 0.0002113041851143876, "learning_rate": 0.00999222234534129, "loss": 3.0287, "step": 482 }, { "crossentropy": 3.211292028427124, "epoch": 0.026264988172598494, "grad_norm": 0.06360537558794022, "grad_norm_var": 0.00021179292613339521, "learning_rate": 0.009992152765199503, "loss": 3.2113, "step": 483 }, { "crossentropy": 3.039893388748169, "epoch": 0.026319367030098698, "grad_norm": 0.05767446383833885, "grad_norm_var": 0.00020855536452028882, "learning_rate": 0.009992082875449053, "loss": 3.0399, "step": 484 }, { "crossentropy": 3.098160147666931, "epoch": 0.0263737458875989, "grad_norm": 0.06071813777089119, "grad_norm_var": 0.00015501956995731068, "learning_rate": 0.00999201267609428, "loss": 3.0982, "step": 485 }, { "crossentropy": 3.0822969675064087, "epoch": 0.026428124745099105, "grad_norm": 0.062419451773166656, "grad_norm_var": 0.00015443038012520686, "learning_rate": 0.009991942167139533, "loss": 3.0823, "step": 486 }, { "crossentropy": 3.1295443773269653, "epoch": 0.026482503602599308, "grad_norm": 0.09866369515657425, "grad_norm_var": 0.00022293441541386844, "learning_rate": 0.009991871348589186, "loss": 3.1295, "step": 487 }, { "crossentropy": 3.1348966360092163, "epoch": 0.026536882460099512, "grad_norm": 0.06562826782464981, "grad_norm_var": 0.00022290476380669815, "learning_rate": 0.009991800220447633, "loss": 3.1349, "step": 488 }, { "crossentropy": 3.165089726448059, "epoch": 0.02659126131759972, "grad_norm": 0.061336200684309006, "grad_norm_var": 0.0002206719903523296, "learning_rate": 0.009991728782719285, "loss": 3.1651, "step": 489 }, { "crossentropy": 3.215384364128113, "epoch": 0.026645640175099922, "grad_norm": 0.05971238762140274, "grad_norm_var": 0.00021950540900072697, "learning_rate": 0.00999165703540857, "loss": 3.2154, "step": 490 }, { "crossentropy": 3.185685157775879, "epoch": 0.026700019032600126, "grad_norm": 0.08151329308748245, "grad_norm_var": 0.00012631757040610901, "learning_rate": 0.009991584978519942, "loss": 3.1857, "step": 491 }, { "crossentropy": 3.1934248208999634, "epoch": 0.02675439789010033, "grad_norm": 0.06179944425821304, "grad_norm_var": 0.00011706865920574072, "learning_rate": 0.009991512612057864, "loss": 3.1934, "step": 492 }, { "crossentropy": 3.1854279041290283, "epoch": 0.026808776747600533, "grad_norm": 0.06405873596668243, "grad_norm_var": 0.00011475449740915941, "learning_rate": 0.009991439936026828, "loss": 3.1854, "step": 493 }, { "crossentropy": 3.1567113399505615, "epoch": 0.026863155605100737, "grad_norm": 0.05943259969353676, "grad_norm_var": 0.0001143548389655977, "learning_rate": 0.009991366950431343, "loss": 3.1567, "step": 494 }, { "crossentropy": 3.2589341402053833, "epoch": 0.02691753446260094, "grad_norm": 0.06422775238752365, "grad_norm_var": 0.00011133902652604721, "learning_rate": 0.00999129365527593, "loss": 3.2589, "step": 495 }, { "crossentropy": 3.205109119415283, "epoch": 0.026971913320101144, "grad_norm": 0.06325613707304001, "grad_norm_var": 0.00010975459411881027, "learning_rate": 0.00999122005056514, "loss": 3.2051, "step": 496 }, { "crossentropy": 3.1992039680480957, "epoch": 0.027026292177601347, "grad_norm": 0.05811939388513565, "grad_norm_var": 0.00010801496919553543, "learning_rate": 0.009991146136303536, "loss": 3.1992, "step": 497 }, { "crossentropy": 3.0697816610336304, "epoch": 0.027080671035101554, "grad_norm": 0.05953560024499893, "grad_norm_var": 0.00011007311519280449, "learning_rate": 0.009991071912495701, "loss": 3.0698, "step": 498 }, { "crossentropy": 3.2579550743103027, "epoch": 0.027135049892601758, "grad_norm": 0.05815594270825386, "grad_norm_var": 0.00011301969957245993, "learning_rate": 0.00999099737914624, "loss": 3.258, "step": 499 }, { "crossentropy": 3.0981425046920776, "epoch": 0.02718942875010196, "grad_norm": 0.06483691930770874, "grad_norm_var": 0.0001094538910202741, "learning_rate": 0.009990922536259774, "loss": 3.0981, "step": 500 }, { "crossentropy": 3.2861239910125732, "epoch": 0.027243807607602165, "grad_norm": 0.05904975160956383, "grad_norm_var": 0.00011062783219168981, "learning_rate": 0.009990847383840944, "loss": 3.2861, "step": 501 }, { "crossentropy": 3.116645932197571, "epoch": 0.02729818646510237, "grad_norm": 0.06520840525627136, "grad_norm_var": 0.00011011380031958687, "learning_rate": 0.009990771921894414, "loss": 3.1166, "step": 502 }, { "crossentropy": 3.1214624643325806, "epoch": 0.027352565322602572, "grad_norm": 0.05684122070670128, "grad_norm_var": 3.3294266227310423e-05, "learning_rate": 0.009990696150424862, "loss": 3.1215, "step": 503 }, { "crossentropy": 3.0816168785095215, "epoch": 0.027406944180102775, "grad_norm": 0.06167994812130928, "grad_norm_var": 3.2710973221992264e-05, "learning_rate": 0.009990620069436986, "loss": 3.0816, "step": 504 }, { "crossentropy": 3.1990774869918823, "epoch": 0.02746132303760298, "grad_norm": 0.058822765946388245, "grad_norm_var": 3.346993169710723e-05, "learning_rate": 0.009990543678935508, "loss": 3.1991, "step": 505 }, { "crossentropy": 3.167070746421814, "epoch": 0.027515701895103183, "grad_norm": 0.05848664045333862, "grad_norm_var": 3.398112137875705e-05, "learning_rate": 0.009990466978925163, "loss": 3.1671, "step": 506 }, { "crossentropy": 3.0546159744262695, "epoch": 0.027570080752603386, "grad_norm": 0.06243890896439552, "grad_norm_var": 7.5741851189201995e-06, "learning_rate": 0.009990389969410709, "loss": 3.0546, "step": 507 }, { "crossentropy": 3.1827216148376465, "epoch": 0.027624459610103593, "grad_norm": 0.06174895912408829, "grad_norm_var": 7.5689421102485366e-06, "learning_rate": 0.009990312650396922, "loss": 3.1827, "step": 508 }, { "crossentropy": 3.037477493286133, "epoch": 0.027678838467603797, "grad_norm": 0.06188230216503143, "grad_norm_var": 6.975558463918433e-06, "learning_rate": 0.009990235021888597, "loss": 3.0375, "step": 509 }, { "crossentropy": 3.1807576417922974, "epoch": 0.027733217325104, "grad_norm": 0.0633305013179779, "grad_norm_var": 7.184505793388027e-06, "learning_rate": 0.009990157083890547, "loss": 3.1808, "step": 510 }, { "crossentropy": 3.1540002822875977, "epoch": 0.027787596182604204, "grad_norm": 0.06374044716358185, "grad_norm_var": 6.99621064978815e-06, "learning_rate": 0.009990078836407608, "loss": 3.154, "step": 511 }, { "crossentropy": 3.1888716220855713, "epoch": 0.027841975040104407, "grad_norm": 0.06443309783935547, "grad_norm_var": 7.425718499586876e-06, "learning_rate": 0.009990000279444631, "loss": 3.1889, "step": 512 }, { "crossentropy": 3.1789424419403076, "epoch": 0.02789635389760461, "grad_norm": 0.06211194396018982, "grad_norm_var": 6.811651751820133e-06, "learning_rate": 0.009989921413006489, "loss": 3.1789, "step": 513 }, { "crossentropy": 3.106834888458252, "epoch": 0.027950732755104814, "grad_norm": 0.06695716083049774, "grad_norm_var": 8.415200728319137e-06, "learning_rate": 0.009989842237098074, "loss": 3.1068, "step": 514 }, { "crossentropy": 3.040956735610962, "epoch": 0.028005111612605018, "grad_norm": 0.0652519091963768, "grad_norm_var": 8.059806136846242e-06, "learning_rate": 0.009989762751724293, "loss": 3.041, "step": 515 }, { "crossentropy": 3.1145845651626587, "epoch": 0.02805949047010522, "grad_norm": 0.06569627672433853, "grad_norm_var": 8.396495265690595e-06, "learning_rate": 0.00998968295689008, "loss": 3.1146, "step": 516 }, { "crossentropy": 3.1260277032852173, "epoch": 0.02811386932760543, "grad_norm": 0.06198617070913315, "grad_norm_var": 7.641319961558478e-06, "learning_rate": 0.009989602852600383, "loss": 3.126, "step": 517 }, { "crossentropy": 3.19792103767395, "epoch": 0.028168248185105632, "grad_norm": 0.059596478939056396, "grad_norm_var": 7.611933191601211e-06, "learning_rate": 0.009989522438860168, "loss": 3.1979, "step": 518 }, { "crossentropy": 3.127088189125061, "epoch": 0.028222627042605836, "grad_norm": 0.058871686458587646, "grad_norm_var": 6.422135968255314e-06, "learning_rate": 0.009989441715674422, "loss": 3.1271, "step": 519 }, { "crossentropy": 3.1900343894958496, "epoch": 0.02827700590010604, "grad_norm": 0.06280898302793503, "grad_norm_var": 6.4062517062469956e-06, "learning_rate": 0.009989360683048154, "loss": 3.19, "step": 520 }, { "crossentropy": 3.1662704944610596, "epoch": 0.028331384757606243, "grad_norm": 0.08092974126338959, "grad_norm_var": 2.6450339897082728e-05, "learning_rate": 0.009989279340986385, "loss": 3.1663, "step": 521 }, { "crossentropy": 3.1374858617782593, "epoch": 0.028385763615106446, "grad_norm": 0.08659527450799942, "grad_norm_var": 5.604165626820473e-05, "learning_rate": 0.009989197689494167, "loss": 3.1375, "step": 522 }, { "crossentropy": 3.110137104988098, "epoch": 0.02844014247260665, "grad_norm": 0.06714168936014175, "grad_norm_var": 5.548961075884258e-05, "learning_rate": 0.009989115728576557, "loss": 3.1101, "step": 523 }, { "crossentropy": 3.180479884147644, "epoch": 0.028494521330106853, "grad_norm": 0.058722760528326035, "grad_norm_var": 5.770367262547902e-05, "learning_rate": 0.009989033458238641, "loss": 3.1805, "step": 524 }, { "crossentropy": 3.156387686729431, "epoch": 0.028548900187607057, "grad_norm": 0.059519656002521515, "grad_norm_var": 5.923268735408497e-05, "learning_rate": 0.009988950878485522, "loss": 3.1564, "step": 525 }, { "crossentropy": 3.2151756286621094, "epoch": 0.028603279045107264, "grad_norm": 0.061070673167705536, "grad_norm_var": 6.0199789621853164e-05, "learning_rate": 0.00998886798932232, "loss": 3.2152, "step": 526 }, { "crossentropy": 3.1323987245559692, "epoch": 0.028657657902607468, "grad_norm": 0.05849049240350723, "grad_norm_var": 6.304182882102914e-05, "learning_rate": 0.009988784790754178, "loss": 3.1324, "step": 527 }, { "crossentropy": 3.1548337936401367, "epoch": 0.02871203676010767, "grad_norm": 0.0547940731048584, "grad_norm_var": 6.959211764180901e-05, "learning_rate": 0.009988701282786252, "loss": 3.1548, "step": 528 }, { "crossentropy": 3.1578946113586426, "epoch": 0.028766415617607875, "grad_norm": 0.06555633246898651, "grad_norm_var": 6.927865093316373e-05, "learning_rate": 0.009988617465423726, "loss": 3.1579, "step": 529 }, { "crossentropy": 3.11134672164917, "epoch": 0.028820794475108078, "grad_norm": 0.05907180905342102, "grad_norm_var": 7.07121371634015e-05, "learning_rate": 0.009988533338671793, "loss": 3.1113, "step": 530 }, { "crossentropy": 3.0613373517990112, "epoch": 0.02887517333260828, "grad_norm": 0.06010322645306587, "grad_norm_var": 7.159979496862609e-05, "learning_rate": 0.009988448902535673, "loss": 3.0613, "step": 531 }, { "crossentropy": 3.0908756256103516, "epoch": 0.028929552190108485, "grad_norm": 0.05884264409542084, "grad_norm_var": 7.281158240120152e-05, "learning_rate": 0.009988364157020606, "loss": 3.0909, "step": 532 }, { "crossentropy": 3.0791083574295044, "epoch": 0.02898393104760869, "grad_norm": 0.08102703839540482, "grad_norm_var": 9.192917644579463e-05, "learning_rate": 0.009988279102131844, "loss": 3.0791, "step": 533 }, { "crossentropy": 2.959784150123596, "epoch": 0.029038309905108892, "grad_norm": 0.0634656548500061, "grad_norm_var": 9.029831632069325e-05, "learning_rate": 0.00998819373787466, "loss": 2.9598, "step": 534 }, { "crossentropy": 3.1300792694091797, "epoch": 0.029092688762609096, "grad_norm": 0.06231067702174187, "grad_norm_var": 8.831309245701704e-05, "learning_rate": 0.009988108064254353, "loss": 3.1301, "step": 535 }, { "crossentropy": 3.1061800718307495, "epoch": 0.029147067620109303, "grad_norm": 0.06014997139573097, "grad_norm_var": 8.954176820854809e-05, "learning_rate": 0.009988022081276233, "loss": 3.1062, "step": 536 }, { "crossentropy": 3.1011780500411987, "epoch": 0.029201446477609506, "grad_norm": 0.05979752168059349, "grad_norm_var": 7.217944867677992e-05, "learning_rate": 0.009987935788945634, "loss": 3.1012, "step": 537 }, { "crossentropy": 3.113202691078186, "epoch": 0.02925582533510971, "grad_norm": 0.059830013662576675, "grad_norm_var": 3.4680104584364904e-05, "learning_rate": 0.009987849187267908, "loss": 3.1132, "step": 538 }, { "crossentropy": 3.077216863632202, "epoch": 0.029310204192609914, "grad_norm": 0.06268703192472458, "grad_norm_var": 3.2788253692087735e-05, "learning_rate": 0.009987762276248426, "loss": 3.0772, "step": 539 }, { "crossentropy": 3.054700493812561, "epoch": 0.029364583050110117, "grad_norm": 0.0630662813782692, "grad_norm_var": 3.2306882943913294e-05, "learning_rate": 0.009987675055892578, "loss": 3.0547, "step": 540 }, { "crossentropy": 3.1574513912200928, "epoch": 0.02941896190761032, "grad_norm": 0.06411833316087723, "grad_norm_var": 3.2192739088854746e-05, "learning_rate": 0.009987587526205773, "loss": 3.1575, "step": 541 }, { "crossentropy": 3.0540820360183716, "epoch": 0.029473340765110524, "grad_norm": 0.06070505827665329, "grad_norm_var": 3.2253653927535356e-05, "learning_rate": 0.009987499687193438, "loss": 3.0541, "step": 542 }, { "crossentropy": 3.1487436294555664, "epoch": 0.029527719622610728, "grad_norm": 0.058791160583496094, "grad_norm_var": 3.211355941933746e-05, "learning_rate": 0.009987411538861022, "loss": 3.1487, "step": 543 }, { "crossentropy": 3.0936580896377563, "epoch": 0.02958209848011093, "grad_norm": 0.21475616097450256, "grad_norm_var": 0.0014745770204453903, "learning_rate": 0.009987323081213995, "loss": 3.0937, "step": 544 }, { "crossentropy": 3.0786052942276, "epoch": 0.02963647733761114, "grad_norm": 0.05905447155237198, "grad_norm_var": 0.001482928744923952, "learning_rate": 0.009987234314257838, "loss": 3.0786, "step": 545 }, { "crossentropy": 3.160004138946533, "epoch": 0.029690856195111342, "grad_norm": 0.06413467228412628, "grad_norm_var": 0.001475981794716237, "learning_rate": 0.009987145237998058, "loss": 3.16, "step": 546 }, { "crossentropy": 3.052586793899536, "epoch": 0.029745235052611545, "grad_norm": 0.0550924614071846, "grad_norm_var": 0.001485534360304169, "learning_rate": 0.00998705585244018, "loss": 3.0526, "step": 547 }, { "crossentropy": 3.137404680252075, "epoch": 0.02979961391011175, "grad_norm": 0.056124672293663025, "grad_norm_var": 0.0014906697785725784, "learning_rate": 0.00998696615758975, "loss": 3.1374, "step": 548 }, { "crossentropy": 3.0928350687026978, "epoch": 0.029853992767611953, "grad_norm": 0.058563873171806335, "grad_norm_var": 0.001493880570878702, "learning_rate": 0.009986876153452326, "loss": 3.0928, "step": 549 }, { "crossentropy": 3.175341248512268, "epoch": 0.029908371625112156, "grad_norm": 0.058546535670757294, "grad_norm_var": 0.001499787241432711, "learning_rate": 0.009986785840033492, "loss": 3.1753, "step": 550 }, { "crossentropy": 3.056047558784485, "epoch": 0.02996275048261236, "grad_norm": 0.0604729950428009, "grad_norm_var": 0.001501847599608629, "learning_rate": 0.00998669521733885, "loss": 3.056, "step": 551 }, { "crossentropy": 2.9985969066619873, "epoch": 0.030017129340112563, "grad_norm": 0.062036145478487015, "grad_norm_var": 0.0014996573527848033, "learning_rate": 0.009986604285374021, "loss": 2.9986, "step": 552 }, { "crossentropy": 3.1167232990264893, "epoch": 0.030071508197612767, "grad_norm": 0.0588810034096241, "grad_norm_var": 0.00150093964531539, "learning_rate": 0.00998651304414464, "loss": 3.1167, "step": 553 }, { "crossentropy": 3.0787190198898315, "epoch": 0.030125887055112974, "grad_norm": 0.06191382184624672, "grad_norm_var": 0.0014984399074503124, "learning_rate": 0.009986421493656373, "loss": 3.0787, "step": 554 }, { "crossentropy": 3.1519042253494263, "epoch": 0.030180265912613177, "grad_norm": 0.06319347769021988, "grad_norm_var": 0.0014979665755381376, "learning_rate": 0.00998632963391489, "loss": 3.1519, "step": 555 }, { "crossentropy": 3.150858759880066, "epoch": 0.03023464477011338, "grad_norm": 0.059755291789770126, "grad_norm_var": 0.001501697592963824, "learning_rate": 0.009986237464925893, "loss": 3.1509, "step": 556 }, { "crossentropy": 3.145945191383362, "epoch": 0.030289023627613584, "grad_norm": 0.06418012827634811, "grad_norm_var": 0.0015016513581979454, "learning_rate": 0.009986144986695098, "loss": 3.1459, "step": 557 }, { "crossentropy": 3.1174288988113403, "epoch": 0.030343402485113788, "grad_norm": 0.05988631770014763, "grad_norm_var": 0.0015026820267136285, "learning_rate": 0.009986052199228239, "loss": 3.1174, "step": 558 }, { "crossentropy": 3.1973464488983154, "epoch": 0.03039778134261399, "grad_norm": 0.06038828194141388, "grad_norm_var": 0.0015005159813190107, "learning_rate": 0.009985959102531073, "loss": 3.1973, "step": 559 }, { "crossentropy": 3.1062761545181274, "epoch": 0.030452160200114195, "grad_norm": 0.06546524912118912, "grad_norm_var": 8.307999886411563e-06, "learning_rate": 0.00998586569660937, "loss": 3.1063, "step": 560 }, { "crossentropy": 3.0861412286758423, "epoch": 0.0305065390576144, "grad_norm": 0.06339120119810104, "grad_norm_var": 8.658827527736832e-06, "learning_rate": 0.009985771981468924, "loss": 3.0861, "step": 561 }, { "crossentropy": 3.1502305269241333, "epoch": 0.030560917915114602, "grad_norm": 0.059750303626060486, "grad_norm_var": 7.88258005176793e-06, "learning_rate": 0.009985677957115548, "loss": 3.1502, "step": 562 }, { "crossentropy": 3.1474711894989014, "epoch": 0.030615296772614806, "grad_norm": 0.06030471995472908, "grad_norm_var": 5.838052609913325e-06, "learning_rate": 0.009985583623555075, "loss": 3.1475, "step": 563 }, { "crossentropy": 3.0928499698638916, "epoch": 0.030669675630115013, "grad_norm": 0.061614200472831726, "grad_norm_var": 4.296968207230985e-06, "learning_rate": 0.009985488980793353, "loss": 3.0928, "step": 564 }, { "crossentropy": 3.094353437423706, "epoch": 0.030724054487615216, "grad_norm": 0.05979853495955467, "grad_norm_var": 3.967091144911559e-06, "learning_rate": 0.009985394028836253, "loss": 3.0944, "step": 565 }, { "crossentropy": 3.0438718795776367, "epoch": 0.03077843334511542, "grad_norm": 0.06360968202352524, "grad_norm_var": 3.762032038330474e-06, "learning_rate": 0.009985298767689662, "loss": 3.0439, "step": 566 }, { "crossentropy": 3.065308690071106, "epoch": 0.030832812202615623, "grad_norm": 0.06012721732258797, "grad_norm_var": 3.818701453080683e-06, "learning_rate": 0.00998520319735949, "loss": 3.0653, "step": 567 }, { "crossentropy": 3.1194169521331787, "epoch": 0.030887191060115827, "grad_norm": 0.06393712759017944, "grad_norm_var": 4.175771108157142e-06, "learning_rate": 0.009985107317851663, "loss": 3.1194, "step": 568 }, { "crossentropy": 3.149047017097473, "epoch": 0.03094156991761603, "grad_norm": 0.05932405963540077, "grad_norm_var": 4.025214759187321e-06, "learning_rate": 0.009985011129172128, "loss": 3.149, "step": 569 }, { "crossentropy": 3.192098617553711, "epoch": 0.030995948775116234, "grad_norm": 0.05663330480456352, "grad_norm_var": 5.592751355013893e-06, "learning_rate": 0.00998491463132685, "loss": 3.1921, "step": 570 }, { "crossentropy": 3.044298768043518, "epoch": 0.031050327632616437, "grad_norm": 0.05808240547776222, "grad_norm_var": 5.958895648228158e-06, "learning_rate": 0.009984817824321814, "loss": 3.0443, "step": 571 }, { "crossentropy": 3.142951011657715, "epoch": 0.03110470649011664, "grad_norm": 0.05594656243920326, "grad_norm_var": 7.505519940627601e-06, "learning_rate": 0.009984720708163024, "loss": 3.143, "step": 572 }, { "crossentropy": 3.1685410737991333, "epoch": 0.031159085347616848, "grad_norm": 0.07398718595504761, "grad_norm_var": 1.7966029102566467e-05, "learning_rate": 0.009984623282856502, "loss": 3.1685, "step": 573 }, { "crossentropy": 3.051766514778137, "epoch": 0.03121346420511705, "grad_norm": 0.056426629424095154, "grad_norm_var": 1.9407938754763207e-05, "learning_rate": 0.009984525548408292, "loss": 3.0518, "step": 574 }, { "crossentropy": 3.1220805644989014, "epoch": 0.031267843062617255, "grad_norm": 0.058664169162511826, "grad_norm_var": 1.9774384568734446e-05, "learning_rate": 0.009984427504824455, "loss": 3.1221, "step": 575 }, { "crossentropy": 3.106940269470215, "epoch": 0.03132222192011746, "grad_norm": 0.052930884063243866, "grad_norm_var": 2.2242224017753517e-05, "learning_rate": 0.009984329152111072, "loss": 3.1069, "step": 576 }, { "crossentropy": 3.0841976404190063, "epoch": 0.03137660077761766, "grad_norm": 0.060079995542764664, "grad_norm_var": 2.1555231937730028e-05, "learning_rate": 0.00998423049027424, "loss": 3.0842, "step": 577 }, { "crossentropy": 2.9868699312210083, "epoch": 0.031430979635117866, "grad_norm": 0.06566765904426575, "grad_norm_var": 2.3486658777830256e-05, "learning_rate": 0.00998413151932008, "loss": 2.9869, "step": 578 }, { "crossentropy": 3.035885214805603, "epoch": 0.03148535849261807, "grad_norm": 0.06708498299121857, "grad_norm_var": 2.623227850360709e-05, "learning_rate": 0.00998403223925473, "loss": 3.0359, "step": 579 }, { "crossentropy": 3.10762882232666, "epoch": 0.03153973735011827, "grad_norm": 0.058560844510793686, "grad_norm_var": 2.651185301338754e-05, "learning_rate": 0.009983932650084347, "loss": 3.1076, "step": 580 }, { "crossentropy": 3.0620908737182617, "epoch": 0.031594116207618476, "grad_norm": 0.05860366299748421, "grad_norm_var": 2.6741330417168602e-05, "learning_rate": 0.009983832751815108, "loss": 3.0621, "step": 581 }, { "crossentropy": 3.158211350440979, "epoch": 0.03164849506511868, "grad_norm": 0.05896781384944916, "grad_norm_var": 2.6227842005092408e-05, "learning_rate": 0.009983732544453209, "loss": 3.1582, "step": 582 }, { "crossentropy": 2.9915295839309692, "epoch": 0.031702873922618884, "grad_norm": 0.05923546105623245, "grad_norm_var": 2.6299756200307695e-05, "learning_rate": 0.009983632028004863, "loss": 2.9915, "step": 583 }, { "crossentropy": 3.1766154766082764, "epoch": 0.03175725278011909, "grad_norm": 0.05935577303171158, "grad_norm_var": 2.536435302855263e-05, "learning_rate": 0.009983531202476303, "loss": 3.1766, "step": 584 }, { "crossentropy": 3.112359404563904, "epoch": 0.03181163163761929, "grad_norm": 0.08709250390529633, "grad_norm_var": 7.115842654835894e-05, "learning_rate": 0.009983430067873786, "loss": 3.1124, "step": 585 }, { "crossentropy": 3.0825403928756714, "epoch": 0.031866010495119494, "grad_norm": 0.08272768557071686, "grad_norm_var": 9.606135840837711e-05, "learning_rate": 0.009983328624203583, "loss": 3.0825, "step": 586 }, { "crossentropy": 3.166721224784851, "epoch": 0.031920389352619705, "grad_norm": 0.05471222475171089, "grad_norm_var": 9.913305589986746e-05, "learning_rate": 0.009983226871471984, "loss": 3.1667, "step": 587 }, { "crossentropy": 3.0622613430023193, "epoch": 0.03197476821011991, "grad_norm": 0.0557759590446949, "grad_norm_var": 9.92982263791307e-05, "learning_rate": 0.0099831248096853, "loss": 3.0623, "step": 588 }, { "crossentropy": 3.1268306970596313, "epoch": 0.03202914706762011, "grad_norm": 0.0601082406938076, "grad_norm_var": 9.122190032126205e-05, "learning_rate": 0.009983022438849862, "loss": 3.1268, "step": 589 }, { "crossentropy": 3.131944179534912, "epoch": 0.032083525925120315, "grad_norm": 0.059151556342840195, "grad_norm_var": 8.957033421984779e-05, "learning_rate": 0.009982919758972015, "loss": 3.1319, "step": 590 }, { "crossentropy": 3.055466413497925, "epoch": 0.03213790478262052, "grad_norm": 0.05920170247554779, "grad_norm_var": 8.931921115315067e-05, "learning_rate": 0.009982816770058134, "loss": 3.0555, "step": 591 }, { "crossentropy": 2.985701084136963, "epoch": 0.03219228364012072, "grad_norm": 0.06071602925658226, "grad_norm_var": 8.322252019241545e-05, "learning_rate": 0.0099827134721146, "loss": 2.9857, "step": 592 }, { "crossentropy": 3.115323066711426, "epoch": 0.032246662497620926, "grad_norm": 0.057753294706344604, "grad_norm_var": 8.444815683125942e-05, "learning_rate": 0.009982609865147823, "loss": 3.1153, "step": 593 }, { "crossentropy": 3.077110171318054, "epoch": 0.03230104135512113, "grad_norm": 0.05814215913414955, "grad_norm_var": 8.510501374731361e-05, "learning_rate": 0.009982505949164227, "loss": 3.0771, "step": 594 }, { "crossentropy": 3.128618597984314, "epoch": 0.03235542021262133, "grad_norm": 0.0670320987701416, "grad_norm_var": 8.507162035764626e-05, "learning_rate": 0.009982401724170257, "loss": 3.1286, "step": 595 }, { "crossentropy": 3.093104600906372, "epoch": 0.03240979907012154, "grad_norm": 0.059504732489585876, "grad_norm_var": 8.465407311948377e-05, "learning_rate": 0.009982297190172377, "loss": 3.0931, "step": 596 }, { "crossentropy": 3.2397581338882446, "epoch": 0.03246417792762174, "grad_norm": 0.06671004742383957, "grad_norm_var": 8.467944563534553e-05, "learning_rate": 0.009982192347177071, "loss": 3.2398, "step": 597 }, { "crossentropy": 3.008832097053528, "epoch": 0.032518556785121944, "grad_norm": 0.05592670664191246, "grad_norm_var": 8.684650226405127e-05, "learning_rate": 0.00998208719519084, "loss": 3.0088, "step": 598 }, { "crossentropy": 3.075597047805786, "epoch": 0.03257293564262215, "grad_norm": 0.05959593132138252, "grad_norm_var": 8.668827002196085e-05, "learning_rate": 0.009981981734220206, "loss": 3.0756, "step": 599 }, { "crossentropy": 3.047071695327759, "epoch": 0.03262731450012235, "grad_norm": 0.06425075978040695, "grad_norm_var": 8.599065786745926e-05, "learning_rate": 0.009981875964271709, "loss": 3.0471, "step": 600 }, { "crossentropy": 3.0106348991394043, "epoch": 0.032681693357622554, "grad_norm": 0.06354891508817673, "grad_norm_var": 4.5083375246805087e-05, "learning_rate": 0.00998176988535191, "loss": 3.0106, "step": 601 }, { "crossentropy": 3.061580181121826, "epoch": 0.03273607221512276, "grad_norm": 0.07574610412120819, "grad_norm_var": 2.8419325615989392e-05, "learning_rate": 0.009981663497467387, "loss": 3.0616, "step": 602 }, { "crossentropy": 3.038737177848816, "epoch": 0.03279045107262296, "grad_norm": 0.05913951247930527, "grad_norm_var": 2.5863444734743925e-05, "learning_rate": 0.009981556800624737, "loss": 3.0387, "step": 603 }, { "crossentropy": 3.059869647026062, "epoch": 0.032844829930123165, "grad_norm": 0.05743143707513809, "grad_norm_var": 2.4794663706075488e-05, "learning_rate": 0.009981449794830581, "loss": 3.0599, "step": 604 }, { "crossentropy": 3.0147197246551514, "epoch": 0.032899208787623375, "grad_norm": 0.05760345980525017, "grad_norm_var": 2.5650740065301843e-05, "learning_rate": 0.009981342480091552, "loss": 3.0147, "step": 605 }, { "crossentropy": 3.1312564611434937, "epoch": 0.03295358764512358, "grad_norm": 0.05796036496758461, "grad_norm_var": 2.60871477243405e-05, "learning_rate": 0.009981234856414306, "loss": 3.1313, "step": 606 }, { "crossentropy": 3.067404270172119, "epoch": 0.03300796650262378, "grad_norm": 0.057348717004060745, "grad_norm_var": 2.6811872067936982e-05, "learning_rate": 0.00998112692380552, "loss": 3.0674, "step": 607 }, { "crossentropy": 3.037852644920349, "epoch": 0.033062345360123986, "grad_norm": 0.06455346941947937, "grad_norm_var": 2.7509870431298897e-05, "learning_rate": 0.009981018682271883, "loss": 3.0379, "step": 608 }, { "crossentropy": 3.0911799669265747, "epoch": 0.03311672421762419, "grad_norm": 0.06465062499046326, "grad_norm_var": 2.7138275739393056e-05, "learning_rate": 0.009980910131820113, "loss": 3.0912, "step": 609 }, { "crossentropy": 3.144911766052246, "epoch": 0.03317110307512439, "grad_norm": 0.06573472917079926, "grad_norm_var": 2.701640112811824e-05, "learning_rate": 0.009980801272456942, "loss": 3.1449, "step": 610 }, { "crossentropy": 3.1645277738571167, "epoch": 0.0332254819326246, "grad_norm": 0.06754983216524124, "grad_norm_var": 2.7360085370708477e-05, "learning_rate": 0.009980692104189117, "loss": 3.1645, "step": 611 }, { "crossentropy": 3.0400177240371704, "epoch": 0.0332798607901248, "grad_norm": 0.0617150254547596, "grad_norm_var": 2.6833254179864143e-05, "learning_rate": 0.00998058262702341, "loss": 3.04, "step": 612 }, { "crossentropy": 3.0117634534835815, "epoch": 0.033334239647625004, "grad_norm": 0.06035070866346359, "grad_norm_var": 2.576276118699238e-05, "learning_rate": 0.009980472840966615, "loss": 3.0118, "step": 613 }, { "crossentropy": 3.0575637817382812, "epoch": 0.03338861850512521, "grad_norm": 0.05827189236879349, "grad_norm_var": 2.4185817284468464e-05, "learning_rate": 0.009980362746025536, "loss": 3.0576, "step": 614 }, { "crossentropy": 3.092461347579956, "epoch": 0.03344299736262541, "grad_norm": 0.06362821161746979, "grad_norm_var": 2.3793527469980387e-05, "learning_rate": 0.009980252342207004, "loss": 3.0925, "step": 615 }, { "crossentropy": 3.114761233329773, "epoch": 0.033497376220125614, "grad_norm": 0.057690974324941635, "grad_norm_var": 2.492345043821244e-05, "learning_rate": 0.009980141629517863, "loss": 3.1148, "step": 616 }, { "crossentropy": 3.0531994104385376, "epoch": 0.03355175507762582, "grad_norm": 0.057358358055353165, "grad_norm_var": 2.6087817610599232e-05, "learning_rate": 0.009980030607964983, "loss": 3.0532, "step": 617 }, { "crossentropy": 3.00201952457428, "epoch": 0.03360613393512602, "grad_norm": 0.06011006608605385, "grad_norm_var": 1.2023988543187885e-05, "learning_rate": 0.009979919277555246, "loss": 3.002, "step": 618 }, { "crossentropy": 2.991894245147705, "epoch": 0.033660512792626225, "grad_norm": 0.058117374777793884, "grad_norm_var": 1.2301083402409668e-05, "learning_rate": 0.009979807638295559, "loss": 2.9919, "step": 619 }, { "crossentropy": 2.9827592372894287, "epoch": 0.03371489165012643, "grad_norm": 0.06427401304244995, "grad_norm_var": 1.230946937900105e-05, "learning_rate": 0.009979695690192844, "loss": 2.9828, "step": 620 }, { "crossentropy": 3.044547915458679, "epoch": 0.03376927050762663, "grad_norm": 0.05779316648840904, "grad_norm_var": 1.2224354843569779e-05, "learning_rate": 0.009979583433254047, "loss": 3.0445, "step": 621 }, { "crossentropy": 3.0017133951187134, "epoch": 0.033823649365126836, "grad_norm": 0.07541931420564651, "grad_norm_var": 2.403830641391653e-05, "learning_rate": 0.009979470867486125, "loss": 3.0017, "step": 622 }, { "crossentropy": 3.1014431715011597, "epoch": 0.03387802822262704, "grad_norm": 0.05830878019332886, "grad_norm_var": 2.3479977411112034e-05, "learning_rate": 0.009979357992896065, "loss": 3.1014, "step": 623 }, { "crossentropy": 3.0423115491867065, "epoch": 0.03393240708012725, "grad_norm": 0.05660174787044525, "grad_norm_var": 2.4958264863488415e-05, "learning_rate": 0.009979244809490863, "loss": 3.0423, "step": 624 }, { "crossentropy": 3.0472238063812256, "epoch": 0.03398678593762745, "grad_norm": 0.06653065234422684, "grad_norm_var": 2.5912933126818045e-05, "learning_rate": 0.00997913131727754, "loss": 3.0472, "step": 625 }, { "crossentropy": 3.0555477142333984, "epoch": 0.03404116479512766, "grad_norm": 0.07066521048545837, "grad_norm_var": 2.9992061238311568e-05, "learning_rate": 0.009979017516263136, "loss": 3.0555, "step": 626 }, { "crossentropy": 3.081781029701233, "epoch": 0.03409554365262786, "grad_norm": 0.06577036529779434, "grad_norm_var": 2.890857374302404e-05, "learning_rate": 0.009978903406454706, "loss": 3.0818, "step": 627 }, { "crossentropy": 3.1677082777023315, "epoch": 0.034149922510128064, "grad_norm": 0.06174301728606224, "grad_norm_var": 2.890741779346297e-05, "learning_rate": 0.009978788987859328, "loss": 3.1677, "step": 628 }, { "crossentropy": 3.107518196105957, "epoch": 0.03420430136762827, "grad_norm": 0.05471029505133629, "grad_norm_var": 3.216596072217262e-05, "learning_rate": 0.009978674260484101, "loss": 3.1075, "step": 629 }, { "crossentropy": 3.049934148788452, "epoch": 0.03425868022512847, "grad_norm": 0.06291560083627701, "grad_norm_var": 3.1399154891290775e-05, "learning_rate": 0.009978559224336136, "loss": 3.0499, "step": 630 }, { "crossentropy": 3.0060492753982544, "epoch": 0.034313059082628675, "grad_norm": 0.058562833815813065, "grad_norm_var": 3.188779903847087e-05, "learning_rate": 0.009978443879422571, "loss": 3.006, "step": 631 }, { "crossentropy": 3.0724825859069824, "epoch": 0.03436743794012888, "grad_norm": 0.061353448778390884, "grad_norm_var": 3.078760323231329e-05, "learning_rate": 0.009978328225750557, "loss": 3.0725, "step": 632 }, { "crossentropy": 3.0542484521865845, "epoch": 0.03442181679762908, "grad_norm": 0.05786314606666565, "grad_norm_var": 3.049855065621887e-05, "learning_rate": 0.009978212263327268, "loss": 3.0542, "step": 633 }, { "crossentropy": 3.1246825456619263, "epoch": 0.034476195655129285, "grad_norm": 0.058651093393564224, "grad_norm_var": 3.098390552666123e-05, "learning_rate": 0.009978095992159897, "loss": 3.1247, "step": 634 }, { "crossentropy": 3.1544363498687744, "epoch": 0.03453057451262949, "grad_norm": 0.06368304789066315, "grad_norm_var": 3.0164846454872978e-05, "learning_rate": 0.009977979412255651, "loss": 3.1544, "step": 635 }, { "crossentropy": 3.1056054830551147, "epoch": 0.03458495337012969, "grad_norm": 0.057135146111249924, "grad_norm_var": 3.135483764940174e-05, "learning_rate": 0.009977862523621765, "loss": 3.1056, "step": 636 }, { "crossentropy": 3.154773235321045, "epoch": 0.034639332227629896, "grad_norm": 0.06083236634731293, "grad_norm_var": 3.0336143359622573e-05, "learning_rate": 0.009977745326265483, "loss": 3.1548, "step": 637 }, { "crossentropy": 2.9576175212860107, "epoch": 0.0346937110851301, "grad_norm": 0.05990045890212059, "grad_norm_var": 1.745917316714468e-05, "learning_rate": 0.00997762782019408, "loss": 2.9576, "step": 638 }, { "crossentropy": 3.026512384414673, "epoch": 0.0347480899426303, "grad_norm": 0.06206187605857849, "grad_norm_var": 1.7016980016122153e-05, "learning_rate": 0.009977510005414839, "loss": 3.0265, "step": 639 }, { "crossentropy": 3.1070432662963867, "epoch": 0.03480246880013051, "grad_norm": 0.055953383445739746, "grad_norm_var": 1.7439578943820717e-05, "learning_rate": 0.009977391881935069, "loss": 3.107, "step": 640 }, { "crossentropy": 3.1480385065078735, "epoch": 0.03485684765763071, "grad_norm": 0.056693851947784424, "grad_norm_var": 1.642454472402007e-05, "learning_rate": 0.009977273449762094, "loss": 3.148, "step": 641 }, { "crossentropy": 3.0095590353012085, "epoch": 0.034911226515130914, "grad_norm": 0.058862123638391495, "grad_norm_var": 9.182851800684787e-06, "learning_rate": 0.00997715470890326, "loss": 3.0096, "step": 642 }, { "crossentropy": 3.007493734359741, "epoch": 0.034965605372631124, "grad_norm": 0.060051098465919495, "grad_norm_var": 6.669267931036273e-06, "learning_rate": 0.00997703565936593, "loss": 3.0075, "step": 643 }, { "crossentropy": 3.0283807516098022, "epoch": 0.03501998423013133, "grad_norm": 0.061011508107185364, "grad_norm_var": 6.4776785467054435e-06, "learning_rate": 0.00997691630115749, "loss": 3.0284, "step": 644 }, { "crossentropy": 3.0159218311309814, "epoch": 0.03507436308763153, "grad_norm": 0.0620160736143589, "grad_norm_var": 5.254982301038045e-06, "learning_rate": 0.00997679663428534, "loss": 3.0159, "step": 645 }, { "crossentropy": 3.022797107696533, "epoch": 0.035128741945131735, "grad_norm": 0.06208885461091995, "grad_norm_var": 4.959407026978943e-06, "learning_rate": 0.009976676658756903, "loss": 3.0228, "step": 646 }, { "crossentropy": 3.0016894340515137, "epoch": 0.03518312080263194, "grad_norm": 0.06564085185527802, "grad_norm_var": 6.927695618165012e-06, "learning_rate": 0.00997655637457962, "loss": 3.0017, "step": 647 }, { "crossentropy": 3.0350375175476074, "epoch": 0.03523749966013214, "grad_norm": 0.06434887647628784, "grad_norm_var": 7.93422320101474e-06, "learning_rate": 0.00997643578176095, "loss": 3.035, "step": 648 }, { "crossentropy": 2.9872864484786987, "epoch": 0.035291878517632345, "grad_norm": 0.11463742703199387, "grad_norm_var": 0.00019000162444367063, "learning_rate": 0.009976314880308373, "loss": 2.9873, "step": 649 }, { "crossentropy": 3.039174199104309, "epoch": 0.03534625737513255, "grad_norm": 0.0647561177611351, "grad_norm_var": 0.00018799903090511137, "learning_rate": 0.009976193670229385, "loss": 3.0392, "step": 650 }, { "crossentropy": 3.068402886390686, "epoch": 0.03540063623263275, "grad_norm": 0.0676182433962822, "grad_norm_var": 0.00018861455025610542, "learning_rate": 0.009976072151531505, "loss": 3.0684, "step": 651 }, { "crossentropy": 3.0840961933135986, "epoch": 0.035455015090132956, "grad_norm": 0.060166772454977036, "grad_norm_var": 0.00018617134433498718, "learning_rate": 0.00997595032422227, "loss": 3.0841, "step": 652 }, { "crossentropy": 3.041886806488037, "epoch": 0.03550939394763316, "grad_norm": 0.0597691684961319, "grad_norm_var": 0.00018680302567537037, "learning_rate": 0.009975828188309235, "loss": 3.0419, "step": 653 }, { "crossentropy": 2.978552222251892, "epoch": 0.03556377280513336, "grad_norm": 0.06196580082178116, "grad_norm_var": 0.00018574145233049745, "learning_rate": 0.009975705743799974, "loss": 2.9786, "step": 654 }, { "crossentropy": 2.9929726123809814, "epoch": 0.03561815166263357, "grad_norm": 0.05881491303443909, "grad_norm_var": 0.0001876085706310685, "learning_rate": 0.009975582990702083, "loss": 2.993, "step": 655 }, { "crossentropy": 3.023211359977722, "epoch": 0.03567253052013377, "grad_norm": 0.058890439569950104, "grad_norm_var": 0.000184742174959504, "learning_rate": 0.009975459929023172, "loss": 3.0232, "step": 656 }, { "crossentropy": 3.151142120361328, "epoch": 0.035726909377633974, "grad_norm": 0.06342770159244537, "grad_norm_var": 0.00018026828281925277, "learning_rate": 0.009975336558770876, "loss": 3.1511, "step": 657 }, { "crossentropy": 2.860145092010498, "epoch": 0.03578128823513418, "grad_norm": 0.06358429789543152, "grad_norm_var": 0.0001776374144996475, "learning_rate": 0.009975212879952846, "loss": 2.8601, "step": 658 }, { "crossentropy": 3.0731258392333984, "epoch": 0.03583566709263438, "grad_norm": 0.059668201953172684, "grad_norm_var": 0.00017792727447814703, "learning_rate": 0.009975088892576749, "loss": 3.0731, "step": 659 }, { "crossentropy": 2.918477773666382, "epoch": 0.035890045950134584, "grad_norm": 0.0578233040869236, "grad_norm_var": 0.0001804813619645386, "learning_rate": 0.009974964596650279, "loss": 2.9185, "step": 660 }, { "crossentropy": 2.993516445159912, "epoch": 0.035944424807634795, "grad_norm": 0.05968186631798744, "grad_norm_var": 0.00018185205559219259, "learning_rate": 0.009974839992181142, "loss": 2.9935, "step": 661 }, { "crossentropy": 3.0034173727035522, "epoch": 0.035998803665135, "grad_norm": 0.056614432483911514, "grad_norm_var": 0.00018598156409650358, "learning_rate": 0.009974715079177068, "loss": 3.0034, "step": 662 }, { "crossentropy": 3.0498112440109253, "epoch": 0.0360531825226352, "grad_norm": 0.0572628490626812, "grad_norm_var": 0.00018947168691616317, "learning_rate": 0.009974589857645802, "loss": 3.0498, "step": 663 }, { "crossentropy": 3.04518723487854, "epoch": 0.036107561380135406, "grad_norm": 0.06501022726297379, "grad_norm_var": 0.00018950206354086374, "learning_rate": 0.00997446432759511, "loss": 3.0452, "step": 664 }, { "crossentropy": 3.0125991106033325, "epoch": 0.03616194023763561, "grad_norm": 0.07053180783987045, "grad_norm_var": 1.5389646015384387e-05, "learning_rate": 0.009974338489032779, "loss": 3.0126, "step": 665 }, { "crossentropy": 3.086272716522217, "epoch": 0.03621631909513581, "grad_norm": 0.0664203017950058, "grad_norm_var": 1.626324722552493e-05, "learning_rate": 0.009974212341966613, "loss": 3.0863, "step": 666 }, { "crossentropy": 2.8528302907943726, "epoch": 0.036270697952636016, "grad_norm": 0.0754006877541542, "grad_norm_var": 2.6186505766061705e-05, "learning_rate": 0.009974085886404433, "loss": 2.8528, "step": 667 }, { "crossentropy": 2.878618836402893, "epoch": 0.03632507681013622, "grad_norm": 0.065501369535923, "grad_norm_var": 2.652636694310626e-05, "learning_rate": 0.009973959122354084, "loss": 2.8786, "step": 668 }, { "crossentropy": 3.0469785928726196, "epoch": 0.03637945566763642, "grad_norm": 0.0628872662782669, "grad_norm_var": 2.5989146233279468e-05, "learning_rate": 0.009973832049823429, "loss": 3.047, "step": 669 }, { "crossentropy": 3.0780484676361084, "epoch": 0.03643383452513663, "grad_norm": 0.0654931589961052, "grad_norm_var": 2.641309153951169e-05, "learning_rate": 0.009973704668820346, "loss": 3.078, "step": 670 }, { "crossentropy": 2.9451674222946167, "epoch": 0.03648821338263683, "grad_norm": 0.08121753484010696, "grad_norm_var": 4.546380443899937e-05, "learning_rate": 0.00997357697935274, "loss": 2.9452, "step": 671 }, { "crossentropy": 3.033063530921936, "epoch": 0.036542592240137034, "grad_norm": 0.05746497958898544, "grad_norm_var": 4.662625949857311e-05, "learning_rate": 0.009973448981428521, "loss": 3.0331, "step": 672 }, { "crossentropy": 2.9543120861053467, "epoch": 0.03659697109763724, "grad_norm": 0.05829761549830437, "grad_norm_var": 4.8833154456220015e-05, "learning_rate": 0.009973320675055637, "loss": 2.9543, "step": 673 }, { "crossentropy": 3.0502912998199463, "epoch": 0.03665134995513744, "grad_norm": 0.06339951604604721, "grad_norm_var": 4.884377479106282e-05, "learning_rate": 0.00997319206024204, "loss": 3.0503, "step": 674 }, { "crossentropy": 3.006015181541443, "epoch": 0.036705728812637645, "grad_norm": 0.05803437530994415, "grad_norm_var": 4.993622736541344e-05, "learning_rate": 0.009973063136995709, "loss": 3.006, "step": 675 }, { "crossentropy": 2.977598190307617, "epoch": 0.03676010767013785, "grad_norm": 0.05635220557451248, "grad_norm_var": 5.124675138247006e-05, "learning_rate": 0.009972933905324636, "loss": 2.9776, "step": 676 }, { "crossentropy": 3.0397560596466064, "epoch": 0.03681448652763805, "grad_norm": 0.054731205105781555, "grad_norm_var": 5.544616181011036e-05, "learning_rate": 0.00997280436523684, "loss": 3.0398, "step": 677 }, { "crossentropy": 2.906584858894348, "epoch": 0.036868865385138255, "grad_norm": 0.058770567178726196, "grad_norm_var": 5.3782028249947196e-05, "learning_rate": 0.009972674516740353, "loss": 2.9066, "step": 678 }, { "crossentropy": 3.0823757648468018, "epoch": 0.03692324424263846, "grad_norm": 0.06179613620042801, "grad_norm_var": 5.126717099881711e-05, "learning_rate": 0.00997254435984323, "loss": 3.0824, "step": 679 }, { "crossentropy": 3.093528151512146, "epoch": 0.03697762310013867, "grad_norm": 0.05681333318352699, "grad_norm_var": 5.4178572813525184e-05, "learning_rate": 0.00997241389455354, "loss": 3.0935, "step": 680 }, { "crossentropy": 3.0928597450256348, "epoch": 0.03703200195763887, "grad_norm": 0.060900866985321045, "grad_norm_var": 5.07142585154811e-05, "learning_rate": 0.009972283120879376, "loss": 3.0929, "step": 681 }, { "crossentropy": 3.0887988805770874, "epoch": 0.037086380815139076, "grad_norm": 0.05783766880631447, "grad_norm_var": 5.108088274631313e-05, "learning_rate": 0.00997215203882885, "loss": 3.0888, "step": 682 }, { "crossentropy": 3.0449496507644653, "epoch": 0.03714075967263928, "grad_norm": 0.060057032853364944, "grad_norm_var": 3.8750324596229886e-05, "learning_rate": 0.00997202064841009, "loss": 3.0449, "step": 683 }, { "crossentropy": 3.064734935760498, "epoch": 0.037195138530139484, "grad_norm": 0.06056426465511322, "grad_norm_var": 3.7456852735895156e-05, "learning_rate": 0.009971888949631243, "loss": 3.0647, "step": 684 }, { "crossentropy": 3.0440436601638794, "epoch": 0.03724951738763969, "grad_norm": 0.06163174286484718, "grad_norm_var": 3.722497735946327e-05, "learning_rate": 0.009971756942500479, "loss": 3.044, "step": 685 }, { "crossentropy": 3.069283962249756, "epoch": 0.03730389624513989, "grad_norm": 0.06301704794168472, "grad_norm_var": 3.6070335671900024e-05, "learning_rate": 0.009971624627025985, "loss": 3.0693, "step": 686 }, { "crossentropy": 3.1407045125961304, "epoch": 0.037358275102640094, "grad_norm": 0.06274325400590897, "grad_norm_var": 6.813639581209653e-06, "learning_rate": 0.009971492003215968, "loss": 3.1407, "step": 687 }, { "crossentropy": 2.9450682401657104, "epoch": 0.0374126539601403, "grad_norm": 0.05932976305484772, "grad_norm_var": 6.5185956798200285e-06, "learning_rate": 0.00997135907107865, "loss": 2.9451, "step": 688 }, { "crossentropy": 3.059021234512329, "epoch": 0.0374670328176405, "grad_norm": 0.058742258697748184, "grad_norm_var": 6.4512325193203174e-06, "learning_rate": 0.00997122583062228, "loss": 3.059, "step": 689 }, { "crossentropy": 3.1101068258285522, "epoch": 0.037521411675140705, "grad_norm": 0.05809002369642258, "grad_norm_var": 5.5729617073831e-06, "learning_rate": 0.009971092281855115, "loss": 3.1101, "step": 690 }, { "crossentropy": 3.0629262924194336, "epoch": 0.03757579053264091, "grad_norm": 0.058776773512363434, "grad_norm_var": 5.478344574458688e-06, "learning_rate": 0.009970958424785443, "loss": 3.0629, "step": 691 }, { "crossentropy": 3.080201268196106, "epoch": 0.03763016939014111, "grad_norm": 0.0630103349685669, "grad_norm_var": 5.556972384775961e-06, "learning_rate": 0.009970824259421564, "loss": 3.0802, "step": 692 }, { "crossentropy": 2.9810022115707397, "epoch": 0.037684548247641315, "grad_norm": 0.06113829463720322, "grad_norm_var": 3.791828801056179e-06, "learning_rate": 0.009970689785771798, "loss": 2.981, "step": 693 }, { "crossentropy": 3.070796489715576, "epoch": 0.03773892710514152, "grad_norm": 0.058285191655159, "grad_norm_var": 3.899139686055984e-06, "learning_rate": 0.009970555003844486, "loss": 3.0708, "step": 694 }, { "crossentropy": 2.997870683670044, "epoch": 0.03779330596264172, "grad_norm": 0.055411141365766525, "grad_norm_var": 5.063511056550146e-06, "learning_rate": 0.009970419913647988, "loss": 2.9979, "step": 695 }, { "crossentropy": 3.056097984313965, "epoch": 0.037847684820141926, "grad_norm": 0.06489531695842743, "grad_norm_var": 5.957864483195572e-06, "learning_rate": 0.00997028451519068, "loss": 3.0561, "step": 696 }, { "crossentropy": 3.0351805686950684, "epoch": 0.03790206367764213, "grad_norm": 0.05745242163538933, "grad_norm_var": 6.4142215396837765e-06, "learning_rate": 0.009970148808480959, "loss": 3.0352, "step": 697 }, { "crossentropy": 3.0391197204589844, "epoch": 0.03795644253514234, "grad_norm": 0.057529836893081665, "grad_norm_var": 6.511415797334254e-06, "learning_rate": 0.009970012793527242, "loss": 3.0391, "step": 698 }, { "crossentropy": 2.9671233892440796, "epoch": 0.038010821392642544, "grad_norm": 0.05733336880803108, "grad_norm_var": 6.969664421592875e-06, "learning_rate": 0.009969876470337969, "loss": 2.9671, "step": 699 }, { "crossentropy": 3.126606822013855, "epoch": 0.03806520025014275, "grad_norm": 0.06049204617738724, "grad_norm_var": 6.9633239037196365e-06, "learning_rate": 0.009969739838921588, "loss": 3.1266, "step": 700 }, { "crossentropy": 3.0050108432769775, "epoch": 0.03811957910764295, "grad_norm": 0.05857168138027191, "grad_norm_var": 6.828716699436299e-06, "learning_rate": 0.009969602899286575, "loss": 3.005, "step": 701 }, { "crossentropy": 3.042618989944458, "epoch": 0.038173957965143154, "grad_norm": 0.0637718141078949, "grad_norm_var": 7.200531859483799e-06, "learning_rate": 0.009969465651441424, "loss": 3.0426, "step": 702 }, { "crossentropy": 2.9497053623199463, "epoch": 0.03822833682264336, "grad_norm": 0.06139027699828148, "grad_norm_var": 6.770158724275256e-06, "learning_rate": 0.009969328095394647, "loss": 2.9497, "step": 703 }, { "crossentropy": 3.124552011489868, "epoch": 0.03828271568014356, "grad_norm": 0.056553035974502563, "grad_norm_var": 7.366455977088753e-06, "learning_rate": 0.009969190231154773, "loss": 3.1246, "step": 704 }, { "crossentropy": 2.9449578523635864, "epoch": 0.038337094537643765, "grad_norm": 0.06040225550532341, "grad_norm_var": 7.378661072550823e-06, "learning_rate": 0.009969052058730355, "loss": 2.945, "step": 705 }, { "crossentropy": 3.0076874494552612, "epoch": 0.03839147339514397, "grad_norm": 0.05640886351466179, "grad_norm_var": 7.886821669067281e-06, "learning_rate": 0.00996891357812996, "loss": 3.0077, "step": 706 }, { "crossentropy": 2.8675854206085205, "epoch": 0.03844585225264417, "grad_norm": 0.06271214783191681, "grad_norm_var": 8.494214868021156e-06, "learning_rate": 0.009968774789362178, "loss": 2.8676, "step": 707 }, { "crossentropy": 3.0342934131622314, "epoch": 0.038500231110144376, "grad_norm": 0.06316079199314117, "grad_norm_var": 8.561839982876297e-06, "learning_rate": 0.009968635692435616, "loss": 3.0343, "step": 708 }, { "crossentropy": 3.1128222942352295, "epoch": 0.03855460996764458, "grad_norm": 0.07152464985847473, "grad_norm_var": 1.7269231614395337e-05, "learning_rate": 0.0099684962873589, "loss": 3.1128, "step": 709 }, { "crossentropy": 3.039461135864258, "epoch": 0.03860898882514478, "grad_norm": 0.06829336285591125, "grad_norm_var": 2.0749532268619226e-05, "learning_rate": 0.009968356574140677, "loss": 3.0395, "step": 710 }, { "crossentropy": 3.0750261545181274, "epoch": 0.038663367682644986, "grad_norm": 0.06162254884839058, "grad_norm_var": 1.8537277696052214e-05, "learning_rate": 0.009968216552789611, "loss": 3.075, "step": 711 }, { "crossentropy": 3.1054517030715942, "epoch": 0.03871774654014519, "grad_norm": 0.0628054141998291, "grad_norm_var": 1.7831301903457423e-05, "learning_rate": 0.009968076223314385, "loss": 3.1055, "step": 712 }, { "crossentropy": 2.8960793018341064, "epoch": 0.03877212539764539, "grad_norm": 0.059326156973838806, "grad_norm_var": 1.710159511000188e-05, "learning_rate": 0.009967935585723706, "loss": 2.8961, "step": 713 }, { "crossentropy": 2.9659582376480103, "epoch": 0.0388265042551456, "grad_norm": 0.0589926652610302, "grad_norm_var": 1.648660196397663e-05, "learning_rate": 0.009967794640026293, "loss": 2.966, "step": 714 }, { "crossentropy": 2.9011032581329346, "epoch": 0.0388808831126458, "grad_norm": 0.08000623434782028, "grad_norm_var": 3.6140067620481164e-05, "learning_rate": 0.009967653386230886, "loss": 2.9011, "step": 715 }, { "crossentropy": 3.047470808029175, "epoch": 0.038935261970146004, "grad_norm": 0.06145835295319557, "grad_norm_var": 3.589113164547593e-05, "learning_rate": 0.00996751182434625, "loss": 3.0475, "step": 716 }, { "crossentropy": 3.090545177459717, "epoch": 0.038989640827646214, "grad_norm": 0.059263184666633606, "grad_norm_var": 3.55184858497661e-05, "learning_rate": 0.00996736995438116, "loss": 3.0905, "step": 717 }, { "crossentropy": 2.983991503715515, "epoch": 0.03904401968514642, "grad_norm": 0.058152616024017334, "grad_norm_var": 3.689924999049354e-05, "learning_rate": 0.009967227776344416, "loss": 2.984, "step": 718 }, { "crossentropy": 3.103682279586792, "epoch": 0.03909839854264662, "grad_norm": 0.05765942111611366, "grad_norm_var": 3.838567100718666e-05, "learning_rate": 0.009967085290244837, "loss": 3.1037, "step": 719 }, { "crossentropy": 2.928007125854492, "epoch": 0.039152777400146825, "grad_norm": 0.05963914468884468, "grad_norm_var": 3.657650892859432e-05, "learning_rate": 0.00996694249609126, "loss": 2.928, "step": 720 }, { "crossentropy": 3.005379557609558, "epoch": 0.03920715625764703, "grad_norm": 0.07757055014371872, "grad_norm_var": 4.999217020542045e-05, "learning_rate": 0.009966799393892539, "loss": 3.0054, "step": 721 }, { "crossentropy": 2.995793581008911, "epoch": 0.03926153511514723, "grad_norm": 0.05589659512042999, "grad_norm_var": 5.050399592291804e-05, "learning_rate": 0.009966655983657549, "loss": 2.9958, "step": 722 }, { "crossentropy": 2.940536141395569, "epoch": 0.039315913972647436, "grad_norm": 0.06434846669435501, "grad_norm_var": 5.0471036641528263e-05, "learning_rate": 0.009966512265395187, "loss": 2.9405, "step": 723 }, { "crossentropy": 3.0116602182388306, "epoch": 0.03937029283014764, "grad_norm": 0.060823649168014526, "grad_norm_var": 5.099058456360008e-05, "learning_rate": 0.009966368239114365, "loss": 3.0117, "step": 724 }, { "crossentropy": 2.839818239212036, "epoch": 0.03942467168764784, "grad_norm": 0.06355155259370804, "grad_norm_var": 4.652477602266839e-05, "learning_rate": 0.009966223904824014, "loss": 2.8398, "step": 725 }, { "crossentropy": 3.0072935819625854, "epoch": 0.039479050545148046, "grad_norm": 0.056979354470968246, "grad_norm_var": 4.6672911545838035e-05, "learning_rate": 0.009966079262533086, "loss": 3.0073, "step": 726 }, { "crossentropy": 3.070927858352661, "epoch": 0.03953342940264825, "grad_norm": 0.0627264603972435, "grad_norm_var": 4.663744112837435e-05, "learning_rate": 0.009965934312250553, "loss": 3.0709, "step": 727 }, { "crossentropy": 3.031925916671753, "epoch": 0.039587808260148453, "grad_norm": 0.056358907371759415, "grad_norm_var": 4.892928136251695e-05, "learning_rate": 0.009965789053985404, "loss": 3.0319, "step": 728 }, { "crossentropy": 3.0364245176315308, "epoch": 0.03964218711764866, "grad_norm": 0.0668339654803276, "grad_norm_var": 4.972847277441086e-05, "learning_rate": 0.009965643487746647, "loss": 3.0364, "step": 729 }, { "crossentropy": 2.993990421295166, "epoch": 0.03969656597514886, "grad_norm": 0.05768408626317978, "grad_norm_var": 5.045029387855665e-05, "learning_rate": 0.00996549761354331, "loss": 2.994, "step": 730 }, { "crossentropy": 3.03268563747406, "epoch": 0.039750944832649064, "grad_norm": 0.06468421220779419, "grad_norm_var": 2.9225201977478056e-05, "learning_rate": 0.00996535143138444, "loss": 3.0327, "step": 731 }, { "crossentropy": 3.0336239337921143, "epoch": 0.03980532369014927, "grad_norm": 0.06075684353709221, "grad_norm_var": 2.9257694681949532e-05, "learning_rate": 0.009965204941279105, "loss": 3.0336, "step": 732 }, { "crossentropy": 3.079004406929016, "epoch": 0.03985970254764947, "grad_norm": 0.06416825205087662, "grad_norm_var": 2.9342305054790888e-05, "learning_rate": 0.009965058143236387, "loss": 3.079, "step": 733 }, { "crossentropy": 3.0050703287124634, "epoch": 0.039914081405149675, "grad_norm": 0.14038623869419098, "grad_norm_var": 0.00041266059663502364, "learning_rate": 0.009964911037265392, "loss": 3.0051, "step": 734 }, { "crossentropy": 2.944147825241089, "epoch": 0.03996846026264988, "grad_norm": 0.05739820748567581, "grad_norm_var": 0.00041298597318033535, "learning_rate": 0.009964763623375244, "loss": 2.9441, "step": 735 }, { "crossentropy": 2.9881703853607178, "epoch": 0.04002283912015009, "grad_norm": 0.060751643031835556, "grad_norm_var": 0.000411991803556318, "learning_rate": 0.009964615901575085, "loss": 2.9882, "step": 736 }, { "crossentropy": 3.0121716260910034, "epoch": 0.04007721797765029, "grad_norm": 0.05887284129858017, "grad_norm_var": 0.0004073209598778269, "learning_rate": 0.009964467871874074, "loss": 3.0122, "step": 737 }, { "crossentropy": 3.0307023525238037, "epoch": 0.040131596835150496, "grad_norm": 0.060683343559503555, "grad_norm_var": 0.00040245542392704773, "learning_rate": 0.009964319534281396, "loss": 3.0307, "step": 738 }, { "crossentropy": 2.98019278049469, "epoch": 0.0401859756926507, "grad_norm": 0.05676284432411194, "grad_norm_var": 0.00040778588659466506, "learning_rate": 0.009964170888806249, "loss": 2.9802, "step": 739 }, { "crossentropy": 3.0798354148864746, "epoch": 0.0402403545501509, "grad_norm": 0.05436794087290764, "grad_norm_var": 0.0004144923898748006, "learning_rate": 0.00996402193545785, "loss": 3.0798, "step": 740 }, { "crossentropy": 2.9947946071624756, "epoch": 0.04029473340765111, "grad_norm": 0.060199182480573654, "grad_norm_var": 0.00041592509865924493, "learning_rate": 0.00996387267424544, "loss": 2.9948, "step": 741 }, { "crossentropy": 3.060669422149658, "epoch": 0.04034911226515131, "grad_norm": 0.06301764398813248, "grad_norm_var": 0.00041176585033676956, "learning_rate": 0.009963723105178275, "loss": 3.0607, "step": 742 }, { "crossentropy": 2.9562700986862183, "epoch": 0.040403491122651514, "grad_norm": 0.06159183010458946, "grad_norm_var": 0.00041224370909282847, "learning_rate": 0.009963573228265628, "loss": 2.9563, "step": 743 }, { "crossentropy": 2.9774423837661743, "epoch": 0.04045786998015172, "grad_norm": 0.05740725249052048, "grad_norm_var": 0.00041106508196412164, "learning_rate": 0.009963423043516799, "loss": 2.9774, "step": 744 }, { "crossentropy": 2.9793492555618286, "epoch": 0.04051224883765192, "grad_norm": 0.058490145951509476, "grad_norm_var": 0.0004137630230959282, "learning_rate": 0.0099632725509411, "loss": 2.9793, "step": 745 }, { "crossentropy": 3.0733444690704346, "epoch": 0.040566627695152124, "grad_norm": 0.05569569393992424, "grad_norm_var": 0.0004159036943003585, "learning_rate": 0.009963121750547864, "loss": 3.0733, "step": 746 }, { "crossentropy": 3.017704486846924, "epoch": 0.04062100655265233, "grad_norm": 0.05554655194282532, "grad_norm_var": 0.0004211440794642712, "learning_rate": 0.009962970642346445, "loss": 3.0177, "step": 747 }, { "crossentropy": 2.997084856033325, "epoch": 0.04067538541015253, "grad_norm": 0.0553351454436779, "grad_norm_var": 0.0004254204300489763, "learning_rate": 0.009962819226346214, "loss": 2.9971, "step": 748 }, { "crossentropy": 3.1512430906295776, "epoch": 0.040729764267652735, "grad_norm": 0.058984920382499695, "grad_norm_var": 0.0004268397002241168, "learning_rate": 0.009962667502556559, "loss": 3.1512, "step": 749 }, { "crossentropy": 2.9325473308563232, "epoch": 0.04078414312515294, "grad_norm": 0.05904935672879219, "grad_norm_var": 6.150585156091253e-06, "learning_rate": 0.009962515470986895, "loss": 2.9325, "step": 750 }, { "crossentropy": 3.05277419090271, "epoch": 0.04083852198265314, "grad_norm": 0.061932455748319626, "grad_norm_var": 6.839172636492983e-06, "learning_rate": 0.009962363131646649, "loss": 3.0528, "step": 751 }, { "crossentropy": 3.061272144317627, "epoch": 0.040892900840153346, "grad_norm": 0.06386718153953552, "grad_norm_var": 8.311369342111664e-06, "learning_rate": 0.009962210484545264, "loss": 3.0613, "step": 752 }, { "crossentropy": 2.9452918767929077, "epoch": 0.04094727969765355, "grad_norm": 0.055794212967157364, "grad_norm_var": 8.899607539589877e-06, "learning_rate": 0.009962057529692212, "loss": 2.9453, "step": 753 }, { "crossentropy": 2.9591753482818604, "epoch": 0.04100165855515376, "grad_norm": 0.05436498671770096, "grad_norm_var": 9.698873457867663e-06, "learning_rate": 0.009961904267096979, "loss": 2.9592, "step": 754 }, { "crossentropy": 2.956863045692444, "epoch": 0.04105603741265396, "grad_norm": 0.05743281543254852, "grad_norm_var": 9.591806195052323e-06, "learning_rate": 0.009961750696769071, "loss": 2.9569, "step": 755 }, { "crossentropy": 3.0673009157180786, "epoch": 0.04111041627015417, "grad_norm": 0.06203604117035866, "grad_norm_var": 9.228880724938306e-06, "learning_rate": 0.009961596818718008, "loss": 3.0673, "step": 756 }, { "crossentropy": 3.0190192461013794, "epoch": 0.04116479512765437, "grad_norm": 0.06430745124816895, "grad_norm_var": 1.1052045445597338e-05, "learning_rate": 0.009961442632953336, "loss": 3.019, "step": 757 }, { "crossentropy": 3.107649087905884, "epoch": 0.041219173985154574, "grad_norm": 0.06046900153160095, "grad_norm_var": 1.0110878520222522e-05, "learning_rate": 0.00996128813948462, "loss": 3.1076, "step": 758 }, { "crossentropy": 3.041380286216736, "epoch": 0.04127355284265478, "grad_norm": 0.0548931248486042, "grad_norm_var": 1.0505881691911384e-05, "learning_rate": 0.009961133338321437, "loss": 3.0414, "step": 759 }, { "crossentropy": 2.8784072399139404, "epoch": 0.04132793170015498, "grad_norm": 0.05820148438215256, "grad_norm_var": 1.0432193128049466e-05, "learning_rate": 0.00996097822947339, "loss": 2.8784, "step": 760 }, { "crossentropy": 3.108930826187134, "epoch": 0.041382310557655184, "grad_norm": 0.05861591920256615, "grad_norm_var": 1.0432596718304432e-05, "learning_rate": 0.009960822812950099, "loss": 3.1089, "step": 761 }, { "crossentropy": 2.825646758079529, "epoch": 0.04143668941515539, "grad_norm": 0.07019087672233582, "grad_norm_var": 1.808105609534625e-05, "learning_rate": 0.009960667088761201, "loss": 2.8256, "step": 762 }, { "crossentropy": 2.9300631284713745, "epoch": 0.04149106827265559, "grad_norm": 0.0620015487074852, "grad_norm_var": 1.733527701195016e-05, "learning_rate": 0.009960511056916357, "loss": 2.9301, "step": 763 }, { "crossentropy": 2.9706674814224243, "epoch": 0.041545447130155795, "grad_norm": 0.05418604984879494, "grad_norm_var": 1.8108354176322022e-05, "learning_rate": 0.00996035471742524, "loss": 2.9707, "step": 764 }, { "crossentropy": 2.955359101295471, "epoch": 0.041599825987656, "grad_norm": 0.05822041258215904, "grad_norm_var": 1.8224957604033125e-05, "learning_rate": 0.009960198070297547, "loss": 2.9554, "step": 765 }, { "crossentropy": 2.998953938484192, "epoch": 0.0416542048451562, "grad_norm": 0.05573857203125954, "grad_norm_var": 1.9207270098604382e-05, "learning_rate": 0.009960041115542996, "loss": 2.999, "step": 766 }, { "crossentropy": 2.978398084640503, "epoch": 0.041708583702656406, "grad_norm": 0.056782882660627365, "grad_norm_var": 1.9205323597111062e-05, "learning_rate": 0.00995988385317132, "loss": 2.9784, "step": 767 }, { "crossentropy": 3.0440038442611694, "epoch": 0.04176296256015661, "grad_norm": 0.05401923134922981, "grad_norm_var": 1.9130420365986017e-05, "learning_rate": 0.00995972628319227, "loss": 3.044, "step": 768 }, { "crossentropy": 2.9049782752990723, "epoch": 0.04181734141765681, "grad_norm": 0.06678026169538498, "grad_norm_var": 2.259543492332423e-05, "learning_rate": 0.00995956840561562, "loss": 2.905, "step": 769 }, { "crossentropy": 2.8562495708465576, "epoch": 0.041871720275157016, "grad_norm": 0.05421856790781021, "grad_norm_var": 2.269243618146302e-05, "learning_rate": 0.009959410220451161, "loss": 2.8562, "step": 770 }, { "crossentropy": 2.8720961809158325, "epoch": 0.04192609913265722, "grad_norm": 0.052148960530757904, "grad_norm_var": 2.5721762935522938e-05, "learning_rate": 0.009959251727708705, "loss": 2.8721, "step": 771 }, { "crossentropy": 3.019200325012207, "epoch": 0.041980477990157423, "grad_norm": 0.05801307037472725, "grad_norm_var": 2.5064879144460575e-05, "learning_rate": 0.00995909292739808, "loss": 3.0192, "step": 772 }, { "crossentropy": 3.003159761428833, "epoch": 0.042034856847657634, "grad_norm": 0.055672578513622284, "grad_norm_var": 2.32393045085597e-05, "learning_rate": 0.009958933819529133, "loss": 3.0032, "step": 773 }, { "crossentropy": 3.015499234199524, "epoch": 0.04208923570515784, "grad_norm": 0.0627250149846077, "grad_norm_var": 2.425961634412351e-05, "learning_rate": 0.009958774404111735, "loss": 3.0155, "step": 774 }, { "crossentropy": 2.9380348920822144, "epoch": 0.04214361456265804, "grad_norm": 0.05771037936210632, "grad_norm_var": 2.3485126081710694e-05, "learning_rate": 0.009958614681155773, "loss": 2.938, "step": 775 }, { "crossentropy": 2.872120499610901, "epoch": 0.042197993420158245, "grad_norm": 0.062076304107904434, "grad_norm_var": 2.4294288126706225e-05, "learning_rate": 0.00995845465067115, "loss": 2.8721, "step": 776 }, { "crossentropy": 2.987724781036377, "epoch": 0.04225237227765845, "grad_norm": 0.06063123047351837, "grad_norm_var": 2.452720623537883e-05, "learning_rate": 0.009958294312667792, "loss": 2.9877, "step": 777 }, { "crossentropy": 2.9780677556991577, "epoch": 0.04230675113515865, "grad_norm": 0.0595758892595768, "grad_norm_var": 1.5475657839558733e-05, "learning_rate": 0.009958133667155644, "loss": 2.9781, "step": 778 }, { "crossentropy": 3.062593460083008, "epoch": 0.042361129992658855, "grad_norm": 0.06234007701277733, "grad_norm_var": 1.5656383396140706e-05, "learning_rate": 0.009957972714144668, "loss": 3.0626, "step": 779 }, { "crossentropy": 3.0667484998703003, "epoch": 0.04241550885015906, "grad_norm": 0.06073441728949547, "grad_norm_var": 1.4851483755815888e-05, "learning_rate": 0.009957811453644848, "loss": 3.0667, "step": 780 }, { "crossentropy": 2.977708578109741, "epoch": 0.04246988770765926, "grad_norm": 0.05723189935088158, "grad_norm_var": 1.4960838839769783e-05, "learning_rate": 0.009957649885666182, "loss": 2.9777, "step": 781 }, { "crossentropy": 3.09585964679718, "epoch": 0.042524266565159466, "grad_norm": 0.05629859119653702, "grad_norm_var": 1.4772382867226745e-05, "learning_rate": 0.009957488010218693, "loss": 3.0959, "step": 782 }, { "crossentropy": 2.8573254346847534, "epoch": 0.04257864542265967, "grad_norm": 0.057145945727825165, "grad_norm_var": 1.469459581694369e-05, "learning_rate": 0.00995732582731242, "loss": 2.8573, "step": 783 }, { "crossentropy": 2.910283327102661, "epoch": 0.04263302428015987, "grad_norm": 0.056985314935445786, "grad_norm_var": 1.3439717763015444e-05, "learning_rate": 0.009957163336957418, "loss": 2.9103, "step": 784 }, { "crossentropy": 3.067343831062317, "epoch": 0.04268740313766008, "grad_norm": 0.054199300706386566, "grad_norm_var": 9.892046438981078e-06, "learning_rate": 0.009957000539163771, "loss": 3.0673, "step": 785 }, { "crossentropy": 2.963623881340027, "epoch": 0.04274178199516028, "grad_norm": 0.058892492204904556, "grad_norm_var": 8.912235213745512e-06, "learning_rate": 0.00995683743394157, "loss": 2.9636, "step": 786 }, { "crossentropy": 3.0808597803115845, "epoch": 0.042796160852660484, "grad_norm": 0.05663065239787102, "grad_norm_var": 6.5076055321225745e-06, "learning_rate": 0.009956674021300934, "loss": 3.0809, "step": 787 }, { "crossentropy": 3.1148312091827393, "epoch": 0.04285053971016069, "grad_norm": 0.05939679592847824, "grad_norm_var": 6.527484015241234e-06, "learning_rate": 0.009956510301251996, "loss": 3.1148, "step": 788 }, { "crossentropy": 2.884376645088196, "epoch": 0.04290491856766089, "grad_norm": 0.05977372080087662, "grad_norm_var": 5.955816903293008e-06, "learning_rate": 0.009956346273804909, "loss": 2.8844, "step": 789 }, { "crossentropy": 3.0754506587982178, "epoch": 0.042959297425161094, "grad_norm": 0.05695514380931854, "grad_norm_var": 5.091385372298548e-06, "learning_rate": 0.009956181938969846, "loss": 3.0755, "step": 790 }, { "crossentropy": 2.9737985134124756, "epoch": 0.0430136762826613, "grad_norm": 0.059438277035951614, "grad_norm_var": 5.0877445159945465e-06, "learning_rate": 0.009956017296757002, "loss": 2.9738, "step": 791 }, { "crossentropy": 3.019282817840576, "epoch": 0.04306805514016151, "grad_norm": 0.06399255245923996, "grad_norm_var": 6.194165173424813e-06, "learning_rate": 0.009955852347176584, "loss": 3.0193, "step": 792 }, { "crossentropy": 2.7849735021591187, "epoch": 0.04312243399766171, "grad_norm": 0.06514883786439896, "grad_norm_var": 8.594499576005295e-06, "learning_rate": 0.009955687090238825, "loss": 2.785, "step": 793 }, { "crossentropy": 2.9796351194381714, "epoch": 0.043176812855161915, "grad_norm": 0.06196307763457298, "grad_norm_var": 9.11924800214269e-06, "learning_rate": 0.009955521525953972, "loss": 2.9796, "step": 794 }, { "crossentropy": 2.9994442462921143, "epoch": 0.04323119171266212, "grad_norm": 0.05808218568563461, "grad_norm_var": 8.467082722867815e-06, "learning_rate": 0.009955355654332294, "loss": 2.9994, "step": 795 }, { "crossentropy": 3.005137085914612, "epoch": 0.04328557057016232, "grad_norm": 0.06011443957686424, "grad_norm_var": 8.341890423106621e-06, "learning_rate": 0.00995518947538408, "loss": 3.0051, "step": 796 }, { "crossentropy": 3.0888339281082153, "epoch": 0.043339949427662526, "grad_norm": 0.05559494346380234, "grad_norm_var": 8.871391180880391e-06, "learning_rate": 0.009955022989119633, "loss": 3.0888, "step": 797 }, { "crossentropy": 3.0121766328811646, "epoch": 0.04339432828516273, "grad_norm": 0.05958910658955574, "grad_norm_var": 8.455800544553543e-06, "learning_rate": 0.00995485619554928, "loss": 3.0122, "step": 798 }, { "crossentropy": 2.947220206260681, "epoch": 0.04344870714266293, "grad_norm": 0.05668981373310089, "grad_norm_var": 8.581193682131022e-06, "learning_rate": 0.009954689094683365, "loss": 2.9472, "step": 799 }, { "crossentropy": 2.949976325035095, "epoch": 0.04350308600016314, "grad_norm": 0.05410008504986763, "grad_norm_var": 9.863217691258713e-06, "learning_rate": 0.009954521686532252, "loss": 2.95, "step": 800 }, { "crossentropy": 3.078596591949463, "epoch": 0.04355746485766334, "grad_norm": 0.056388501077890396, "grad_norm_var": 8.824193807621441e-06, "learning_rate": 0.009954353971106323, "loss": 3.0786, "step": 801 }, { "crossentropy": 3.0237587690353394, "epoch": 0.043611843715163544, "grad_norm": 0.05834180489182472, "grad_norm_var": 8.845307640514601e-06, "learning_rate": 0.009954185948415979, "loss": 3.0238, "step": 802 }, { "crossentropy": 2.9674206972122192, "epoch": 0.04366622257266375, "grad_norm": 0.0687539204955101, "grad_norm_var": 1.438311660158133e-05, "learning_rate": 0.009954017618471641, "loss": 2.9674, "step": 803 }, { "crossentropy": 2.998755931854248, "epoch": 0.04372060143016395, "grad_norm": 0.05911652743816376, "grad_norm_var": 1.439730866203594e-05, "learning_rate": 0.00995384898128375, "loss": 2.9988, "step": 804 }, { "crossentropy": 2.8626508712768555, "epoch": 0.043774980287664154, "grad_norm": 0.06128324195742607, "grad_norm_var": 1.4569117377164856e-05, "learning_rate": 0.009953680036862764, "loss": 2.8627, "step": 805 }, { "crossentropy": 2.9339520931243896, "epoch": 0.04382935914516436, "grad_norm": 0.056608617305755615, "grad_norm_var": 1.4704462274098273e-05, "learning_rate": 0.00995351078521916, "loss": 2.934, "step": 806 }, { "crossentropy": 3.001691222190857, "epoch": 0.04388373800266456, "grad_norm": 0.05549906939268112, "grad_norm_var": 1.581195596105315e-05, "learning_rate": 0.009953341226363434, "loss": 3.0017, "step": 807 }, { "crossentropy": 2.9784817695617676, "epoch": 0.043938116860164765, "grad_norm": 0.0662262812256813, "grad_norm_var": 1.7475471114726004e-05, "learning_rate": 0.009953171360306103, "loss": 2.9785, "step": 808 }, { "crossentropy": 3.036158800125122, "epoch": 0.04399249571766497, "grad_norm": 0.06401002407073975, "grad_norm_var": 1.6713036712923992e-05, "learning_rate": 0.009953001187057704, "loss": 3.0362, "step": 809 }, { "crossentropy": 3.0285489559173584, "epoch": 0.04404687457516518, "grad_norm": 0.06442060321569443, "grad_norm_var": 1.789017189664672e-05, "learning_rate": 0.009952830706628788, "loss": 3.0285, "step": 810 }, { "crossentropy": 2.9522318840026855, "epoch": 0.04410125343266538, "grad_norm": 0.06341134756803513, "grad_norm_var": 1.8532536631597067e-05, "learning_rate": 0.00995265991902993, "loss": 2.9522, "step": 811 }, { "crossentropy": 3.0568349361419678, "epoch": 0.044155632290165586, "grad_norm": 0.06579968333244324, "grad_norm_var": 2.0632383039840587e-05, "learning_rate": 0.009952488824271721, "loss": 3.0568, "step": 812 }, { "crossentropy": 2.9856133460998535, "epoch": 0.04421001114766579, "grad_norm": 0.05945832282304764, "grad_norm_var": 1.9108307885905228e-05, "learning_rate": 0.009952317422364772, "loss": 2.9856, "step": 813 }, { "crossentropy": 2.9961228370666504, "epoch": 0.04426439000516599, "grad_norm": 0.06800837069749832, "grad_norm_var": 2.2396959170153016e-05, "learning_rate": 0.009952145713319713, "loss": 2.9961, "step": 814 }, { "crossentropy": 2.9703924655914307, "epoch": 0.0443187688626662, "grad_norm": 0.05760686844587326, "grad_norm_var": 2.1906325084806306e-05, "learning_rate": 0.009951973697147194, "loss": 2.9704, "step": 815 }, { "crossentropy": 2.9634664058685303, "epoch": 0.0443731477201664, "grad_norm": 0.05878416448831558, "grad_norm_var": 1.8849912022929755e-05, "learning_rate": 0.009951801373857884, "loss": 2.9635, "step": 816 }, { "crossentropy": 3.0311086177825928, "epoch": 0.044427526577666604, "grad_norm": 0.05362819507718086, "grad_norm_var": 2.1200856045050913e-05, "learning_rate": 0.009951628743462466, "loss": 3.0311, "step": 817 }, { "crossentropy": 2.9633467197418213, "epoch": 0.04448190543516681, "grad_norm": 0.053371720016002655, "grad_norm_var": 2.471155007840268e-05, "learning_rate": 0.009951455805971654, "loss": 2.9633, "step": 818 }, { "crossentropy": 2.905082583427429, "epoch": 0.04453628429266701, "grad_norm": 0.0670008510351181, "grad_norm_var": 2.309101632984196e-05, "learning_rate": 0.009951282561396166, "loss": 2.9051, "step": 819 }, { "crossentropy": 3.003321409225464, "epoch": 0.044590663150167215, "grad_norm": 0.05921514332294464, "grad_norm_var": 2.3068310162123693e-05, "learning_rate": 0.009951109009746752, "loss": 3.0033, "step": 820 }, { "crossentropy": 2.9344526529312134, "epoch": 0.04464504200766742, "grad_norm": 0.06171407550573349, "grad_norm_var": 2.310216872068906e-05, "learning_rate": 0.009950935151034172, "loss": 2.9345, "step": 821 }, { "crossentropy": 2.8150253295898438, "epoch": 0.04469942086516762, "grad_norm": 0.05345988646149635, "grad_norm_var": 2.5533013649012975e-05, "learning_rate": 0.00995076098526921, "loss": 2.815, "step": 822 }, { "crossentropy": 2.9938368797302246, "epoch": 0.044753799722667825, "grad_norm": 0.05725124478340149, "grad_norm_var": 2.4503783182147633e-05, "learning_rate": 0.009950586512462667, "loss": 2.9938, "step": 823 }, { "crossentropy": 3.032329797744751, "epoch": 0.04480817858016803, "grad_norm": 0.05662453919649124, "grad_norm_var": 2.3364324266059166e-05, "learning_rate": 0.009950411732625364, "loss": 3.0323, "step": 824 }, { "crossentropy": 2.9375839233398438, "epoch": 0.04486255743766823, "grad_norm": 0.0689457431435585, "grad_norm_var": 2.737102741124722e-05, "learning_rate": 0.00995023664576814, "loss": 2.9376, "step": 825 }, { "crossentropy": 3.042143225669861, "epoch": 0.044916936295168436, "grad_norm": 0.06694195419549942, "grad_norm_var": 2.9071658154568534e-05, "learning_rate": 0.009950061251901855, "loss": 3.0421, "step": 826 }, { "crossentropy": 3.0342180728912354, "epoch": 0.04497131515266864, "grad_norm": 0.07282096147537231, "grad_norm_var": 3.800542438160507e-05, "learning_rate": 0.009949885551037386, "loss": 3.0342, "step": 827 }, { "crossentropy": 2.949041485786438, "epoch": 0.04502569401016884, "grad_norm": 0.06129920855164528, "grad_norm_var": 3.656491059620814e-05, "learning_rate": 0.00994970954318563, "loss": 2.949, "step": 828 }, { "crossentropy": 3.1420880556106567, "epoch": 0.045080072867669053, "grad_norm": 0.059138223528862, "grad_norm_var": 3.663746331193923e-05, "learning_rate": 0.009949533228357503, "loss": 3.1421, "step": 829 }, { "crossentropy": 2.9959070682525635, "epoch": 0.04513445172516926, "grad_norm": 0.05695391446352005, "grad_norm_var": 3.392779873257473e-05, "learning_rate": 0.00994935660656394, "loss": 2.9959, "step": 830 }, { "crossentropy": 2.903855800628662, "epoch": 0.04518883058266946, "grad_norm": 0.05315859988331795, "grad_norm_var": 3.6760189730696235e-05, "learning_rate": 0.009949179677815891, "loss": 2.9039, "step": 831 }, { "crossentropy": 2.8636971712112427, "epoch": 0.045243209440169664, "grad_norm": 0.06501718610525131, "grad_norm_var": 3.816188559448407e-05, "learning_rate": 0.009949002442124336, "loss": 2.8637, "step": 832 }, { "crossentropy": 2.95835018157959, "epoch": 0.04529758829766987, "grad_norm": 0.051786549389362335, "grad_norm_var": 4.003887043983739e-05, "learning_rate": 0.009948824899500264, "loss": 2.9584, "step": 833 }, { "crossentropy": 2.9818748235702515, "epoch": 0.04535196715517007, "grad_norm": 0.06317198276519775, "grad_norm_var": 3.699668021251891e-05, "learning_rate": 0.009948647049954684, "loss": 2.9819, "step": 834 }, { "crossentropy": 3.066809892654419, "epoch": 0.045406346012670275, "grad_norm": 0.05905000492930412, "grad_norm_var": 3.448671689835573e-05, "learning_rate": 0.009948468893498628, "loss": 3.0668, "step": 835 }, { "crossentropy": 2.9568525552749634, "epoch": 0.04546072487017048, "grad_norm": 0.06131599470973015, "grad_norm_var": 3.4428058691822876e-05, "learning_rate": 0.009948290430143144, "loss": 2.9569, "step": 836 }, { "crossentropy": 2.917979121208191, "epoch": 0.04551510372767068, "grad_norm": 0.05699586495757103, "grad_norm_var": 3.508119429127358e-05, "learning_rate": 0.009948111659899301, "loss": 2.918, "step": 837 }, { "crossentropy": 2.8729965686798096, "epoch": 0.045569482585170885, "grad_norm": 0.05786748602986336, "grad_norm_var": 3.2307466965531924e-05, "learning_rate": 0.009947932582778188, "loss": 2.873, "step": 838 }, { "crossentropy": 2.895754814147949, "epoch": 0.04562386144267109, "grad_norm": 0.05595144256949425, "grad_norm_var": 3.297976862746913e-05, "learning_rate": 0.009947753198790906, "loss": 2.8958, "step": 839 }, { "crossentropy": 2.9709688425064087, "epoch": 0.04567824030017129, "grad_norm": 0.055262621492147446, "grad_norm_var": 3.3788536777494085e-05, "learning_rate": 0.009947573507948585, "loss": 2.971, "step": 840 }, { "crossentropy": 2.8907830715179443, "epoch": 0.045732619157671496, "grad_norm": 0.06090293824672699, "grad_norm_var": 2.8618815126709118e-05, "learning_rate": 0.009947393510262368, "loss": 2.8908, "step": 841 }, { "crossentropy": 2.9238799810409546, "epoch": 0.0457869980151717, "grad_norm": 0.05711574852466583, "grad_norm_var": 2.5364720437810276e-05, "learning_rate": 0.009947213205743417, "loss": 2.9239, "step": 842 }, { "crossentropy": 2.8776776790618896, "epoch": 0.0458413768726719, "grad_norm": 0.05848417431116104, "grad_norm_var": 1.2246470308542822e-05, "learning_rate": 0.009947032594402916, "loss": 2.8777, "step": 843 }, { "crossentropy": 2.9351534843444824, "epoch": 0.04589575573017211, "grad_norm": 0.0573839470744133, "grad_norm_var": 1.1660781930056669e-05, "learning_rate": 0.009946851676252066, "loss": 2.9352, "step": 844 }, { "crossentropy": 2.9871413707733154, "epoch": 0.04595013458767231, "grad_norm": 0.05930629000067711, "grad_norm_var": 1.1685873408391844e-05, "learning_rate": 0.009946670451302085, "loss": 2.9871, "step": 845 }, { "crossentropy": 2.9694011211395264, "epoch": 0.046004513445172514, "grad_norm": 0.05717763677239418, "grad_norm_var": 1.1654581748036092e-05, "learning_rate": 0.009946488919564216, "loss": 2.9694, "step": 846 }, { "crossentropy": 2.9870073795318604, "epoch": 0.046058892302672724, "grad_norm": 0.05612712725996971, "grad_norm_var": 1.0240896918031702e-05, "learning_rate": 0.009946307081049715, "loss": 2.987, "step": 847 }, { "crossentropy": 2.9796230792999268, "epoch": 0.04611327116017293, "grad_norm": 0.05832208693027496, "grad_norm_var": 7.052649475109268e-06, "learning_rate": 0.009946124935769863, "loss": 2.9796, "step": 848 }, { "crossentropy": 2.932759642601013, "epoch": 0.04616765001767313, "grad_norm": 0.056731220334768295, "grad_norm_var": 4.557565481570075e-06, "learning_rate": 0.00994594248373595, "loss": 2.9328, "step": 849 }, { "crossentropy": 2.973347306251526, "epoch": 0.046222028875173335, "grad_norm": 0.05769922211766243, "grad_norm_var": 2.799922157397233e-06, "learning_rate": 0.009945759724959298, "loss": 2.9733, "step": 850 }, { "crossentropy": 3.0583068132400513, "epoch": 0.04627640773267354, "grad_norm": 0.05765309929847717, "grad_norm_var": 2.6994674096512497e-06, "learning_rate": 0.009945576659451236, "loss": 3.0583, "step": 851 }, { "crossentropy": 2.935524106025696, "epoch": 0.04633078659017374, "grad_norm": 0.0544954352080822, "grad_norm_var": 2.380900761109195e-06, "learning_rate": 0.009945393287223124, "loss": 2.9355, "step": 852 }, { "crossentropy": 3.077533483505249, "epoch": 0.046385165447673946, "grad_norm": 0.0646735355257988, "grad_norm_var": 5.710452320752067e-06, "learning_rate": 0.009945209608286328, "loss": 3.0775, "step": 853 }, { "crossentropy": 2.958067297935486, "epoch": 0.04643954430517415, "grad_norm": 0.06130131706595421, "grad_norm_var": 6.468169991700965e-06, "learning_rate": 0.009945025622652244, "loss": 2.9581, "step": 854 }, { "crossentropy": 3.01697838306427, "epoch": 0.04649392316267435, "grad_norm": 0.0609988197684288, "grad_norm_var": 6.657049886366987e-06, "learning_rate": 0.00994484133033228, "loss": 3.017, "step": 855 }, { "crossentropy": 2.8566795587539673, "epoch": 0.046548302020174556, "grad_norm": 0.057543639093637466, "grad_norm_var": 6.042588527628798e-06, "learning_rate": 0.009944656731337867, "loss": 2.8567, "step": 856 }, { "crossentropy": 3.0435543060302734, "epoch": 0.04660268087767476, "grad_norm": 0.056494828313589096, "grad_norm_var": 5.841654022974557e-06, "learning_rate": 0.009944471825680453, "loss": 3.0436, "step": 857 }, { "crossentropy": 2.9037795066833496, "epoch": 0.04665705973517496, "grad_norm": 0.058676827698946, "grad_norm_var": 5.764275785209947e-06, "learning_rate": 0.009944286613371506, "loss": 2.9038, "step": 858 }, { "crossentropy": 2.9507960081100464, "epoch": 0.04671143859267517, "grad_norm": 0.05235249176621437, "grad_norm_var": 7.977304095001652e-06, "learning_rate": 0.009944101094422515, "loss": 2.9508, "step": 859 }, { "crossentropy": 3.0488224029541016, "epoch": 0.04676581745017537, "grad_norm": 0.05791955068707466, "grad_norm_var": 7.955981072708488e-06, "learning_rate": 0.009943915268844982, "loss": 3.0488, "step": 860 }, { "crossentropy": 2.973878264427185, "epoch": 0.046820196307675574, "grad_norm": 0.05840336158871651, "grad_norm_var": 7.845706808904535e-06, "learning_rate": 0.009943729136650434, "loss": 2.9739, "step": 861 }, { "crossentropy": 2.9860448837280273, "epoch": 0.04687457516517578, "grad_norm": 0.053346287459135056, "grad_norm_var": 9.1376100494626e-06, "learning_rate": 0.009943542697850413, "loss": 2.986, "step": 862 }, { "crossentropy": 2.9307522773742676, "epoch": 0.04692895402267598, "grad_norm": 0.058645766228437424, "grad_norm_var": 9.015560547460482e-06, "learning_rate": 0.009943355952456483, "loss": 2.9308, "step": 863 }, { "crossentropy": 3.000841975212097, "epoch": 0.046983332880176185, "grad_norm": 0.05796642228960991, "grad_norm_var": 9.00006418869803e-06, "learning_rate": 0.009943168900480226, "loss": 3.0008, "step": 864 }, { "crossentropy": 2.893749237060547, "epoch": 0.04703771173767639, "grad_norm": 0.052576690912246704, "grad_norm_var": 1.0674383506336278e-05, "learning_rate": 0.00994298154193324, "loss": 2.8937, "step": 865 }, { "crossentropy": 2.955524206161499, "epoch": 0.0470920905951766, "grad_norm": 0.05802442133426666, "grad_norm_var": 1.0687606250072736e-05, "learning_rate": 0.00994279387682715, "loss": 2.9555, "step": 866 }, { "crossentropy": 2.8796879053115845, "epoch": 0.0471464694526768, "grad_norm": 0.05685019865632057, "grad_norm_var": 1.0718682922745508e-05, "learning_rate": 0.009942605905173593, "loss": 2.8797, "step": 867 }, { "crossentropy": 2.930345296859741, "epoch": 0.047200848310177006, "grad_norm": 0.05767840892076492, "grad_norm_var": 1.0069612848370192e-05, "learning_rate": 0.009942417626984222, "loss": 2.9303, "step": 868 }, { "crossentropy": 2.8632009029388428, "epoch": 0.04725522716767721, "grad_norm": 0.058320142328739166, "grad_norm_var": 6.698420240803561e-06, "learning_rate": 0.009942229042270719, "loss": 2.8632, "step": 869 }, { "crossentropy": 2.9951401948928833, "epoch": 0.04730960602517741, "grad_norm": 0.05674367770552635, "grad_norm_var": 5.57649640166389e-06, "learning_rate": 0.009942040151044778, "loss": 2.9951, "step": 870 }, { "crossentropy": 2.775408983230591, "epoch": 0.047363984882677616, "grad_norm": 0.06172839179635048, "grad_norm_var": 5.9954614786340845e-06, "learning_rate": 0.009941850953318116, "loss": 2.7754, "step": 871 }, { "crossentropy": 2.999764323234558, "epoch": 0.04741836374017782, "grad_norm": 0.061262160539627075, "grad_norm_var": 7.089823256294497e-06, "learning_rate": 0.009941661449102464, "loss": 2.9998, "step": 872 }, { "crossentropy": 2.9316779375076294, "epoch": 0.04747274259767802, "grad_norm": 0.058392565697431564, "grad_norm_var": 7.108178412187395e-06, "learning_rate": 0.009941471638409576, "loss": 2.9317, "step": 873 }, { "crossentropy": 2.9310590028762817, "epoch": 0.04752712145517823, "grad_norm": 0.05610418692231178, "grad_norm_var": 7.094306052463952e-06, "learning_rate": 0.009941281521251224, "loss": 2.9311, "step": 874 }, { "crossentropy": 2.9966992139816284, "epoch": 0.04758150031267843, "grad_norm": 0.05560911446809769, "grad_norm_var": 5.622036149830617e-06, "learning_rate": 0.0099410910976392, "loss": 2.9967, "step": 875 }, { "crossentropy": 2.9255259037017822, "epoch": 0.047635879170178634, "grad_norm": 0.057185567915439606, "grad_norm_var": 5.612025892956449e-06, "learning_rate": 0.00994090036758531, "loss": 2.9255, "step": 876 }, { "crossentropy": 2.9214857816696167, "epoch": 0.04769025802767884, "grad_norm": 0.052413519471883774, "grad_norm_var": 7.074914811475095e-06, "learning_rate": 0.009940709331101387, "loss": 2.9215, "step": 877 }, { "crossentropy": 2.977203130722046, "epoch": 0.04774463688517904, "grad_norm": 0.05862739309668541, "grad_norm_var": 6.207993550447616e-06, "learning_rate": 0.009940517988199279, "loss": 2.9772, "step": 878 }, { "crossentropy": 2.9742510318756104, "epoch": 0.047799015742679245, "grad_norm": 0.06014971807599068, "grad_norm_var": 6.6025712373333005e-06, "learning_rate": 0.009940326338890849, "loss": 2.9743, "step": 879 }, { "crossentropy": 2.8245599269866943, "epoch": 0.04785339460017945, "grad_norm": 0.10450160503387451, "grad_norm_var": 0.00014498426242298226, "learning_rate": 0.009940134383187986, "loss": 2.8246, "step": 880 }, { "crossentropy": 3.0172159671783447, "epoch": 0.04790777345767965, "grad_norm": 0.05478832498192787, "grad_norm_var": 0.0001429872775256075, "learning_rate": 0.009939942121102595, "loss": 3.0172, "step": 881 }, { "crossentropy": 2.909218430519104, "epoch": 0.047962152315179855, "grad_norm": 0.05618515610694885, "grad_norm_var": 0.0001438116230926896, "learning_rate": 0.009939749552646598, "loss": 2.9092, "step": 882 }, { "crossentropy": 2.9147050380706787, "epoch": 0.04801653117268006, "grad_norm": 0.056391727179288864, "grad_norm_var": 0.00014404229342396773, "learning_rate": 0.00993955667783194, "loss": 2.9147, "step": 883 }, { "crossentropy": 2.9203810691833496, "epoch": 0.04807091003018026, "grad_norm": 0.05562411621212959, "grad_norm_var": 0.00014504606043487593, "learning_rate": 0.009939363496670583, "loss": 2.9204, "step": 884 }, { "crossentropy": 2.916391968727112, "epoch": 0.04812528888768047, "grad_norm": 0.06166457012295723, "grad_norm_var": 0.0001448838032402144, "learning_rate": 0.009939170009174506, "loss": 2.9164, "step": 885 }, { "crossentropy": 2.954128623008728, "epoch": 0.048179667745180677, "grad_norm": 0.06793128699064255, "grad_norm_var": 0.0001471617979147059, "learning_rate": 0.009938976215355708, "loss": 2.9541, "step": 886 }, { "crossentropy": 2.908265709877014, "epoch": 0.04823404660268088, "grad_norm": 0.0522712878882885, "grad_norm_var": 0.00015203483985097092, "learning_rate": 0.009938782115226212, "loss": 2.9083, "step": 887 }, { "crossentropy": 2.989992618560791, "epoch": 0.048288425460181084, "grad_norm": 0.05975163355469704, "grad_norm_var": 0.00015203781916979097, "learning_rate": 0.009938587708798052, "loss": 2.99, "step": 888 }, { "crossentropy": 2.9410165548324585, "epoch": 0.04834280431768129, "grad_norm": 0.05995957925915718, "grad_norm_var": 0.00015175630360535618, "learning_rate": 0.009938392996083288, "loss": 2.941, "step": 889 }, { "crossentropy": 2.908294200897217, "epoch": 0.04839718317518149, "grad_norm": 0.05986521393060684, "grad_norm_var": 0.00015039969821854492, "learning_rate": 0.009938197977093995, "loss": 2.9083, "step": 890 }, { "crossentropy": 3.0238037109375, "epoch": 0.048451562032681694, "grad_norm": 0.05789131298661232, "grad_norm_var": 0.00014914339559572297, "learning_rate": 0.009938002651842266, "loss": 3.0238, "step": 891 }, { "crossentropy": 2.906211733818054, "epoch": 0.0485059408901819, "grad_norm": 0.057976942509412766, "grad_norm_var": 0.0001487853143092741, "learning_rate": 0.009937807020340216, "loss": 2.9062, "step": 892 }, { "crossentropy": 2.9566009044647217, "epoch": 0.0485603197476821, "grad_norm": 0.061198361217975616, "grad_norm_var": 0.00014355168900000338, "learning_rate": 0.00993761108259998, "loss": 2.9566, "step": 893 }, { "crossentropy": 2.900616765022278, "epoch": 0.048614698605182305, "grad_norm": 0.05371738225221634, "grad_norm_var": 0.00014697089865836158, "learning_rate": 0.009937414838633704, "loss": 2.9006, "step": 894 }, { "crossentropy": 3.020187497138977, "epoch": 0.04866907746268251, "grad_norm": 0.05433869734406471, "grad_norm_var": 0.00014992751620476752, "learning_rate": 0.009937218288453566, "loss": 3.0202, "step": 895 }, { "crossentropy": 2.929895520210266, "epoch": 0.04872345632018271, "grad_norm": 0.05716269463300705, "grad_norm_var": 1.4646064592747226e-05, "learning_rate": 0.009937021432071754, "loss": 2.9299, "step": 896 }, { "crossentropy": 2.7774658203125, "epoch": 0.048777835177682916, "grad_norm": 0.06565888226032257, "grad_norm_var": 1.7492709537817763e-05, "learning_rate": 0.009936824269500473, "loss": 2.7775, "step": 897 }, { "crossentropy": 2.880231261253357, "epoch": 0.04883221403518312, "grad_norm": 0.05536800995469093, "grad_norm_var": 1.7797470615885796e-05, "learning_rate": 0.009936626800751953, "loss": 2.8802, "step": 898 }, { "crossentropy": 2.7960976362228394, "epoch": 0.04888659289268332, "grad_norm": 0.06925090402364731, "grad_norm_var": 2.4434923656686756e-05, "learning_rate": 0.009936429025838443, "loss": 2.7961, "step": 899 }, { "crossentropy": 2.939084768295288, "epoch": 0.048940971750183526, "grad_norm": 0.06001248583197594, "grad_norm_var": 2.345733227607314e-05, "learning_rate": 0.009936230944772207, "loss": 2.9391, "step": 900 }, { "crossentropy": 3.015801787376404, "epoch": 0.04899535060768373, "grad_norm": 0.05312541127204895, "grad_norm_var": 2.569386707518065e-05, "learning_rate": 0.00993603255756553, "loss": 3.0158, "step": 901 }, { "crossentropy": 2.898020029067993, "epoch": 0.04904972946518393, "grad_norm": 0.07535801082849503, "grad_norm_var": 3.789355661568475e-05, "learning_rate": 0.009935833864230713, "loss": 2.898, "step": 902 }, { "crossentropy": 3.016543745994568, "epoch": 0.049104108322684144, "grad_norm": 0.06515160948038101, "grad_norm_var": 3.5750722816292546e-05, "learning_rate": 0.009935634864780083, "loss": 3.0165, "step": 903 }, { "crossentropy": 2.9779138565063477, "epoch": 0.04915848718018435, "grad_norm": 0.05796615779399872, "grad_norm_var": 3.609520153884568e-05, "learning_rate": 0.009935435559225981, "loss": 2.9779, "step": 904 }, { "crossentropy": 2.9518280029296875, "epoch": 0.04921286603768455, "grad_norm": 0.05797748267650604, "grad_norm_var": 3.641752532481385e-05, "learning_rate": 0.009935235947580766, "loss": 2.9518, "step": 905 }, { "crossentropy": 2.922651529312134, "epoch": 0.049267244895184754, "grad_norm": 0.05644507333636284, "grad_norm_var": 3.7267635201955936e-05, "learning_rate": 0.009935036029856818, "loss": 2.9227, "step": 906 }, { "crossentropy": 2.94489848613739, "epoch": 0.04932162375268496, "grad_norm": 0.05660970136523247, "grad_norm_var": 3.771567058089541e-05, "learning_rate": 0.00993483580606654, "loss": 2.9449, "step": 907 }, { "crossentropy": 2.913500428199768, "epoch": 0.04937600261018516, "grad_norm": 0.0539216510951519, "grad_norm_var": 3.974674342279878e-05, "learning_rate": 0.009934635276222343, "loss": 2.9135, "step": 908 }, { "crossentropy": 2.965059757232666, "epoch": 0.049430381467685365, "grad_norm": 0.05793245881795883, "grad_norm_var": 3.970817866418448e-05, "learning_rate": 0.009934434440336665, "loss": 2.9651, "step": 909 }, { "crossentropy": 2.981885313987732, "epoch": 0.04948476032518557, "grad_norm": 0.0585346594452858, "grad_norm_var": 3.7524791212907815e-05, "learning_rate": 0.009934233298421967, "loss": 2.9819, "step": 910 }, { "crossentropy": 2.9718087911605835, "epoch": 0.04953913918268577, "grad_norm": 0.05362459644675255, "grad_norm_var": 3.8064832928746035e-05, "learning_rate": 0.009934031850490719, "loss": 2.9718, "step": 911 }, { "crossentropy": 2.875010132789612, "epoch": 0.049593518040185976, "grad_norm": 0.058517519384622574, "grad_norm_var": 3.773362920148991e-05, "learning_rate": 0.009933830096555414, "loss": 2.875, "step": 912 }, { "crossentropy": 2.9292417764663696, "epoch": 0.04964789689768618, "grad_norm": 0.07308276742696762, "grad_norm_var": 4.706091444327352e-05, "learning_rate": 0.009933628036628568, "loss": 2.9292, "step": 913 }, { "crossentropy": 3.0476540327072144, "epoch": 0.04970227575518638, "grad_norm": 0.06267968565225601, "grad_norm_var": 4.5711132060497625e-05, "learning_rate": 0.00993342567072271, "loss": 3.0477, "step": 914 }, { "crossentropy": 2.934458613395691, "epoch": 0.049756654612686586, "grad_norm": 0.061166372150182724, "grad_norm_var": 4.051073669126311e-05, "learning_rate": 0.009933222998850392, "loss": 2.9345, "step": 915 }, { "crossentropy": 2.9016083478927612, "epoch": 0.04981103347018679, "grad_norm": 0.055762533098459244, "grad_norm_var": 4.170711663199714e-05, "learning_rate": 0.009933020021024181, "loss": 2.9016, "step": 916 }, { "crossentropy": 2.934544324874878, "epoch": 0.04986541232768699, "grad_norm": 0.053466442972421646, "grad_norm_var": 4.14078858346302e-05, "learning_rate": 0.00993281673725667, "loss": 2.9345, "step": 917 }, { "crossentropy": 2.9593578577041626, "epoch": 0.0499197911851872, "grad_norm": 0.06551826000213623, "grad_norm_var": 2.7162114335513298e-05, "learning_rate": 0.009932613147560464, "loss": 2.9594, "step": 918 }, { "crossentropy": 2.859462857246399, "epoch": 0.0499741700426874, "grad_norm": 0.05625991150736809, "grad_norm_var": 2.513324776101892e-05, "learning_rate": 0.009932409251948186, "loss": 2.8595, "step": 919 }, { "crossentropy": 2.985567808151245, "epoch": 0.050028548900187604, "grad_norm": 0.057638123631477356, "grad_norm_var": 2.5172795025740247e-05, "learning_rate": 0.009932205050432488, "loss": 2.9856, "step": 920 }, { "crossentropy": 2.9577059745788574, "epoch": 0.05008292775768781, "grad_norm": 0.06072140485048294, "grad_norm_var": 2.538046191841118e-05, "learning_rate": 0.009932000543026027, "loss": 2.9577, "step": 921 }, { "crossentropy": 3.036247968673706, "epoch": 0.05013730661518802, "grad_norm": 0.05423452705144882, "grad_norm_var": 2.6399875323582926e-05, "learning_rate": 0.009931795729741495, "loss": 3.0362, "step": 922 }, { "crossentropy": 2.9134219884872437, "epoch": 0.05019168547268822, "grad_norm": 0.05883992090821266, "grad_norm_var": 2.60804196708847e-05, "learning_rate": 0.009931590610591587, "loss": 2.9134, "step": 923 }, { "crossentropy": 2.884853720664978, "epoch": 0.050246064330188425, "grad_norm": 0.05425512418150902, "grad_norm_var": 2.586740437535336e-05, "learning_rate": 0.009931385185589028, "loss": 2.8849, "step": 924 }, { "crossentropy": 3.00618839263916, "epoch": 0.05030044318768863, "grad_norm": 0.056935086846351624, "grad_norm_var": 2.6056865622408933e-05, "learning_rate": 0.009931179454746556, "loss": 3.0062, "step": 925 }, { "crossentropy": 2.89305579662323, "epoch": 0.05035482204518883, "grad_norm": 0.05371486395597458, "grad_norm_var": 2.769683520201215e-05, "learning_rate": 0.009930973418076933, "loss": 2.8931, "step": 926 }, { "crossentropy": 2.8448392152786255, "epoch": 0.050409200902689036, "grad_norm": 0.05464530363678932, "grad_norm_var": 2.709488765630467e-05, "learning_rate": 0.009930767075592935, "loss": 2.8448, "step": 927 }, { "crossentropy": 2.9644267559051514, "epoch": 0.05046357976018924, "grad_norm": 0.05807648226618767, "grad_norm_var": 2.711129907194664e-05, "learning_rate": 0.00993056042730736, "loss": 2.9644, "step": 928 }, { "crossentropy": 2.9912214279174805, "epoch": 0.05051795861768944, "grad_norm": 0.06021884083747864, "grad_norm_var": 1.2548474511582993e-05, "learning_rate": 0.009930353473233025, "loss": 2.9912, "step": 929 }, { "crossentropy": 2.963263154029846, "epoch": 0.050572337475189647, "grad_norm": 0.05736931413412094, "grad_norm_var": 1.0826396236328824e-05, "learning_rate": 0.009930146213382764, "loss": 2.9633, "step": 930 }, { "crossentropy": 2.847991704940796, "epoch": 0.05062671633268985, "grad_norm": 0.05577189847826958, "grad_norm_var": 9.955148736810286e-06, "learning_rate": 0.00992993864776943, "loss": 2.848, "step": 931 }, { "crossentropy": 2.934782862663269, "epoch": 0.050681095190190054, "grad_norm": 0.0559631772339344, "grad_norm_var": 9.9221717398947e-06, "learning_rate": 0.009929730776405899, "loss": 2.9348, "step": 932 }, { "crossentropy": 2.9167312383651733, "epoch": 0.05073547404769026, "grad_norm": 0.053285300731658936, "grad_norm_var": 1.0012024573538964e-05, "learning_rate": 0.009929522599305058, "loss": 2.9167, "step": 933 }, { "crossentropy": 2.9159839153289795, "epoch": 0.05078985290519046, "grad_norm": 0.053832050412893295, "grad_norm_var": 5.415638910938094e-06, "learning_rate": 0.009929314116479824, "loss": 2.916, "step": 934 }, { "crossentropy": 2.985842227935791, "epoch": 0.050844231762690664, "grad_norm": 0.0684199333190918, "grad_norm_var": 1.4494860123443788e-05, "learning_rate": 0.009929105327943125, "loss": 2.9858, "step": 935 }, { "crossentropy": 2.9810268878936768, "epoch": 0.05089861062019087, "grad_norm": 0.06565681844949722, "grad_norm_var": 1.906744311305908e-05, "learning_rate": 0.009928896233707907, "loss": 2.981, "step": 936 }, { "crossentropy": 2.973832607269287, "epoch": 0.05095298947769107, "grad_norm": 0.0682145357131958, "grad_norm_var": 2.5673943758001913e-05, "learning_rate": 0.009928686833787139, "loss": 2.9738, "step": 937 }, { "crossentropy": 2.9522645473480225, "epoch": 0.051007368335191275, "grad_norm": 0.05745110660791397, "grad_norm_var": 2.4667250890134974e-05, "learning_rate": 0.00992847712819381, "loss": 2.9523, "step": 938 }, { "crossentropy": 2.9329960346221924, "epoch": 0.05106174719269148, "grad_norm": 0.05814057216048241, "grad_norm_var": 2.4646597599305094e-05, "learning_rate": 0.009928267116940925, "loss": 2.933, "step": 939 }, { "crossentropy": 2.931981086730957, "epoch": 0.05111612605019169, "grad_norm": 0.05700695514678955, "grad_norm_var": 2.3655257607050478e-05, "learning_rate": 0.009928056800041508, "loss": 2.932, "step": 940 }, { "crossentropy": 3.0567290782928467, "epoch": 0.05117050490769189, "grad_norm": 0.05633963271975517, "grad_norm_var": 2.3795222852547534e-05, "learning_rate": 0.0099278461775086, "loss": 3.0567, "step": 941 }, { "crossentropy": 3.0120041370391846, "epoch": 0.051224883765192096, "grad_norm": 0.05908595398068428, "grad_norm_var": 2.2256146525189333e-05, "learning_rate": 0.00992763524935527, "loss": 3.012, "step": 942 }, { "crossentropy": 2.8249611854553223, "epoch": 0.0512792626226923, "grad_norm": 0.05216961354017258, "grad_norm_var": 2.398336728723077e-05, "learning_rate": 0.009927424015594592, "loss": 2.825, "step": 943 }, { "crossentropy": 2.8902604579925537, "epoch": 0.0513336414801925, "grad_norm": 0.05443485081195831, "grad_norm_var": 2.5048262424416888e-05, "learning_rate": 0.009927212476239674, "loss": 2.8903, "step": 944 }, { "crossentropy": 2.9744232892990112, "epoch": 0.05138802033769271, "grad_norm": 0.06496994942426682, "grad_norm_var": 2.7652432679890298e-05, "learning_rate": 0.00992700063130363, "loss": 2.9744, "step": 945 }, { "crossentropy": 3.0150907039642334, "epoch": 0.05144239919519291, "grad_norm": 0.05841607600450516, "grad_norm_var": 2.7544686665734174e-05, "learning_rate": 0.0099267884807996, "loss": 3.0151, "step": 946 }, { "crossentropy": 2.9093897342681885, "epoch": 0.051496778052693114, "grad_norm": 0.052754178643226624, "grad_norm_var": 2.929096437971571e-05, "learning_rate": 0.009926576024740742, "loss": 2.9094, "step": 947 }, { "crossentropy": 2.866421937942505, "epoch": 0.05155115691019332, "grad_norm": 0.06061013787984848, "grad_norm_var": 2.906335375678984e-05, "learning_rate": 0.009926363263140232, "loss": 2.8664, "step": 948 }, { "crossentropy": 2.874050736427307, "epoch": 0.05160553576769352, "grad_norm": 0.06066013127565384, "grad_norm_var": 2.704070662738663e-05, "learning_rate": 0.009926150196011266, "loss": 2.8741, "step": 949 }, { "crossentropy": 2.9599987268447876, "epoch": 0.051659914625193724, "grad_norm": 0.05716449022293091, "grad_norm_var": 2.532293413098439e-05, "learning_rate": 0.009925936823367057, "loss": 2.96, "step": 950 }, { "crossentropy": 2.873924136161804, "epoch": 0.05171429348269393, "grad_norm": 0.05543571710586548, "grad_norm_var": 2.036270649265013e-05, "learning_rate": 0.009925723145220838, "loss": 2.8739, "step": 951 }, { "crossentropy": 2.86227023601532, "epoch": 0.05176867234019413, "grad_norm": 0.06367101520299911, "grad_norm_var": 1.8755780420339798e-05, "learning_rate": 0.009925509161585862, "loss": 2.8623, "step": 952 }, { "crossentropy": 2.864550828933716, "epoch": 0.051823051197694335, "grad_norm": 0.05722469836473465, "grad_norm_var": 1.2117563794762122e-05, "learning_rate": 0.0099252948724754, "loss": 2.8646, "step": 953 }, { "crossentropy": 2.9409340620040894, "epoch": 0.05187743005519454, "grad_norm": 0.05608430504798889, "grad_norm_var": 1.23062779160266e-05, "learning_rate": 0.009925080277902744, "loss": 2.9409, "step": 954 }, { "crossentropy": 2.9596980810165405, "epoch": 0.05193180891269474, "grad_norm": 0.05510114133358002, "grad_norm_var": 1.2729641607852013e-05, "learning_rate": 0.009924865377881197, "loss": 2.9597, "step": 955 }, { "crossentropy": 2.98806631565094, "epoch": 0.051986187770194946, "grad_norm": 0.055504780262708664, "grad_norm_var": 1.2983557683346874e-05, "learning_rate": 0.009924650172424093, "loss": 2.9881, "step": 956 }, { "crossentropy": 3.0101557970046997, "epoch": 0.05204056662769515, "grad_norm": 0.0646587535738945, "grad_norm_var": 1.6047826686688137e-05, "learning_rate": 0.009924434661544775, "loss": 3.0102, "step": 957 }, { "crossentropy": 2.9717912673950195, "epoch": 0.05209494548519535, "grad_norm": 0.05405230447649956, "grad_norm_var": 1.6900312993728175e-05, "learning_rate": 0.009924218845256612, "loss": 2.9718, "step": 958 }, { "crossentropy": 2.9078128337860107, "epoch": 0.05214932434269556, "grad_norm": 0.04917340725660324, "grad_norm_var": 1.966356108411537e-05, "learning_rate": 0.009924002723572987, "loss": 2.9078, "step": 959 }, { "crossentropy": 2.9052118062973022, "epoch": 0.05220370320019577, "grad_norm": 0.056081999093294144, "grad_norm_var": 1.916111618322404e-05, "learning_rate": 0.009923786296507302, "loss": 2.9052, "step": 960 }, { "crossentropy": 2.975624203681946, "epoch": 0.05225808205769597, "grad_norm": 0.05349527299404144, "grad_norm_var": 1.6111144090578897e-05, "learning_rate": 0.009923569564072983, "loss": 2.9756, "step": 961 }, { "crossentropy": 2.8256289958953857, "epoch": 0.052312460915196174, "grad_norm": 0.11432865262031555, "grad_norm_var": 0.0002229472024326333, "learning_rate": 0.00992335252628347, "loss": 2.8256, "step": 962 }, { "crossentropy": 3.004561185836792, "epoch": 0.05236683977269638, "grad_norm": 0.061130546033382416, "grad_norm_var": 0.00021882104771794628, "learning_rate": 0.009923135183152224, "loss": 3.0046, "step": 963 }, { "crossentropy": 2.934750199317932, "epoch": 0.05242121863019658, "grad_norm": 0.06312992423772812, "grad_norm_var": 0.00021912097056010996, "learning_rate": 0.009922917534692722, "loss": 2.9348, "step": 964 }, { "crossentropy": 2.984598398208618, "epoch": 0.052475597487696785, "grad_norm": 0.07110881060361862, "grad_norm_var": 0.00022539279522610454, "learning_rate": 0.009922699580918464, "loss": 2.9846, "step": 965 }, { "crossentropy": 2.88892924785614, "epoch": 0.05252997634519699, "grad_norm": 0.2443470060825348, "grad_norm_var": 0.00230180047917126, "learning_rate": 0.00992248132184297, "loss": 2.8889, "step": 966 }, { "crossentropy": 2.845343828201294, "epoch": 0.05258435520269719, "grad_norm": 0.07849713414907455, "grad_norm_var": 0.002279777549875153, "learning_rate": 0.00992226275747977, "loss": 2.8453, "step": 967 }, { "crossentropy": 3.0200690031051636, "epoch": 0.052638734060197395, "grad_norm": 0.11114969849586487, "grad_norm_var": 0.002349902226441783, "learning_rate": 0.009922043887842425, "loss": 3.0201, "step": 968 }, { "crossentropy": 2.9057148694992065, "epoch": 0.0526931129176976, "grad_norm": 0.07067085802555084, "grad_norm_var": 0.0023242842598053468, "learning_rate": 0.009921824712944506, "loss": 2.9057, "step": 969 }, { "crossentropy": 2.972044348716736, "epoch": 0.0527474917751978, "grad_norm": 0.06906262040138245, "grad_norm_var": 0.002295750530602929, "learning_rate": 0.009921605232799608, "loss": 2.972, "step": 970 }, { "crossentropy": 2.922685742378235, "epoch": 0.052801870632698006, "grad_norm": 0.05643844231963158, "grad_norm_var": 0.0022915174727099526, "learning_rate": 0.009921385447421341, "loss": 2.9227, "step": 971 }, { "crossentropy": 2.9740582704544067, "epoch": 0.05285624949019821, "grad_norm": 0.05953483283519745, "grad_norm_var": 0.002279611074235643, "learning_rate": 0.009921165356823336, "loss": 2.9741, "step": 972 }, { "crossentropy": 2.915442109107971, "epoch": 0.05291062834769841, "grad_norm": 0.0573844350874424, "grad_norm_var": 0.0022976075923613333, "learning_rate": 0.009920944961019244, "loss": 2.9154, "step": 973 }, { "crossentropy": 2.8855223655700684, "epoch": 0.052965007205198616, "grad_norm": 0.055578093975782394, "grad_norm_var": 0.0022926067454147377, "learning_rate": 0.009920724260022735, "loss": 2.8855, "step": 974 }, { "crossentropy": 2.9628206491470337, "epoch": 0.05301938606269882, "grad_norm": 0.06773682683706284, "grad_norm_var": 0.00223921965458783, "learning_rate": 0.009920503253847494, "loss": 2.9628, "step": 975 }, { "crossentropy": 2.8700335025787354, "epoch": 0.053073764920199024, "grad_norm": 0.06665056198835373, "grad_norm_var": 0.0022116446053430487, "learning_rate": 0.009920281942507228, "loss": 2.87, "step": 976 }, { "crossentropy": 2.9432088136672974, "epoch": 0.05312814377769923, "grad_norm": 0.06123071536421776, "grad_norm_var": 0.0021867426994972793, "learning_rate": 0.009920060326015664, "loss": 2.9432, "step": 977 }, { "crossentropy": 2.9925153255462646, "epoch": 0.05318252263519944, "grad_norm": 0.055889301002025604, "grad_norm_var": 0.0021463298689047643, "learning_rate": 0.009919838404386545, "loss": 2.9925, "step": 978 }, { "crossentropy": 2.936923027038574, "epoch": 0.05323690149269964, "grad_norm": 0.05099203810095787, "grad_norm_var": 0.002175688443145194, "learning_rate": 0.009919616177633634, "loss": 2.9369, "step": 979 }, { "crossentropy": 2.9442083835601807, "epoch": 0.053291280350199845, "grad_norm": 0.0604671984910965, "grad_norm_var": 0.0021812200995552484, "learning_rate": 0.009919393645770715, "loss": 2.9442, "step": 980 }, { "crossentropy": 2.8747618198394775, "epoch": 0.05334565920770005, "grad_norm": 0.05972210690379143, "grad_norm_var": 0.002198717466584486, "learning_rate": 0.00991917080881159, "loss": 2.8748, "step": 981 }, { "crossentropy": 2.831022620201111, "epoch": 0.05340003806520025, "grad_norm": 0.05154674872756004, "grad_norm_var": 0.00020934288044248864, "learning_rate": 0.009918947666770076, "loss": 2.831, "step": 982 }, { "crossentropy": 2.984055280685425, "epoch": 0.053454416922700455, "grad_norm": 0.05622237175703049, "grad_norm_var": 0.00019888454245630584, "learning_rate": 0.009918724219660014, "loss": 2.9841, "step": 983 }, { "crossentropy": 3.0018725395202637, "epoch": 0.05350879578020066, "grad_norm": 0.06009102240204811, "grad_norm_var": 3.499544448795448e-05, "learning_rate": 0.009918500467495261, "loss": 3.0019, "step": 984 }, { "crossentropy": 2.898564577102661, "epoch": 0.05356317463770086, "grad_norm": 0.05735344439744949, "grad_norm_var": 2.7045508773517144e-05, "learning_rate": 0.009918276410289695, "loss": 2.8986, "step": 985 }, { "crossentropy": 2.8558149337768555, "epoch": 0.053617553495201066, "grad_norm": 0.0535055473446846, "grad_norm_var": 2.1545679235564683e-05, "learning_rate": 0.009918052048057211, "loss": 2.8558, "step": 986 }, { "crossentropy": 2.909510374069214, "epoch": 0.05367193235270127, "grad_norm": 0.05940284579992294, "grad_norm_var": 2.1419801169843673e-05, "learning_rate": 0.009917827380811726, "loss": 2.9095, "step": 987 }, { "crossentropy": 2.832650899887085, "epoch": 0.05372631121020147, "grad_norm": 0.06693067401647568, "grad_norm_var": 2.6024824344369977e-05, "learning_rate": 0.00991760240856717, "loss": 2.8327, "step": 988 }, { "crossentropy": 2.969797372817993, "epoch": 0.05378069006770168, "grad_norm": 0.05370572209358215, "grad_norm_var": 2.756201498107328e-05, "learning_rate": 0.009917377131337498, "loss": 2.9698, "step": 989 }, { "crossentropy": 2.9305906295776367, "epoch": 0.05383506892520188, "grad_norm": 0.061163876205682755, "grad_norm_var": 2.7288202316392423e-05, "learning_rate": 0.00991715154913668, "loss": 2.9306, "step": 990 }, { "crossentropy": 2.829235553741455, "epoch": 0.053889447782702084, "grad_norm": 0.06089966744184494, "grad_norm_var": 2.2166057039416794e-05, "learning_rate": 0.009916925661978706, "loss": 2.8292, "step": 991 }, { "crossentropy": 2.825047492980957, "epoch": 0.05394382664020229, "grad_norm": 0.05921103432774544, "grad_norm_var": 1.7526352692616375e-05, "learning_rate": 0.009916699469877588, "loss": 2.825, "step": 992 }, { "crossentropy": 3.008751153945923, "epoch": 0.05399820549770249, "grad_norm": 0.053330592811107635, "grad_norm_var": 1.8046035111068024e-05, "learning_rate": 0.009916472972847352, "loss": 3.0088, "step": 993 }, { "crossentropy": 2.8852912187576294, "epoch": 0.054052584355202694, "grad_norm": 0.05584191530942917, "grad_norm_var": 1.805652344782268e-05, "learning_rate": 0.009916246170902048, "loss": 2.8853, "step": 994 }, { "crossentropy": 3.0539923906326294, "epoch": 0.0541069632127029, "grad_norm": 0.0558638721704483, "grad_norm_var": 1.5296814536799816e-05, "learning_rate": 0.009916019064055738, "loss": 3.054, "step": 995 }, { "crossentropy": 2.9064877033233643, "epoch": 0.05416134207020311, "grad_norm": 0.05187397077679634, "grad_norm_var": 1.688890132770492e-05, "learning_rate": 0.00991579165232251, "loss": 2.9065, "step": 996 }, { "crossentropy": 3.0280697345733643, "epoch": 0.05421572092770331, "grad_norm": 0.06738472729921341, "grad_norm_var": 2.304185466220071e-05, "learning_rate": 0.009915563935716465, "loss": 3.0281, "step": 997 }, { "crossentropy": 2.9011834859848022, "epoch": 0.054270099785203516, "grad_norm": 0.05741942673921585, "grad_norm_var": 2.0324029775967144e-05, "learning_rate": 0.009915335914251729, "loss": 2.9012, "step": 998 }, { "crossentropy": 2.83752703666687, "epoch": 0.05432447864270372, "grad_norm": 0.05536618456244469, "grad_norm_var": 2.058847865047068e-05, "learning_rate": 0.00991510758794244, "loss": 2.8375, "step": 999 }, { "crossentropy": 2.9361441135406494, "epoch": 0.05437885750020392, "grad_norm": 0.0563778430223465, "grad_norm_var": 2.0456568281409145e-05, "learning_rate": 0.009914878956802764, "loss": 2.9361, "step": 1000 }, { "crossentropy": 2.76516854763031, "epoch": 0.054433236357704126, "grad_norm": 0.05525344982743263, "grad_norm_var": 2.0871775589402913e-05, "learning_rate": 0.009914650020846875, "loss": 2.7652, "step": 1001 }, { "crossentropy": 2.841576099395752, "epoch": 0.05448761521520433, "grad_norm": 0.11174475401639938, "grad_norm_var": 0.00020012790796916963, "learning_rate": 0.009914420780088974, "loss": 2.8416, "step": 1002 }, { "crossentropy": 2.847742438316345, "epoch": 0.05454199407270453, "grad_norm": 0.055633820593357086, "grad_norm_var": 0.0002019996282706946, "learning_rate": 0.009914191234543278, "loss": 2.8477, "step": 1003 }, { "crossentropy": 2.8464466333389282, "epoch": 0.05459637293020474, "grad_norm": 0.05474163591861725, "grad_norm_var": 0.00020185016351089197, "learning_rate": 0.009913961384224025, "loss": 2.8464, "step": 1004 }, { "crossentropy": 2.875818967819214, "epoch": 0.05465075178770494, "grad_norm": 0.05859677121043205, "grad_norm_var": 0.0001990036515386373, "learning_rate": 0.009913731229145465, "loss": 2.8758, "step": 1005 }, { "crossentropy": 2.9575871229171753, "epoch": 0.054705130645205144, "grad_norm": 0.05721774697303772, "grad_norm_var": 0.00019971650308536475, "learning_rate": 0.009913500769321877, "loss": 2.9576, "step": 1006 }, { "crossentropy": 2.904797315597534, "epoch": 0.05475950950270535, "grad_norm": 0.0499749556183815, "grad_norm_var": 0.00020648054502088862, "learning_rate": 0.009913270004767551, "loss": 2.9048, "step": 1007 }, { "crossentropy": 2.97242534160614, "epoch": 0.05481388836020555, "grad_norm": 0.05931400507688522, "grad_norm_var": 0.00020647395157100956, "learning_rate": 0.0099130389354968, "loss": 2.9724, "step": 1008 }, { "crossentropy": 2.8958581686019897, "epoch": 0.054868267217705755, "grad_norm": 0.05672326683998108, "grad_norm_var": 0.00020429129928247353, "learning_rate": 0.009912807561523957, "loss": 2.8959, "step": 1009 }, { "crossentropy": 2.968396306037903, "epoch": 0.05492264607520596, "grad_norm": 0.05682554095983505, "grad_norm_var": 0.0002038119415463666, "learning_rate": 0.009912575882863367, "loss": 2.9684, "step": 1010 }, { "crossentropy": 2.795261025428772, "epoch": 0.05497702493270616, "grad_norm": 0.06112593039870262, "grad_norm_var": 0.00020262690059281435, "learning_rate": 0.009912343899529401, "loss": 2.7953, "step": 1011 }, { "crossentropy": 2.844757914543152, "epoch": 0.055031403790206365, "grad_norm": 0.05914643779397011, "grad_norm_var": 0.00019771513728993, "learning_rate": 0.009912111611536448, "loss": 2.8448, "step": 1012 }, { "crossentropy": 2.935696601867676, "epoch": 0.05508578264770657, "grad_norm": 0.056204766035079956, "grad_norm_var": 0.00019571584061284456, "learning_rate": 0.00991187901889891, "loss": 2.9357, "step": 1013 }, { "crossentropy": 2.962240695953369, "epoch": 0.05514016150520677, "grad_norm": 0.05220509693026543, "grad_norm_var": 0.0001992817113845478, "learning_rate": 0.009911646121631215, "loss": 2.9622, "step": 1014 }, { "crossentropy": 2.8550254106521606, "epoch": 0.05519454036270698, "grad_norm": 0.04914398863911629, "grad_norm_var": 0.00020536181943006635, "learning_rate": 0.009911412919747807, "loss": 2.855, "step": 1015 }, { "crossentropy": 2.995598316192627, "epoch": 0.055248919220207186, "grad_norm": 0.05207134410738945, "grad_norm_var": 0.00020825016181611194, "learning_rate": 0.00991117941326315, "loss": 2.9956, "step": 1016 }, { "crossentropy": 2.8943707942962646, "epoch": 0.05530329807770739, "grad_norm": 0.060156773775815964, "grad_norm_var": 0.00020722482022327704, "learning_rate": 0.009910945602191723, "loss": 2.8944, "step": 1017 }, { "crossentropy": 2.8732614517211914, "epoch": 0.05535767693520759, "grad_norm": 0.0545065701007843, "grad_norm_var": 1.2709032472689395e-05, "learning_rate": 0.009910711486548028, "loss": 2.8733, "step": 1018 }, { "crossentropy": 2.906235456466675, "epoch": 0.0554120557927078, "grad_norm": 0.05495147407054901, "grad_norm_var": 1.2757735640370136e-05, "learning_rate": 0.009910477066346585, "loss": 2.9062, "step": 1019 }, { "crossentropy": 2.9554742574691772, "epoch": 0.055466434650208, "grad_norm": 0.05401645973324776, "grad_norm_var": 1.2893578974034389e-05, "learning_rate": 0.009910242341601934, "loss": 2.9555, "step": 1020 }, { "crossentropy": 2.809577703475952, "epoch": 0.055520813507708204, "grad_norm": 0.05066491663455963, "grad_norm_var": 1.3827006112455443e-05, "learning_rate": 0.009910007312328628, "loss": 2.8096, "step": 1021 }, { "crossentropy": 2.8703378438949585, "epoch": 0.05557519236520841, "grad_norm": 0.05129409208893776, "grad_norm_var": 1.4478249460049784e-05, "learning_rate": 0.00990977197854125, "loss": 2.8703, "step": 1022 }, { "crossentropy": 2.9111716747283936, "epoch": 0.05562957122270861, "grad_norm": 0.05298768728971481, "grad_norm_var": 1.3069023097156176e-05, "learning_rate": 0.009909536340254389, "loss": 2.9112, "step": 1023 }, { "crossentropy": 2.8497653007507324, "epoch": 0.055683950080208815, "grad_norm": 0.054153598845005035, "grad_norm_var": 1.1822669570373211e-05, "learning_rate": 0.009909300397482659, "loss": 2.8498, "step": 1024 }, { "crossentropy": 2.88861882686615, "epoch": 0.05573832893770902, "grad_norm": 0.05217840522527695, "grad_norm_var": 1.19246311513721e-05, "learning_rate": 0.009909064150240699, "loss": 2.8886, "step": 1025 }, { "crossentropy": 2.8528014421463013, "epoch": 0.05579270779520922, "grad_norm": 0.05230042710900307, "grad_norm_var": 1.1787474239813705e-05, "learning_rate": 0.009908827598543155, "loss": 2.8528, "step": 1026 }, { "crossentropy": 3.0384305715560913, "epoch": 0.055847086652709425, "grad_norm": 0.05346693471074104, "grad_norm_var": 8.375107643431429e-06, "learning_rate": 0.0099085907424047, "loss": 3.0384, "step": 1027 }, { "crossentropy": 2.8944531679153442, "epoch": 0.05590146551020963, "grad_norm": 0.05451442301273346, "grad_norm_var": 6.361959878134482e-06, "learning_rate": 0.009908353581840024, "loss": 2.8945, "step": 1028 }, { "crossentropy": 2.9662495851516724, "epoch": 0.05595584436770983, "grad_norm": 0.0631289854645729, "grad_norm_var": 1.1923893603915427e-05, "learning_rate": 0.009908116116863834, "loss": 2.9662, "step": 1029 }, { "crossentropy": 2.9491021633148193, "epoch": 0.056010223225210036, "grad_norm": 0.053766194730997086, "grad_norm_var": 1.1731990539638285e-05, "learning_rate": 0.00990787834749086, "loss": 2.9491, "step": 1030 }, { "crossentropy": 2.962319493293762, "epoch": 0.05606460208271024, "grad_norm": 0.054840657860040665, "grad_norm_var": 1.0104953537667013e-05, "learning_rate": 0.009907640273735845, "loss": 2.9623, "step": 1031 }, { "crossentropy": 2.9393322467803955, "epoch": 0.05611898094021044, "grad_norm": 0.05349424481391907, "grad_norm_var": 9.806314144407355e-06, "learning_rate": 0.009907401895613557, "loss": 2.9393, "step": 1032 }, { "crossentropy": 3.004130721092224, "epoch": 0.05617335979771065, "grad_norm": 0.05413435027003288, "grad_norm_var": 7.451628927663117e-06, "learning_rate": 0.009907163213138778, "loss": 3.0041, "step": 1033 }, { "crossentropy": 2.9246782064437866, "epoch": 0.05622773865521086, "grad_norm": 0.0533953420817852, "grad_norm_var": 7.457449081553028e-06, "learning_rate": 0.009906924226326311, "loss": 2.9247, "step": 1034 }, { "crossentropy": 2.7860310077667236, "epoch": 0.05628211751271106, "grad_norm": 0.0604107603430748, "grad_norm_var": 1.0045152514041172e-05, "learning_rate": 0.009906684935190977, "loss": 2.786, "step": 1035 }, { "crossentropy": 3.0003668069839478, "epoch": 0.056336496370211264, "grad_norm": 0.05326448008418083, "grad_norm_var": 1.0108594354030221e-05, "learning_rate": 0.00990644533974762, "loss": 3.0004, "step": 1036 }, { "crossentropy": 2.9327585697174072, "epoch": 0.05639087522771147, "grad_norm": 0.05663957819342613, "grad_norm_var": 9.483900036322673e-06, "learning_rate": 0.009906205440011097, "loss": 2.9328, "step": 1037 }, { "crossentropy": 2.8264747858047485, "epoch": 0.05644525408521167, "grad_norm": 0.05568482354283333, "grad_norm_var": 8.73988317126219e-06, "learning_rate": 0.009905965235996286, "loss": 2.8265, "step": 1038 }, { "crossentropy": 2.924909234046936, "epoch": 0.056499632942711875, "grad_norm": 0.05612753704190254, "grad_norm_var": 8.556489157570078e-06, "learning_rate": 0.009905724727718085, "loss": 2.9249, "step": 1039 }, { "crossentropy": 2.8275959491729736, "epoch": 0.05655401180021208, "grad_norm": 0.055637214332818985, "grad_norm_var": 8.508073274415307e-06, "learning_rate": 0.00990548391519141, "loss": 2.8276, "step": 1040 }, { "crossentropy": 2.99234676361084, "epoch": 0.05660839065771228, "grad_norm": 0.05266078934073448, "grad_norm_var": 8.329140949246e-06, "learning_rate": 0.009905242798431196, "loss": 2.9923, "step": 1041 }, { "crossentropy": 2.8579734563827515, "epoch": 0.056662769515212486, "grad_norm": 0.054702285677194595, "grad_norm_var": 7.755777879962095e-06, "learning_rate": 0.009905001377452398, "loss": 2.858, "step": 1042 }, { "crossentropy": 2.929396152496338, "epoch": 0.05671714837271269, "grad_norm": 0.05951521545648575, "grad_norm_var": 8.510021647014678e-06, "learning_rate": 0.009904759652269985, "loss": 2.9294, "step": 1043 }, { "crossentropy": 2.994704484939575, "epoch": 0.05677152723021289, "grad_norm": 0.05450095236301422, "grad_norm_var": 8.512242861053304e-06, "learning_rate": 0.009904517622898953, "loss": 2.9947, "step": 1044 }, { "crossentropy": 2.8413360118865967, "epoch": 0.056825906087713096, "grad_norm": 0.05218162760138512, "grad_norm_var": 5.223002284430549e-06, "learning_rate": 0.00990427528935431, "loss": 2.8413, "step": 1045 }, { "crossentropy": 2.9436535835266113, "epoch": 0.0568802849452133, "grad_norm": 0.058487407863140106, "grad_norm_var": 5.801829327212587e-06, "learning_rate": 0.009904032651651086, "loss": 2.9437, "step": 1046 }, { "crossentropy": 2.991308808326721, "epoch": 0.0569346638027135, "grad_norm": 0.05722486600279808, "grad_norm_var": 5.993655163698206e-06, "learning_rate": 0.009903789709804328, "loss": 2.9913, "step": 1047 }, { "crossentropy": 2.8751128911972046, "epoch": 0.05698904266021371, "grad_norm": 0.062019508332014084, "grad_norm_var": 8.25184958471517e-06, "learning_rate": 0.009903546463829105, "loss": 2.8751, "step": 1048 }, { "crossentropy": 2.84070885181427, "epoch": 0.05704342151771391, "grad_norm": 0.06911378353834152, "grad_norm_var": 1.8476388227015894e-05, "learning_rate": 0.009903302913740501, "loss": 2.8407, "step": 1049 }, { "crossentropy": 2.771955728530884, "epoch": 0.057097800375214114, "grad_norm": 0.05888895317912102, "grad_norm_var": 1.77421389373553e-05, "learning_rate": 0.009903059059553623, "loss": 2.772, "step": 1050 }, { "crossentropy": 2.8804686069488525, "epoch": 0.05715217923271432, "grad_norm": 0.05385037884116173, "grad_norm_var": 1.7725217453826854e-05, "learning_rate": 0.009902814901283592, "loss": 2.8805, "step": 1051 }, { "crossentropy": 2.978422999382019, "epoch": 0.05720655809021453, "grad_norm": 0.05863388627767563, "grad_norm_var": 1.6919932992554415e-05, "learning_rate": 0.009902570438945553, "loss": 2.9784, "step": 1052 }, { "crossentropy": 2.940023899078369, "epoch": 0.05726093694771473, "grad_norm": 0.0570342056453228, "grad_norm_var": 1.6897979043291728e-05, "learning_rate": 0.009902325672554665, "loss": 2.94, "step": 1053 }, { "crossentropy": 3.0048539638519287, "epoch": 0.057315315805214935, "grad_norm": 0.05267374590039253, "grad_norm_var": 1.809963340162011e-05, "learning_rate": 0.009902080602126108, "loss": 3.0049, "step": 1054 }, { "crossentropy": 2.8445550203323364, "epoch": 0.05736969466271514, "grad_norm": 0.04953144118189812, "grad_norm_var": 2.1655065591377695e-05, "learning_rate": 0.009901835227675085, "loss": 2.8446, "step": 1055 }, { "crossentropy": 2.950771689414978, "epoch": 0.05742407352021534, "grad_norm": 0.054371777921915054, "grad_norm_var": 2.1928733143507645e-05, "learning_rate": 0.00990158954921681, "loss": 2.9508, "step": 1056 }, { "crossentropy": 2.9183539152145386, "epoch": 0.057478452377715546, "grad_norm": 0.0613798052072525, "grad_norm_var": 2.2115786792210414e-05, "learning_rate": 0.00990134356676652, "loss": 2.9184, "step": 1057 }, { "crossentropy": 2.9054521322250366, "epoch": 0.05753283123521575, "grad_norm": 0.059896279126405716, "grad_norm_var": 2.211932220136936e-05, "learning_rate": 0.009901097280339473, "loss": 2.9055, "step": 1058 }, { "crossentropy": 2.8782060146331787, "epoch": 0.05758721009271595, "grad_norm": 0.05203811824321747, "grad_norm_var": 2.356106955939107e-05, "learning_rate": 0.00990085068995094, "loss": 2.8782, "step": 1059 }, { "crossentropy": 2.7765235900878906, "epoch": 0.057641588950216156, "grad_norm": 0.053993109613657, "grad_norm_var": 2.3745671753186368e-05, "learning_rate": 0.009900603795616218, "loss": 2.7765, "step": 1060 }, { "crossentropy": 2.9164477586746216, "epoch": 0.05769596780771636, "grad_norm": 0.05271512269973755, "grad_norm_var": 2.342374462986352e-05, "learning_rate": 0.009900356597350618, "loss": 2.9164, "step": 1061 }, { "crossentropy": 2.9783726930618286, "epoch": 0.05775034666521656, "grad_norm": 0.061556439846754074, "grad_norm_var": 2.462485827874842e-05, "learning_rate": 0.009900109095169468, "loss": 2.9784, "step": 1062 }, { "crossentropy": 3.031753182411194, "epoch": 0.05780472552271677, "grad_norm": 0.06130367890000343, "grad_norm_var": 2.56876450253166e-05, "learning_rate": 0.009899861289088121, "loss": 3.0318, "step": 1063 }, { "crossentropy": 2.9535473585128784, "epoch": 0.05785910438021697, "grad_norm": 0.051064785569906235, "grad_norm_var": 2.64954208904317e-05, "learning_rate": 0.009899613179121946, "loss": 2.9535, "step": 1064 }, { "crossentropy": 2.794206380844116, "epoch": 0.057913483237717174, "grad_norm": 0.05771440267562866, "grad_norm_var": 1.582943169205897e-05, "learning_rate": 0.00989936476528633, "loss": 2.7942, "step": 1065 }, { "crossentropy": 2.988542318344116, "epoch": 0.05796786209521738, "grad_norm": 0.05655577406287193, "grad_norm_var": 1.52835012859527e-05, "learning_rate": 0.009899116047596676, "loss": 2.9885, "step": 1066 }, { "crossentropy": 2.784671664237976, "epoch": 0.05802224095271758, "grad_norm": 0.05913137272000313, "grad_norm_var": 1.5587183247444252e-05, "learning_rate": 0.009898867026068417, "loss": 2.7847, "step": 1067 }, { "crossentropy": 2.8843374252319336, "epoch": 0.058076619810217785, "grad_norm": 0.05789314582943916, "grad_norm_var": 1.538352480015895e-05, "learning_rate": 0.009898617700716988, "loss": 2.8843, "step": 1068 }, { "crossentropy": 3.050257444381714, "epoch": 0.05813099866771799, "grad_norm": 0.05617375299334526, "grad_norm_var": 1.533160587380081e-05, "learning_rate": 0.009898368071557858, "loss": 3.0503, "step": 1069 }, { "crossentropy": 2.7148643732070923, "epoch": 0.05818537752521819, "grad_norm": 0.058550089597702026, "grad_norm_var": 1.4786073530430726e-05, "learning_rate": 0.009898118138606507, "loss": 2.7149, "step": 1070 }, { "crossentropy": 2.8052570819854736, "epoch": 0.0582397563827184, "grad_norm": 0.07279899716377258, "grad_norm_var": 2.702880855872461e-05, "learning_rate": 0.009897867901878434, "loss": 2.8053, "step": 1071 }, { "crossentropy": 2.853406071662903, "epoch": 0.058294135240218606, "grad_norm": 0.054197657853364944, "grad_norm_var": 2.7113683539851246e-05, "learning_rate": 0.009897617361389163, "loss": 2.8534, "step": 1072 }, { "crossentropy": 3.0644842386245728, "epoch": 0.05834851409771881, "grad_norm": 0.05522150918841362, "grad_norm_var": 2.6655550908697735e-05, "learning_rate": 0.009897366517154227, "loss": 3.0645, "step": 1073 }, { "crossentropy": 2.9558730125427246, "epoch": 0.05840289295521901, "grad_norm": 0.05306914448738098, "grad_norm_var": 2.7433120572992186e-05, "learning_rate": 0.009897115369189184, "loss": 2.9559, "step": 1074 }, { "crossentropy": 2.9324827194213867, "epoch": 0.058457271812719216, "grad_norm": 0.053423214703798294, "grad_norm_var": 2.6613847719476834e-05, "learning_rate": 0.009896863917509616, "loss": 2.9325, "step": 1075 }, { "crossentropy": 2.8407888412475586, "epoch": 0.05851165067021942, "grad_norm": 0.055480822920799255, "grad_norm_var": 2.6114043066244318e-05, "learning_rate": 0.00989661216213111, "loss": 2.8408, "step": 1076 }, { "crossentropy": 2.819665789604187, "epoch": 0.058566029527719624, "grad_norm": 0.05370417982339859, "grad_norm_var": 2.5570143908196924e-05, "learning_rate": 0.009896360103069282, "loss": 2.8197, "step": 1077 }, { "crossentropy": 2.90505051612854, "epoch": 0.05862040838521983, "grad_norm": 0.0519905649125576, "grad_norm_var": 2.5943212456278758e-05, "learning_rate": 0.009896107740339766, "loss": 2.9051, "step": 1078 }, { "crossentropy": 2.8768398761749268, "epoch": 0.05867478724272003, "grad_norm": 0.05752614885568619, "grad_norm_var": 2.45501130903565e-05, "learning_rate": 0.009895855073958213, "loss": 2.8768, "step": 1079 }, { "crossentropy": 2.923078775405884, "epoch": 0.058729166100220234, "grad_norm": 0.05951155349612236, "grad_norm_var": 2.2853140886966843e-05, "learning_rate": 0.009895602103940294, "loss": 2.9231, "step": 1080 }, { "crossentropy": 2.9017776250839233, "epoch": 0.05878354495772044, "grad_norm": 0.06950833648443222, "grad_norm_var": 3.257749633082205e-05, "learning_rate": 0.009895348830301695, "loss": 2.9018, "step": 1081 }, { "crossentropy": 2.8604702949523926, "epoch": 0.05883792381522064, "grad_norm": 0.05649639666080475, "grad_norm_var": 3.258753566928521e-05, "learning_rate": 0.009895095253058126, "loss": 2.8605, "step": 1082 }, { "crossentropy": 2.890753984451294, "epoch": 0.058892302672720845, "grad_norm": 0.052322469651699066, "grad_norm_var": 3.426943102679171e-05, "learning_rate": 0.009894841372225315, "loss": 2.8908, "step": 1083 }, { "crossentropy": 2.86702561378479, "epoch": 0.05894668153022105, "grad_norm": 0.05399041995406151, "grad_norm_var": 3.494746770542802e-05, "learning_rate": 0.009894587187819001, "loss": 2.867, "step": 1084 }, { "crossentropy": 2.9127845764160156, "epoch": 0.05900106038772125, "grad_norm": 0.06585217267274857, "grad_norm_var": 3.957721531056557e-05, "learning_rate": 0.009894332699854956, "loss": 2.9128, "step": 1085 }, { "crossentropy": 2.9385393857955933, "epoch": 0.059055439245221455, "grad_norm": 0.05797111243009567, "grad_norm_var": 3.953468255559548e-05, "learning_rate": 0.00989407790834896, "loss": 2.9385, "step": 1086 }, { "crossentropy": 2.8376402854919434, "epoch": 0.05910981810272166, "grad_norm": 0.058510903269052505, "grad_norm_var": 2.3513139350559744e-05, "learning_rate": 0.009893822813316813, "loss": 2.8376, "step": 1087 }, { "crossentropy": 2.893911361694336, "epoch": 0.05916419696022186, "grad_norm": 0.049645356833934784, "grad_norm_var": 2.6387019857893047e-05, "learning_rate": 0.00989356741477434, "loss": 2.8939, "step": 1088 }, { "crossentropy": 2.8361960649490356, "epoch": 0.05921857581772207, "grad_norm": 0.05021583288908005, "grad_norm_var": 2.881572108730379e-05, "learning_rate": 0.009893311712737376, "loss": 2.8362, "step": 1089 }, { "crossentropy": 2.9098150730133057, "epoch": 0.05927295467522228, "grad_norm": 0.05090563744306564, "grad_norm_var": 3.001175503744193e-05, "learning_rate": 0.009893055707221782, "loss": 2.9098, "step": 1090 }, { "crossentropy": 3.0103721618652344, "epoch": 0.05932733353272248, "grad_norm": 0.06791631877422333, "grad_norm_var": 3.8033035079657976e-05, "learning_rate": 0.009892799398243437, "loss": 3.0104, "step": 1091 }, { "crossentropy": 2.926972985267639, "epoch": 0.059381712390222684, "grad_norm": 0.06745930016040802, "grad_norm_var": 4.461955241323127e-05, "learning_rate": 0.009892542785818234, "loss": 2.927, "step": 1092 }, { "crossentropy": 2.8445138931274414, "epoch": 0.05943609124772289, "grad_norm": 0.05911894142627716, "grad_norm_var": 4.355243307599006e-05, "learning_rate": 0.009892285869962087, "loss": 2.8445, "step": 1093 }, { "crossentropy": 2.8881677389144897, "epoch": 0.05949047010522309, "grad_norm": 0.058259766548871994, "grad_norm_var": 4.0936430280067573e-05, "learning_rate": 0.009892028650690934, "loss": 2.8882, "step": 1094 }, { "crossentropy": 3.0374823808670044, "epoch": 0.059544848962723294, "grad_norm": 0.05541932210326195, "grad_norm_var": 4.14735567451839e-05, "learning_rate": 0.009891771128020723, "loss": 3.0375, "step": 1095 }, { "crossentropy": 2.8809423446655273, "epoch": 0.0595992278202235, "grad_norm": 0.055207639932632446, "grad_norm_var": 4.194692757726326e-05, "learning_rate": 0.009891513301967428, "loss": 2.8809, "step": 1096 }, { "crossentropy": 3.0382239818573, "epoch": 0.0596536066777237, "grad_norm": 0.05332674831151962, "grad_norm_var": 3.359027738409903e-05, "learning_rate": 0.009891255172547037, "loss": 3.0382, "step": 1097 }, { "crossentropy": 2.917035222053528, "epoch": 0.059707985535223905, "grad_norm": 0.05184045806527138, "grad_norm_var": 3.5281761715963826e-05, "learning_rate": 0.009890996739775561, "loss": 2.917, "step": 1098 }, { "crossentropy": 2.8227332830429077, "epoch": 0.05976236439272411, "grad_norm": 0.06579136103391647, "grad_norm_var": 3.867298156086148e-05, "learning_rate": 0.009890738003669028, "loss": 2.8227, "step": 1099 }, { "crossentropy": 2.86754834651947, "epoch": 0.05981674325022431, "grad_norm": 0.061548277735710144, "grad_norm_var": 3.8616257367982826e-05, "learning_rate": 0.009890478964243483, "loss": 2.8675, "step": 1100 }, { "crossentropy": 2.914805054664612, "epoch": 0.059871122107724516, "grad_norm": 0.05541679263114929, "grad_norm_var": 3.458296012651685e-05, "learning_rate": 0.009890219621514993, "loss": 2.9148, "step": 1101 }, { "crossentropy": 2.960595488548279, "epoch": 0.05992550096522472, "grad_norm": 0.049860067665576935, "grad_norm_var": 3.80875267100872e-05, "learning_rate": 0.009889959975499641, "loss": 2.9606, "step": 1102 }, { "crossentropy": 2.834489583969116, "epoch": 0.05997987982272492, "grad_norm": 0.0545930415391922, "grad_norm_var": 3.820676799064313e-05, "learning_rate": 0.00988970002621353, "loss": 2.8345, "step": 1103 }, { "crossentropy": 2.9473878145217896, "epoch": 0.060034258680225126, "grad_norm": 0.05446445196866989, "grad_norm_var": 3.515242789413244e-05, "learning_rate": 0.00988943977367278, "loss": 2.9474, "step": 1104 }, { "crossentropy": 2.83147132396698, "epoch": 0.06008863753772533, "grad_norm": 0.05201788619160652, "grad_norm_var": 3.373518456975695e-05, "learning_rate": 0.009889179217893536, "loss": 2.8315, "step": 1105 }, { "crossentropy": 2.905728816986084, "epoch": 0.06014301639522553, "grad_norm": 0.060736656188964844, "grad_norm_var": 3.169334982271084e-05, "learning_rate": 0.009888918358891954, "loss": 2.9057, "step": 1106 }, { "crossentropy": 2.876832604408264, "epoch": 0.06019739525272574, "grad_norm": 0.05830924212932587, "grad_norm_var": 2.4357466534030704e-05, "learning_rate": 0.009888657196684213, "loss": 2.8768, "step": 1107 }, { "crossentropy": 2.9430984258651733, "epoch": 0.06025177411022595, "grad_norm": 0.05664456635713577, "grad_norm_var": 1.6708895167952957e-05, "learning_rate": 0.009888395731286512, "loss": 2.9431, "step": 1108 }, { "crossentropy": 2.922698140144348, "epoch": 0.06030615296772615, "grad_norm": 0.052545614540576935, "grad_norm_var": 1.7034938133160073e-05, "learning_rate": 0.009888133962715064, "loss": 2.9227, "step": 1109 }, { "crossentropy": 2.939542770385742, "epoch": 0.060360531825226355, "grad_norm": 0.0743401050567627, "grad_norm_var": 3.8043486411803015e-05, "learning_rate": 0.009887871890986104, "loss": 2.9395, "step": 1110 }, { "crossentropy": 3.00255286693573, "epoch": 0.06041491068272656, "grad_norm": 0.06450735777616501, "grad_norm_var": 4.128543030750091e-05, "learning_rate": 0.009887609516115888, "loss": 3.0026, "step": 1111 }, { "crossentropy": 2.841096878051758, "epoch": 0.06046928954022676, "grad_norm": 0.057943735271692276, "grad_norm_var": 4.0890809816948535e-05, "learning_rate": 0.009887346838120686, "loss": 2.8411, "step": 1112 }, { "crossentropy": 2.857108473777771, "epoch": 0.060523668397726965, "grad_norm": 0.06374946236610413, "grad_norm_var": 4.154326875762341e-05, "learning_rate": 0.00988708385701679, "loss": 2.8571, "step": 1113 }, { "crossentropy": 2.8082796335220337, "epoch": 0.06057804725522717, "grad_norm": 0.06053493916988373, "grad_norm_var": 3.867023984582066e-05, "learning_rate": 0.009886820572820507, "loss": 2.8083, "step": 1114 }, { "crossentropy": 2.8549927473068237, "epoch": 0.06063242611272737, "grad_norm": 0.05485367402434349, "grad_norm_var": 3.6152241527642676e-05, "learning_rate": 0.00988655698554817, "loss": 2.855, "step": 1115 }, { "crossentropy": 2.9778581857681274, "epoch": 0.060686804970227576, "grad_norm": 0.059113532304763794, "grad_norm_var": 3.545334820824978e-05, "learning_rate": 0.009886293095216126, "loss": 2.9779, "step": 1116 }, { "crossentropy": 2.8256407976150513, "epoch": 0.06074118382772778, "grad_norm": 0.061853691935539246, "grad_norm_var": 3.5738411749184663e-05, "learning_rate": 0.009886028901840737, "loss": 2.8256, "step": 1117 }, { "crossentropy": 2.895656704902649, "epoch": 0.06079556268522798, "grad_norm": 0.061603039503097534, "grad_norm_var": 3.082254411501528e-05, "learning_rate": 0.009885764405438393, "loss": 2.8957, "step": 1118 }, { "crossentropy": 2.844050407409668, "epoch": 0.060849941542728186, "grad_norm": 0.05144992843270302, "grad_norm_var": 3.3386687405534175e-05, "learning_rate": 0.009885499606025494, "loss": 2.8441, "step": 1119 }, { "crossentropy": 2.9195988178253174, "epoch": 0.06090432040022839, "grad_norm": 0.06065446510910988, "grad_norm_var": 3.200365518545989e-05, "learning_rate": 0.009885234503618464, "loss": 2.9196, "step": 1120 }, { "crossentropy": 2.9358668327331543, "epoch": 0.060958699257728594, "grad_norm": 0.06385622173547745, "grad_norm_var": 2.9065360502568648e-05, "learning_rate": 0.009884969098233744, "loss": 2.9359, "step": 1121 }, { "crossentropy": 2.84194552898407, "epoch": 0.0610130781152288, "grad_norm": 0.05884760618209839, "grad_norm_var": 2.9145292630022627e-05, "learning_rate": 0.009884703389887797, "loss": 2.8419, "step": 1122 }, { "crossentropy": 2.929875612258911, "epoch": 0.061067456972729, "grad_norm": 0.057115551084280014, "grad_norm_var": 2.951147716006615e-05, "learning_rate": 0.0098844373785971, "loss": 2.9299, "step": 1123 }, { "crossentropy": 2.8453893661499023, "epoch": 0.061121835830229204, "grad_norm": 0.0511397160589695, "grad_norm_var": 3.385052890388651e-05, "learning_rate": 0.00988417106437815, "loss": 2.8454, "step": 1124 }, { "crossentropy": 2.955026626586914, "epoch": 0.06117621468772941, "grad_norm": 0.1321335881948471, "grad_norm_var": 0.00035454429600107265, "learning_rate": 0.009883904447247462, "loss": 2.955, "step": 1125 }, { "crossentropy": 2.8982696533203125, "epoch": 0.06123059354522961, "grad_norm": 0.052006930112838745, "grad_norm_var": 0.0003567317830990085, "learning_rate": 0.009883637527221574, "loss": 2.8983, "step": 1126 }, { "crossentropy": 2.9657976627349854, "epoch": 0.06128497240272982, "grad_norm": 0.055323097854852676, "grad_norm_var": 0.00036041525791489403, "learning_rate": 0.009883370304317041, "loss": 2.9658, "step": 1127 }, { "crossentropy": 2.8815832138061523, "epoch": 0.061339351260230025, "grad_norm": 0.0570850595831871, "grad_norm_var": 0.00036099858123070535, "learning_rate": 0.009883102778550434, "loss": 2.8816, "step": 1128 }, { "crossentropy": 2.9171576499938965, "epoch": 0.06139373011773023, "grad_norm": 0.06141043081879616, "grad_norm_var": 0.00036097659126221773, "learning_rate": 0.009882834949938343, "loss": 2.9172, "step": 1129 }, { "crossentropy": 2.9121071100234985, "epoch": 0.06144810897523043, "grad_norm": 0.05712506175041199, "grad_norm_var": 0.000362567768676282, "learning_rate": 0.009882566818497385, "loss": 2.9121, "step": 1130 }, { "crossentropy": 2.9551690816879272, "epoch": 0.061502487832730636, "grad_norm": 0.05275676026940346, "grad_norm_var": 0.00036490302574720176, "learning_rate": 0.009882298384244181, "loss": 2.9552, "step": 1131 }, { "crossentropy": 2.848042130470276, "epoch": 0.06155686669023084, "grad_norm": 0.058044370263814926, "grad_norm_var": 0.0003653990891302981, "learning_rate": 0.009882029647195385, "loss": 2.848, "step": 1132 }, { "crossentropy": 2.8730580806732178, "epoch": 0.06161124554773104, "grad_norm": 0.05653269588947296, "grad_norm_var": 0.0003672904335506664, "learning_rate": 0.009881760607367663, "loss": 2.8731, "step": 1133 }, { "crossentropy": 2.924229621887207, "epoch": 0.06166562440523125, "grad_norm": 0.0659513920545578, "grad_norm_var": 0.00036842016284184634, "learning_rate": 0.009881491264777697, "loss": 2.9242, "step": 1134 }, { "crossentropy": 2.8456201553344727, "epoch": 0.06172000326273145, "grad_norm": 0.06017456203699112, "grad_norm_var": 0.00036094611324249026, "learning_rate": 0.009881221619442195, "loss": 2.8456, "step": 1135 }, { "crossentropy": 2.8522162437438965, "epoch": 0.061774382120231654, "grad_norm": 0.05785418301820755, "grad_norm_var": 0.00036212895666660355, "learning_rate": 0.009880951671377878, "loss": 2.8522, "step": 1136 }, { "crossentropy": 2.9658656120300293, "epoch": 0.06182876097773186, "grad_norm": 0.05152197927236557, "grad_norm_var": 0.000369135268674488, "learning_rate": 0.00988068142060149, "loss": 2.9659, "step": 1137 }, { "crossentropy": 2.8215084075927734, "epoch": 0.06188313983523206, "grad_norm": 0.0608871765434742, "grad_norm_var": 0.00036865657287953084, "learning_rate": 0.009880410867129789, "loss": 2.8215, "step": 1138 }, { "crossentropy": 3.0165354013442993, "epoch": 0.061937518692732264, "grad_norm": 0.06615185737609863, "grad_norm_var": 0.0003682468185383434, "learning_rate": 0.009880140010979557, "loss": 3.0165, "step": 1139 }, { "crossentropy": 2.8048681020736694, "epoch": 0.06199189755023247, "grad_norm": 0.06469455361366272, "grad_norm_var": 0.0003596392569405364, "learning_rate": 0.00987986885216759, "loss": 2.8049, "step": 1140 }, { "crossentropy": 2.9338009357452393, "epoch": 0.06204627640773267, "grad_norm": 0.05446870997548103, "grad_norm_var": 2.1798896877061232e-05, "learning_rate": 0.009879597390710709, "loss": 2.9338, "step": 1141 }, { "crossentropy": 2.7520523071289062, "epoch": 0.062100655265232875, "grad_norm": 0.054201722145080566, "grad_norm_var": 2.027320551529488e-05, "learning_rate": 0.009879325626625744, "loss": 2.7521, "step": 1142 }, { "crossentropy": 2.978917717933655, "epoch": 0.06215503412273308, "grad_norm": 0.05370227247476578, "grad_norm_var": 2.1099424455799964e-05, "learning_rate": 0.009879053559929555, "loss": 2.9789, "step": 1143 }, { "crossentropy": 2.9188872575759888, "epoch": 0.06220941298023328, "grad_norm": 0.05581683665513992, "grad_norm_var": 2.1402883841571165e-05, "learning_rate": 0.009878781190639013, "loss": 2.9189, "step": 1144 }, { "crossentropy": 2.81010103225708, "epoch": 0.06226379183773349, "grad_norm": 0.05570796877145767, "grad_norm_var": 2.099877584685305e-05, "learning_rate": 0.009878508518771008, "loss": 2.8101, "step": 1145 }, { "crossentropy": 2.9928869009017944, "epoch": 0.062318170695233696, "grad_norm": 0.0518442802131176, "grad_norm_var": 2.325177621094228e-05, "learning_rate": 0.009878235544342454, "loss": 2.9929, "step": 1146 }, { "crossentropy": 2.93398380279541, "epoch": 0.0623725495527339, "grad_norm": 0.049470625817775726, "grad_norm_var": 2.6013475497390035e-05, "learning_rate": 0.00987796226737028, "loss": 2.934, "step": 1147 }, { "crossentropy": 2.875606417655945, "epoch": 0.0624269284102341, "grad_norm": 0.05304339900612831, "grad_norm_var": 2.7089623930823258e-05, "learning_rate": 0.009877688687871434, "loss": 2.8756, "step": 1148 }, { "crossentropy": 2.9113155603408813, "epoch": 0.06248130726773431, "grad_norm": 0.051295842975378036, "grad_norm_var": 2.9131013674961865e-05, "learning_rate": 0.009877414805862883, "loss": 2.9113, "step": 1149 }, { "crossentropy": 2.924302101135254, "epoch": 0.06253568612523451, "grad_norm": 0.05314796790480614, "grad_norm_var": 2.353920035419061e-05, "learning_rate": 0.009877140621361611, "loss": 2.9243, "step": 1150 }, { "crossentropy": 2.8578433990478516, "epoch": 0.06259006498273471, "grad_norm": 0.05331416800618172, "grad_norm_var": 2.2546953005645313e-05, "learning_rate": 0.009876866134384626, "loss": 2.8578, "step": 1151 }, { "crossentropy": 2.97672963142395, "epoch": 0.06264444384023492, "grad_norm": 0.05935141071677208, "grad_norm_var": 2.3167960537005122e-05, "learning_rate": 0.00987659134494895, "loss": 2.9767, "step": 1152 }, { "crossentropy": 2.8633930683135986, "epoch": 0.06269882269773512, "grad_norm": 0.06099234148859978, "grad_norm_var": 2.370134818731693e-05, "learning_rate": 0.009876316253071627, "loss": 2.8634, "step": 1153 }, { "crossentropy": 2.9195396900177, "epoch": 0.06275320155523532, "grad_norm": 0.060189325362443924, "grad_norm_var": 2.3289210032135557e-05, "learning_rate": 0.009876040858769712, "loss": 2.9195, "step": 1154 }, { "crossentropy": 2.810256004333496, "epoch": 0.06280758041273553, "grad_norm": 0.05583415925502777, "grad_norm_var": 1.609659653836696e-05, "learning_rate": 0.009875765162060292, "loss": 2.8103, "step": 1155 }, { "crossentropy": 2.8382405042648315, "epoch": 0.06286195927023573, "grad_norm": 0.05166173353791237, "grad_norm_var": 1.063463690707811e-05, "learning_rate": 0.009875489162960462, "loss": 2.8382, "step": 1156 }, { "crossentropy": 2.801261067390442, "epoch": 0.06291633812773594, "grad_norm": 0.11539307236671448, "grad_norm_var": 0.00024132946345946458, "learning_rate": 0.00987521286148734, "loss": 2.8013, "step": 1157 }, { "crossentropy": 2.8335005044937134, "epoch": 0.06297071698523614, "grad_norm": 0.05756250396370888, "grad_norm_var": 0.00024013824250113298, "learning_rate": 0.00987493625765806, "loss": 2.8335, "step": 1158 }, { "crossentropy": 2.7148512601852417, "epoch": 0.06302509584273634, "grad_norm": 0.05612402409315109, "grad_norm_var": 0.00023890863016914806, "learning_rate": 0.00987465935148978, "loss": 2.7149, "step": 1159 }, { "crossentropy": 2.8250396251678467, "epoch": 0.06307947470023655, "grad_norm": 0.056767966598272324, "grad_norm_var": 0.00023858725287996627, "learning_rate": 0.009874382142999669, "loss": 2.825, "step": 1160 }, { "crossentropy": 2.8222216367721558, "epoch": 0.06313385355773675, "grad_norm": 0.052141331136226654, "grad_norm_var": 0.00024087950330608273, "learning_rate": 0.009874104632204924, "loss": 2.8222, "step": 1161 }, { "crossentropy": 2.8341174125671387, "epoch": 0.06318823241523695, "grad_norm": 0.05419588088989258, "grad_norm_var": 0.00023909642827492432, "learning_rate": 0.009873826819122753, "loss": 2.8341, "step": 1162 }, { "crossentropy": 2.8567333221435547, "epoch": 0.06324261127273716, "grad_norm": 0.06222447007894516, "grad_norm_var": 0.00023343139298807395, "learning_rate": 0.009873548703770387, "loss": 2.8567, "step": 1163 }, { "crossentropy": 2.921335220336914, "epoch": 0.06329699013023736, "grad_norm": 0.05801676586270332, "grad_norm_var": 0.00023064444411661386, "learning_rate": 0.009873270286165074, "loss": 2.9213, "step": 1164 }, { "crossentropy": 2.8935673236846924, "epoch": 0.06335136898773756, "grad_norm": 0.2217634916305542, "grad_norm_var": 0.0018515472871633372, "learning_rate": 0.00987299156632408, "loss": 2.8936, "step": 1165 }, { "crossentropy": 2.9877151250839233, "epoch": 0.06340574784523777, "grad_norm": 0.07973338663578033, "grad_norm_var": 0.00183406239269825, "learning_rate": 0.009872712544264695, "loss": 2.9877, "step": 1166 }, { "crossentropy": 2.7775025367736816, "epoch": 0.06346012670273797, "grad_norm": 0.06476324051618576, "grad_norm_var": 0.001813418636870441, "learning_rate": 0.009872433220004218, "loss": 2.7775, "step": 1167 }, { "crossentropy": 2.864116072654724, "epoch": 0.06351450556023817, "grad_norm": 0.05980224907398224, "grad_norm_var": 0.0018126157266487196, "learning_rate": 0.009872153593559975, "loss": 2.8641, "step": 1168 }, { "crossentropy": 2.7856342792510986, "epoch": 0.06356888441773838, "grad_norm": 0.06155199185013771, "grad_norm_var": 0.0018117431799629417, "learning_rate": 0.00987187366494931, "loss": 2.7856, "step": 1169 }, { "crossentropy": 2.816585421562195, "epoch": 0.06362326327523858, "grad_norm": 0.05891605466604233, "grad_norm_var": 0.0018140164553653636, "learning_rate": 0.009871593434189583, "loss": 2.8166, "step": 1170 }, { "crossentropy": 2.822769284248352, "epoch": 0.06367764213273878, "grad_norm": 0.05392030254006386, "grad_norm_var": 0.0018186010943324392, "learning_rate": 0.009871312901298173, "loss": 2.8228, "step": 1171 }, { "crossentropy": 2.814505100250244, "epoch": 0.06373202099023899, "grad_norm": 0.059148870408535004, "grad_norm_var": 0.0018010189758040339, "learning_rate": 0.009871032066292475, "loss": 2.8145, "step": 1172 }, { "crossentropy": 2.917352318763733, "epoch": 0.0637863998477392, "grad_norm": 0.06793096661567688, "grad_norm_var": 0.0016751266253000768, "learning_rate": 0.009870750929189914, "loss": 2.9174, "step": 1173 }, { "crossentropy": 2.8173693418502808, "epoch": 0.06384077870523941, "grad_norm": 0.05732974037528038, "grad_norm_var": 0.0016755248627736714, "learning_rate": 0.009870469490007918, "loss": 2.8174, "step": 1174 }, { "crossentropy": 2.8962243795394897, "epoch": 0.06389515756273961, "grad_norm": 0.06830921024084091, "grad_norm_var": 0.0016618208488819886, "learning_rate": 0.009870187748763947, "loss": 2.8962, "step": 1175 }, { "crossentropy": 2.8749712705612183, "epoch": 0.06394953642023982, "grad_norm": 0.07437759637832642, "grad_norm_var": 0.0016477102187586262, "learning_rate": 0.009869905705475471, "loss": 2.875, "step": 1176 }, { "crossentropy": 2.872851610183716, "epoch": 0.06400391527774002, "grad_norm": 0.06064312160015106, "grad_norm_var": 0.0016295659226892578, "learning_rate": 0.009869623360159984, "loss": 2.8729, "step": 1177 }, { "crossentropy": 2.8407528400421143, "epoch": 0.06405829413524022, "grad_norm": 0.054861631244421005, "grad_norm_var": 0.0016279542513876458, "learning_rate": 0.009869340712834995, "loss": 2.8408, "step": 1178 }, { "crossentropy": 2.824641466140747, "epoch": 0.06411267299274043, "grad_norm": 0.05219194293022156, "grad_norm_var": 0.001648265564269541, "learning_rate": 0.009869057763518038, "loss": 2.8246, "step": 1179 }, { "crossentropy": 2.9318565130233765, "epoch": 0.06416705185024063, "grad_norm": 0.053901318460702896, "grad_norm_var": 0.001657040321081786, "learning_rate": 0.009868774512226655, "loss": 2.9319, "step": 1180 }, { "crossentropy": 2.927098035812378, "epoch": 0.06422143070774083, "grad_norm": 0.05708802863955498, "grad_norm_var": 5.968177256937731e-05, "learning_rate": 0.009868490958978416, "loss": 2.9271, "step": 1181 }, { "crossentropy": 2.8480207920074463, "epoch": 0.06427580956524104, "grad_norm": 0.050410497933626175, "grad_norm_var": 4.2248614240421716e-05, "learning_rate": 0.009868207103790907, "loss": 2.848, "step": 1182 }, { "crossentropy": 2.8799761533737183, "epoch": 0.06433018842274124, "grad_norm": 0.05586903169751167, "grad_norm_var": 4.118438371411385e-05, "learning_rate": 0.009867922946681732, "loss": 2.88, "step": 1183 }, { "crossentropy": 2.8953651189804077, "epoch": 0.06438456728024144, "grad_norm": 0.05694405734539032, "grad_norm_var": 4.144288351480841e-05, "learning_rate": 0.009867638487668515, "loss": 2.8954, "step": 1184 }, { "crossentropy": 2.8401330709457397, "epoch": 0.06443894613774165, "grad_norm": 0.05564101040363312, "grad_norm_var": 4.158547875903606e-05, "learning_rate": 0.009867353726768896, "loss": 2.8401, "step": 1185 }, { "crossentropy": 2.8614429235458374, "epoch": 0.06449332499524185, "grad_norm": 0.06459540128707886, "grad_norm_var": 4.38462653949205e-05, "learning_rate": 0.009867068664000538, "loss": 2.8614, "step": 1186 }, { "crossentropy": 2.9237210750579834, "epoch": 0.06454770385274206, "grad_norm": 0.060593925416469574, "grad_norm_var": 4.215640865609452e-05, "learning_rate": 0.009866783299381118, "loss": 2.9237, "step": 1187 }, { "crossentropy": 2.8484925031661987, "epoch": 0.06460208271024226, "grad_norm": 0.05774626508355141, "grad_norm_var": 4.231974162982825e-05, "learning_rate": 0.009866497632928335, "loss": 2.8485, "step": 1188 }, { "crossentropy": 2.8685325384140015, "epoch": 0.06465646156774246, "grad_norm": 0.050800442695617676, "grad_norm_var": 4.08946546585515e-05, "learning_rate": 0.009866211664659905, "loss": 2.8685, "step": 1189 }, { "crossentropy": 2.8625359535217285, "epoch": 0.06471084042524267, "grad_norm": 0.05342230945825577, "grad_norm_var": 4.2305664028458036e-05, "learning_rate": 0.009865925394593567, "loss": 2.8625, "step": 1190 }, { "crossentropy": 2.8466087579727173, "epoch": 0.06476521928274287, "grad_norm": 0.0553150437772274, "grad_norm_var": 3.493198083896285e-05, "learning_rate": 0.00986563882274707, "loss": 2.8466, "step": 1191 }, { "crossentropy": 2.8140729665756226, "epoch": 0.06481959814024307, "grad_norm": 0.053285326808691025, "grad_norm_var": 1.4288291827916725e-05, "learning_rate": 0.009865351949138191, "loss": 2.8141, "step": 1192 }, { "crossentropy": 2.9065980911254883, "epoch": 0.06487397699774328, "grad_norm": 0.05153618007898331, "grad_norm_var": 1.3629667995910395e-05, "learning_rate": 0.00986506477378472, "loss": 2.9066, "step": 1193 }, { "crossentropy": 2.94264018535614, "epoch": 0.06492835585524348, "grad_norm": 0.05399647727608681, "grad_norm_var": 1.3722707865693184e-05, "learning_rate": 0.009864777296704466, "loss": 2.9426, "step": 1194 }, { "crossentropy": 2.818272113800049, "epoch": 0.06498273471274368, "grad_norm": 0.06078116223216057, "grad_norm_var": 1.4878886381855006e-05, "learning_rate": 0.00986448951791526, "loss": 2.8183, "step": 1195 }, { "crossentropy": 2.86555278301239, "epoch": 0.06503711357024389, "grad_norm": 0.05784397944808006, "grad_norm_var": 1.4881007999424135e-05, "learning_rate": 0.00986420143743495, "loss": 2.8656, "step": 1196 }, { "crossentropy": 2.906923294067383, "epoch": 0.06509149242774409, "grad_norm": 0.055469900369644165, "grad_norm_var": 1.4808146965174472e-05, "learning_rate": 0.009863913055281401, "loss": 2.9069, "step": 1197 }, { "crossentropy": 3.019923448562622, "epoch": 0.0651458712852443, "grad_norm": 0.05467040464282036, "grad_norm_var": 1.2829642379936594e-05, "learning_rate": 0.0098636243714725, "loss": 3.0199, "step": 1198 }, { "crossentropy": 2.7977683544158936, "epoch": 0.0652002501427445, "grad_norm": 0.05490119755268097, "grad_norm_var": 1.2925338327483768e-05, "learning_rate": 0.00986333538602615, "loss": 2.7978, "step": 1199 }, { "crossentropy": 2.8769619464874268, "epoch": 0.0652546290002447, "grad_norm": 0.056459397077560425, "grad_norm_var": 1.2885245284859475e-05, "learning_rate": 0.00986304609896027, "loss": 2.877, "step": 1200 }, { "crossentropy": 2.8334555625915527, "epoch": 0.0653090078577449, "grad_norm": 0.05450551211833954, "grad_norm_var": 1.3030196318190336e-05, "learning_rate": 0.00986275651029281, "loss": 2.8335, "step": 1201 }, { "crossentropy": 2.879557490348816, "epoch": 0.06536338671524511, "grad_norm": 0.051683828234672546, "grad_norm_var": 8.643844724123307e-06, "learning_rate": 0.009862466620041721, "loss": 2.8796, "step": 1202 }, { "crossentropy": 2.81711483001709, "epoch": 0.06541776557274531, "grad_norm": 0.05546850338578224, "grad_norm_var": 6.591506612744352e-06, "learning_rate": 0.009862176428224987, "loss": 2.8171, "step": 1203 }, { "crossentropy": 2.9018919467926025, "epoch": 0.06547214443024552, "grad_norm": 0.0592728853225708, "grad_norm_var": 7.323062559166406e-06, "learning_rate": 0.009861885934860605, "loss": 2.9019, "step": 1204 }, { "crossentropy": 2.868003249168396, "epoch": 0.06552652328774572, "grad_norm": 0.054320208728313446, "grad_norm_var": 6.143729103908329e-06, "learning_rate": 0.009861595139966586, "loss": 2.868, "step": 1205 }, { "crossentropy": 2.8585941791534424, "epoch": 0.06558090214524592, "grad_norm": 0.05086341127753258, "grad_norm_var": 7.153792353446301e-06, "learning_rate": 0.009861304043560972, "loss": 2.8586, "step": 1206 }, { "crossentropy": 2.7366620302200317, "epoch": 0.06563528100274613, "grad_norm": 0.05041535198688507, "grad_norm_var": 8.463659882546453e-06, "learning_rate": 0.009861012645661813, "loss": 2.7367, "step": 1207 }, { "crossentropy": 2.91241455078125, "epoch": 0.06568965986024633, "grad_norm": 0.055331602692604065, "grad_norm_var": 8.334720143121412e-06, "learning_rate": 0.009860720946287182, "loss": 2.9124, "step": 1208 }, { "crossentropy": 2.904722809791565, "epoch": 0.06574403871774653, "grad_norm": 0.050021104514598846, "grad_norm_var": 9.146601007834625e-06, "learning_rate": 0.009860428945455168, "loss": 2.9047, "step": 1209 }, { "crossentropy": 2.8779021501541138, "epoch": 0.06579841757524675, "grad_norm": 0.05343492701649666, "grad_norm_var": 9.222751516039378e-06, "learning_rate": 0.009860136643183883, "loss": 2.8779, "step": 1210 }, { "crossentropy": 2.8977768421173096, "epoch": 0.06585279643274695, "grad_norm": 0.05661629140377045, "grad_norm_var": 6.938365406027914e-06, "learning_rate": 0.009859844039491457, "loss": 2.8978, "step": 1211 }, { "crossentropy": 2.9003454446792603, "epoch": 0.06590717529024716, "grad_norm": 0.07421088218688965, "grad_norm_var": 3.1076401218609516e-05, "learning_rate": 0.00985955113439603, "loss": 2.9003, "step": 1212 }, { "crossentropy": 2.8788516521453857, "epoch": 0.06596155414774736, "grad_norm": 0.05018003657460213, "grad_norm_var": 3.283091595789872e-05, "learning_rate": 0.009859257927915775, "loss": 2.8789, "step": 1213 }, { "crossentropy": 2.7858545780181885, "epoch": 0.06601593300524757, "grad_norm": 0.05699348822236061, "grad_norm_var": 3.302051938819933e-05, "learning_rate": 0.009858964420068873, "loss": 2.7859, "step": 1214 }, { "crossentropy": 2.9505962133407593, "epoch": 0.06607031186274777, "grad_norm": 0.05445733293890953, "grad_norm_var": 3.305598584293704e-05, "learning_rate": 0.009858670610873529, "loss": 2.9506, "step": 1215 }, { "crossentropy": 2.8529624938964844, "epoch": 0.06612469072024797, "grad_norm": 0.05531461536884308, "grad_norm_var": 3.295553385482056e-05, "learning_rate": 0.009858376500347961, "loss": 2.853, "step": 1216 }, { "crossentropy": 2.8983116149902344, "epoch": 0.06617906957774818, "grad_norm": 0.056598030030727386, "grad_norm_var": 3.303735298379832e-05, "learning_rate": 0.009858082088510413, "loss": 2.8983, "step": 1217 }, { "crossentropy": 2.874754309654236, "epoch": 0.06623344843524838, "grad_norm": 0.056065578013658524, "grad_norm_var": 3.211068137810944e-05, "learning_rate": 0.009857787375379144, "loss": 2.8748, "step": 1218 }, { "crossentropy": 2.8785741329193115, "epoch": 0.06628782729274858, "grad_norm": 0.0571209117770195, "grad_norm_var": 3.2252855524776e-05, "learning_rate": 0.009857492360972428, "loss": 2.8786, "step": 1219 }, { "crossentropy": 2.7900971174240112, "epoch": 0.06634220615024879, "grad_norm": 0.06092371046543121, "grad_norm_var": 3.320938069664136e-05, "learning_rate": 0.009857197045308567, "loss": 2.7901, "step": 1220 }, { "crossentropy": 2.9132872819900513, "epoch": 0.06639658500774899, "grad_norm": 0.060716692358255386, "grad_norm_var": 3.450090982759357e-05, "learning_rate": 0.009856901428405873, "loss": 2.9133, "step": 1221 }, { "crossentropy": 2.8395477533340454, "epoch": 0.0664509638652492, "grad_norm": 0.06420651823282242, "grad_norm_var": 3.612698043019743e-05, "learning_rate": 0.009856605510282682, "loss": 2.8395, "step": 1222 }, { "crossentropy": 2.8803542852401733, "epoch": 0.0665053427227494, "grad_norm": 0.058129165321588516, "grad_norm_var": 3.303452282190037e-05, "learning_rate": 0.009856309290957344, "loss": 2.8804, "step": 1223 }, { "crossentropy": 2.8939863443374634, "epoch": 0.0665597215802496, "grad_norm": 0.05365801230072975, "grad_norm_var": 3.369792250513051e-05, "learning_rate": 0.00985601277044823, "loss": 2.894, "step": 1224 }, { "crossentropy": 2.722407579421997, "epoch": 0.0666141004377498, "grad_norm": 0.05208956077694893, "grad_norm_var": 3.192601043129993e-05, "learning_rate": 0.00985571594877373, "loss": 2.7224, "step": 1225 }, { "crossentropy": 2.910215377807617, "epoch": 0.06666847929525001, "grad_norm": 0.05233987420797348, "grad_norm_var": 3.2601017561473036e-05, "learning_rate": 0.009855418825952257, "loss": 2.9102, "step": 1226 }, { "crossentropy": 2.7997976541519165, "epoch": 0.06672285815275021, "grad_norm": 0.04910198971629143, "grad_norm_var": 3.699170546596533e-05, "learning_rate": 0.009855121402002235, "loss": 2.7998, "step": 1227 }, { "crossentropy": 2.7994476556777954, "epoch": 0.06677723701025041, "grad_norm": 0.05123300477862358, "grad_norm_var": 1.728173490175011e-05, "learning_rate": 0.009854823676942109, "loss": 2.7994, "step": 1228 }, { "crossentropy": 2.869547724723816, "epoch": 0.06683161586775062, "grad_norm": 0.05228763446211815, "grad_norm_var": 1.6044558226159752e-05, "learning_rate": 0.009854525650790345, "loss": 2.8695, "step": 1229 }, { "crossentropy": 2.92829966545105, "epoch": 0.06688599472525082, "grad_norm": 0.049537595361471176, "grad_norm_var": 1.8235317252758102e-05, "learning_rate": 0.009854227323565426, "loss": 2.9283, "step": 1230 }, { "crossentropy": 2.834198832511902, "epoch": 0.06694037358275103, "grad_norm": 0.05511878430843353, "grad_norm_var": 1.819396544181038e-05, "learning_rate": 0.009853928695285855, "loss": 2.8342, "step": 1231 }, { "crossentropy": 2.8688957691192627, "epoch": 0.06699475244025123, "grad_norm": 0.056628916412591934, "grad_norm_var": 1.8308412873687074e-05, "learning_rate": 0.00985362976597015, "loss": 2.8689, "step": 1232 }, { "crossentropy": 2.7498743534088135, "epoch": 0.06704913129775143, "grad_norm": 0.05228153616189957, "grad_norm_var": 1.876024913727639e-05, "learning_rate": 0.009853330535636851, "loss": 2.7499, "step": 1233 }, { "crossentropy": 2.9068745374679565, "epoch": 0.06710351015525164, "grad_norm": 0.05312067270278931, "grad_norm_var": 1.891920104609094e-05, "learning_rate": 0.009853031004304518, "loss": 2.9069, "step": 1234 }, { "crossentropy": 2.8867921829223633, "epoch": 0.06715788901275184, "grad_norm": 0.05149272829294205, "grad_norm_var": 1.9236788514069973e-05, "learning_rate": 0.009852731171991728, "loss": 2.8868, "step": 1235 }, { "crossentropy": 2.7404266595840454, "epoch": 0.06721226787025204, "grad_norm": 0.05595133826136589, "grad_norm_var": 1.6559158621322808e-05, "learning_rate": 0.009852431038717073, "loss": 2.7404, "step": 1236 }, { "crossentropy": 2.728303074836731, "epoch": 0.06726664672775225, "grad_norm": 0.0597456619143486, "grad_norm_var": 1.5779984973381065e-05, "learning_rate": 0.00985213060449917, "loss": 2.7283, "step": 1237 }, { "crossentropy": 2.800938129425049, "epoch": 0.06732102558525245, "grad_norm": 0.05017998442053795, "grad_norm_var": 9.329848927247299e-06, "learning_rate": 0.00985182986935665, "loss": 2.8009, "step": 1238 }, { "crossentropy": 2.8377538919448853, "epoch": 0.06737540444275265, "grad_norm": 0.05048380792140961, "grad_norm_var": 8.06645328280253e-06, "learning_rate": 0.009851528833308166, "loss": 2.8378, "step": 1239 }, { "crossentropy": 2.8156375885009766, "epoch": 0.06742978330025286, "grad_norm": 0.050005827099084854, "grad_norm_var": 8.496020086244613e-06, "learning_rate": 0.009851227496372386, "loss": 2.8156, "step": 1240 }, { "crossentropy": 2.7183873653411865, "epoch": 0.06748416215775306, "grad_norm": 0.062047164887189865, "grad_norm_var": 1.401552721893521e-05, "learning_rate": 0.009850925858568, "loss": 2.7184, "step": 1241 }, { "crossentropy": 2.885608434677124, "epoch": 0.06753854101525326, "grad_norm": 0.04971158131957054, "grad_norm_var": 1.4756502783194736e-05, "learning_rate": 0.009850623919913714, "loss": 2.8856, "step": 1242 }, { "crossentropy": 2.8297128677368164, "epoch": 0.06759291987275347, "grad_norm": 0.05573022738099098, "grad_norm_var": 1.4006152581879973e-05, "learning_rate": 0.009850321680428254, "loss": 2.8297, "step": 1243 }, { "crossentropy": 2.8821730613708496, "epoch": 0.06764729873025367, "grad_norm": 0.05604751408100128, "grad_norm_var": 1.4017403710593565e-05, "learning_rate": 0.009850019140130365, "loss": 2.8822, "step": 1244 }, { "crossentropy": 2.8008806705474854, "epoch": 0.06770167758775388, "grad_norm": 0.05356431007385254, "grad_norm_var": 1.3866396854177639e-05, "learning_rate": 0.009849716299038814, "loss": 2.8009, "step": 1245 }, { "crossentropy": 3.003320574760437, "epoch": 0.06775605644525408, "grad_norm": 0.05434524640440941, "grad_norm_var": 1.2544743855409195e-05, "learning_rate": 0.009849413157172377, "loss": 3.0033, "step": 1246 }, { "crossentropy": 2.8615610599517822, "epoch": 0.06781043530275428, "grad_norm": 0.051259737461805344, "grad_norm_var": 1.2978809575192075e-05, "learning_rate": 0.009849109714549857, "loss": 2.8616, "step": 1247 }, { "crossentropy": 2.8069599866867065, "epoch": 0.0678648141602545, "grad_norm": 0.04758740961551666, "grad_norm_var": 1.4813097217626868e-05, "learning_rate": 0.009848805971190075, "loss": 2.807, "step": 1248 }, { "crossentropy": 2.8143246173858643, "epoch": 0.0679191930177547, "grad_norm": 0.05535399913787842, "grad_norm_var": 1.4966548895122463e-05, "learning_rate": 0.009848501927111866, "loss": 2.8143, "step": 1249 }, { "crossentropy": 2.765677571296692, "epoch": 0.0679735718752549, "grad_norm": 0.05120581015944481, "grad_norm_var": 1.530257402954782e-05, "learning_rate": 0.009848197582334087, "loss": 2.7657, "step": 1250 }, { "crossentropy": 2.855466365814209, "epoch": 0.06802795073275511, "grad_norm": 0.05142369493842125, "grad_norm_var": 1.532060694882283e-05, "learning_rate": 0.009847892936875616, "loss": 2.8555, "step": 1251 }, { "crossentropy": 2.829630970954895, "epoch": 0.06808232959025531, "grad_norm": 0.052303772419691086, "grad_norm_var": 1.4918725643403574e-05, "learning_rate": 0.009847587990755342, "loss": 2.8296, "step": 1252 }, { "crossentropy": 2.839754581451416, "epoch": 0.06813670844775552, "grad_norm": 0.05337473005056381, "grad_norm_var": 1.1884417905374163e-05, "learning_rate": 0.009847282743992183, "loss": 2.8398, "step": 1253 }, { "crossentropy": 2.8752652406692505, "epoch": 0.06819108730525572, "grad_norm": 0.059076111763715744, "grad_norm_var": 1.373599027197266e-05, "learning_rate": 0.009846977196605065, "loss": 2.8753, "step": 1254 }, { "crossentropy": 2.9689226150512695, "epoch": 0.06824546616275592, "grad_norm": 0.054627254605293274, "grad_norm_var": 1.3228274142907762e-05, "learning_rate": 0.009846671348612942, "loss": 2.9689, "step": 1255 }, { "crossentropy": 2.7527188062667847, "epoch": 0.06829984502025613, "grad_norm": 0.048766765743494034, "grad_norm_var": 1.3918680348446825e-05, "learning_rate": 0.00984636520003478, "loss": 2.7527, "step": 1256 }, { "crossentropy": 2.862297534942627, "epoch": 0.06835422387775633, "grad_norm": 0.05413644015789032, "grad_norm_var": 8.842706296302976e-06, "learning_rate": 0.009846058750889565, "loss": 2.8623, "step": 1257 }, { "crossentropy": 2.896428346633911, "epoch": 0.06840860273525654, "grad_norm": 0.05449013039469719, "grad_norm_var": 8.154189604856789e-06, "learning_rate": 0.009845752001196304, "loss": 2.8964, "step": 1258 }, { "crossentropy": 2.80844247341156, "epoch": 0.06846298159275674, "grad_norm": 0.05274958908557892, "grad_norm_var": 7.75588451271958e-06, "learning_rate": 0.009845444950974024, "loss": 2.8084, "step": 1259 }, { "crossentropy": 2.8228477239608765, "epoch": 0.06851736045025694, "grad_norm": 0.05073492228984833, "grad_norm_var": 7.463547162029719e-06, "learning_rate": 0.009845137600241765, "loss": 2.8228, "step": 1260 }, { "crossentropy": 2.766956329345703, "epoch": 0.06857173930775715, "grad_norm": 0.05134870111942291, "grad_norm_var": 7.548257836768417e-06, "learning_rate": 0.009844829949018587, "loss": 2.767, "step": 1261 }, { "crossentropy": 2.967631220817566, "epoch": 0.06862611816525735, "grad_norm": 0.0513652004301548, "grad_norm_var": 7.439255644305319e-06, "learning_rate": 0.009844521997323575, "loss": 2.9676, "step": 1262 }, { "crossentropy": 2.850440263748169, "epoch": 0.06868049702275755, "grad_norm": 0.05104690417647362, "grad_norm_var": 7.476935506585634e-06, "learning_rate": 0.009844213745175825, "loss": 2.8504, "step": 1263 }, { "crossentropy": 2.716636300086975, "epoch": 0.06873487588025776, "grad_norm": 0.05592479184269905, "grad_norm_var": 6.388732351065608e-06, "learning_rate": 0.009843905192594453, "loss": 2.7166, "step": 1264 }, { "crossentropy": 2.8250157833099365, "epoch": 0.06878925473775796, "grad_norm": 0.055867455899715424, "grad_norm_var": 6.566671192396028e-06, "learning_rate": 0.009843596339598599, "loss": 2.825, "step": 1265 }, { "crossentropy": 2.8597627878189087, "epoch": 0.06884363359525816, "grad_norm": 0.053536295890808105, "grad_norm_var": 6.340018452060993e-06, "learning_rate": 0.009843287186207415, "loss": 2.8598, "step": 1266 }, { "crossentropy": 2.9031628370285034, "epoch": 0.06889801245275837, "grad_norm": 0.05692404508590698, "grad_norm_var": 6.9477605880468566e-06, "learning_rate": 0.009842977732440073, "loss": 2.9032, "step": 1267 }, { "crossentropy": 2.7655270099639893, "epoch": 0.06895239131025857, "grad_norm": 0.05221090838313103, "grad_norm_var": 6.963322458909162e-06, "learning_rate": 0.00984266797831577, "loss": 2.7655, "step": 1268 }, { "crossentropy": 2.866193175315857, "epoch": 0.06900677016775877, "grad_norm": 0.0544244721531868, "grad_norm_var": 7.0130846055185245e-06, "learning_rate": 0.009842357923853714, "loss": 2.8662, "step": 1269 }, { "crossentropy": 2.815509557723999, "epoch": 0.06906114902525898, "grad_norm": 0.050863705575466156, "grad_norm_var": 5.206714346239529e-06, "learning_rate": 0.009842047569073132, "loss": 2.8155, "step": 1270 }, { "crossentropy": 2.7846930027008057, "epoch": 0.06911552788275918, "grad_norm": 0.05611817538738251, "grad_norm_var": 5.656480377778245e-06, "learning_rate": 0.009841736913993274, "loss": 2.7847, "step": 1271 }, { "crossentropy": 2.7879570722579956, "epoch": 0.06916990674025938, "grad_norm": 0.0594671294093132, "grad_norm_var": 6.549289810892494e-06, "learning_rate": 0.009841425958633408, "loss": 2.788, "step": 1272 }, { "crossentropy": 2.7603511810302734, "epoch": 0.06922428559775959, "grad_norm": 0.051131244748830795, "grad_norm_var": 6.989170037900033e-06, "learning_rate": 0.009841114703012816, "loss": 2.7604, "step": 1273 }, { "crossentropy": 2.8533281087875366, "epoch": 0.06927866445525979, "grad_norm": 0.056908365339040756, "grad_norm_var": 7.62950205653104e-06, "learning_rate": 0.009840803147150806, "loss": 2.8533, "step": 1274 }, { "crossentropy": 3.0060839653015137, "epoch": 0.06933304331276, "grad_norm": 0.05537791550159454, "grad_norm_var": 7.697049315457995e-06, "learning_rate": 0.009840491291066695, "loss": 3.0061, "step": 1275 }, { "crossentropy": 2.848102331161499, "epoch": 0.0693874221702602, "grad_norm": 0.05494913458824158, "grad_norm_var": 6.9987232803638e-06, "learning_rate": 0.009840179134779829, "loss": 2.8481, "step": 1276 }, { "crossentropy": 2.916846990585327, "epoch": 0.0694418010277604, "grad_norm": 0.054905965924263, "grad_norm_var": 6.429390708791403e-06, "learning_rate": 0.009839866678309566, "loss": 2.9168, "step": 1277 }, { "crossentropy": 2.784703850746155, "epoch": 0.0694961798852606, "grad_norm": 0.056746453046798706, "grad_norm_var": 6.033908604941423e-06, "learning_rate": 0.009839553921675282, "loss": 2.7847, "step": 1278 }, { "crossentropy": 2.8390350341796875, "epoch": 0.06955055874276081, "grad_norm": 0.05323057994246483, "grad_norm_var": 5.246421844140997e-06, "learning_rate": 0.009839240864896376, "loss": 2.839, "step": 1279 }, { "crossentropy": 2.91703999042511, "epoch": 0.06960493760026101, "grad_norm": 0.05560838803648949, "grad_norm_var": 5.209937842739059e-06, "learning_rate": 0.009838927507992264, "loss": 2.917, "step": 1280 }, { "crossentropy": 2.72241473197937, "epoch": 0.06965931645776122, "grad_norm": 0.05155639722943306, "grad_norm_var": 5.810751427916492e-06, "learning_rate": 0.009838613850982378, "loss": 2.7224, "step": 1281 }, { "crossentropy": 2.877849578857422, "epoch": 0.06971369531526142, "grad_norm": 0.05353759974241257, "grad_norm_var": 5.810562709922714e-06, "learning_rate": 0.009838299893886173, "loss": 2.8778, "step": 1282 }, { "crossentropy": 2.8688489198684692, "epoch": 0.06976807417276162, "grad_norm": 0.08994371443986893, "grad_norm_var": 8.408692523602078e-05, "learning_rate": 0.00983798563672312, "loss": 2.8688, "step": 1283 }, { "crossentropy": 2.76124370098114, "epoch": 0.06982245303026183, "grad_norm": 0.04820568487048149, "grad_norm_var": 8.74795094819023e-05, "learning_rate": 0.009837671079512706, "loss": 2.7612, "step": 1284 }, { "crossentropy": 2.8292150497436523, "epoch": 0.06987683188776204, "grad_norm": 0.050707485526800156, "grad_norm_var": 8.933988517569088e-05, "learning_rate": 0.009837356222274446, "loss": 2.8292, "step": 1285 }, { "crossentropy": 2.7565606832504272, "epoch": 0.06993121074526225, "grad_norm": 0.051201462745666504, "grad_norm_var": 8.910653586411728e-05, "learning_rate": 0.00983704106502786, "loss": 2.7566, "step": 1286 }, { "crossentropy": 2.8997737169265747, "epoch": 0.06998558960276245, "grad_norm": 0.04918844625353813, "grad_norm_var": 9.220631104631208e-05, "learning_rate": 0.009836725607792496, "loss": 2.8998, "step": 1287 }, { "crossentropy": 2.7917137145996094, "epoch": 0.07003996846026266, "grad_norm": 0.049843497574329376, "grad_norm_var": 9.327847470104548e-05, "learning_rate": 0.00983640985058792, "loss": 2.7917, "step": 1288 }, { "crossentropy": 2.9129282236099243, "epoch": 0.07009434731776286, "grad_norm": 0.05926447734236717, "grad_norm_var": 9.30112183671968e-05, "learning_rate": 0.009836093793433713, "loss": 2.9129, "step": 1289 }, { "crossentropy": 2.819472312927246, "epoch": 0.07014872617526306, "grad_norm": 0.051081333309412, "grad_norm_var": 9.419335140944928e-05, "learning_rate": 0.00983577743634948, "loss": 2.8195, "step": 1290 }, { "crossentropy": 2.8701281547546387, "epoch": 0.07020310503276327, "grad_norm": 0.05150516331195831, "grad_norm_var": 9.510820938302666e-05, "learning_rate": 0.009835460779354837, "loss": 2.8701, "step": 1291 }, { "crossentropy": 2.8086379766464233, "epoch": 0.07025748389026347, "grad_norm": 0.051727619022130966, "grad_norm_var": 9.581831187701427e-05, "learning_rate": 0.009835143822469425, "loss": 2.8086, "step": 1292 }, { "crossentropy": 2.8576189279556274, "epoch": 0.07031186274776367, "grad_norm": 0.052461348474025726, "grad_norm_var": 9.61869081264494e-05, "learning_rate": 0.009834826565712902, "loss": 2.8576, "step": 1293 }, { "crossentropy": 2.8569040298461914, "epoch": 0.07036624160526388, "grad_norm": 0.04978404939174652, "grad_norm_var": 9.735220733965982e-05, "learning_rate": 0.009834509009104942, "loss": 2.8569, "step": 1294 }, { "crossentropy": 2.8086931705474854, "epoch": 0.07042062046276408, "grad_norm": 0.054112743586301804, "grad_norm_var": 9.727471116898612e-05, "learning_rate": 0.009834191152665239, "loss": 2.8087, "step": 1295 }, { "crossentropy": 2.822044253349304, "epoch": 0.07047499932026428, "grad_norm": 0.05298677831888199, "grad_norm_var": 9.726722374578922e-05, "learning_rate": 0.00983387299641351, "loss": 2.822, "step": 1296 }, { "crossentropy": 2.9132803678512573, "epoch": 0.07052937817776449, "grad_norm": 0.0558682344853878, "grad_norm_var": 9.691269492694647e-05, "learning_rate": 0.009833554540369483, "loss": 2.9133, "step": 1297 }, { "crossentropy": 2.89321506023407, "epoch": 0.07058375703526469, "grad_norm": 0.059052400290966034, "grad_norm_var": 9.813252108676155e-05, "learning_rate": 0.00983323578455291, "loss": 2.8932, "step": 1298 }, { "crossentropy": 2.977119207382202, "epoch": 0.0706381358927649, "grad_norm": 0.0529438816010952, "grad_norm_var": 1.0360825173752879e-05, "learning_rate": 0.009832916728983558, "loss": 2.9771, "step": 1299 }, { "crossentropy": 2.8458914756774902, "epoch": 0.0706925147502651, "grad_norm": 0.052878305315971375, "grad_norm_var": 9.052530588589635e-06, "learning_rate": 0.00983259737368122, "loss": 2.8459, "step": 1300 }, { "crossentropy": 2.915503144264221, "epoch": 0.0707468936077653, "grad_norm": 0.05556146800518036, "grad_norm_var": 9.178629386758357e-06, "learning_rate": 0.009832277718665695, "loss": 2.9155, "step": 1301 }, { "crossentropy": 2.878324508666992, "epoch": 0.0708012724652655, "grad_norm": 0.05201137438416481, "grad_norm_var": 9.015543771713024e-06, "learning_rate": 0.009831957763956813, "loss": 2.8783, "step": 1302 }, { "crossentropy": 2.816387414932251, "epoch": 0.07085565132276571, "grad_norm": 0.05591901019215584, "grad_norm_var": 8.298920938274585e-06, "learning_rate": 0.009831637509574413, "loss": 2.8164, "step": 1303 }, { "crossentropy": 2.845304846763611, "epoch": 0.07091003018026591, "grad_norm": 0.05988480895757675, "grad_norm_var": 9.621371015786778e-06, "learning_rate": 0.009831316955538362, "loss": 2.8453, "step": 1304 }, { "crossentropy": 2.9560707807540894, "epoch": 0.07096440903776612, "grad_norm": 0.06529605388641357, "grad_norm_var": 1.597591162158281e-05, "learning_rate": 0.009830996101868536, "loss": 2.9561, "step": 1305 }, { "crossentropy": 2.9769176244735718, "epoch": 0.07101878789526632, "grad_norm": 0.06542714685201645, "grad_norm_var": 2.2170955378018423e-05, "learning_rate": 0.009830674948584837, "loss": 2.9769, "step": 1306 }, { "crossentropy": 2.775087594985962, "epoch": 0.07107316675276652, "grad_norm": 0.05509290099143982, "grad_norm_var": 2.1081785772981513e-05, "learning_rate": 0.00983035349570718, "loss": 2.7751, "step": 1307 }, { "crossentropy": 2.884898066520691, "epoch": 0.07112754561026673, "grad_norm": 0.055003661662340164, "grad_norm_var": 2.0022643928363273e-05, "learning_rate": 0.009830031743255504, "loss": 2.8849, "step": 1308 }, { "crossentropy": 2.889712333679199, "epoch": 0.07118192446776693, "grad_norm": 0.06062480807304382, "grad_norm_var": 2.0452814451004295e-05, "learning_rate": 0.00982970969124976, "loss": 2.8897, "step": 1309 }, { "crossentropy": 2.8630553483963013, "epoch": 0.07123630332526713, "grad_norm": 0.0602746307849884, "grad_norm_var": 1.8072897090964934e-05, "learning_rate": 0.009829387339709925, "loss": 2.8631, "step": 1310 }, { "crossentropy": 2.8980231285095215, "epoch": 0.07129068218276734, "grad_norm": 0.05734618008136749, "grad_norm_var": 1.7456293246052336e-05, "learning_rate": 0.009829064688655988, "loss": 2.898, "step": 1311 }, { "crossentropy": 2.832553267478943, "epoch": 0.07134506104026754, "grad_norm": 0.061307940632104874, "grad_norm_var": 1.7042005219708445e-05, "learning_rate": 0.009828741738107964, "loss": 2.8326, "step": 1312 }, { "crossentropy": 2.946608304977417, "epoch": 0.07139943989776774, "grad_norm": 0.10637152940034866, "grad_norm_var": 0.00016357464607736575, "learning_rate": 0.009828418488085877, "loss": 2.9466, "step": 1313 }, { "crossentropy": 2.9160317182540894, "epoch": 0.07145381875526795, "grad_norm": 0.05834589898586273, "grad_norm_var": 0.00016378339634912174, "learning_rate": 0.00982809493860978, "loss": 2.916, "step": 1314 }, { "crossentropy": 2.8583056926727295, "epoch": 0.07150819761276815, "grad_norm": 0.06060076504945755, "grad_norm_var": 0.00015933213936555982, "learning_rate": 0.009827771089699733, "loss": 2.8583, "step": 1315 }, { "crossentropy": 2.82452130317688, "epoch": 0.07156257647026835, "grad_norm": 0.05947655066847801, "grad_norm_var": 0.00015458103111754175, "learning_rate": 0.009827446941375825, "loss": 2.8245, "step": 1316 }, { "crossentropy": 2.8095943927764893, "epoch": 0.07161695532776856, "grad_norm": 0.05821066349744797, "grad_norm_var": 0.0001528216943348, "learning_rate": 0.009827122493658158, "loss": 2.8096, "step": 1317 }, { "crossentropy": 2.8718881607055664, "epoch": 0.07167133418526876, "grad_norm": 0.05570404231548309, "grad_norm_var": 0.00014878077936962368, "learning_rate": 0.009826797746566855, "loss": 2.8719, "step": 1318 }, { "crossentropy": 2.802487015724182, "epoch": 0.07172571304276897, "grad_norm": 0.05639069899916649, "grad_norm_var": 0.00014840089392864237, "learning_rate": 0.009826472700122055, "loss": 2.8025, "step": 1319 }, { "crossentropy": 2.8103907108306885, "epoch": 0.07178009190026917, "grad_norm": 0.055239610373973846, "grad_norm_var": 0.0001511895741323689, "learning_rate": 0.009826147354343917, "loss": 2.8104, "step": 1320 }, { "crossentropy": 2.9017356634140015, "epoch": 0.07183447075776937, "grad_norm": 0.054658252745866776, "grad_norm_var": 0.00015347313138696587, "learning_rate": 0.009825821709252622, "loss": 2.9017, "step": 1321 }, { "crossentropy": 2.8776063919067383, "epoch": 0.07188884961526959, "grad_norm": 0.05525418743491173, "grad_norm_var": 0.00015428172288695918, "learning_rate": 0.009825495764868361, "loss": 2.8776, "step": 1322 }, { "crossentropy": 2.8562581539154053, "epoch": 0.0719432284727698, "grad_norm": 0.05470508709549904, "grad_norm_var": 0.00015457686383727967, "learning_rate": 0.009825169521211354, "loss": 2.8563, "step": 1323 }, { "crossentropy": 2.9546092748641968, "epoch": 0.07199760733027, "grad_norm": 0.05514082312583923, "grad_norm_var": 0.00015447579045754778, "learning_rate": 0.00982484297830183, "loss": 2.9546, "step": 1324 }, { "crossentropy": 2.880886673927307, "epoch": 0.0720519861877702, "grad_norm": 0.06776446849107742, "grad_norm_var": 0.00015768225437515562, "learning_rate": 0.009824516136160043, "loss": 2.8809, "step": 1325 }, { "crossentropy": 2.9421366453170776, "epoch": 0.0721063650452704, "grad_norm": 0.06293331831693649, "grad_norm_var": 0.00015784937325579728, "learning_rate": 0.009824188994806264, "loss": 2.9421, "step": 1326 }, { "crossentropy": 2.9288768768310547, "epoch": 0.07216074390277061, "grad_norm": 0.054611433297395706, "grad_norm_var": 0.00015972772808596855, "learning_rate": 0.00982386155426078, "loss": 2.9289, "step": 1327 }, { "crossentropy": 2.8676098585128784, "epoch": 0.07221512276027081, "grad_norm": 0.05153067782521248, "grad_norm_var": 0.0001653592433618793, "learning_rate": 0.009823533814543902, "loss": 2.8676, "step": 1328 }, { "crossentropy": 2.805366635322571, "epoch": 0.07226950161777101, "grad_norm": 0.055213287472724915, "grad_norm_var": 1.5585113670539958e-05, "learning_rate": 0.00982320577567595, "loss": 2.8054, "step": 1329 }, { "crossentropy": 2.769723415374756, "epoch": 0.07232388047527122, "grad_norm": 0.05108148604631424, "grad_norm_var": 1.7808537319424258e-05, "learning_rate": 0.009822877437677275, "loss": 2.7697, "step": 1330 }, { "crossentropy": 2.877426028251648, "epoch": 0.07237825933277142, "grad_norm": 0.051469430327415466, "grad_norm_var": 1.8370732472900755e-05, "learning_rate": 0.009822548800568237, "loss": 2.8774, "step": 1331 }, { "crossentropy": 2.7659629583358765, "epoch": 0.07243263819027163, "grad_norm": 0.0506872683763504, "grad_norm_var": 1.937262455798672e-05, "learning_rate": 0.00982221986436922, "loss": 2.766, "step": 1332 }, { "crossentropy": 2.8437914848327637, "epoch": 0.07248701704777183, "grad_norm": 0.05240754410624504, "grad_norm_var": 1.950549254823939e-05, "learning_rate": 0.00982189062910062, "loss": 2.8438, "step": 1333 }, { "crossentropy": 2.85073983669281, "epoch": 0.07254139590527203, "grad_norm": 0.049954887479543686, "grad_norm_var": 2.1261169479453815e-05, "learning_rate": 0.00982156109478286, "loss": 2.8507, "step": 1334 }, { "crossentropy": 2.8833104372024536, "epoch": 0.07259577476277224, "grad_norm": 0.049912530928850174, "grad_norm_var": 2.2631169118889764e-05, "learning_rate": 0.009821231261436376, "loss": 2.8833, "step": 1335 }, { "crossentropy": 2.860969305038452, "epoch": 0.07265015362027244, "grad_norm": 0.056044235825538635, "grad_norm_var": 2.274719720481248e-05, "learning_rate": 0.009820901129081624, "loss": 2.861, "step": 1336 }, { "crossentropy": 2.8192055225372314, "epoch": 0.07270453247777264, "grad_norm": 0.05534151941537857, "grad_norm_var": 2.2782998243425258e-05, "learning_rate": 0.009820570697739079, "loss": 2.8192, "step": 1337 }, { "crossentropy": 2.8766067028045654, "epoch": 0.07275891133527285, "grad_norm": 0.050143830478191376, "grad_norm_var": 2.3988738580035806e-05, "learning_rate": 0.009820239967429232, "loss": 2.8766, "step": 1338 }, { "crossentropy": 2.791110038757324, "epoch": 0.07281329019277305, "grad_norm": 0.04898379370570183, "grad_norm_var": 2.573230920106135e-05, "learning_rate": 0.009819908938172597, "loss": 2.7911, "step": 1339 }, { "crossentropy": 2.8929271697998047, "epoch": 0.07286766905027325, "grad_norm": 0.04968465864658356, "grad_norm_var": 2.6727540828324948e-05, "learning_rate": 0.009819577609989706, "loss": 2.8929, "step": 1340 }, { "crossentropy": 2.8533869981765747, "epoch": 0.07292204790777346, "grad_norm": 0.04933234676718712, "grad_norm_var": 1.317590514250764e-05, "learning_rate": 0.0098192459829011, "loss": 2.8534, "step": 1341 }, { "crossentropy": 2.8882375955581665, "epoch": 0.07297642676527366, "grad_norm": 0.0472409725189209, "grad_norm_var": 6.649425758148362e-06, "learning_rate": 0.009818914056927355, "loss": 2.8882, "step": 1342 }, { "crossentropy": 2.8563441038131714, "epoch": 0.07303080562277386, "grad_norm": 0.0582844540476799, "grad_norm_var": 9.02742158086107e-06, "learning_rate": 0.00981858183208905, "loss": 2.8563, "step": 1343 }, { "crossentropy": 2.707926392555237, "epoch": 0.07308518448027407, "grad_norm": 0.056471213698387146, "grad_norm_var": 1.0436789322388652e-05, "learning_rate": 0.009818249308406794, "loss": 2.7079, "step": 1344 }, { "crossentropy": 2.7602585554122925, "epoch": 0.07313956333777427, "grad_norm": 0.05014874041080475, "grad_norm_var": 9.880742708185709e-06, "learning_rate": 0.009817916485901208, "loss": 2.7603, "step": 1345 }, { "crossentropy": 2.8638161420822144, "epoch": 0.07319394219527448, "grad_norm": 0.05222850665450096, "grad_norm_var": 9.86848410249111e-06, "learning_rate": 0.009817583364592932, "loss": 2.8638, "step": 1346 }, { "crossentropy": 2.807239532470703, "epoch": 0.07324832105277468, "grad_norm": 0.05655786395072937, "grad_norm_var": 1.1282144421723844e-05, "learning_rate": 0.009817249944502629, "loss": 2.8072, "step": 1347 }, { "crossentropy": 2.8701019287109375, "epoch": 0.07330269991027488, "grad_norm": 0.061702910810709, "grad_norm_var": 1.680733779067332e-05, "learning_rate": 0.009816916225650974, "loss": 2.8701, "step": 1348 }, { "crossentropy": 2.821553587913513, "epoch": 0.07335707876777509, "grad_norm": 0.05879141017794609, "grad_norm_var": 1.903954656412848e-05, "learning_rate": 0.009816582208058665, "loss": 2.8216, "step": 1349 }, { "crossentropy": 2.9279727935791016, "epoch": 0.07341145762527529, "grad_norm": 0.06068059056997299, "grad_norm_var": 2.1622393948983235e-05, "learning_rate": 0.00981624789174642, "loss": 2.928, "step": 1350 }, { "crossentropy": 2.8545833826065063, "epoch": 0.07346583648277549, "grad_norm": 0.061396803706884384, "grad_norm_var": 2.3841056119331774e-05, "learning_rate": 0.009815913276734969, "loss": 2.8546, "step": 1351 }, { "crossentropy": 2.8380569219589233, "epoch": 0.0735202153402757, "grad_norm": 0.05573597177863121, "grad_norm_var": 2.378618013159957e-05, "learning_rate": 0.009815578363045068, "loss": 2.8381, "step": 1352 }, { "crossentropy": 2.686629295349121, "epoch": 0.0735745941977759, "grad_norm": 0.07329294085502625, "grad_norm_var": 4.5832677452275054e-05, "learning_rate": 0.009815243150697485, "loss": 2.6866, "step": 1353 }, { "crossentropy": 2.8425978422164917, "epoch": 0.0736289730552761, "grad_norm": 0.05501003935933113, "grad_norm_var": 4.3728887431752444e-05, "learning_rate": 0.009814907639713011, "loss": 2.8426, "step": 1354 }, { "crossentropy": 2.953587532043457, "epoch": 0.07368335191277631, "grad_norm": 0.053940240293741226, "grad_norm_var": 4.06464250163568e-05, "learning_rate": 0.009814571830112456, "loss": 2.9536, "step": 1355 }, { "crossentropy": 2.8053301572799683, "epoch": 0.07373773077027651, "grad_norm": 0.053699348121881485, "grad_norm_var": 3.8122692355910666e-05, "learning_rate": 0.009814235721916642, "loss": 2.8053, "step": 1356 }, { "crossentropy": 2.8144551515579224, "epoch": 0.07379210962777671, "grad_norm": 0.05541094020009041, "grad_norm_var": 3.459673561287211e-05, "learning_rate": 0.00981389931514642, "loss": 2.8145, "step": 1357 }, { "crossentropy": 2.820757269859314, "epoch": 0.07384648848527692, "grad_norm": 0.07510265707969666, "grad_norm_var": 4.718679175443972e-05, "learning_rate": 0.009813562609822649, "loss": 2.8208, "step": 1358 }, { "crossentropy": 2.8663949966430664, "epoch": 0.07390086734277714, "grad_norm": 0.047372594475746155, "grad_norm_var": 5.516539013840817e-05, "learning_rate": 0.009813225605966211, "loss": 2.8664, "step": 1359 }, { "crossentropy": 2.825021505355835, "epoch": 0.07395524620027734, "grad_norm": 0.05209556594491005, "grad_norm_var": 5.723728527980328e-05, "learning_rate": 0.009812888303598012, "loss": 2.825, "step": 1360 }, { "crossentropy": 2.8486346006393433, "epoch": 0.07400962505777754, "grad_norm": 0.050461310893297195, "grad_norm_var": 5.69287704014075e-05, "learning_rate": 0.009812550702738964, "loss": 2.8486, "step": 1361 }, { "crossentropy": 2.818665862083435, "epoch": 0.07406400391527775, "grad_norm": 0.05706482380628586, "grad_norm_var": 5.485112040165274e-05, "learning_rate": 0.00981221280341001, "loss": 2.8187, "step": 1362 }, { "crossentropy": 2.828900098800659, "epoch": 0.07411838277277795, "grad_norm": 0.06589964032173157, "grad_norm_var": 5.848453690227851e-05, "learning_rate": 0.009811874605632103, "loss": 2.8289, "step": 1363 }, { "crossentropy": 2.6396480798721313, "epoch": 0.07417276163027815, "grad_norm": 0.0619930624961853, "grad_norm_var": 5.860970089752582e-05, "learning_rate": 0.00981153610942622, "loss": 2.6396, "step": 1364 }, { "crossentropy": 2.765424609184265, "epoch": 0.07422714048777836, "grad_norm": 0.05590934306383133, "grad_norm_var": 5.906364761732984e-05, "learning_rate": 0.009811197314813353, "loss": 2.7654, "step": 1365 }, { "crossentropy": 2.7957394123077393, "epoch": 0.07428151934527856, "grad_norm": 0.054412174969911575, "grad_norm_var": 5.964815338741301e-05, "learning_rate": 0.009810858221814513, "loss": 2.7957, "step": 1366 }, { "crossentropy": 2.7904146909713745, "epoch": 0.07433589820277876, "grad_norm": 0.05128183588385582, "grad_norm_var": 6.152876670345951e-05, "learning_rate": 0.009810518830450735, "loss": 2.7904, "step": 1367 }, { "crossentropy": 2.800134778022766, "epoch": 0.07439027706027897, "grad_norm": 0.054793357849121094, "grad_norm_var": 6.179565641278468e-05, "learning_rate": 0.00981017914074306, "loss": 2.8001, "step": 1368 }, { "crossentropy": 2.8722338676452637, "epoch": 0.07444465591777917, "grad_norm": 0.05564859136939049, "grad_norm_var": 4.376687229291163e-05, "learning_rate": 0.00980983915271256, "loss": 2.8722, "step": 1369 }, { "crossentropy": 2.977061152458191, "epoch": 0.07449903477527937, "grad_norm": 0.053556736558675766, "grad_norm_var": 4.4140306530139556e-05, "learning_rate": 0.009809498866380321, "loss": 2.9771, "step": 1370 }, { "crossentropy": 2.849839925765991, "epoch": 0.07455341363277958, "grad_norm": 0.05874074622988701, "grad_norm_var": 4.415652487513155e-05, "learning_rate": 0.009809158281767445, "loss": 2.8498, "step": 1371 }, { "crossentropy": 2.8773605823516846, "epoch": 0.07460779249027978, "grad_norm": 0.04846900701522827, "grad_norm_var": 4.7795130116389437e-05, "learning_rate": 0.009808817398895059, "loss": 2.8774, "step": 1372 }, { "crossentropy": 2.9724403619766235, "epoch": 0.07466217134777998, "grad_norm": 0.0470668226480484, "grad_norm_var": 5.295584350994831e-05, "learning_rate": 0.009808476217784298, "loss": 2.9724, "step": 1373 }, { "crossentropy": 2.910130739212036, "epoch": 0.07471655020528019, "grad_norm": 0.052579235285520554, "grad_norm_var": 2.614385349066749e-05, "learning_rate": 0.009808134738456325, "loss": 2.9101, "step": 1374 }, { "crossentropy": 2.793554663658142, "epoch": 0.07477092906278039, "grad_norm": 0.05277736112475395, "grad_norm_var": 2.3042977291689615e-05, "learning_rate": 0.009807792960932317, "loss": 2.7936, "step": 1375 }, { "crossentropy": 2.835622191429138, "epoch": 0.0748253079202806, "grad_norm": 0.05197501182556152, "grad_norm_var": 2.3083287290273645e-05, "learning_rate": 0.009807450885233476, "loss": 2.8356, "step": 1376 }, { "crossentropy": 2.898525595664978, "epoch": 0.0748796867777808, "grad_norm": 0.04897845908999443, "grad_norm_var": 2.4026992411699948e-05, "learning_rate": 0.00980710851138101, "loss": 2.8985, "step": 1377 }, { "crossentropy": 2.881989598274231, "epoch": 0.074934065635281, "grad_norm": 0.049243535846471786, "grad_norm_var": 2.5119931913236173e-05, "learning_rate": 0.009806765839396157, "loss": 2.882, "step": 1378 }, { "crossentropy": 2.8100290298461914, "epoch": 0.0749884444927812, "grad_norm": 0.048183128237724304, "grad_norm_var": 1.652809044046978e-05, "learning_rate": 0.009806422869300167, "loss": 2.81, "step": 1379 }, { "crossentropy": 2.696121335029602, "epoch": 0.07504282335028141, "grad_norm": 0.05001327022910118, "grad_norm_var": 1.0894378644499038e-05, "learning_rate": 0.009806079601114313, "loss": 2.6961, "step": 1380 }, { "crossentropy": 2.838192582130432, "epoch": 0.07509720220778161, "grad_norm": 0.051777493208646774, "grad_norm_var": 9.863757635271825e-06, "learning_rate": 0.009805736034859883, "loss": 2.8382, "step": 1381 }, { "crossentropy": 2.9479113817214966, "epoch": 0.07515158106528182, "grad_norm": 0.04825770482420921, "grad_norm_var": 1.0123296459552065e-05, "learning_rate": 0.009805392170558183, "loss": 2.9479, "step": 1382 }, { "crossentropy": 2.8134098052978516, "epoch": 0.07520595992278202, "grad_norm": 0.050322141498327255, "grad_norm_var": 1.0203515961476559e-05, "learning_rate": 0.009805048008230541, "loss": 2.8134, "step": 1383 }, { "crossentropy": 2.8430687189102173, "epoch": 0.07526033878028222, "grad_norm": 0.05479322373867035, "grad_norm_var": 1.020345526519106e-05, "learning_rate": 0.009804703547898301, "loss": 2.8431, "step": 1384 }, { "crossentropy": 2.86301589012146, "epoch": 0.07531471763778243, "grad_norm": 0.054407499730587006, "grad_norm_var": 9.59649106438328e-06, "learning_rate": 0.009804358789582827, "loss": 2.863, "step": 1385 }, { "crossentropy": 2.8092647790908813, "epoch": 0.07536909649528263, "grad_norm": 0.0468100905418396, "grad_norm_var": 1.0430457318828743e-05, "learning_rate": 0.0098040137333055, "loss": 2.8093, "step": 1386 }, { "crossentropy": 2.831538438796997, "epoch": 0.07542347535278283, "grad_norm": 0.04990525171160698, "grad_norm_var": 6.072276039438493e-06, "learning_rate": 0.00980366837908772, "loss": 2.8315, "step": 1387 }, { "crossentropy": 2.6680960655212402, "epoch": 0.07547785421028304, "grad_norm": 0.056272052228450775, "grad_norm_var": 7.9233999194086e-06, "learning_rate": 0.009803322726950905, "loss": 2.6681, "step": 1388 }, { "crossentropy": 2.9032793045043945, "epoch": 0.07553223306778324, "grad_norm": 0.05354224145412445, "grad_norm_var": 7.290563803667658e-06, "learning_rate": 0.009802976776916493, "loss": 2.9033, "step": 1389 }, { "crossentropy": 2.850787043571472, "epoch": 0.07558661192528345, "grad_norm": 0.051985424011945724, "grad_norm_var": 7.206556929687374e-06, "learning_rate": 0.009802630529005941, "loss": 2.8508, "step": 1390 }, { "crossentropy": 2.8446186780929565, "epoch": 0.07564099078278365, "grad_norm": 0.056218188256025314, "grad_norm_var": 8.668911103565784e-06, "learning_rate": 0.009802283983240718, "loss": 2.8446, "step": 1391 }, { "crossentropy": 2.879874587059021, "epoch": 0.07569536964028385, "grad_norm": 0.047999802976846695, "grad_norm_var": 9.361213388260162e-06, "learning_rate": 0.00980193713964232, "loss": 2.8799, "step": 1392 }, { "crossentropy": 2.8105183839797974, "epoch": 0.07574974849778406, "grad_norm": 0.05315695330500603, "grad_norm_var": 9.231838309535964e-06, "learning_rate": 0.009801589998232258, "loss": 2.8105, "step": 1393 }, { "crossentropy": 2.8089765310287476, "epoch": 0.07580412735528426, "grad_norm": 0.05264408513903618, "grad_norm_var": 8.962987806051167e-06, "learning_rate": 0.009801242559032062, "loss": 2.809, "step": 1394 }, { "crossentropy": 2.765032172203064, "epoch": 0.07585850621278446, "grad_norm": 0.05634532496333122, "grad_norm_var": 9.36143742087386e-06, "learning_rate": 0.009800894822063278, "loss": 2.765, "step": 1395 }, { "crossentropy": 2.768213152885437, "epoch": 0.07591288507028468, "grad_norm": 0.054763294756412506, "grad_norm_var": 9.416330279843111e-06, "learning_rate": 0.009800546787347472, "loss": 2.7682, "step": 1396 }, { "crossentropy": 2.7128067016601562, "epoch": 0.07596726392778488, "grad_norm": 0.05017484724521637, "grad_norm_var": 9.720575597669107e-06, "learning_rate": 0.00980019845490623, "loss": 2.7128, "step": 1397 }, { "crossentropy": 2.7280203104019165, "epoch": 0.07602164278528509, "grad_norm": 0.05043831095099449, "grad_norm_var": 8.827975312651557e-06, "learning_rate": 0.009799849824761158, "loss": 2.728, "step": 1398 }, { "crossentropy": 2.7788448333740234, "epoch": 0.07607602164278529, "grad_norm": 0.04831528291106224, "grad_norm_var": 9.658746332892541e-06, "learning_rate": 0.009799500896933872, "loss": 2.7788, "step": 1399 }, { "crossentropy": 2.814368963241577, "epoch": 0.0761304005002855, "grad_norm": 0.05191189423203468, "grad_norm_var": 9.243120924153543e-06, "learning_rate": 0.009799151671446014, "loss": 2.8144, "step": 1400 }, { "crossentropy": 2.828707695007324, "epoch": 0.0761847793577857, "grad_norm": 0.045899368822574615, "grad_norm_var": 1.124122218233529e-05, "learning_rate": 0.009798802148319246, "loss": 2.8287, "step": 1401 }, { "crossentropy": 2.8502519130706787, "epoch": 0.0762391582152859, "grad_norm": 0.04805159196257591, "grad_norm_var": 1.0536569728107567e-05, "learning_rate": 0.009798452327575242, "loss": 2.8503, "step": 1402 }, { "crossentropy": 2.6786887645721436, "epoch": 0.0762935370727861, "grad_norm": 0.056993044912815094, "grad_norm_var": 1.1955224700377122e-05, "learning_rate": 0.009798102209235698, "loss": 2.6787, "step": 1403 }, { "crossentropy": 2.8295159339904785, "epoch": 0.07634791593028631, "grad_norm": 0.0664309561252594, "grad_norm_var": 2.3962448609601837e-05, "learning_rate": 0.009797751793322327, "loss": 2.8295, "step": 1404 }, { "crossentropy": 2.894326686859131, "epoch": 0.07640229478778651, "grad_norm": 0.059332866221666336, "grad_norm_var": 2.662782189468641e-05, "learning_rate": 0.009797401079856865, "loss": 2.8943, "step": 1405 }, { "crossentropy": 2.7663328647613525, "epoch": 0.07645667364528672, "grad_norm": 0.05332333594560623, "grad_norm_var": 2.652903813594454e-05, "learning_rate": 0.009797050068861057, "loss": 2.7663, "step": 1406 }, { "crossentropy": 2.9113097190856934, "epoch": 0.07651105250278692, "grad_norm": 0.055347688496112823, "grad_norm_var": 2.6231884953267262e-05, "learning_rate": 0.00979669876035668, "loss": 2.9113, "step": 1407 }, { "crossentropy": 2.8690141439437866, "epoch": 0.07656543136028712, "grad_norm": 0.052173513919115067, "grad_norm_var": 2.4429225485195158e-05, "learning_rate": 0.009796347154365516, "loss": 2.869, "step": 1408 }, { "crossentropy": 2.9224082231521606, "epoch": 0.07661981021778733, "grad_norm": 0.05297943577170372, "grad_norm_var": 2.443828256070557e-05, "learning_rate": 0.009795995250909373, "loss": 2.9224, "step": 1409 }, { "crossentropy": 2.7337167263031006, "epoch": 0.07667418907528753, "grad_norm": 0.07701408118009567, "grad_norm_var": 5.8953404413060694e-05, "learning_rate": 0.009795643050010077, "loss": 2.7337, "step": 1410 }, { "crossentropy": 2.922888994216919, "epoch": 0.07672856793278773, "grad_norm": 0.049144912511110306, "grad_norm_var": 6.087187825029409e-05, "learning_rate": 0.009795290551689468, "loss": 2.9229, "step": 1411 }, { "crossentropy": 2.928156018257141, "epoch": 0.07678294679028794, "grad_norm": 0.05146577209234238, "grad_norm_var": 6.144380964500374e-05, "learning_rate": 0.009794937755969412, "loss": 2.9282, "step": 1412 }, { "crossentropy": 2.7852741479873657, "epoch": 0.07683732564778814, "grad_norm": 0.05527331680059433, "grad_norm_var": 6.025583115639328e-05, "learning_rate": 0.009794584662871786, "loss": 2.7853, "step": 1413 }, { "crossentropy": 2.83932888507843, "epoch": 0.07689170450528834, "grad_norm": 0.060792483389377594, "grad_norm_var": 6.116819731377763e-05, "learning_rate": 0.009794231272418488, "loss": 2.8393, "step": 1414 }, { "crossentropy": 2.802941918373108, "epoch": 0.07694608336278855, "grad_norm": 0.05579822510480881, "grad_norm_var": 5.7720871179687644e-05, "learning_rate": 0.009793877584631436, "loss": 2.8029, "step": 1415 }, { "crossentropy": 2.8107024431228638, "epoch": 0.07700046222028875, "grad_norm": 0.04912371188402176, "grad_norm_var": 5.963202025369794e-05, "learning_rate": 0.009793523599532568, "loss": 2.8107, "step": 1416 }, { "crossentropy": 2.8198466300964355, "epoch": 0.07705484107778895, "grad_norm": 0.052072636783123016, "grad_norm_var": 5.405267755378736e-05, "learning_rate": 0.009793169317143833, "loss": 2.8198, "step": 1417 }, { "crossentropy": 2.841365694999695, "epoch": 0.07710921993528916, "grad_norm": 0.05142118036746979, "grad_norm_var": 5.1210424436637135e-05, "learning_rate": 0.009792814737487207, "loss": 2.8414, "step": 1418 }, { "crossentropy": 2.84302818775177, "epoch": 0.07716359879278936, "grad_norm": 0.05829497054219246, "grad_norm_var": 5.1459591295963517e-05, "learning_rate": 0.009792459860584678, "loss": 2.843, "step": 1419 }, { "crossentropy": 2.8833532333374023, "epoch": 0.07721797765028957, "grad_norm": 0.05750735476613045, "grad_norm_var": 4.432225691595344e-05, "learning_rate": 0.009792104686458258, "loss": 2.8834, "step": 1420 }, { "crossentropy": 2.748688817024231, "epoch": 0.07727235650778977, "grad_norm": 0.056578248739242554, "grad_norm_var": 4.345912639041923e-05, "learning_rate": 0.009791749215129972, "loss": 2.7487, "step": 1421 }, { "crossentropy": 2.8521134853363037, "epoch": 0.07732673536528997, "grad_norm": 0.060213152319192886, "grad_norm_var": 4.440855052991466e-05, "learning_rate": 0.009791393446621867, "loss": 2.8521, "step": 1422 }, { "crossentropy": 2.7780669927597046, "epoch": 0.07738111422279018, "grad_norm": 0.053309302777051926, "grad_norm_var": 4.4831949770326485e-05, "learning_rate": 0.009791037380956007, "loss": 2.7781, "step": 1423 }, { "crossentropy": 2.7625454664230347, "epoch": 0.07743549308029038, "grad_norm": 0.08026927709579468, "grad_norm_var": 8.049768262613685e-05, "learning_rate": 0.009790681018154477, "loss": 2.7625, "step": 1424 }, { "crossentropy": 2.867573618888855, "epoch": 0.07748987193779058, "grad_norm": 0.04739046469330788, "grad_norm_var": 8.587727095422452e-05, "learning_rate": 0.009790324358239375, "loss": 2.8676, "step": 1425 }, { "crossentropy": 2.793244242668152, "epoch": 0.07754425079529079, "grad_norm": 0.05221177637577057, "grad_norm_var": 5.8896719892626014e-05, "learning_rate": 0.009789967401232823, "loss": 2.7932, "step": 1426 }, { "crossentropy": 2.7236448526382446, "epoch": 0.07759862965279099, "grad_norm": 0.05648549646139145, "grad_norm_var": 5.5869107882898913e-05, "learning_rate": 0.00978961014715696, "loss": 2.7236, "step": 1427 }, { "crossentropy": 2.94861102104187, "epoch": 0.0776530085102912, "grad_norm": 0.060115184634923935, "grad_norm_var": 5.51566541395522e-05, "learning_rate": 0.009789252596033938, "loss": 2.9486, "step": 1428 }, { "crossentropy": 2.804589867591858, "epoch": 0.0777073873677914, "grad_norm": 0.05335908755660057, "grad_norm_var": 5.574432942870063e-05, "learning_rate": 0.009788894747885937, "loss": 2.8046, "step": 1429 }, { "crossentropy": 2.8346316814422607, "epoch": 0.0777617662252916, "grad_norm": 0.06204890459775925, "grad_norm_var": 5.65522118234823e-05, "learning_rate": 0.009788536602735147, "loss": 2.8346, "step": 1430 }, { "crossentropy": 2.800776243209839, "epoch": 0.0778161450827918, "grad_norm": 0.059098850935697556, "grad_norm_var": 5.68637722178957e-05, "learning_rate": 0.009788178160603782, "loss": 2.8008, "step": 1431 }, { "crossentropy": 2.7733763456344604, "epoch": 0.07787052394029201, "grad_norm": 0.06095198914408684, "grad_norm_var": 5.343277035454678e-05, "learning_rate": 0.009787819421514072, "loss": 2.7734, "step": 1432 }, { "crossentropy": 2.815526247024536, "epoch": 0.07792490279779221, "grad_norm": 0.050237029790878296, "grad_norm_var": 5.499200746626459e-05, "learning_rate": 0.009787460385488264, "loss": 2.8155, "step": 1433 }, { "crossentropy": 2.8830018043518066, "epoch": 0.07797928165529243, "grad_norm": 0.0554475374519825, "grad_norm_var": 5.2758865453556014e-05, "learning_rate": 0.009787101052548628, "loss": 2.883, "step": 1434 }, { "crossentropy": 2.805662989616394, "epoch": 0.07803366051279263, "grad_norm": 0.05179246887564659, "grad_norm_var": 5.4902949509264695e-05, "learning_rate": 0.009786741422717446, "loss": 2.8057, "step": 1435 }, { "crossentropy": 2.8368337154388428, "epoch": 0.07808803937029284, "grad_norm": 0.0698234885931015, "grad_norm_var": 6.470172241530656e-05, "learning_rate": 0.009786381496017025, "loss": 2.8368, "step": 1436 }, { "crossentropy": 2.795819401741028, "epoch": 0.07814241822779304, "grad_norm": 0.05425078421831131, "grad_norm_var": 6.55073405909307e-05, "learning_rate": 0.009786021272469685, "loss": 2.7958, "step": 1437 }, { "crossentropy": 2.763054609298706, "epoch": 0.07819679708529324, "grad_norm": 0.05013075843453407, "grad_norm_var": 6.880195709302172e-05, "learning_rate": 0.009785660752097768, "loss": 2.7631, "step": 1438 }, { "crossentropy": 2.7621030807495117, "epoch": 0.07825117594279345, "grad_norm": 0.05465337261557579, "grad_norm_var": 6.819832375369033e-05, "learning_rate": 0.009785299934923632, "loss": 2.7621, "step": 1439 }, { "crossentropy": 2.821943759918213, "epoch": 0.07830555480029365, "grad_norm": 0.06507784128189087, "grad_norm_var": 3.628286495260715e-05, "learning_rate": 0.009784938820969657, "loss": 2.8219, "step": 1440 }, { "crossentropy": 2.871613621711731, "epoch": 0.07835993365779385, "grad_norm": 0.06629450619220734, "grad_norm_var": 3.5802814253501024e-05, "learning_rate": 0.009784577410258235, "loss": 2.8716, "step": 1441 }, { "crossentropy": 2.8593106269836426, "epoch": 0.07841431251529406, "grad_norm": 0.06409355252981186, "grad_norm_var": 3.605259552691698e-05, "learning_rate": 0.009784215702811785, "loss": 2.8593, "step": 1442 }, { "crossentropy": 2.914143681526184, "epoch": 0.07846869137279426, "grad_norm": 0.04957013204693794, "grad_norm_var": 4.077568240906052e-05, "learning_rate": 0.009783853698652737, "loss": 2.9141, "step": 1443 }, { "crossentropy": 2.7534639835357666, "epoch": 0.07852307023029446, "grad_norm": 0.05010642483830452, "grad_norm_var": 4.412596738999069e-05, "learning_rate": 0.009783491397803542, "loss": 2.7535, "step": 1444 }, { "crossentropy": 2.955861449241638, "epoch": 0.07857744908779467, "grad_norm": 0.05205395072698593, "grad_norm_var": 4.4919706521850954e-05, "learning_rate": 0.00978312880028667, "loss": 2.9559, "step": 1445 }, { "crossentropy": 2.9493616819381714, "epoch": 0.07863182794529487, "grad_norm": 0.0537540540099144, "grad_norm_var": 4.3887032055478506e-05, "learning_rate": 0.009782765906124608, "loss": 2.9494, "step": 1446 }, { "crossentropy": 2.7279820442199707, "epoch": 0.07868620680279508, "grad_norm": 0.09873710572719574, "grad_norm_var": 0.00015471948500230345, "learning_rate": 0.009782402715339866, "loss": 2.728, "step": 1447 }, { "crossentropy": 2.72326922416687, "epoch": 0.07874058566029528, "grad_norm": 0.051630742847919464, "grad_norm_var": 0.00015795492978128433, "learning_rate": 0.009782039227954967, "loss": 2.7233, "step": 1448 }, { "crossentropy": 2.8181376457214355, "epoch": 0.07879496451779548, "grad_norm": 0.050769805908203125, "grad_norm_var": 0.0001573783530176509, "learning_rate": 0.00978167544399245, "loss": 2.8181, "step": 1449 }, { "crossentropy": 2.8385292291641235, "epoch": 0.07884934337529569, "grad_norm": 0.049105823040008545, "grad_norm_var": 0.00016258853568170643, "learning_rate": 0.009781311363474881, "loss": 2.8385, "step": 1450 }, { "crossentropy": 2.8334503173828125, "epoch": 0.07890372223279589, "grad_norm": 0.049741532653570175, "grad_norm_var": 0.00016461464415078585, "learning_rate": 0.00978094698642484, "loss": 2.8335, "step": 1451 }, { "crossentropy": 2.782662868499756, "epoch": 0.07895810109029609, "grad_norm": 0.08890360593795776, "grad_norm_var": 0.00021716173600521964, "learning_rate": 0.009780582312864923, "loss": 2.7827, "step": 1452 }, { "crossentropy": 2.8795337677001953, "epoch": 0.0790124799477963, "grad_norm": 0.05288102850317955, "grad_norm_var": 0.00021820200398458094, "learning_rate": 0.009780217342817747, "loss": 2.8795, "step": 1453 }, { "crossentropy": 2.874595522880554, "epoch": 0.0790668588052965, "grad_norm": 0.05368025600910187, "grad_norm_var": 0.0002146882714276182, "learning_rate": 0.009779852076305951, "loss": 2.8746, "step": 1454 }, { "crossentropy": 2.7819048166275024, "epoch": 0.0791212376627967, "grad_norm": 0.048776861280202866, "grad_norm_var": 0.0002205977699734277, "learning_rate": 0.009779486513352184, "loss": 2.7819, "step": 1455 }, { "crossentropy": 2.810363292694092, "epoch": 0.07917561652029691, "grad_norm": 0.05055335909128189, "grad_norm_var": 0.0002221549614295631, "learning_rate": 0.00977912065397912, "loss": 2.8104, "step": 1456 }, { "crossentropy": 2.8288326263427734, "epoch": 0.07922999537779711, "grad_norm": 0.051736656576395035, "grad_norm_var": 0.0002196224432610229, "learning_rate": 0.009778754498209447, "loss": 2.8288, "step": 1457 }, { "crossentropy": 2.892397880554199, "epoch": 0.07928437423529731, "grad_norm": 0.048519931733608246, "grad_norm_var": 0.00022058284343669867, "learning_rate": 0.009778388046065876, "loss": 2.8924, "step": 1458 }, { "crossentropy": 2.8593324422836304, "epoch": 0.07933875309279752, "grad_norm": 0.049671970307826996, "grad_norm_var": 0.00022049234709707686, "learning_rate": 0.009778021297571132, "loss": 2.8593, "step": 1459 }, { "crossentropy": 2.8157119750976562, "epoch": 0.07939313195029772, "grad_norm": 0.04928646609187126, "grad_norm_var": 0.00022121028932571695, "learning_rate": 0.009777654252747963, "loss": 2.8157, "step": 1460 }, { "crossentropy": 2.750983715057373, "epoch": 0.07944751080779792, "grad_norm": 0.050350021570920944, "grad_norm_var": 0.00022234225784835114, "learning_rate": 0.009777286911619132, "loss": 2.751, "step": 1461 }, { "crossentropy": 2.7380080223083496, "epoch": 0.07950188966529813, "grad_norm": 0.05662574619054794, "grad_norm_var": 0.00022194747996406096, "learning_rate": 0.00977691927420742, "loss": 2.738, "step": 1462 }, { "crossentropy": 2.7589824199676514, "epoch": 0.07955626852279833, "grad_norm": 0.052786558866500854, "grad_norm_var": 9.397762827892189e-05, "learning_rate": 0.009776551340535629, "loss": 2.759, "step": 1463 }, { "crossentropy": 2.7509015798568726, "epoch": 0.07961064738029854, "grad_norm": 0.055136002600193024, "grad_norm_var": 9.390054081383469e-05, "learning_rate": 0.009776183110626575, "loss": 2.7509, "step": 1464 }, { "crossentropy": 2.743883967399597, "epoch": 0.07966502623779874, "grad_norm": 0.04834837466478348, "grad_norm_var": 9.519942611776651e-05, "learning_rate": 0.0097758145845031, "loss": 2.7439, "step": 1465 }, { "crossentropy": 2.864224672317505, "epoch": 0.07971940509529894, "grad_norm": 0.05347128212451935, "grad_norm_var": 9.382903272153094e-05, "learning_rate": 0.009775445762188055, "loss": 2.8642, "step": 1466 }, { "crossentropy": 2.871294856071472, "epoch": 0.07977378395279915, "grad_norm": 0.050814833492040634, "grad_norm_var": 9.332319162238257e-05, "learning_rate": 0.009775076643704314, "loss": 2.8713, "step": 1467 }, { "crossentropy": 2.8200795650482178, "epoch": 0.07982816281029935, "grad_norm": 0.05357770621776581, "grad_norm_var": 6.194658932195951e-06, "learning_rate": 0.009774707229074775, "loss": 2.8201, "step": 1468 }, { "crossentropy": 2.8399828672409058, "epoch": 0.07988254166779955, "grad_norm": 0.05472591519355774, "grad_norm_var": 6.71301137863117e-06, "learning_rate": 0.009774337518322344, "loss": 2.84, "step": 1469 }, { "crossentropy": 2.8254072666168213, "epoch": 0.07993692052529976, "grad_norm": 0.05726778879761696, "grad_norm_var": 8.438873252674996e-06, "learning_rate": 0.009773967511469951, "loss": 2.8254, "step": 1470 }, { "crossentropy": 2.7787410020828247, "epoch": 0.07999129938279997, "grad_norm": 0.05081938952207565, "grad_norm_var": 7.827804419008726e-06, "learning_rate": 0.009773597208540543, "loss": 2.7787, "step": 1471 }, { "crossentropy": 2.8607126474380493, "epoch": 0.08004567824030018, "grad_norm": 0.05660722032189369, "grad_norm_var": 8.865320284764944e-06, "learning_rate": 0.009773226609557088, "loss": 2.8607, "step": 1472 }, { "crossentropy": 2.849833130836487, "epoch": 0.08010005709780038, "grad_norm": 0.04676948860287666, "grad_norm_var": 1.0902402183386265e-05, "learning_rate": 0.009772855714542568, "loss": 2.8498, "step": 1473 }, { "crossentropy": 2.8458656072616577, "epoch": 0.08015443595530058, "grad_norm": 0.0485859140753746, "grad_norm_var": 1.0870530006488558e-05, "learning_rate": 0.009772484523519987, "loss": 2.8459, "step": 1474 }, { "crossentropy": 2.7569265365600586, "epoch": 0.08020881481280079, "grad_norm": 0.04705284908413887, "grad_norm_var": 1.2174340837344382e-05, "learning_rate": 0.009772113036512365, "loss": 2.7569, "step": 1475 }, { "crossentropy": 2.785817503929138, "epoch": 0.08026319367030099, "grad_norm": 0.04918979853391647, "grad_norm_var": 1.2210081336107523e-05, "learning_rate": 0.009771741253542741, "loss": 2.7858, "step": 1476 }, { "crossentropy": 2.8739787340164185, "epoch": 0.0803175725278012, "grad_norm": 0.05078190937638283, "grad_norm_var": 1.2126261320364687e-05, "learning_rate": 0.009771369174634172, "loss": 2.874, "step": 1477 }, { "crossentropy": 2.921887755393982, "epoch": 0.0803719513853014, "grad_norm": 0.05337782949209213, "grad_norm_var": 1.0797544387188459e-05, "learning_rate": 0.009770996799809736, "loss": 2.9219, "step": 1478 }, { "crossentropy": 2.930216670036316, "epoch": 0.0804263302428016, "grad_norm": 0.05869373306632042, "grad_norm_var": 1.373025392854271e-05, "learning_rate": 0.009770624129092529, "loss": 2.9302, "step": 1479 }, { "crossentropy": 2.9220893383026123, "epoch": 0.0804807091003018, "grad_norm": 0.06597685813903809, "grad_norm_var": 2.531754058334486e-05, "learning_rate": 0.009770251162505659, "loss": 2.9221, "step": 1480 }, { "crossentropy": 2.840362787246704, "epoch": 0.08053508795780201, "grad_norm": 0.060674943029880524, "grad_norm_var": 2.7368102895773624e-05, "learning_rate": 0.00976987790007226, "loss": 2.8404, "step": 1481 }, { "crossentropy": 2.84689998626709, "epoch": 0.08058946681530221, "grad_norm": 0.05396105721592903, "grad_norm_var": 2.7371475683405506e-05, "learning_rate": 0.009769504341815479, "loss": 2.8469, "step": 1482 }, { "crossentropy": 2.803867816925049, "epoch": 0.08064384567280242, "grad_norm": 0.052650824189186096, "grad_norm_var": 2.688080765827812e-05, "learning_rate": 0.009769130487758486, "loss": 2.8039, "step": 1483 }, { "crossentropy": 2.763050675392151, "epoch": 0.08069822453030262, "grad_norm": 0.04924619570374489, "grad_norm_var": 2.8178681768590656e-05, "learning_rate": 0.009768756337924466, "loss": 2.7631, "step": 1484 }, { "crossentropy": 2.8000845909118652, "epoch": 0.08075260338780282, "grad_norm": 0.052765823900699615, "grad_norm_var": 2.810465166765581e-05, "learning_rate": 0.009768381892336624, "loss": 2.8001, "step": 1485 }, { "crossentropy": 2.8258391618728638, "epoch": 0.08080698224530303, "grad_norm": 0.05141754820942879, "grad_norm_var": 2.72277886837681e-05, "learning_rate": 0.009768007151018183, "loss": 2.8258, "step": 1486 }, { "crossentropy": 2.762916922569275, "epoch": 0.08086136110280323, "grad_norm": 0.05444495379924774, "grad_norm_var": 2.6977944462050484e-05, "learning_rate": 0.009767632113992383, "loss": 2.7629, "step": 1487 }, { "crossentropy": 2.7945839166641235, "epoch": 0.08091573996030343, "grad_norm": 0.05045272409915924, "grad_norm_var": 2.6600476041183963e-05, "learning_rate": 0.009767256781282485, "loss": 2.7946, "step": 1488 }, { "crossentropy": 2.8897032737731934, "epoch": 0.08097011881780364, "grad_norm": 0.0500558465719223, "grad_norm_var": 2.4599003300338623e-05, "learning_rate": 0.009766881152911766, "loss": 2.8897, "step": 1489 }, { "crossentropy": 2.8116140365600586, "epoch": 0.08102449767530384, "grad_norm": 0.054914455860853195, "grad_norm_var": 2.330744707491102e-05, "learning_rate": 0.009766505228903523, "loss": 2.8116, "step": 1490 }, { "crossentropy": 2.8206005096435547, "epoch": 0.08107887653280405, "grad_norm": 0.05264493450522423, "grad_norm_var": 2.047080935339603e-05, "learning_rate": 0.009766129009281069, "loss": 2.8206, "step": 1491 }, { "crossentropy": 2.8873807191848755, "epoch": 0.08113325539030425, "grad_norm": 0.053396157920360565, "grad_norm_var": 1.897527459714058e-05, "learning_rate": 0.009765752494067735, "loss": 2.8874, "step": 1492 }, { "crossentropy": 2.865382671356201, "epoch": 0.08118763424780445, "grad_norm": 0.0520896352827549, "grad_norm_var": 1.850517654935531e-05, "learning_rate": 0.009765375683286876, "loss": 2.8654, "step": 1493 }, { "crossentropy": 2.8248658180236816, "epoch": 0.08124201310530466, "grad_norm": 0.05462419614195824, "grad_norm_var": 1.8470169233987763e-05, "learning_rate": 0.00976499857696186, "loss": 2.8249, "step": 1494 }, { "crossentropy": 2.786145567893982, "epoch": 0.08129639196280486, "grad_norm": 0.04986833781003952, "grad_norm_var": 1.810984474595641e-05, "learning_rate": 0.009764621175116076, "loss": 2.7861, "step": 1495 }, { "crossentropy": 2.8646578788757324, "epoch": 0.08135077082030506, "grad_norm": 0.05874275416135788, "grad_norm_var": 9.538067047681378e-06, "learning_rate": 0.009764243477772928, "loss": 2.8647, "step": 1496 }, { "crossentropy": 2.8327022790908813, "epoch": 0.08140514967780527, "grad_norm": 0.05734017863869667, "grad_norm_var": 6.930337786283334e-06, "learning_rate": 0.00976386548495584, "loss": 2.8327, "step": 1497 }, { "crossentropy": 2.9438847303390503, "epoch": 0.08145952853530547, "grad_norm": 0.05438923463225365, "grad_norm_var": 6.994466710292616e-06, "learning_rate": 0.009763487196688257, "loss": 2.9439, "step": 1498 }, { "crossentropy": 2.732704281806946, "epoch": 0.08151390739280567, "grad_norm": 0.05147240683436394, "grad_norm_var": 7.146372019026642e-06, "learning_rate": 0.009763108612993638, "loss": 2.7327, "step": 1499 }, { "crossentropy": 2.857415199279785, "epoch": 0.08156828625030588, "grad_norm": 0.04995227977633476, "grad_norm_var": 6.8249235835357e-06, "learning_rate": 0.009762729733895463, "loss": 2.8574, "step": 1500 }, { "crossentropy": 2.930940866470337, "epoch": 0.08162266510780608, "grad_norm": 0.061258889734745026, "grad_norm_var": 1.1027554995504154e-05, "learning_rate": 0.009762350559417233, "loss": 2.9309, "step": 1501 }, { "crossentropy": 2.8784000873565674, "epoch": 0.08167704396530628, "grad_norm": 0.05538707599043846, "grad_norm_var": 1.0874982746043351e-05, "learning_rate": 0.009761971089582457, "loss": 2.8784, "step": 1502 }, { "crossentropy": 2.8854280710220337, "epoch": 0.08173142282280649, "grad_norm": 0.052481696009635925, "grad_norm_var": 1.0950882854889448e-05, "learning_rate": 0.009761591324414677, "loss": 2.8854, "step": 1503 }, { "crossentropy": 2.863037943840027, "epoch": 0.08178580168030669, "grad_norm": 0.05589492246508598, "grad_norm_var": 1.04515280191247e-05, "learning_rate": 0.009761211263937442, "loss": 2.863, "step": 1504 }, { "crossentropy": 2.770443081855774, "epoch": 0.0818401805378069, "grad_norm": 0.05359388887882233, "grad_norm_var": 9.358150887267315e-06, "learning_rate": 0.009760830908174324, "loss": 2.7704, "step": 1505 }, { "crossentropy": 2.8828662633895874, "epoch": 0.0818945593953071, "grad_norm": 0.056286025792360306, "grad_norm_var": 9.596655738543698e-06, "learning_rate": 0.00976045025714891, "loss": 2.8829, "step": 1506 }, { "crossentropy": 2.752739429473877, "epoch": 0.0819489382528073, "grad_norm": 0.05131813883781433, "grad_norm_var": 1.0006355093016411e-05, "learning_rate": 0.009760069310884809, "loss": 2.7527, "step": 1507 }, { "crossentropy": 2.8537824153900146, "epoch": 0.08200331711030752, "grad_norm": 0.04892314225435257, "grad_norm_var": 1.1769651776860102e-05, "learning_rate": 0.009759688069405649, "loss": 2.8538, "step": 1508 }, { "crossentropy": 2.886916995048523, "epoch": 0.08205769596780772, "grad_norm": 0.04926470294594765, "grad_norm_var": 1.2979090793711842e-05, "learning_rate": 0.009759306532735068, "loss": 2.8869, "step": 1509 }, { "crossentropy": 2.7308491468429565, "epoch": 0.08211207482530793, "grad_norm": 0.050078246742486954, "grad_norm_var": 1.3771046439562708e-05, "learning_rate": 0.009758924700896737, "loss": 2.7308, "step": 1510 }, { "crossentropy": 2.8337957859039307, "epoch": 0.08216645368280813, "grad_norm": 0.05361699312925339, "grad_norm_var": 1.2826272787917238e-05, "learning_rate": 0.009758542573914331, "loss": 2.8338, "step": 1511 }, { "crossentropy": 2.7395695447921753, "epoch": 0.08222083254030833, "grad_norm": 0.048197709023952484, "grad_norm_var": 1.2756354637043626e-05, "learning_rate": 0.009758160151811553, "loss": 2.7396, "step": 1512 }, { "crossentropy": 2.8028717041015625, "epoch": 0.08227521139780854, "grad_norm": 0.04835567623376846, "grad_norm_var": 1.2711165697413632e-05, "learning_rate": 0.009757777434612116, "loss": 2.8029, "step": 1513 }, { "crossentropy": 2.844079613685608, "epoch": 0.08232959025530874, "grad_norm": 0.0509040467441082, "grad_norm_var": 1.2606092636027744e-05, "learning_rate": 0.009757394422339761, "loss": 2.8441, "step": 1514 }, { "crossentropy": 2.903067111968994, "epoch": 0.08238396911280894, "grad_norm": 0.0566403903067112, "grad_norm_var": 1.3697077375187059e-05, "learning_rate": 0.009757011115018237, "loss": 2.9031, "step": 1515 }, { "crossentropy": 2.828970789909363, "epoch": 0.08243834797030915, "grad_norm": 0.058553628623485565, "grad_norm_var": 1.5244801786539787e-05, "learning_rate": 0.009756627512671319, "loss": 2.829, "step": 1516 }, { "crossentropy": 2.6954123973846436, "epoch": 0.08249272682780935, "grad_norm": 0.06271179765462875, "grad_norm_var": 1.6943298018624356e-05, "learning_rate": 0.009756243615322797, "loss": 2.6954, "step": 1517 }, { "crossentropy": 2.8634501695632935, "epoch": 0.08254710568530955, "grad_norm": 0.05499769747257233, "grad_norm_var": 1.684249831892873e-05, "learning_rate": 0.00975585942299648, "loss": 2.8635, "step": 1518 }, { "crossentropy": 2.8281028270721436, "epoch": 0.08260148454280976, "grad_norm": 0.06059771776199341, "grad_norm_var": 2.0140213545815956e-05, "learning_rate": 0.009755474935716197, "loss": 2.8281, "step": 1519 }, { "crossentropy": 2.916847825050354, "epoch": 0.08265586340030996, "grad_norm": 0.06399433314800262, "grad_norm_var": 2.656099535856404e-05, "learning_rate": 0.00975509015350579, "loss": 2.9168, "step": 1520 }, { "crossentropy": 2.8661224842071533, "epoch": 0.08271024225781017, "grad_norm": 0.05925806611776352, "grad_norm_var": 2.8069054973468786e-05, "learning_rate": 0.009754705076389128, "loss": 2.8661, "step": 1521 }, { "crossentropy": 2.750257968902588, "epoch": 0.08276462111531037, "grad_norm": 0.054077114909887314, "grad_norm_var": 2.787924936768765e-05, "learning_rate": 0.009754319704390087, "loss": 2.7503, "step": 1522 }, { "crossentropy": 2.78438401222229, "epoch": 0.08281899997281057, "grad_norm": 0.048641059547662735, "grad_norm_var": 2.9451526482728493e-05, "learning_rate": 0.009753934037532571, "loss": 2.7844, "step": 1523 }, { "crossentropy": 2.8369290828704834, "epoch": 0.08287337883031078, "grad_norm": 0.048935666680336, "grad_norm_var": 2.9442556059527715e-05, "learning_rate": 0.0097535480758405, "loss": 2.8369, "step": 1524 }, { "crossentropy": 2.709890842437744, "epoch": 0.08292775768781098, "grad_norm": 0.04946891590952873, "grad_norm_var": 2.9308017151009428e-05, "learning_rate": 0.009753161819337806, "loss": 2.7099, "step": 1525 }, { "crossentropy": 2.7601613998413086, "epoch": 0.08298213654531118, "grad_norm": 0.057595618069171906, "grad_norm_var": 2.859406563576235e-05, "learning_rate": 0.009752775268048448, "loss": 2.7602, "step": 1526 }, { "crossentropy": 2.877411127090454, "epoch": 0.08303651540281139, "grad_norm": 0.05320907011628151, "grad_norm_var": 2.8667947169282634e-05, "learning_rate": 0.0097523884219964, "loss": 2.8774, "step": 1527 }, { "crossentropy": 2.823796033859253, "epoch": 0.08309089426031159, "grad_norm": 0.053033698350191116, "grad_norm_var": 2.58991319476875e-05, "learning_rate": 0.009752001281205654, "loss": 2.8238, "step": 1528 }, { "crossentropy": 2.957275152206421, "epoch": 0.0831452731178118, "grad_norm": 0.05753427371382713, "grad_norm_var": 2.2958600215016353e-05, "learning_rate": 0.009751613845700216, "loss": 2.9573, "step": 1529 }, { "crossentropy": 2.878862500190735, "epoch": 0.083199651975312, "grad_norm": 0.053457312285900116, "grad_norm_var": 2.175561094900537e-05, "learning_rate": 0.009751226115504118, "loss": 2.8789, "step": 1530 }, { "crossentropy": 2.7699904441833496, "epoch": 0.0832540308328122, "grad_norm": 0.0614764578640461, "grad_norm_var": 2.3762997745202922e-05, "learning_rate": 0.009750838090641407, "loss": 2.77, "step": 1531 }, { "crossentropy": 2.8132389783859253, "epoch": 0.0833084096903124, "grad_norm": 0.05951071158051491, "grad_norm_var": 2.4133817558359923e-05, "learning_rate": 0.009750449771136146, "loss": 2.8132, "step": 1532 }, { "crossentropy": 2.919834017753601, "epoch": 0.08336278854781261, "grad_norm": 0.061666082590818405, "grad_norm_var": 2.3288126973797634e-05, "learning_rate": 0.00975006115701242, "loss": 2.9198, "step": 1533 }, { "crossentropy": 2.7724716663360596, "epoch": 0.08341716740531281, "grad_norm": 0.06378570944070816, "grad_norm_var": 2.6834049759168605e-05, "learning_rate": 0.009749672248294329, "loss": 2.7725, "step": 1534 }, { "crossentropy": 2.7988808155059814, "epoch": 0.08347154626281302, "grad_norm": 0.05616573616862297, "grad_norm_var": 2.5723032703917995e-05, "learning_rate": 0.009749283045005993, "loss": 2.7989, "step": 1535 }, { "crossentropy": 2.8005266189575195, "epoch": 0.08352592512031322, "grad_norm": 0.052586473524570465, "grad_norm_var": 2.2249286525712295e-05, "learning_rate": 0.00974889354717155, "loss": 2.8005, "step": 1536 }, { "crossentropy": 2.9157419204711914, "epoch": 0.08358030397781342, "grad_norm": 0.04981338232755661, "grad_norm_var": 2.3280963762946592e-05, "learning_rate": 0.009748503754815155, "loss": 2.9157, "step": 1537 }, { "crossentropy": 2.9041953086853027, "epoch": 0.08363468283531363, "grad_norm": 0.05022025480866432, "grad_norm_var": 2.4716033759876004e-05, "learning_rate": 0.009748113667960988, "loss": 2.9042, "step": 1538 }, { "crossentropy": 2.833001494407654, "epoch": 0.08368906169281383, "grad_norm": 0.04874745011329651, "grad_norm_var": 2.4629107755192296e-05, "learning_rate": 0.009747723286633234, "loss": 2.833, "step": 1539 }, { "crossentropy": 2.8033416271209717, "epoch": 0.08374344055031403, "grad_norm": 0.055734265595674515, "grad_norm_var": 2.217896891875959e-05, "learning_rate": 0.009747332610856111, "loss": 2.8033, "step": 1540 }, { "crossentropy": 2.832807183265686, "epoch": 0.08379781940781424, "grad_norm": 0.06133899465203285, "grad_norm_var": 2.1835016834555417e-05, "learning_rate": 0.009746941640653843, "loss": 2.8328, "step": 1541 }, { "crossentropy": 2.8173707723617554, "epoch": 0.08385219826531444, "grad_norm": 0.06427813321352005, "grad_norm_var": 2.6054649896445116e-05, "learning_rate": 0.009746550376050682, "loss": 2.8174, "step": 1542 }, { "crossentropy": 2.829363465309143, "epoch": 0.08390657712281464, "grad_norm": 0.05383697524666786, "grad_norm_var": 2.5811317842155726e-05, "learning_rate": 0.009746158817070891, "loss": 2.8294, "step": 1543 }, { "crossentropy": 2.7346521615982056, "epoch": 0.08396095598031485, "grad_norm": 0.04961591213941574, "grad_norm_var": 2.8097820646633536e-05, "learning_rate": 0.009745766963738757, "loss": 2.7347, "step": 1544 }, { "crossentropy": 2.6983834505081177, "epoch": 0.08401533483781506, "grad_norm": 0.05044328421354294, "grad_norm_var": 3.0012515868878495e-05, "learning_rate": 0.009745374816078579, "loss": 2.6984, "step": 1545 }, { "crossentropy": 2.8023711442947388, "epoch": 0.08406971369531527, "grad_norm": 0.05533235892653465, "grad_norm_var": 2.9648486667213125e-05, "learning_rate": 0.009744982374114678, "loss": 2.8024, "step": 1546 }, { "crossentropy": 2.7864813804626465, "epoch": 0.08412409255281547, "grad_norm": 0.056201495230197906, "grad_norm_var": 2.7472172314102485e-05, "learning_rate": 0.009744589637871394, "loss": 2.7865, "step": 1547 }, { "crossentropy": 2.916491746902466, "epoch": 0.08417847141031568, "grad_norm": 0.05915655195713043, "grad_norm_var": 2.7294390179261455e-05, "learning_rate": 0.009744196607373086, "loss": 2.9165, "step": 1548 }, { "crossentropy": 2.7908159494400024, "epoch": 0.08423285026781588, "grad_norm": 0.06445211172103882, "grad_norm_var": 3.0048600065696318e-05, "learning_rate": 0.009743803282644126, "loss": 2.7908, "step": 1549 }, { "crossentropy": 2.8461592197418213, "epoch": 0.08428722912531608, "grad_norm": 0.058447737246751785, "grad_norm_var": 2.6097278677244574e-05, "learning_rate": 0.009743409663708909, "loss": 2.8462, "step": 1550 }, { "crossentropy": 2.8435171842575073, "epoch": 0.08434160798281629, "grad_norm": 0.04848671332001686, "grad_norm_var": 2.8996878328913823e-05, "learning_rate": 0.009743015750591846, "loss": 2.8435, "step": 1551 }, { "crossentropy": 2.7829248905181885, "epoch": 0.08439598684031649, "grad_norm": 0.04935314133763313, "grad_norm_var": 3.065553757327551e-05, "learning_rate": 0.009742621543317368, "loss": 2.7829, "step": 1552 }, { "crossentropy": 2.836017608642578, "epoch": 0.08445036569781669, "grad_norm": 0.050730686634778976, "grad_norm_var": 3.010848126809845e-05, "learning_rate": 0.009742227041909926, "loss": 2.836, "step": 1553 }, { "crossentropy": 2.797895312309265, "epoch": 0.0845047445553169, "grad_norm": 0.049316611140966415, "grad_norm_var": 3.070811900145922e-05, "learning_rate": 0.009741832246393982, "loss": 2.7979, "step": 1554 }, { "crossentropy": 2.8811157941818237, "epoch": 0.0845591234128171, "grad_norm": 0.047622792422771454, "grad_norm_var": 3.1682336420725485e-05, "learning_rate": 0.009741437156794024, "loss": 2.8811, "step": 1555 }, { "crossentropy": 2.7424817085266113, "epoch": 0.0846135022703173, "grad_norm": 0.04919581115245819, "grad_norm_var": 3.340619578232525e-05, "learning_rate": 0.009741041773134554, "loss": 2.7425, "step": 1556 }, { "crossentropy": 2.7910940647125244, "epoch": 0.08466788112781751, "grad_norm": 0.04745101183652878, "grad_norm_var": 3.231196939810031e-05, "learning_rate": 0.009740646095440094, "loss": 2.7911, "step": 1557 }, { "crossentropy": 2.8649582862854004, "epoch": 0.08472225998531771, "grad_norm": 0.05087706446647644, "grad_norm_var": 2.404565548406821e-05, "learning_rate": 0.00974025012373518, "loss": 2.865, "step": 1558 }, { "crossentropy": 2.8035471439361572, "epoch": 0.08477663884281791, "grad_norm": 0.0580778568983078, "grad_norm_var": 2.5907330340181842e-05, "learning_rate": 0.009739853858044375, "loss": 2.8035, "step": 1559 }, { "crossentropy": 2.7290455102920532, "epoch": 0.08483101770031812, "grad_norm": 0.06665053963661194, "grad_norm_var": 3.681703825211731e-05, "learning_rate": 0.009739457298392252, "loss": 2.729, "step": 1560 }, { "crossentropy": 2.7905560731887817, "epoch": 0.08488539655781832, "grad_norm": 0.05427725240588188, "grad_norm_var": 3.5987991856039195e-05, "learning_rate": 0.009739060444803405, "loss": 2.7906, "step": 1561 }, { "crossentropy": 2.8112361431121826, "epoch": 0.08493977541531852, "grad_norm": 0.06286913901567459, "grad_norm_var": 4.077471753231125e-05, "learning_rate": 0.00973866329730245, "loss": 2.8112, "step": 1562 }, { "crossentropy": 2.847270131111145, "epoch": 0.08499415427281873, "grad_norm": 0.05649733543395996, "grad_norm_var": 4.084442786039047e-05, "learning_rate": 0.009738265855914013, "loss": 2.8473, "step": 1563 }, { "crossentropy": 2.8086212873458862, "epoch": 0.08504853313031893, "grad_norm": 0.05008277669548988, "grad_norm_var": 4.04671735385229e-05, "learning_rate": 0.009737868120662746, "loss": 2.8086, "step": 1564 }, { "crossentropy": 2.768716335296631, "epoch": 0.08510291198781914, "grad_norm": 0.04898761957883835, "grad_norm_var": 3.391261103287176e-05, "learning_rate": 0.009737470091573315, "loss": 2.7687, "step": 1565 }, { "crossentropy": 2.804634928703308, "epoch": 0.08515729084531934, "grad_norm": 0.047276902943849564, "grad_norm_var": 3.368375327051498e-05, "learning_rate": 0.009737071768670405, "loss": 2.8046, "step": 1566 }, { "crossentropy": 2.8320960998535156, "epoch": 0.08521166970281954, "grad_norm": 0.04573485627770424, "grad_norm_var": 3.5578057578848574e-05, "learning_rate": 0.00973667315197872, "loss": 2.8321, "step": 1567 }, { "crossentropy": 2.804649233818054, "epoch": 0.08526604856031975, "grad_norm": 0.05874865874648094, "grad_norm_var": 3.7544480187851e-05, "learning_rate": 0.009736274241522984, "loss": 2.8046, "step": 1568 }, { "crossentropy": 2.9342143535614014, "epoch": 0.08532042741781995, "grad_norm": 0.059812262654304504, "grad_norm_var": 4.022399094453418e-05, "learning_rate": 0.009735875037327932, "loss": 2.9342, "step": 1569 }, { "crossentropy": 2.7705018520355225, "epoch": 0.08537480627532015, "grad_norm": 0.05307386815547943, "grad_norm_var": 3.908950956215262e-05, "learning_rate": 0.009735475539418328, "loss": 2.7705, "step": 1570 }, { "crossentropy": 2.7418869733810425, "epoch": 0.08542918513282036, "grad_norm": 0.0515885204076767, "grad_norm_var": 3.692395407619419e-05, "learning_rate": 0.009735075747818947, "loss": 2.7419, "step": 1571 }, { "crossentropy": 2.8974298238754272, "epoch": 0.08548356399032056, "grad_norm": 0.05389607697725296, "grad_norm_var": 3.540355500921682e-05, "learning_rate": 0.009734675662554579, "loss": 2.8974, "step": 1572 }, { "crossentropy": 2.815464735031128, "epoch": 0.08553794284782076, "grad_norm": 0.05008720979094505, "grad_norm_var": 3.3494199271688385e-05, "learning_rate": 0.009734275283650043, "loss": 2.8155, "step": 1573 }, { "crossentropy": 2.8899677991867065, "epoch": 0.08559232170532097, "grad_norm": 0.05315548554062843, "grad_norm_var": 3.278377332990527e-05, "learning_rate": 0.009733874611130167, "loss": 2.89, "step": 1574 }, { "crossentropy": 2.8131470680236816, "epoch": 0.08564670056282117, "grad_norm": 0.05669204518198967, "grad_norm_var": 3.222903565599488e-05, "learning_rate": 0.009733473645019803, "loss": 2.8131, "step": 1575 }, { "crossentropy": 2.756526231765747, "epoch": 0.08570107942032137, "grad_norm": 0.050033148378133774, "grad_norm_var": 2.22104588914917e-05, "learning_rate": 0.009733072385343815, "loss": 2.7565, "step": 1576 }, { "crossentropy": 2.8381742238998413, "epoch": 0.08575545827782158, "grad_norm": 0.049035459756851196, "grad_norm_var": 2.324530078441703e-05, "learning_rate": 0.00973267083212709, "loss": 2.8382, "step": 1577 }, { "crossentropy": 2.6909432411193848, "epoch": 0.08580983713532178, "grad_norm": 0.05136178061366081, "grad_norm_var": 1.6338040481676288e-05, "learning_rate": 0.009732268985394532, "loss": 2.6909, "step": 1578 }, { "crossentropy": 2.8509007692337036, "epoch": 0.08586421599282199, "grad_norm": 0.05032481998205185, "grad_norm_var": 1.5227013731846686e-05, "learning_rate": 0.009731866845171064, "loss": 2.8509, "step": 1579 }, { "crossentropy": 2.664581537246704, "epoch": 0.08591859485032219, "grad_norm": 0.04767747223377228, "grad_norm_var": 1.616121089418332e-05, "learning_rate": 0.009731464411481625, "loss": 2.6646, "step": 1580 }, { "crossentropy": 2.940685749053955, "epoch": 0.08597297370782239, "grad_norm": 0.05184026435017586, "grad_norm_var": 1.5631345483677003e-05, "learning_rate": 0.009731061684351174, "loss": 2.9407, "step": 1581 }, { "crossentropy": 2.7766363620758057, "epoch": 0.0860273525653226, "grad_norm": 0.0657946839928627, "grad_norm_var": 2.5657948177647734e-05, "learning_rate": 0.00973065866380469, "loss": 2.7766, "step": 1582 }, { "crossentropy": 2.8716992139816284, "epoch": 0.08608173142282281, "grad_norm": 0.06735976785421371, "grad_norm_var": 3.378313463247937e-05, "learning_rate": 0.009730255349867165, "loss": 2.8717, "step": 1583 }, { "crossentropy": 2.820469617843628, "epoch": 0.08613611028032302, "grad_norm": 0.05304461717605591, "grad_norm_var": 3.2513191401909586e-05, "learning_rate": 0.009729851742563614, "loss": 2.8205, "step": 1584 }, { "crossentropy": 2.8751180171966553, "epoch": 0.08619048913782322, "grad_norm": 0.05173273757100105, "grad_norm_var": 3.038408294408623e-05, "learning_rate": 0.009729447841919066, "loss": 2.8751, "step": 1585 }, { "crossentropy": 2.828620433807373, "epoch": 0.08624486799532342, "grad_norm": 0.09269986301660538, "grad_norm_var": 0.00012604086931376483, "learning_rate": 0.009729043647958574, "loss": 2.8286, "step": 1586 }, { "crossentropy": 2.737037420272827, "epoch": 0.08629924685282363, "grad_norm": 0.05732359364628792, "grad_norm_var": 0.00012470772396275785, "learning_rate": 0.009728639160707202, "loss": 2.737, "step": 1587 }, { "crossentropy": 2.799222946166992, "epoch": 0.08635362571032383, "grad_norm": 0.05421348288655281, "grad_norm_var": 0.00012460895451738696, "learning_rate": 0.009728234380190038, "loss": 2.7992, "step": 1588 }, { "crossentropy": 2.8877118825912476, "epoch": 0.08640800456782403, "grad_norm": 0.05311749503016472, "grad_norm_var": 0.00012263285662372127, "learning_rate": 0.009727829306432187, "loss": 2.8877, "step": 1589 }, { "crossentropy": 2.7552934885025024, "epoch": 0.08646238342532424, "grad_norm": 0.052501656115055084, "grad_norm_var": 0.00012295880513600287, "learning_rate": 0.00972742393945877, "loss": 2.7553, "step": 1590 }, { "crossentropy": 2.767495274543762, "epoch": 0.08651676228282444, "grad_norm": 0.06360308080911636, "grad_norm_var": 0.00012607755983602969, "learning_rate": 0.009727018279294926, "loss": 2.7675, "step": 1591 }, { "crossentropy": 2.8395068645477295, "epoch": 0.08657114114032465, "grad_norm": 0.055429499596357346, "grad_norm_var": 0.00012289996720303605, "learning_rate": 0.009726612325965818, "loss": 2.8395, "step": 1592 }, { "crossentropy": 2.8599371910095215, "epoch": 0.08662551999782485, "grad_norm": 0.059536971151828766, "grad_norm_var": 0.0001181977769383483, "learning_rate": 0.009726206079496617, "loss": 2.8599, "step": 1593 }, { "crossentropy": 2.8401989936828613, "epoch": 0.08667989885532505, "grad_norm": 0.049724094569683075, "grad_norm_var": 0.0001198089316705469, "learning_rate": 0.009725799539912525, "loss": 2.8402, "step": 1594 }, { "crossentropy": 2.843074321746826, "epoch": 0.08673427771282526, "grad_norm": 0.05419322848320007, "grad_norm_var": 0.00011685237421802848, "learning_rate": 0.009725392707238748, "loss": 2.8431, "step": 1595 }, { "crossentropy": 2.6795053482055664, "epoch": 0.08678865657032546, "grad_norm": 0.06340300291776657, "grad_norm_var": 0.00011042961241438271, "learning_rate": 0.009724985581500523, "loss": 2.6795, "step": 1596 }, { "crossentropy": 2.8070662021636963, "epoch": 0.08684303542782566, "grad_norm": 0.06404880434274673, "grad_norm_var": 0.00010793604257665756, "learning_rate": 0.009724578162723095, "loss": 2.8071, "step": 1597 }, { "crossentropy": 2.8243380784988403, "epoch": 0.08689741428532587, "grad_norm": 0.05598133057355881, "grad_norm_var": 0.00010618695837840969, "learning_rate": 0.009724170450931737, "loss": 2.8243, "step": 1598 }, { "crossentropy": 2.903603434562683, "epoch": 0.08695179314282607, "grad_norm": 0.049336835741996765, "grad_norm_var": 0.00010698731637925103, "learning_rate": 0.009723762446151729, "loss": 2.9036, "step": 1599 }, { "crossentropy": 2.65511417388916, "epoch": 0.08700617200032627, "grad_norm": 0.05219573527574539, "grad_norm_var": 0.00010760659718146616, "learning_rate": 0.009723354148408381, "loss": 2.6551, "step": 1600 }, { "crossentropy": 2.879418969154358, "epoch": 0.08706055085782648, "grad_norm": 0.05334387719631195, "grad_norm_var": 0.00010640852605251472, "learning_rate": 0.00972294555772701, "loss": 2.8794, "step": 1601 }, { "crossentropy": 2.6703141927719116, "epoch": 0.08711492971532668, "grad_norm": 0.053631775081157684, "grad_norm_var": 2.191252210758827e-05, "learning_rate": 0.00972253667413296, "loss": 2.6703, "step": 1602 }, { "crossentropy": 2.8599653244018555, "epoch": 0.08716930857282688, "grad_norm": 0.05348401144146919, "grad_norm_var": 2.201503356466123e-05, "learning_rate": 0.009722127497651587, "loss": 2.86, "step": 1603 }, { "crossentropy": 2.7381056547164917, "epoch": 0.08722368743032709, "grad_norm": 0.05625239387154579, "grad_norm_var": 2.1929444794107033e-05, "learning_rate": 0.009721718028308271, "loss": 2.7381, "step": 1604 }, { "crossentropy": 2.881499171257019, "epoch": 0.08727806628782729, "grad_norm": 0.05812377855181694, "grad_norm_var": 2.1831123667661e-05, "learning_rate": 0.009721308266128404, "loss": 2.8815, "step": 1605 }, { "crossentropy": 2.9469014406204224, "epoch": 0.0873324451453275, "grad_norm": 0.06360791623592377, "grad_norm_var": 2.447194919245425e-05, "learning_rate": 0.0097208982111374, "loss": 2.9469, "step": 1606 }, { "crossentropy": 2.9343011379241943, "epoch": 0.0873868240028277, "grad_norm": 0.05171488970518112, "grad_norm_var": 2.2233846254337183e-05, "learning_rate": 0.009720487863360688, "loss": 2.9343, "step": 1607 }, { "crossentropy": 2.812281847000122, "epoch": 0.0874412028603279, "grad_norm": 0.05476374551653862, "grad_norm_var": 2.230113905002847e-05, "learning_rate": 0.009720077222823722, "loss": 2.8123, "step": 1608 }, { "crossentropy": 2.8483842611312866, "epoch": 0.0874955817178281, "grad_norm": 0.057443518191576004, "grad_norm_var": 2.1541420599432923e-05, "learning_rate": 0.009719666289551968, "loss": 2.8484, "step": 1609 }, { "crossentropy": 2.864544630050659, "epoch": 0.08754996057532831, "grad_norm": 0.06274258345365524, "grad_norm_var": 2.1755709488196066e-05, "learning_rate": 0.009719255063570909, "loss": 2.8645, "step": 1610 }, { "crossentropy": 2.7007553577423096, "epoch": 0.08760433943282851, "grad_norm": 0.05102647840976715, "grad_norm_var": 2.336353174046997e-05, "learning_rate": 0.00971884354490605, "loss": 2.7008, "step": 1611 }, { "crossentropy": 2.782903790473938, "epoch": 0.08765871829032872, "grad_norm": 0.05263988673686981, "grad_norm_var": 2.0437399327190704e-05, "learning_rate": 0.009718431733582914, "loss": 2.7829, "step": 1612 }, { "crossentropy": 2.879874110221863, "epoch": 0.08771309714782892, "grad_norm": 0.056323494762182236, "grad_norm_var": 1.5512289965398616e-05, "learning_rate": 0.009718019629627045, "loss": 2.8799, "step": 1613 }, { "crossentropy": 2.7345346212387085, "epoch": 0.08776747600532912, "grad_norm": 0.05075310543179512, "grad_norm_var": 1.6650415766960615e-05, "learning_rate": 0.009717607233063993, "loss": 2.7345, "step": 1614 }, { "crossentropy": 2.8623063564300537, "epoch": 0.08782185486282933, "grad_norm": 0.05114017426967621, "grad_norm_var": 1.5531299721432448e-05, "learning_rate": 0.00971719454391934, "loss": 2.8623, "step": 1615 }, { "crossentropy": 2.7463663816452026, "epoch": 0.08787623372032953, "grad_norm": 0.051764070987701416, "grad_norm_var": 1.5701422509090527e-05, "learning_rate": 0.00971678156221868, "loss": 2.7464, "step": 1616 }, { "crossentropy": 2.841691017150879, "epoch": 0.08793061257782973, "grad_norm": 0.056488242000341415, "grad_norm_var": 1.565763909820601e-05, "learning_rate": 0.009716368287987623, "loss": 2.8417, "step": 1617 }, { "crossentropy": 2.854263186454773, "epoch": 0.08798499143532994, "grad_norm": 0.0631515383720398, "grad_norm_var": 1.943433238972651e-05, "learning_rate": 0.009715954721251804, "loss": 2.8543, "step": 1618 }, { "crossentropy": 2.8379178047180176, "epoch": 0.08803937029283014, "grad_norm": 0.05579383671283722, "grad_norm_var": 1.908108397121183e-05, "learning_rate": 0.009715540862036867, "loss": 2.8379, "step": 1619 }, { "crossentropy": 2.864417791366577, "epoch": 0.08809374915033036, "grad_norm": 0.06949912011623383, "grad_norm_var": 3.074472659023696e-05, "learning_rate": 0.009715126710368484, "loss": 2.8644, "step": 1620 }, { "crossentropy": 2.77848744392395, "epoch": 0.08814812800783056, "grad_norm": 0.04896979779005051, "grad_norm_var": 3.4227112812475144e-05, "learning_rate": 0.009714712266272338, "loss": 2.7785, "step": 1621 }, { "crossentropy": 2.881782054901123, "epoch": 0.08820250686533077, "grad_norm": 0.04913540184497833, "grad_norm_var": 3.2856999495142235e-05, "learning_rate": 0.009714297529774133, "loss": 2.8818, "step": 1622 }, { "crossentropy": 2.883259415626526, "epoch": 0.08825688572283097, "grad_norm": 0.051901452243328094, "grad_norm_var": 3.27722496843249e-05, "learning_rate": 0.00971388250089959, "loss": 2.8833, "step": 1623 }, { "crossentropy": 2.857697010040283, "epoch": 0.08831126458033117, "grad_norm": 0.05788838118314743, "grad_norm_var": 3.319194681326761e-05, "learning_rate": 0.009713467179674449, "loss": 2.8577, "step": 1624 }, { "crossentropy": 2.914333939552307, "epoch": 0.08836564343783138, "grad_norm": 0.05820401385426521, "grad_norm_var": 3.343365087847457e-05, "learning_rate": 0.009713051566124467, "loss": 2.9143, "step": 1625 }, { "crossentropy": 2.806437134742737, "epoch": 0.08842002229533158, "grad_norm": 0.05410994961857796, "grad_norm_var": 2.9713345149326385e-05, "learning_rate": 0.009712635660275424, "loss": 2.8064, "step": 1626 }, { "crossentropy": 2.7899104356765747, "epoch": 0.08847440115283178, "grad_norm": 0.05312270298600197, "grad_norm_var": 2.889854954865508e-05, "learning_rate": 0.009712219462153107, "loss": 2.7899, "step": 1627 }, { "crossentropy": 2.849815249443054, "epoch": 0.08852878001033199, "grad_norm": 0.049672823399305344, "grad_norm_var": 3.0404333108300223e-05, "learning_rate": 0.009711802971783335, "loss": 2.8498, "step": 1628 }, { "crossentropy": 2.8184159994125366, "epoch": 0.08858315886783219, "grad_norm": 0.04962197318673134, "grad_norm_var": 3.191237657507214e-05, "learning_rate": 0.009711386189191934, "loss": 2.8184, "step": 1629 }, { "crossentropy": 2.763826847076416, "epoch": 0.0886375377253324, "grad_norm": 0.049844056367874146, "grad_norm_var": 3.241223814938743e-05, "learning_rate": 0.009710969114404755, "loss": 2.7638, "step": 1630 }, { "crossentropy": 2.8372256755828857, "epoch": 0.0886919165828326, "grad_norm": 0.054638393223285675, "grad_norm_var": 3.1659301777891884e-05, "learning_rate": 0.009710551747447662, "loss": 2.8372, "step": 1631 }, { "crossentropy": 2.847567558288574, "epoch": 0.0887462954403328, "grad_norm": 0.04751363396644592, "grad_norm_var": 3.4402919761984904e-05, "learning_rate": 0.009710134088346542, "loss": 2.8476, "step": 1632 }, { "crossentropy": 2.853932738304138, "epoch": 0.088800674297833, "grad_norm": 0.04956610128283501, "grad_norm_var": 3.542159930924304e-05, "learning_rate": 0.009709716137127298, "loss": 2.8539, "step": 1633 }, { "crossentropy": 2.7225054502487183, "epoch": 0.08885505315533321, "grad_norm": 0.05422375723719597, "grad_norm_var": 2.9407765639353988e-05, "learning_rate": 0.00970929789381585, "loss": 2.7225, "step": 1634 }, { "crossentropy": 2.902502655982971, "epoch": 0.08890943201283341, "grad_norm": 0.06337086111307144, "grad_norm_var": 3.545825169419484e-05, "learning_rate": 0.009708879358438137, "loss": 2.9025, "step": 1635 }, { "crossentropy": 2.7269861698150635, "epoch": 0.08896381087033362, "grad_norm": 0.06072870269417763, "grad_norm_var": 2.194264584045961e-05, "learning_rate": 0.009708460531020115, "loss": 2.727, "step": 1636 }, { "crossentropy": 2.76858913898468, "epoch": 0.08901818972783382, "grad_norm": 0.055574096739292145, "grad_norm_var": 2.08714838491747e-05, "learning_rate": 0.009708041411587761, "loss": 2.7686, "step": 1637 }, { "crossentropy": 2.8039112091064453, "epoch": 0.08907256858533402, "grad_norm": 0.0496731735765934, "grad_norm_var": 2.056263892909003e-05, "learning_rate": 0.009707622000167069, "loss": 2.8039, "step": 1638 }, { "crossentropy": 2.7949838638305664, "epoch": 0.08912694744283423, "grad_norm": 0.04995034635066986, "grad_norm_var": 2.1275835299251234e-05, "learning_rate": 0.009707202296784049, "loss": 2.795, "step": 1639 }, { "crossentropy": 2.79144823551178, "epoch": 0.08918132630033443, "grad_norm": 0.04760877788066864, "grad_norm_var": 2.201133171681509e-05, "learning_rate": 0.009706782301464729, "loss": 2.7914, "step": 1640 }, { "crossentropy": 2.814406633377075, "epoch": 0.08923570515783463, "grad_norm": 0.051094986498355865, "grad_norm_var": 2.0203082301635765e-05, "learning_rate": 0.009706362014235161, "loss": 2.8144, "step": 1641 }, { "crossentropy": 2.753352403640747, "epoch": 0.08929008401533484, "grad_norm": 0.05075931176543236, "grad_norm_var": 2.0194284814462986e-05, "learning_rate": 0.009705941435121409, "loss": 2.7534, "step": 1642 }, { "crossentropy": 2.8489338159561157, "epoch": 0.08934446287283504, "grad_norm": 0.05374240502715111, "grad_norm_var": 2.0285418789568975e-05, "learning_rate": 0.009705520564149554, "loss": 2.8489, "step": 1643 }, { "crossentropy": 2.8524221181869507, "epoch": 0.08939884173033524, "grad_norm": 0.04786904156208038, "grad_norm_var": 2.1132393415313327e-05, "learning_rate": 0.009705099401345703, "loss": 2.8524, "step": 1644 }, { "crossentropy": 2.7225197553634644, "epoch": 0.08945322058783545, "grad_norm": 0.04928544536232948, "grad_norm_var": 2.1256774116507466e-05, "learning_rate": 0.009704677946735974, "loss": 2.7225, "step": 1645 }, { "crossentropy": 2.7844226360321045, "epoch": 0.08950759944533565, "grad_norm": 0.04772092029452324, "grad_norm_var": 2.22097385999392e-05, "learning_rate": 0.009704256200346505, "loss": 2.7844, "step": 1646 }, { "crossentropy": 2.9030044078826904, "epoch": 0.08956197830283585, "grad_norm": 0.05282063037157059, "grad_norm_var": 2.1796786572828714e-05, "learning_rate": 0.00970383416220345, "loss": 2.903, "step": 1647 }, { "crossentropy": 2.81594717502594, "epoch": 0.08961635716033606, "grad_norm": 0.06093398854136467, "grad_norm_var": 2.5081263020322078e-05, "learning_rate": 0.009703411832332988, "loss": 2.8159, "step": 1648 }, { "crossentropy": 2.8206206560134888, "epoch": 0.08967073601783626, "grad_norm": 0.06506194919347763, "grad_norm_var": 3.33914194108581e-05, "learning_rate": 0.009702989210761306, "loss": 2.8206, "step": 1649 }, { "crossentropy": 2.914011240005493, "epoch": 0.08972511487533646, "grad_norm": 0.07453739643096924, "grad_norm_var": 6.039400406118531e-05, "learning_rate": 0.00970256629751462, "loss": 2.914, "step": 1650 }, { "crossentropy": 2.889344334602356, "epoch": 0.08977949373283667, "grad_norm": 0.06758254021406174, "grad_norm_var": 6.617766914042741e-05, "learning_rate": 0.009702143092619155, "loss": 2.8893, "step": 1651 }, { "crossentropy": 2.756690263748169, "epoch": 0.08983387259033687, "grad_norm": 0.04898671805858612, "grad_norm_var": 6.630970327249168e-05, "learning_rate": 0.009701719596101162, "loss": 2.7567, "step": 1652 }, { "crossentropy": 2.7977795600891113, "epoch": 0.08988825144783708, "grad_norm": 0.05466704070568085, "grad_norm_var": 6.624030668519627e-05, "learning_rate": 0.0097012958079869, "loss": 2.7978, "step": 1653 }, { "crossentropy": 2.779194474220276, "epoch": 0.08994263030533728, "grad_norm": 0.05203116312623024, "grad_norm_var": 6.506447607718471e-05, "learning_rate": 0.009700871728302658, "loss": 2.7792, "step": 1654 }, { "crossentropy": 2.875272274017334, "epoch": 0.08999700916283748, "grad_norm": 0.05476108938455582, "grad_norm_var": 6.348628989521487e-05, "learning_rate": 0.00970044735707473, "loss": 2.8753, "step": 1655 }, { "crossentropy": 2.880563974380493, "epoch": 0.09005138802033769, "grad_norm": 0.054166849702596664, "grad_norm_var": 5.974067888994631e-05, "learning_rate": 0.009700022694329444, "loss": 2.8806, "step": 1656 }, { "crossentropy": 2.768418312072754, "epoch": 0.0901057668778379, "grad_norm": 0.0479884073138237, "grad_norm_var": 6.21172387901112e-05, "learning_rate": 0.009699597740093128, "loss": 2.7684, "step": 1657 }, { "crossentropy": 2.7650299072265625, "epoch": 0.09016014573533811, "grad_norm": 0.04691300168633461, "grad_norm_var": 6.531010035250393e-05, "learning_rate": 0.009699172494392144, "loss": 2.765, "step": 1658 }, { "crossentropy": 2.736670732498169, "epoch": 0.09021452459283831, "grad_norm": 0.047714266926050186, "grad_norm_var": 6.854525851623793e-05, "learning_rate": 0.009698746957252863, "loss": 2.7367, "step": 1659 }, { "crossentropy": 2.795124053955078, "epoch": 0.09026890345033851, "grad_norm": 0.04991744086146355, "grad_norm_var": 6.697869754150667e-05, "learning_rate": 0.009698321128701675, "loss": 2.7951, "step": 1660 }, { "crossentropy": 2.815779447555542, "epoch": 0.09032328230783872, "grad_norm": 0.04955606162548065, "grad_norm_var": 6.678815640420022e-05, "learning_rate": 0.009697895008764992, "loss": 2.8158, "step": 1661 }, { "crossentropy": 2.806054949760437, "epoch": 0.09037766116533892, "grad_norm": 0.05006379261612892, "grad_norm_var": 6.494796301081725e-05, "learning_rate": 0.00969746859746924, "loss": 2.8061, "step": 1662 }, { "crossentropy": 2.6984288692474365, "epoch": 0.09043204002283912, "grad_norm": 0.05114574730396271, "grad_norm_var": 6.557791268612707e-05, "learning_rate": 0.009697041894840866, "loss": 2.6984, "step": 1663 }, { "crossentropy": 2.8244248628616333, "epoch": 0.09048641888033933, "grad_norm": 0.04928571358323097, "grad_norm_var": 6.445634874868196e-05, "learning_rate": 0.009696614900906331, "loss": 2.8244, "step": 1664 }, { "crossentropy": 2.8590433597564697, "epoch": 0.09054079773783953, "grad_norm": 0.059138648211956024, "grad_norm_var": 5.7931474801995306e-05, "learning_rate": 0.009696187615692119, "loss": 2.859, "step": 1665 }, { "crossentropy": 2.8727071285247803, "epoch": 0.09059517659533974, "grad_norm": 0.05649294704198837, "grad_norm_var": 2.8036469403068936e-05, "learning_rate": 0.00969576003922473, "loss": 2.8727, "step": 1666 }, { "crossentropy": 2.870721459388733, "epoch": 0.09064955545283994, "grad_norm": 0.05736904218792915, "grad_norm_var": 1.405180916933535e-05, "learning_rate": 0.00969533217153068, "loss": 2.8707, "step": 1667 }, { "crossentropy": 2.873858332633972, "epoch": 0.09070393431034014, "grad_norm": 0.05591116100549698, "grad_norm_var": 1.4370499772101836e-05, "learning_rate": 0.009694904012636507, "loss": 2.8739, "step": 1668 }, { "crossentropy": 2.811476469039917, "epoch": 0.09075831316784035, "grad_norm": 0.057314857840538025, "grad_norm_var": 1.5637235491897798e-05, "learning_rate": 0.009694475562568763, "loss": 2.8115, "step": 1669 }, { "crossentropy": 2.8012970685958862, "epoch": 0.09081269202534055, "grad_norm": 0.05733868107199669, "grad_norm_var": 1.707622763462979e-05, "learning_rate": 0.009694046821354023, "loss": 2.8013, "step": 1670 }, { "crossentropy": 2.7023032903671265, "epoch": 0.09086707088284075, "grad_norm": 0.05179508402943611, "grad_norm_var": 1.6857369025292425e-05, "learning_rate": 0.009693617789018876, "loss": 2.7023, "step": 1671 }, { "crossentropy": 2.6719053983688354, "epoch": 0.09092144974034096, "grad_norm": 0.047476932406425476, "grad_norm_var": 1.8285470547910188e-05, "learning_rate": 0.009693188465589928, "loss": 2.6719, "step": 1672 }, { "crossentropy": 2.7771881818771362, "epoch": 0.09097582859784116, "grad_norm": 0.05706935375928879, "grad_norm_var": 1.832329493360041e-05, "learning_rate": 0.009692758851093808, "loss": 2.7772, "step": 1673 }, { "crossentropy": 2.626819372177124, "epoch": 0.09103020745534136, "grad_norm": 0.05416644737124443, "grad_norm_var": 1.5936073376785626e-05, "learning_rate": 0.009692328945557157, "loss": 2.6268, "step": 1674 }, { "crossentropy": 2.775224804878235, "epoch": 0.09108458631284157, "grad_norm": 0.05699647217988968, "grad_norm_var": 1.4488717086388823e-05, "learning_rate": 0.00969189874900664, "loss": 2.7752, "step": 1675 }, { "crossentropy": 2.68954861164093, "epoch": 0.09113896517034177, "grad_norm": 0.06681185960769653, "grad_norm_var": 2.3548181783466144e-05, "learning_rate": 0.009691468261468938, "loss": 2.6895, "step": 1676 }, { "crossentropy": 2.7804473638534546, "epoch": 0.09119334402784197, "grad_norm": 0.05656377598643303, "grad_norm_var": 2.165154633690791e-05, "learning_rate": 0.009691037482970748, "loss": 2.7804, "step": 1677 }, { "crossentropy": 2.7996522188186646, "epoch": 0.09124772288534218, "grad_norm": 0.054977770894765854, "grad_norm_var": 1.972424321858192e-05, "learning_rate": 0.009690606413538787, "loss": 2.7997, "step": 1678 }, { "crossentropy": 2.706040620803833, "epoch": 0.09130210174284238, "grad_norm": 0.05547996610403061, "grad_norm_var": 1.831504794670835e-05, "learning_rate": 0.009690175053199789, "loss": 2.706, "step": 1679 }, { "crossentropy": 2.786052107810974, "epoch": 0.09135648060034258, "grad_norm": 0.05327170342206955, "grad_norm_var": 1.579980956337527e-05, "learning_rate": 0.009689743401980507, "loss": 2.7861, "step": 1680 }, { "crossentropy": 2.816587209701538, "epoch": 0.09141085945784279, "grad_norm": 0.05246156081557274, "grad_norm_var": 1.591301330387257e-05, "learning_rate": 0.00968931145990771, "loss": 2.8166, "step": 1681 }, { "crossentropy": 2.823183298110962, "epoch": 0.09146523831534299, "grad_norm": 0.04949123039841652, "grad_norm_var": 1.825411532577666e-05, "learning_rate": 0.009688879227008191, "loss": 2.8232, "step": 1682 }, { "crossentropy": 2.864630103111267, "epoch": 0.0915196171728432, "grad_norm": 0.050930820405483246, "grad_norm_var": 1.905234148419684e-05, "learning_rate": 0.009688446703308751, "loss": 2.8646, "step": 1683 }, { "crossentropy": 2.792740821838379, "epoch": 0.0915739960303434, "grad_norm": 0.04931843653321266, "grad_norm_var": 2.086119470090057e-05, "learning_rate": 0.009688013888836219, "loss": 2.7927, "step": 1684 }, { "crossentropy": 2.7968865633010864, "epoch": 0.0916283748878436, "grad_norm": 0.048549044877290726, "grad_norm_var": 2.233464199904855e-05, "learning_rate": 0.009687580783617436, "loss": 2.7969, "step": 1685 }, { "crossentropy": 2.8043864965438843, "epoch": 0.0916827537453438, "grad_norm": 0.049091704189777374, "grad_norm_var": 2.2824826452035504e-05, "learning_rate": 0.009687147387679262, "loss": 2.8044, "step": 1686 }, { "crossentropy": 2.798310160636902, "epoch": 0.09173713260284401, "grad_norm": 0.050888676196336746, "grad_norm_var": 2.3070530028275933e-05, "learning_rate": 0.009686713701048578, "loss": 2.7983, "step": 1687 }, { "crossentropy": 2.890635848045349, "epoch": 0.09179151146034421, "grad_norm": 0.054488129913806915, "grad_norm_var": 2.0655706778808376e-05, "learning_rate": 0.009686279723752281, "loss": 2.8906, "step": 1688 }, { "crossentropy": 2.875780940055847, "epoch": 0.09184589031784442, "grad_norm": 0.054844316095113754, "grad_norm_var": 1.999070003735424e-05, "learning_rate": 0.009685845455817282, "loss": 2.8758, "step": 1689 }, { "crossentropy": 2.8012114763259888, "epoch": 0.09190026917534462, "grad_norm": 0.05133819580078125, "grad_norm_var": 2.029428088018703e-05, "learning_rate": 0.009685410897270517, "loss": 2.8012, "step": 1690 }, { "crossentropy": 2.89277982711792, "epoch": 0.09195464803284482, "grad_norm": 0.048014722764492035, "grad_norm_var": 2.1111861772715734e-05, "learning_rate": 0.009684976048138935, "loss": 2.8928, "step": 1691 }, { "crossentropy": 2.761112689971924, "epoch": 0.09200902689034503, "grad_norm": 0.054412029683589935, "grad_norm_var": 7.733570022847719e-06, "learning_rate": 0.00968454090844951, "loss": 2.7611, "step": 1692 }, { "crossentropy": 2.7259790897369385, "epoch": 0.09206340574784523, "grad_norm": 0.07031150907278061, "grad_norm_var": 2.76685081737143e-05, "learning_rate": 0.009684105478229224, "loss": 2.726, "step": 1693 }, { "crossentropy": 2.8627140522003174, "epoch": 0.09211778460534545, "grad_norm": 0.05753017216920853, "grad_norm_var": 2.8751524507535562e-05, "learning_rate": 0.009683669757505082, "loss": 2.8627, "step": 1694 }, { "crossentropy": 2.782580852508545, "epoch": 0.09217216346284565, "grad_norm": 0.05413956567645073, "grad_norm_var": 2.8447653094307242e-05, "learning_rate": 0.009683233746304108, "loss": 2.7826, "step": 1695 }, { "crossentropy": 2.8259445428848267, "epoch": 0.09222654232034586, "grad_norm": 0.05045616254210472, "grad_norm_var": 2.8866491075322382e-05, "learning_rate": 0.009682797444653344, "loss": 2.8259, "step": 1696 }, { "crossentropy": 2.8689048290252686, "epoch": 0.09228092117784606, "grad_norm": 0.057496748864650726, "grad_norm_var": 3.0162322159319777e-05, "learning_rate": 0.009682360852579848, "loss": 2.8689, "step": 1697 }, { "crossentropy": 2.7862573862075806, "epoch": 0.09233530003534626, "grad_norm": 0.058169543743133545, "grad_norm_var": 3.057060579721808e-05, "learning_rate": 0.009681923970110697, "loss": 2.7863, "step": 1698 }, { "crossentropy": 2.8869540691375732, "epoch": 0.09238967889284647, "grad_norm": 0.055824022740125656, "grad_norm_var": 3.0228585856654997e-05, "learning_rate": 0.009681486797272987, "loss": 2.887, "step": 1699 }, { "crossentropy": 2.8550649881362915, "epoch": 0.09244405775034667, "grad_norm": 0.0515267476439476, "grad_norm_var": 2.913886413553404e-05, "learning_rate": 0.00968104933409383, "loss": 2.8551, "step": 1700 }, { "crossentropy": 2.7288402318954468, "epoch": 0.09249843660784687, "grad_norm": 0.05525725707411766, "grad_norm_var": 2.6903632910258805e-05, "learning_rate": 0.009680611580600358, "loss": 2.7288, "step": 1701 }, { "crossentropy": 2.6781636476516724, "epoch": 0.09255281546534708, "grad_norm": 0.05616280809044838, "grad_norm_var": 2.4824200512688482e-05, "learning_rate": 0.00968017353681972, "loss": 2.6782, "step": 1702 }, { "crossentropy": 2.8464890718460083, "epoch": 0.09260719432284728, "grad_norm": 0.059412624686956406, "grad_norm_var": 2.4631546828280243e-05, "learning_rate": 0.00967973520277908, "loss": 2.8465, "step": 1703 }, { "crossentropy": 2.8127994537353516, "epoch": 0.09266157318034748, "grad_norm": 0.0485215038061142, "grad_norm_var": 2.7730422180181382e-05, "learning_rate": 0.009679296578505628, "loss": 2.8128, "step": 1704 }, { "crossentropy": 2.8009207248687744, "epoch": 0.09271595203784769, "grad_norm": 0.04962156340479851, "grad_norm_var": 2.969241530088821e-05, "learning_rate": 0.009678857664026564, "loss": 2.8009, "step": 1705 }, { "crossentropy": 2.7592201232910156, "epoch": 0.09277033089534789, "grad_norm": 0.04687638580799103, "grad_norm_var": 3.3047979736576004e-05, "learning_rate": 0.00967841845936911, "loss": 2.7592, "step": 1706 }, { "crossentropy": 2.740188956260681, "epoch": 0.0928247097528481, "grad_norm": 0.05378790944814682, "grad_norm_var": 3.0055597395626554e-05, "learning_rate": 0.009677978964560505, "loss": 2.7402, "step": 1707 }, { "crossentropy": 2.8307039737701416, "epoch": 0.0928790886103483, "grad_norm": 0.059964846819639206, "grad_norm_var": 3.157022308398668e-05, "learning_rate": 0.009677539179628006, "loss": 2.8307, "step": 1708 }, { "crossentropy": 2.728952407836914, "epoch": 0.0929334674678485, "grad_norm": 0.06556438654661179, "grad_norm_var": 2.348740220790291e-05, "learning_rate": 0.009677099104598887, "loss": 2.729, "step": 1709 }, { "crossentropy": 2.833549737930298, "epoch": 0.0929878463253487, "grad_norm": 0.059534139931201935, "grad_norm_var": 2.4409231784774943e-05, "learning_rate": 0.009676658739500443, "loss": 2.8335, "step": 1710 }, { "crossentropy": 2.7207202911376953, "epoch": 0.09304222518284891, "grad_norm": 0.06065579876303673, "grad_norm_var": 2.6189715483467455e-05, "learning_rate": 0.009676218084359983, "loss": 2.7207, "step": 1711 }, { "crossentropy": 2.759713649749756, "epoch": 0.09309660404034911, "grad_norm": 0.054664235562086105, "grad_norm_var": 2.4437287965649776e-05, "learning_rate": 0.009675777139204837, "loss": 2.7597, "step": 1712 }, { "crossentropy": 2.7232621908187866, "epoch": 0.09315098289784932, "grad_norm": 0.06131504476070404, "grad_norm_var": 2.6204671436650533e-05, "learning_rate": 0.009675335904062353, "loss": 2.7233, "step": 1713 }, { "crossentropy": 2.8420937061309814, "epoch": 0.09320536175534952, "grad_norm": 0.05357611924409866, "grad_norm_var": 2.6227516170476693e-05, "learning_rate": 0.009674894378959895, "loss": 2.8421, "step": 1714 }, { "crossentropy": 2.7562334537506104, "epoch": 0.09325974061284972, "grad_norm": 0.04983758181333542, "grad_norm_var": 2.842151371695665e-05, "learning_rate": 0.009674452563924844, "loss": 2.7562, "step": 1715 }, { "crossentropy": 2.8011972904205322, "epoch": 0.09331411947034993, "grad_norm": 0.04829499498009682, "grad_norm_var": 3.074000367405475e-05, "learning_rate": 0.009674010458984606, "loss": 2.8012, "step": 1716 }, { "crossentropy": 2.6987547874450684, "epoch": 0.09336849832785013, "grad_norm": 0.046733956784009933, "grad_norm_var": 3.5204496982386285e-05, "learning_rate": 0.009673568064166596, "loss": 2.6988, "step": 1717 }, { "crossentropy": 2.792245626449585, "epoch": 0.09342287718535033, "grad_norm": 0.05232008546590805, "grad_norm_var": 3.535626501418813e-05, "learning_rate": 0.009673125379498252, "loss": 2.7922, "step": 1718 }, { "crossentropy": 2.812083125114441, "epoch": 0.09347725604285054, "grad_norm": 0.046969763934612274, "grad_norm_var": 3.6745784002101034e-05, "learning_rate": 0.009672682405007026, "loss": 2.8121, "step": 1719 }, { "crossentropy": 2.7249146699905396, "epoch": 0.09353163490035074, "grad_norm": 0.046102944761514664, "grad_norm_var": 3.876192402596229e-05, "learning_rate": 0.009672239140720396, "loss": 2.7249, "step": 1720 }, { "crossentropy": 2.8195377588272095, "epoch": 0.09358601375785094, "grad_norm": 0.048853348940610886, "grad_norm_var": 3.91949175587639e-05, "learning_rate": 0.00967179558666585, "loss": 2.8195, "step": 1721 }, { "crossentropy": 2.8102258443832397, "epoch": 0.09364039261535115, "grad_norm": 0.04874482750892639, "grad_norm_var": 3.777776590880247e-05, "learning_rate": 0.009671351742870897, "loss": 2.8102, "step": 1722 }, { "crossentropy": 2.7988606691360474, "epoch": 0.09369477147285135, "grad_norm": 0.05176515877246857, "grad_norm_var": 3.797134422999221e-05, "learning_rate": 0.009670907609363065, "loss": 2.7989, "step": 1723 }, { "crossentropy": 2.872697591781616, "epoch": 0.09374915033035155, "grad_norm": 0.060904957354068756, "grad_norm_var": 3.884557764320025e-05, "learning_rate": 0.009670463186169896, "loss": 2.8727, "step": 1724 }, { "crossentropy": 2.7475814819335938, "epoch": 0.09380352918785176, "grad_norm": 0.06509105861186981, "grad_norm_var": 3.8097550370017746e-05, "learning_rate": 0.009670018473318956, "loss": 2.7476, "step": 1725 }, { "crossentropy": 2.867823839187622, "epoch": 0.09385790804535196, "grad_norm": 0.06911280751228333, "grad_norm_var": 5.1589281547301244e-05, "learning_rate": 0.009669573470837824, "loss": 2.8678, "step": 1726 }, { "crossentropy": 2.6358367204666138, "epoch": 0.09391228690285217, "grad_norm": 0.05572453513741493, "grad_norm_var": 4.877165538706221e-05, "learning_rate": 0.009669128178754099, "loss": 2.6358, "step": 1727 }, { "crossentropy": 2.755694031715393, "epoch": 0.09396666576035237, "grad_norm": 0.0520213283598423, "grad_norm_var": 4.888630153369507e-05, "learning_rate": 0.009668682597095395, "loss": 2.7557, "step": 1728 }, { "crossentropy": 2.940595865249634, "epoch": 0.09402104461785257, "grad_norm": 0.05497172102332115, "grad_norm_var": 4.4863721152216204e-05, "learning_rate": 0.009668236725889352, "loss": 2.9406, "step": 1729 }, { "crossentropy": 2.803196668624878, "epoch": 0.09407542347535278, "grad_norm": 0.059724535793066025, "grad_norm_var": 4.754370534758486e-05, "learning_rate": 0.009667790565163618, "loss": 2.8032, "step": 1730 }, { "crossentropy": 2.780025362968445, "epoch": 0.094129802332853, "grad_norm": 0.0595095194876194, "grad_norm_var": 4.8572737842788165e-05, "learning_rate": 0.009667344114945865, "loss": 2.78, "step": 1731 }, { "crossentropy": 2.722377061843872, "epoch": 0.0941841811903532, "grad_norm": 0.058352991938591, "grad_norm_var": 4.700615054041822e-05, "learning_rate": 0.00966689737526378, "loss": 2.7224, "step": 1732 }, { "crossentropy": 2.8386130332946777, "epoch": 0.0942385600478534, "grad_norm": 0.060826774686574936, "grad_norm_var": 4.425052330347378e-05, "learning_rate": 0.009666450346145073, "loss": 2.8386, "step": 1733 }, { "crossentropy": 2.7435059547424316, "epoch": 0.0942929389053536, "grad_norm": 0.04946010559797287, "grad_norm_var": 4.604575270654327e-05, "learning_rate": 0.009666003027617465, "loss": 2.7435, "step": 1734 }, { "crossentropy": 2.8984854221343994, "epoch": 0.09434731776285381, "grad_norm": 0.05065309256315231, "grad_norm_var": 4.270021012190405e-05, "learning_rate": 0.009665555419708701, "loss": 2.8985, "step": 1735 }, { "crossentropy": 2.751681923866272, "epoch": 0.09440169662035401, "grad_norm": 0.05271737277507782, "grad_norm_var": 3.693659698893453e-05, "learning_rate": 0.009665107522446537, "loss": 2.7517, "step": 1736 }, { "crossentropy": 2.7765839099884033, "epoch": 0.09445607547785421, "grad_norm": 0.05720099061727524, "grad_norm_var": 3.316810699192559e-05, "learning_rate": 0.009664659335858755, "loss": 2.7766, "step": 1737 }, { "crossentropy": 2.7292507886886597, "epoch": 0.09451045433535442, "grad_norm": 0.05238810181617737, "grad_norm_var": 3.0146011552398265e-05, "learning_rate": 0.009664210859973151, "loss": 2.7293, "step": 1738 }, { "crossentropy": 2.874269127845764, "epoch": 0.09456483319285462, "grad_norm": 0.061035674065351486, "grad_norm_var": 2.916846298625138e-05, "learning_rate": 0.009663762094817535, "loss": 2.8743, "step": 1739 }, { "crossentropy": 2.7992258071899414, "epoch": 0.09461921205035483, "grad_norm": 0.053031276911497116, "grad_norm_var": 2.9448559522333532e-05, "learning_rate": 0.009663313040419744, "loss": 2.7992, "step": 1740 }, { "crossentropy": 2.8014657497406006, "epoch": 0.09467359090785503, "grad_norm": 0.0474003441631794, "grad_norm_var": 2.9897506729848327e-05, "learning_rate": 0.009662863696807624, "loss": 2.8015, "step": 1741 }, { "crossentropy": 2.7658143043518066, "epoch": 0.09472796976535523, "grad_norm": 0.049697183072566986, "grad_norm_var": 1.9209761336979637e-05, "learning_rate": 0.009662414064009047, "loss": 2.7658, "step": 1742 }, { "crossentropy": 2.771151900291443, "epoch": 0.09478234862285544, "grad_norm": 0.04987055063247681, "grad_norm_var": 2.052826741883613e-05, "learning_rate": 0.009661964142051895, "loss": 2.7712, "step": 1743 }, { "crossentropy": 2.8095537424087524, "epoch": 0.09483672748035564, "grad_norm": 0.047863755375146866, "grad_norm_var": 2.2873904515642815e-05, "learning_rate": 0.009661513930964075, "loss": 2.8096, "step": 1744 }, { "crossentropy": 2.7945111989974976, "epoch": 0.09489110633785584, "grad_norm": 0.05479374900460243, "grad_norm_var": 2.2853869677936602e-05, "learning_rate": 0.009661063430773505, "loss": 2.7945, "step": 1745 }, { "crossentropy": 2.772676706314087, "epoch": 0.09494548519535605, "grad_norm": 0.05185467004776001, "grad_norm_var": 2.075244756140019e-05, "learning_rate": 0.009660612641508127, "loss": 2.7727, "step": 1746 }, { "crossentropy": 2.8108381032943726, "epoch": 0.09499986405285625, "grad_norm": 0.047888319939374924, "grad_norm_var": 1.9945048992034364e-05, "learning_rate": 0.0096601615631959, "loss": 2.8108, "step": 1747 }, { "crossentropy": 2.846801280975342, "epoch": 0.09505424291035645, "grad_norm": 0.04636041074991226, "grad_norm_var": 2.0078110788222746e-05, "learning_rate": 0.009659710195864795, "loss": 2.8468, "step": 1748 }, { "crossentropy": 2.81865131855011, "epoch": 0.09510862176785666, "grad_norm": 0.04889548569917679, "grad_norm_var": 1.503700641106905e-05, "learning_rate": 0.009659258539542808, "loss": 2.8187, "step": 1749 }, { "crossentropy": 2.9294790029525757, "epoch": 0.09516300062535686, "grad_norm": 0.04708269238471985, "grad_norm_var": 1.5979650602774424e-05, "learning_rate": 0.009658806594257953, "loss": 2.9295, "step": 1750 }, { "crossentropy": 2.835949420928955, "epoch": 0.09521737948285706, "grad_norm": 0.048463232815265656, "grad_norm_var": 1.6430545223424196e-05, "learning_rate": 0.009658354360038254, "loss": 2.8359, "step": 1751 }, { "crossentropy": 2.737813353538513, "epoch": 0.09527175834035727, "grad_norm": 0.06341466307640076, "grad_norm_var": 2.5983567026236638e-05, "learning_rate": 0.00965790183691176, "loss": 2.7378, "step": 1752 }, { "crossentropy": 2.8549681901931763, "epoch": 0.09532613719785747, "grad_norm": 0.04968925192952156, "grad_norm_var": 2.400317791325466e-05, "learning_rate": 0.009657449024906537, "loss": 2.855, "step": 1753 }, { "crossentropy": 2.7497905492782593, "epoch": 0.09538051605535768, "grad_norm": 0.05021553114056587, "grad_norm_var": 2.396360114722158e-05, "learning_rate": 0.009656995924050669, "loss": 2.7498, "step": 1754 }, { "crossentropy": 2.8506247997283936, "epoch": 0.09543489491285788, "grad_norm": 0.048284053802490234, "grad_norm_var": 1.7228955951444948e-05, "learning_rate": 0.009656542534372255, "loss": 2.8506, "step": 1755 }, { "crossentropy": 2.7191120386123657, "epoch": 0.09548927377035808, "grad_norm": 0.04910937324166298, "grad_norm_var": 1.6762217310176997e-05, "learning_rate": 0.009656088855899412, "loss": 2.7191, "step": 1756 }, { "crossentropy": 2.8134357929229736, "epoch": 0.09554365262785829, "grad_norm": 0.05257868766784668, "grad_norm_var": 1.660513294268658e-05, "learning_rate": 0.009655634888660282, "loss": 2.8134, "step": 1757 }, { "crossentropy": 2.794527292251587, "epoch": 0.09559803148535849, "grad_norm": 0.05419006571173668, "grad_norm_var": 1.7458403929973035e-05, "learning_rate": 0.009655180632683016, "loss": 2.7945, "step": 1758 }, { "crossentropy": 2.8136500120162964, "epoch": 0.09565241034285869, "grad_norm": 0.053099799901247025, "grad_norm_var": 1.7770394791503248e-05, "learning_rate": 0.009654726087995787, "loss": 2.8137, "step": 1759 }, { "crossentropy": 2.722229480743408, "epoch": 0.0957067892003589, "grad_norm": 0.05933433026075363, "grad_norm_var": 2.140901935856748e-05, "learning_rate": 0.009654271254626786, "loss": 2.7222, "step": 1760 }, { "crossentropy": 2.8044538497924805, "epoch": 0.0957611680578591, "grad_norm": 0.06418539583683014, "grad_norm_var": 3.09480381865695e-05, "learning_rate": 0.00965381613260422, "loss": 2.8045, "step": 1761 }, { "crossentropy": 2.8546929359436035, "epoch": 0.0958155469153593, "grad_norm": 0.053653739392757416, "grad_norm_var": 3.1075798745981245e-05, "learning_rate": 0.009653360721956317, "loss": 2.8547, "step": 1762 }, { "crossentropy": 2.7128487825393677, "epoch": 0.09586992577285951, "grad_norm": 0.050532810389995575, "grad_norm_var": 2.996515162763154e-05, "learning_rate": 0.009652905022711319, "loss": 2.7128, "step": 1763 }, { "crossentropy": 2.774527072906494, "epoch": 0.09592430463035971, "grad_norm": 0.04844684153795242, "grad_norm_var": 2.854507952187394e-05, "learning_rate": 0.00965244903489749, "loss": 2.7745, "step": 1764 }, { "crossentropy": 2.8872413635253906, "epoch": 0.09597868348785991, "grad_norm": 0.05057397484779358, "grad_norm_var": 2.789802873915109e-05, "learning_rate": 0.009651992758543112, "loss": 2.8872, "step": 1765 }, { "crossentropy": 2.7507225275039673, "epoch": 0.09603306234536012, "grad_norm": 0.04794019088149071, "grad_norm_var": 2.730421012221614e-05, "learning_rate": 0.009651536193676476, "loss": 2.7507, "step": 1766 }, { "crossentropy": 2.7839205265045166, "epoch": 0.09608744120286032, "grad_norm": 0.04624839872121811, "grad_norm_var": 2.887141700211223e-05, "learning_rate": 0.009651079340325903, "loss": 2.7839, "step": 1767 }, { "crossentropy": 2.845107913017273, "epoch": 0.09614182006036052, "grad_norm": 0.04743890091776848, "grad_norm_var": 2.1772948579493133e-05, "learning_rate": 0.009650622198519728, "loss": 2.8451, "step": 1768 }, { "crossentropy": 2.754263401031494, "epoch": 0.09619619891786074, "grad_norm": 0.04582292214035988, "grad_norm_var": 2.3689706984824677e-05, "learning_rate": 0.009650164768286298, "loss": 2.7543, "step": 1769 }, { "crossentropy": 2.8076107501983643, "epoch": 0.09625057777536095, "grad_norm": 0.0549427792429924, "grad_norm_var": 2.436916391248267e-05, "learning_rate": 0.009649707049653984, "loss": 2.8076, "step": 1770 }, { "crossentropy": 2.764182448387146, "epoch": 0.09630495663286115, "grad_norm": 0.0490960031747818, "grad_norm_var": 2.4046090723067737e-05, "learning_rate": 0.009649249042651175, "loss": 2.7642, "step": 1771 }, { "crossentropy": 2.8030630350112915, "epoch": 0.09635933549036135, "grad_norm": 0.07133092731237411, "grad_norm_var": 4.723380343428455e-05, "learning_rate": 0.009648790747306273, "loss": 2.8031, "step": 1772 }, { "crossentropy": 2.828161358833313, "epoch": 0.09641371434786156, "grad_norm": 0.05395901948213577, "grad_norm_var": 4.725906032430147e-05, "learning_rate": 0.009648332163647706, "loss": 2.8282, "step": 1773 }, { "crossentropy": 2.821336507797241, "epoch": 0.09646809320536176, "grad_norm": 0.04971258342266083, "grad_norm_var": 4.7905913442069145e-05, "learning_rate": 0.009647873291703909, "loss": 2.8213, "step": 1774 }, { "crossentropy": 2.789875030517578, "epoch": 0.09652247206286196, "grad_norm": 0.05271347984671593, "grad_norm_var": 4.790468758238571e-05, "learning_rate": 0.009647414131503345, "loss": 2.7899, "step": 1775 }, { "crossentropy": 2.7551770210266113, "epoch": 0.09657685092036217, "grad_norm": 0.06162187457084656, "grad_norm_var": 5.020316576503185e-05, "learning_rate": 0.009646954683074489, "loss": 2.7552, "step": 1776 }, { "crossentropy": 2.832037091255188, "epoch": 0.09663122977786237, "grad_norm": 0.05084842070937157, "grad_norm_var": 4.145420141583999e-05, "learning_rate": 0.009646494946445837, "loss": 2.832, "step": 1777 }, { "crossentropy": 2.715542793273926, "epoch": 0.09668560863536257, "grad_norm": 0.05242261663079262, "grad_norm_var": 4.130704589710964e-05, "learning_rate": 0.009646034921645898, "loss": 2.7155, "step": 1778 }, { "crossentropy": 2.775670647621155, "epoch": 0.09673998749286278, "grad_norm": 0.04950670152902603, "grad_norm_var": 4.1587708852036435e-05, "learning_rate": 0.009645574608703206, "loss": 2.7757, "step": 1779 }, { "crossentropy": 2.7955867052078247, "epoch": 0.09679436635036298, "grad_norm": 0.046539679169654846, "grad_norm_var": 4.2728508030614086e-05, "learning_rate": 0.009645114007646307, "loss": 2.7956, "step": 1780 }, { "crossentropy": 2.872834324836731, "epoch": 0.09684874520786318, "grad_norm": 0.051924850791692734, "grad_norm_var": 4.2600137605434905e-05, "learning_rate": 0.00964465311850377, "loss": 2.8728, "step": 1781 }, { "crossentropy": 2.772592544555664, "epoch": 0.09690312406536339, "grad_norm": 0.0520038977265358, "grad_norm_var": 4.1430179706223455e-05, "learning_rate": 0.009644191941304175, "loss": 2.7726, "step": 1782 }, { "crossentropy": 2.631656527519226, "epoch": 0.09695750292286359, "grad_norm": 0.05951991677284241, "grad_norm_var": 4.1803740815417785e-05, "learning_rate": 0.009643730476076126, "loss": 2.6317, "step": 1783 }, { "crossentropy": 2.7780168056488037, "epoch": 0.0970118817803638, "grad_norm": 0.05435341224074364, "grad_norm_var": 3.9583991158347505e-05, "learning_rate": 0.009643268722848241, "loss": 2.778, "step": 1784 }, { "crossentropy": 2.790835380554199, "epoch": 0.097066260637864, "grad_norm": 0.05050176382064819, "grad_norm_var": 3.615046148307954e-05, "learning_rate": 0.009642806681649161, "loss": 2.7908, "step": 1785 }, { "crossentropy": 2.8429617881774902, "epoch": 0.0971206394953642, "grad_norm": 0.04518480598926544, "grad_norm_var": 4.0630855878827773e-05, "learning_rate": 0.009642344352507538, "loss": 2.843, "step": 1786 }, { "crossentropy": 2.633596897125244, "epoch": 0.0971750183528644, "grad_norm": 0.05080467090010643, "grad_norm_var": 3.987777632065241e-05, "learning_rate": 0.009641881735452048, "loss": 2.6336, "step": 1787 }, { "crossentropy": 2.739920735359192, "epoch": 0.09722939721036461, "grad_norm": 0.05140996724367142, "grad_norm_var": 1.6812788144809908e-05, "learning_rate": 0.009641418830511377, "loss": 2.7399, "step": 1788 }, { "crossentropy": 2.7524677515029907, "epoch": 0.09728377606786481, "grad_norm": 0.04792977124452591, "grad_norm_var": 1.7561556007456015e-05, "learning_rate": 0.009640955637714239, "loss": 2.7525, "step": 1789 }, { "crossentropy": 2.7289657592773438, "epoch": 0.09733815492536502, "grad_norm": 0.052978139370679855, "grad_norm_var": 1.736819677304411e-05, "learning_rate": 0.00964049215708936, "loss": 2.729, "step": 1790 }, { "crossentropy": 2.7298126220703125, "epoch": 0.09739253378286522, "grad_norm": 0.052181873470544815, "grad_norm_var": 1.7327596837103145e-05, "learning_rate": 0.009640028388665482, "loss": 2.7298, "step": 1791 }, { "crossentropy": 2.913740396499634, "epoch": 0.09744691264036542, "grad_norm": 0.04840648174285889, "grad_norm_var": 1.1039031814511804e-05, "learning_rate": 0.009639564332471371, "loss": 2.9137, "step": 1792 }, { "crossentropy": 2.8101696968078613, "epoch": 0.09750129149786563, "grad_norm": 0.051356859505176544, "grad_norm_var": 1.1042722457185739e-05, "learning_rate": 0.009639099988535805, "loss": 2.8102, "step": 1793 }, { "crossentropy": 2.8531090021133423, "epoch": 0.09755567035536583, "grad_norm": 0.0541493222117424, "grad_norm_var": 1.1541837494406272e-05, "learning_rate": 0.009638635356887582, "loss": 2.8531, "step": 1794 }, { "crossentropy": 2.775728702545166, "epoch": 0.09761004921286603, "grad_norm": 0.05465330183506012, "grad_norm_var": 1.2054550953393729e-05, "learning_rate": 0.00963817043755552, "loss": 2.7757, "step": 1795 }, { "crossentropy": 2.815353274345398, "epoch": 0.09766442807036624, "grad_norm": 0.0940457433462143, "grad_norm_var": 0.00012172690581906069, "learning_rate": 0.00963770523056845, "loss": 2.8154, "step": 1796 }, { "crossentropy": 2.8541529178619385, "epoch": 0.09771880692786644, "grad_norm": 0.06003357842564583, "grad_norm_var": 0.00012309243526422055, "learning_rate": 0.009637239735955226, "loss": 2.8542, "step": 1797 }, { "crossentropy": 2.8477513790130615, "epoch": 0.09777318578536665, "grad_norm": 0.0581706278026104, "grad_norm_var": 0.0001230307337046467, "learning_rate": 0.009636773953744718, "loss": 2.8478, "step": 1798 }, { "crossentropy": 2.6062116622924805, "epoch": 0.09782756464286685, "grad_norm": 0.05201615393161774, "grad_norm_var": 0.00012238289564352813, "learning_rate": 0.00963630788396581, "loss": 2.6062, "step": 1799 }, { "crossentropy": 2.8250404596328735, "epoch": 0.09788194350036705, "grad_norm": 0.04775311425328255, "grad_norm_var": 0.0001255743658981114, "learning_rate": 0.00963584152664741, "loss": 2.825, "step": 1800 }, { "crossentropy": 2.852368950843811, "epoch": 0.09793632235786726, "grad_norm": 0.05061030387878418, "grad_norm_var": 0.0001255176230519785, "learning_rate": 0.009635374881818442, "loss": 2.8524, "step": 1801 }, { "crossentropy": 2.8408446311950684, "epoch": 0.09799070121536746, "grad_norm": 0.04933939874172211, "grad_norm_var": 0.00012144721710856584, "learning_rate": 0.009634907949507845, "loss": 2.8408, "step": 1802 }, { "crossentropy": 2.903504490852356, "epoch": 0.09804508007286766, "grad_norm": 0.047258540987968445, "grad_norm_var": 0.00012409382825569556, "learning_rate": 0.009634440729744578, "loss": 2.9035, "step": 1803 }, { "crossentropy": 2.816786289215088, "epoch": 0.09809945893036787, "grad_norm": 0.04906730353832245, "grad_norm_var": 0.0001254077439898577, "learning_rate": 0.009633973222557617, "loss": 2.8168, "step": 1804 }, { "crossentropy": 2.828432321548462, "epoch": 0.09815383778786807, "grad_norm": 0.052448615431785583, "grad_norm_var": 0.00012280252325808598, "learning_rate": 0.009633505427975957, "loss": 2.8284, "step": 1805 }, { "crossentropy": 2.769035220146179, "epoch": 0.09820821664536829, "grad_norm": 0.055100955069065094, "grad_norm_var": 0.0001226097360212647, "learning_rate": 0.00963303734602861, "loss": 2.769, "step": 1806 }, { "crossentropy": 2.8444936275482178, "epoch": 0.09826259550286849, "grad_norm": 0.04956435784697533, "grad_norm_var": 0.00012394714615852968, "learning_rate": 0.009632568976744606, "loss": 2.8445, "step": 1807 }, { "crossentropy": 2.682267904281616, "epoch": 0.0983169743603687, "grad_norm": 0.048254091292619705, "grad_norm_var": 0.000124074917777923, "learning_rate": 0.009632100320152994, "loss": 2.6823, "step": 1808 }, { "crossentropy": 2.767850875854492, "epoch": 0.0983713532178689, "grad_norm": 0.047616779804229736, "grad_norm_var": 0.00012657338806614464, "learning_rate": 0.009631631376282838, "loss": 2.7679, "step": 1809 }, { "crossentropy": 2.7905423641204834, "epoch": 0.0984257320753691, "grad_norm": 0.053867775946855545, "grad_norm_var": 0.00012658700699859916, "learning_rate": 0.009631162145163224, "loss": 2.7905, "step": 1810 }, { "crossentropy": 2.818086624145508, "epoch": 0.0984801109328693, "grad_norm": 0.05784064158797264, "grad_norm_var": 0.00012734552042813386, "learning_rate": 0.00963069262682325, "loss": 2.8181, "step": 1811 }, { "crossentropy": 2.7801597118377686, "epoch": 0.09853448979036951, "grad_norm": 0.053616564720869064, "grad_norm_var": 1.666219695043783e-05, "learning_rate": 0.009630222821292037, "loss": 2.7802, "step": 1812 }, { "crossentropy": 2.6845245361328125, "epoch": 0.09858886864786971, "grad_norm": 0.05614815652370453, "grad_norm_var": 1.34619760882854e-05, "learning_rate": 0.00962975272859872, "loss": 2.6845, "step": 1813 }, { "crossentropy": 2.853392004966736, "epoch": 0.09864324750536992, "grad_norm": 0.05677619203925133, "grad_norm_var": 1.2397575436499422e-05, "learning_rate": 0.009629282348772457, "loss": 2.8534, "step": 1814 }, { "crossentropy": 2.6689577102661133, "epoch": 0.09869762636287012, "grad_norm": 0.05440452694892883, "grad_norm_var": 1.2853203664067204e-05, "learning_rate": 0.009628811681842415, "loss": 2.669, "step": 1815 }, { "crossentropy": 2.7464189529418945, "epoch": 0.09875200522037032, "grad_norm": 0.05590160936117172, "grad_norm_var": 1.2547378606617837e-05, "learning_rate": 0.009628340727837791, "loss": 2.7464, "step": 1816 }, { "crossentropy": 2.7225382328033447, "epoch": 0.09880638407787053, "grad_norm": 0.056154537945985794, "grad_norm_var": 1.3172528136870937e-05, "learning_rate": 0.00962786948678779, "loss": 2.7225, "step": 1817 }, { "crossentropy": 2.763778805732727, "epoch": 0.09886076293537073, "grad_norm": 0.05704398825764656, "grad_norm_var": 1.34200222453205e-05, "learning_rate": 0.009627397958721638, "loss": 2.7638, "step": 1818 }, { "crossentropy": 2.9177403450012207, "epoch": 0.09891514179287093, "grad_norm": 0.06501185894012451, "grad_norm_var": 1.9074735572614335e-05, "learning_rate": 0.009626926143668578, "loss": 2.9177, "step": 1819 }, { "crossentropy": 2.7962642908096313, "epoch": 0.09896952065037114, "grad_norm": 0.05042863264679909, "grad_norm_var": 1.8240568338483085e-05, "learning_rate": 0.009626454041657871, "loss": 2.7963, "step": 1820 }, { "crossentropy": 2.7565062046051025, "epoch": 0.09902389950787134, "grad_norm": 0.04962906241416931, "grad_norm_var": 1.946585409407873e-05, "learning_rate": 0.0096259816527188, "loss": 2.7565, "step": 1821 }, { "crossentropy": 2.806275486946106, "epoch": 0.09907827836537154, "grad_norm": 0.057374995201826096, "grad_norm_var": 2.0059205187464986e-05, "learning_rate": 0.009625508976880658, "loss": 2.8063, "step": 1822 }, { "crossentropy": 2.800838589668274, "epoch": 0.09913265722287175, "grad_norm": 0.050673987716436386, "grad_norm_var": 1.9427808944725427e-05, "learning_rate": 0.009625036014172762, "loss": 2.8008, "step": 1823 }, { "crossentropy": 2.809632182121277, "epoch": 0.09918703608037195, "grad_norm": 0.052221961319446564, "grad_norm_var": 1.714896478675152e-05, "learning_rate": 0.009624562764624445, "loss": 2.8096, "step": 1824 }, { "crossentropy": 2.7744616270065308, "epoch": 0.09924141493787215, "grad_norm": 0.0559486486017704, "grad_norm_var": 1.3652789152246436e-05, "learning_rate": 0.009624089228265057, "loss": 2.7745, "step": 1825 }, { "crossentropy": 2.8310779333114624, "epoch": 0.09929579379537236, "grad_norm": 0.05469218268990517, "grad_norm_var": 1.3549905427257905e-05, "learning_rate": 0.009623615405123964, "loss": 2.8311, "step": 1826 }, { "crossentropy": 2.82760226726532, "epoch": 0.09935017265287256, "grad_norm": 0.047399114817380905, "grad_norm_var": 1.6745772439206534e-05, "learning_rate": 0.009623141295230555, "loss": 2.8276, "step": 1827 }, { "crossentropy": 2.765963554382324, "epoch": 0.09940455151037277, "grad_norm": 0.0718027800321579, "grad_norm_var": 3.5058629010153014e-05, "learning_rate": 0.009622666898614234, "loss": 2.766, "step": 1828 }, { "crossentropy": 2.839385509490967, "epoch": 0.09945893036787297, "grad_norm": 0.04746914282441139, "grad_norm_var": 3.9277666363331475e-05, "learning_rate": 0.009622192215304422, "loss": 2.8394, "step": 1829 }, { "crossentropy": 2.8352742195129395, "epoch": 0.09951330922537317, "grad_norm": 0.05113564431667328, "grad_norm_var": 4.006820120911417e-05, "learning_rate": 0.009621717245330556, "loss": 2.8353, "step": 1830 }, { "crossentropy": 2.7509093284606934, "epoch": 0.09956768808287338, "grad_norm": 0.056232601404190063, "grad_norm_var": 4.0173167958018095e-05, "learning_rate": 0.009621241988722098, "loss": 2.7509, "step": 1831 }, { "crossentropy": 2.7861016988754272, "epoch": 0.09962206694037358, "grad_norm": 0.05393685773015022, "grad_norm_var": 4.016384580003765e-05, "learning_rate": 0.009620766445508518, "loss": 2.7861, "step": 1832 }, { "crossentropy": 2.7444273233413696, "epoch": 0.09967644579787378, "grad_norm": 0.0520319864153862, "grad_norm_var": 4.049373670916238e-05, "learning_rate": 0.009620290615719312, "loss": 2.7444, "step": 1833 }, { "crossentropy": 2.756873369216919, "epoch": 0.09973082465537399, "grad_norm": 0.05259179323911667, "grad_norm_var": 4.026077916082689e-05, "learning_rate": 0.00961981449938399, "loss": 2.7569, "step": 1834 }, { "crossentropy": 2.777822732925415, "epoch": 0.09978520351287419, "grad_norm": 0.04907522723078728, "grad_norm_var": 3.3343782326891666e-05, "learning_rate": 0.009619338096532079, "loss": 2.7778, "step": 1835 }, { "crossentropy": 2.8019200563430786, "epoch": 0.0998395823703744, "grad_norm": 0.04858791083097458, "grad_norm_var": 3.425788339810239e-05, "learning_rate": 0.009618861407193126, "loss": 2.8019, "step": 1836 }, { "crossentropy": 2.776302456855774, "epoch": 0.0998939612278746, "grad_norm": 0.04749985411763191, "grad_norm_var": 3.5547970002018124e-05, "learning_rate": 0.009618384431396693, "loss": 2.7763, "step": 1837 }, { "crossentropy": 2.6125855445861816, "epoch": 0.0999483400853748, "grad_norm": 0.049110665917396545, "grad_norm_var": 3.504227809611252e-05, "learning_rate": 0.009617907169172366, "loss": 2.6126, "step": 1838 }, { "crossentropy": 2.7273309230804443, "epoch": 0.100002718942875, "grad_norm": 0.052689678966999054, "grad_norm_var": 3.479856638728632e-05, "learning_rate": 0.009617429620549742, "loss": 2.7273, "step": 1839 }, { "crossentropy": 2.779359817504883, "epoch": 0.10005709780037521, "grad_norm": 0.048906002193689346, "grad_norm_var": 3.567575813289425e-05, "learning_rate": 0.009616951785558435, "loss": 2.7794, "step": 1840 }, { "crossentropy": 2.793626308441162, "epoch": 0.10011147665787541, "grad_norm": 0.04943098872900009, "grad_norm_var": 3.5285467773542704e-05, "learning_rate": 0.009616473664228084, "loss": 2.7936, "step": 1841 }, { "crossentropy": 2.72604238986969, "epoch": 0.10016585551537562, "grad_norm": 0.051422927528619766, "grad_norm_var": 3.479608539813563e-05, "learning_rate": 0.009615995256588341, "loss": 2.726, "step": 1842 }, { "crossentropy": 2.85772705078125, "epoch": 0.10022023437287583, "grad_norm": 0.04938843101263046, "grad_norm_var": 3.386744826633463e-05, "learning_rate": 0.009615516562668874, "loss": 2.8577, "step": 1843 }, { "crossentropy": 2.801988124847412, "epoch": 0.10027461323037604, "grad_norm": 0.050601642578840256, "grad_norm_var": 5.86013489901286e-06, "learning_rate": 0.009615037582499375, "loss": 2.802, "step": 1844 }, { "crossentropy": 2.730757713317871, "epoch": 0.10032899208787624, "grad_norm": 0.04615844413638115, "grad_norm_var": 6.520238911810784e-06, "learning_rate": 0.009614558316109547, "loss": 2.7308, "step": 1845 }, { "crossentropy": 2.6952402591705322, "epoch": 0.10038337094537644, "grad_norm": 0.058006253093481064, "grad_norm_var": 1.0007028123741138e-05, "learning_rate": 0.009614078763529113, "loss": 2.6952, "step": 1846 }, { "crossentropy": 2.773687243461609, "epoch": 0.10043774980287665, "grad_norm": 0.047364283353090286, "grad_norm_var": 8.710925512912746e-06, "learning_rate": 0.009613598924787817, "loss": 2.7737, "step": 1847 }, { "crossentropy": 2.664523124694824, "epoch": 0.10049212866037685, "grad_norm": 0.053061217069625854, "grad_norm_var": 8.348851955127431e-06, "learning_rate": 0.009613118799915417, "loss": 2.6645, "step": 1848 }, { "crossentropy": 2.700393557548523, "epoch": 0.10054650751787705, "grad_norm": 0.048073332756757736, "grad_norm_var": 8.451296178599705e-06, "learning_rate": 0.009612638388941688, "loss": 2.7004, "step": 1849 }, { "crossentropy": 2.7773948907852173, "epoch": 0.10060088637537726, "grad_norm": 0.057097043842077255, "grad_norm_var": 1.1202856156157746e-05, "learning_rate": 0.009612157691896429, "loss": 2.7774, "step": 1850 }, { "crossentropy": 2.7835681438446045, "epoch": 0.10065526523287746, "grad_norm": 0.059108614921569824, "grad_norm_var": 1.5716220092641618e-05, "learning_rate": 0.00961167670880945, "loss": 2.7836, "step": 1851 }, { "crossentropy": 2.7902190685272217, "epoch": 0.10070964409037766, "grad_norm": 0.0487109012901783, "grad_norm_var": 1.5677090381527084e-05, "learning_rate": 0.009611195439710578, "loss": 2.7902, "step": 1852 }, { "crossentropy": 2.6920214891433716, "epoch": 0.10076402294787787, "grad_norm": 0.048746347427368164, "grad_norm_var": 1.5185931342307352e-05, "learning_rate": 0.009610713884629666, "loss": 2.692, "step": 1853 }, { "crossentropy": 2.6492836475372314, "epoch": 0.10081840180537807, "grad_norm": 0.046095944941043854, "grad_norm_var": 1.6560556868081406e-05, "learning_rate": 0.009610232043596576, "loss": 2.6493, "step": 1854 }, { "crossentropy": 2.71435284614563, "epoch": 0.10087278066287828, "grad_norm": 0.04492281749844551, "grad_norm_var": 1.8507362855220653e-05, "learning_rate": 0.009609749916641194, "loss": 2.7144, "step": 1855 }, { "crossentropy": 2.7898064851760864, "epoch": 0.10092715952037848, "grad_norm": 0.09097357839345932, "grad_norm_var": 0.00012048886562073299, "learning_rate": 0.00960926750379342, "loss": 2.7898, "step": 1856 }, { "crossentropy": 2.746904969215393, "epoch": 0.10098153837787868, "grad_norm": 0.04873042553663254, "grad_norm_var": 0.00012085970391769677, "learning_rate": 0.009608784805083172, "loss": 2.7469, "step": 1857 }, { "crossentropy": 2.7095375061035156, "epoch": 0.10103591723537889, "grad_norm": 0.05316327512264252, "grad_norm_var": 0.00012067634730594778, "learning_rate": 0.009608301820540389, "loss": 2.7095, "step": 1858 }, { "crossentropy": 2.7797330617904663, "epoch": 0.10109029609287909, "grad_norm": 0.054290033876895905, "grad_norm_var": 0.00011972765696875975, "learning_rate": 0.00960781855019502, "loss": 2.7797, "step": 1859 }, { "crossentropy": 2.743208646774292, "epoch": 0.10114467495037929, "grad_norm": 0.05405738577246666, "grad_norm_var": 0.00011916437656977636, "learning_rate": 0.009607334994077043, "loss": 2.7432, "step": 1860 }, { "crossentropy": 2.8539074659347534, "epoch": 0.1011990538078795, "grad_norm": 0.05405791476368904, "grad_norm_var": 0.0001151633761178624, "learning_rate": 0.009606851152216446, "loss": 2.8539, "step": 1861 }, { "crossentropy": 2.6674153804779053, "epoch": 0.1012534326653797, "grad_norm": 0.04868529364466667, "grad_norm_var": 0.00011580547495700459, "learning_rate": 0.009606367024643233, "loss": 2.6674, "step": 1862 }, { "crossentropy": 2.650430679321289, "epoch": 0.1013078115228799, "grad_norm": 0.05059133097529411, "grad_norm_var": 0.00011378569222076894, "learning_rate": 0.009605882611387432, "loss": 2.6504, "step": 1863 }, { "crossentropy": 2.705017328262329, "epoch": 0.10136219038038011, "grad_norm": 0.05914732813835144, "grad_norm_var": 0.00011552326923686106, "learning_rate": 0.009605397912479085, "loss": 2.705, "step": 1864 }, { "crossentropy": 2.7554832696914673, "epoch": 0.10141656923788031, "grad_norm": 0.04779069498181343, "grad_norm_var": 0.0001157573828826743, "learning_rate": 0.009604912927948255, "loss": 2.7555, "step": 1865 }, { "crossentropy": 2.7301347255706787, "epoch": 0.10147094809538051, "grad_norm": 0.046919964253902435, "grad_norm_var": 0.00011821212217496402, "learning_rate": 0.009604427657825017, "loss": 2.7301, "step": 1866 }, { "crossentropy": 2.821101665496826, "epoch": 0.10152532695288072, "grad_norm": 0.04727109894156456, "grad_norm_var": 0.00011811696656351378, "learning_rate": 0.00960394210213947, "loss": 2.8211, "step": 1867 }, { "crossentropy": 2.830947518348694, "epoch": 0.10157970581038092, "grad_norm": 0.04869259521365166, "grad_norm_var": 0.00011812686972537309, "learning_rate": 0.009603456260921723, "loss": 2.8309, "step": 1868 }, { "crossentropy": 2.797854781150818, "epoch": 0.10163408466788112, "grad_norm": 0.045062050223350525, "grad_norm_var": 0.00012094617683256944, "learning_rate": 0.009602970134201914, "loss": 2.7979, "step": 1869 }, { "crossentropy": 2.728033661842346, "epoch": 0.10168846352538133, "grad_norm": 0.049003444612026215, "grad_norm_var": 0.00011898094048629071, "learning_rate": 0.009602483722010186, "loss": 2.728, "step": 1870 }, { "crossentropy": 2.728308320045471, "epoch": 0.10174284238288153, "grad_norm": 0.04748770594596863, "grad_norm_var": 0.00011672902210783602, "learning_rate": 0.00960199702437671, "loss": 2.7283, "step": 1871 }, { "crossentropy": 2.763628840446472, "epoch": 0.10179722124038174, "grad_norm": 0.05852512642741203, "grad_norm_var": 1.7682895700965594e-05, "learning_rate": 0.009601510041331668, "loss": 2.7636, "step": 1872 }, { "crossentropy": 2.801420211791992, "epoch": 0.10185160009788194, "grad_norm": 0.052981600165367126, "grad_norm_var": 1.7615406527852346e-05, "learning_rate": 0.009601022772905263, "loss": 2.8014, "step": 1873 }, { "crossentropy": 2.774663805961609, "epoch": 0.10190597895538214, "grad_norm": 0.05143493041396141, "grad_norm_var": 1.7328458478000665e-05, "learning_rate": 0.009600535219127716, "loss": 2.7747, "step": 1874 }, { "crossentropy": 2.9431389570236206, "epoch": 0.10196035781288235, "grad_norm": 0.05440687760710716, "grad_norm_var": 1.7380569196160518e-05, "learning_rate": 0.009600047380029264, "loss": 2.9431, "step": 1875 }, { "crossentropy": 2.82229220867157, "epoch": 0.10201473667038255, "grad_norm": 0.04686345160007477, "grad_norm_var": 1.7689409347182842e-05, "learning_rate": 0.00959955925564016, "loss": 2.8223, "step": 1876 }, { "crossentropy": 2.8443777561187744, "epoch": 0.10206911552788275, "grad_norm": 0.05091649293899536, "grad_norm_var": 1.6840058818786096e-05, "learning_rate": 0.009599070845990683, "loss": 2.8444, "step": 1877 }, { "crossentropy": 2.8208402395248413, "epoch": 0.10212349438538296, "grad_norm": 0.04435805231332779, "grad_norm_var": 1.897734092788968e-05, "learning_rate": 0.009598582151111116, "loss": 2.8208, "step": 1878 }, { "crossentropy": 2.737797737121582, "epoch": 0.10217787324288316, "grad_norm": 0.05360972136259079, "grad_norm_var": 1.9748199568601446e-05, "learning_rate": 0.009598093171031774, "loss": 2.7378, "step": 1879 }, { "crossentropy": 2.6847373247146606, "epoch": 0.10223225210038338, "grad_norm": 0.049323730170726776, "grad_norm_var": 1.416437395230762e-05, "learning_rate": 0.00959760390578298, "loss": 2.6847, "step": 1880 }, { "crossentropy": 2.7896273136138916, "epoch": 0.10228663095788358, "grad_norm": 0.047492269426584244, "grad_norm_var": 1.4244537545027072e-05, "learning_rate": 0.009597114355395077, "loss": 2.7896, "step": 1881 }, { "crossentropy": 2.8541516065597534, "epoch": 0.10234100981538378, "grad_norm": 0.04988555237650871, "grad_norm_var": 1.3715976507574998e-05, "learning_rate": 0.009596624519898426, "loss": 2.8542, "step": 1882 }, { "crossentropy": 2.7220144271850586, "epoch": 0.10239538867288399, "grad_norm": 0.0477876253426075, "grad_norm_var": 1.3556270122753953e-05, "learning_rate": 0.00959613439932341, "loss": 2.722, "step": 1883 }, { "crossentropy": 2.7621015310287476, "epoch": 0.10244976753038419, "grad_norm": 0.049749188125133514, "grad_norm_var": 1.346095438413871e-05, "learning_rate": 0.009595643993700425, "loss": 2.7621, "step": 1884 }, { "crossentropy": 2.7650173902511597, "epoch": 0.1025041463878844, "grad_norm": 0.04375430941581726, "grad_norm_var": 1.4416728430737454e-05, "learning_rate": 0.00959515330305988, "loss": 2.765, "step": 1885 }, { "crossentropy": 2.7011619806289673, "epoch": 0.1025585252453846, "grad_norm": 0.04519682005047798, "grad_norm_var": 1.575141502257593e-05, "learning_rate": 0.009594662327432214, "loss": 2.7012, "step": 1886 }, { "crossentropy": 2.850680351257324, "epoch": 0.1026129041028848, "grad_norm": 0.04544328898191452, "grad_norm_var": 1.6591385637803172e-05, "learning_rate": 0.009594171066847875, "loss": 2.8507, "step": 1887 }, { "crossentropy": 2.835735559463501, "epoch": 0.102667282960385, "grad_norm": 0.046727463603019714, "grad_norm_var": 1.106707939483471e-05, "learning_rate": 0.009593679521337327, "loss": 2.8357, "step": 1888 }, { "crossentropy": 2.8449771404266357, "epoch": 0.10272166181788521, "grad_norm": 0.04845660179853439, "grad_norm_var": 9.791152746786251e-06, "learning_rate": 0.009593187690931058, "loss": 2.845, "step": 1889 }, { "crossentropy": 2.7428457736968994, "epoch": 0.10277604067538541, "grad_norm": 0.048251014202833176, "grad_norm_var": 9.163041974600643e-06, "learning_rate": 0.009592695575659572, "loss": 2.7428, "step": 1890 }, { "crossentropy": 2.7391178607940674, "epoch": 0.10283041953288562, "grad_norm": 0.0488116480410099, "grad_norm_var": 6.536857534762917e-06, "learning_rate": 0.009592203175553387, "loss": 2.7391, "step": 1891 }, { "crossentropy": 2.7705514430999756, "epoch": 0.10288479839038582, "grad_norm": 0.04998971149325371, "grad_norm_var": 6.70971222312035e-06, "learning_rate": 0.009591710490643043, "loss": 2.7706, "step": 1892 }, { "crossentropy": 2.83771288394928, "epoch": 0.10293917724788602, "grad_norm": 0.048836346715688705, "grad_norm_var": 6.20164863002944e-06, "learning_rate": 0.009591217520959094, "loss": 2.8377, "step": 1893 }, { "crossentropy": 2.7781835794448853, "epoch": 0.10299355610538623, "grad_norm": 0.05576420947909355, "grad_norm_var": 8.825223735926292e-06, "learning_rate": 0.009590724266532118, "loss": 2.7782, "step": 1894 }, { "crossentropy": 2.7065213918685913, "epoch": 0.10304793496288643, "grad_norm": 0.056659847497940063, "grad_norm_var": 1.1406443488836716e-05, "learning_rate": 0.0095902307273927, "loss": 2.7065, "step": 1895 }, { "crossentropy": 2.830206513404846, "epoch": 0.10310231382038663, "grad_norm": 0.054622091352939606, "grad_norm_var": 1.3472264132396936e-05, "learning_rate": 0.009589736903571454, "loss": 2.8302, "step": 1896 }, { "crossentropy": 2.8333081007003784, "epoch": 0.10315669267788684, "grad_norm": 0.051143888384103775, "grad_norm_var": 1.3467257309499914e-05, "learning_rate": 0.009589242795099002, "loss": 2.8333, "step": 1897 }, { "crossentropy": 2.8648860454559326, "epoch": 0.10321107153538704, "grad_norm": 0.051654987037181854, "grad_norm_var": 1.3767471259297735e-05, "learning_rate": 0.00958874840200599, "loss": 2.8649, "step": 1898 }, { "crossentropy": 2.7712481021881104, "epoch": 0.10326545039288725, "grad_norm": 0.35659676790237427, "grad_norm_var": 0.005901269187796158, "learning_rate": 0.009588253724323082, "loss": 2.7712, "step": 1899 }, { "crossentropy": 2.746821641921997, "epoch": 0.10331982925038745, "grad_norm": 0.05182763561606407, "grad_norm_var": 0.0058962448385876534, "learning_rate": 0.009587758762080954, "loss": 2.7468, "step": 1900 }, { "crossentropy": 2.896252155303955, "epoch": 0.10337420810788765, "grad_norm": 0.055373597890138626, "grad_norm_var": 0.005865596736525246, "learning_rate": 0.009587263515310307, "loss": 2.8963, "step": 1901 }, { "crossentropy": 2.897267699241638, "epoch": 0.10342858696538786, "grad_norm": 0.07182365655899048, "grad_norm_var": 0.005822881626921256, "learning_rate": 0.009586767984041853, "loss": 2.8973, "step": 1902 }, { "crossentropy": 2.8809573650360107, "epoch": 0.10348296582288806, "grad_norm": 0.0760304257273674, "grad_norm_var": 0.005775602425083634, "learning_rate": 0.009586272168306322, "loss": 2.881, "step": 1903 }, { "crossentropy": 2.9680451154708862, "epoch": 0.10353734468038826, "grad_norm": 0.07147582620382309, "grad_norm_var": 0.005726246403367663, "learning_rate": 0.00958577606813447, "loss": 2.968, "step": 1904 }, { "crossentropy": 2.764005184173584, "epoch": 0.10359172353788847, "grad_norm": 0.05259893089532852, "grad_norm_var": 0.0057127512074763515, "learning_rate": 0.00958527968355706, "loss": 2.764, "step": 1905 }, { "crossentropy": 2.7978545427322388, "epoch": 0.10364610239538867, "grad_norm": 0.048027921468019485, "grad_norm_var": 0.0057135527007595496, "learning_rate": 0.00958478301460488, "loss": 2.7979, "step": 1906 }, { "crossentropy": 2.793905258178711, "epoch": 0.10370048125288887, "grad_norm": 0.05012962967157364, "grad_norm_var": 0.005709045574140389, "learning_rate": 0.009584286061308732, "loss": 2.7939, "step": 1907 }, { "crossentropy": 2.8147082328796387, "epoch": 0.10375486011038908, "grad_norm": 0.054608382284641266, "grad_norm_var": 0.005694878568992871, "learning_rate": 0.009583788823699436, "loss": 2.8147, "step": 1908 }, { "crossentropy": 2.763296365737915, "epoch": 0.10380923896788928, "grad_norm": 0.05206925421953201, "grad_norm_var": 0.005684060565653567, "learning_rate": 0.009583291301807828, "loss": 2.7633, "step": 1909 }, { "crossentropy": 2.817496418952942, "epoch": 0.10386361782538948, "grad_norm": 0.05226299911737442, "grad_norm_var": 0.005694110170814729, "learning_rate": 0.00958279349566477, "loss": 2.8175, "step": 1910 }, { "crossentropy": 2.840063691139221, "epoch": 0.10391799668288969, "grad_norm": 0.06481116265058517, "grad_norm_var": 0.005677860971142928, "learning_rate": 0.00958229540530113, "loss": 2.8401, "step": 1911 }, { "crossentropy": 2.778799057006836, "epoch": 0.10397237554038989, "grad_norm": 0.05464106425642967, "grad_norm_var": 0.00567780706257869, "learning_rate": 0.009581797030747801, "loss": 2.7788, "step": 1912 }, { "crossentropy": 2.7973748445510864, "epoch": 0.1040267543978901, "grad_norm": 0.04897255077958107, "grad_norm_var": 0.005685281149463535, "learning_rate": 0.009581298372035694, "loss": 2.7974, "step": 1913 }, { "crossentropy": 2.8189523220062256, "epoch": 0.1040811332553903, "grad_norm": 0.08597058057785034, "grad_norm_var": 0.005648375286773955, "learning_rate": 0.009580799429195732, "loss": 2.819, "step": 1914 }, { "crossentropy": 2.7114369869232178, "epoch": 0.1041355121128905, "grad_norm": 0.049296487122774124, "grad_norm_var": 0.0001334215666624947, "learning_rate": 0.009580300202258859, "loss": 2.7114, "step": 1915 }, { "crossentropy": 2.8964933156967163, "epoch": 0.1041898909703907, "grad_norm": 0.05883818864822388, "grad_norm_var": 0.00013002736140861748, "learning_rate": 0.009579800691256038, "loss": 2.8965, "step": 1916 }, { "crossentropy": 2.8006949424743652, "epoch": 0.10424426982789091, "grad_norm": 0.05236596614122391, "grad_norm_var": 0.00013212043073953887, "learning_rate": 0.009579300896218248, "loss": 2.8007, "step": 1917 }, { "crossentropy": 2.7835357189178467, "epoch": 0.10429864868539113, "grad_norm": 0.050160352140665054, "grad_norm_var": 0.00012439733716055698, "learning_rate": 0.009578800817176486, "loss": 2.7835, "step": 1918 }, { "crossentropy": 2.7946078777313232, "epoch": 0.10435302754289133, "grad_norm": 0.054833460599184036, "grad_norm_var": 0.00010050661731893877, "learning_rate": 0.009578300454161765, "loss": 2.7946, "step": 1919 }, { "crossentropy": 2.7591497898101807, "epoch": 0.10440740640039153, "grad_norm": 0.04966677725315094, "grad_norm_var": 8.615215658282655e-05, "learning_rate": 0.00957779980720512, "loss": 2.7591, "step": 1920 }, { "crossentropy": 2.7388275861740112, "epoch": 0.10446178525789174, "grad_norm": 0.056705914437770844, "grad_norm_var": 8.591708527079037e-05, "learning_rate": 0.0095772988763376, "loss": 2.7388, "step": 1921 }, { "crossentropy": 2.7551056146621704, "epoch": 0.10451616411539194, "grad_norm": 0.04670703038573265, "grad_norm_var": 8.729103909053959e-05, "learning_rate": 0.009576797661590267, "loss": 2.7551, "step": 1922 }, { "crossentropy": 3.015592336654663, "epoch": 0.10457054297289214, "grad_norm": 0.05259637534618378, "grad_norm_var": 8.602754866619337e-05, "learning_rate": 0.009576296162994212, "loss": 3.0156, "step": 1923 }, { "crossentropy": 2.7123358249664307, "epoch": 0.10462492183039235, "grad_norm": 0.05019841343164444, "grad_norm_var": 8.763892173108851e-05, "learning_rate": 0.009575794380580536, "loss": 2.7123, "step": 1924 }, { "crossentropy": 2.8088375329971313, "epoch": 0.10467930068789255, "grad_norm": 0.054560642689466476, "grad_norm_var": 8.705130500430141e-05, "learning_rate": 0.009575292314380358, "loss": 2.8088, "step": 1925 }, { "crossentropy": 2.824997305870056, "epoch": 0.10473367954539275, "grad_norm": 0.06216596066951752, "grad_norm_var": 8.9353102508613e-05, "learning_rate": 0.009574789964424816, "loss": 2.825, "step": 1926 }, { "crossentropy": 2.6944611072540283, "epoch": 0.10478805840289296, "grad_norm": 0.04838257282972336, "grad_norm_var": 8.644068979970585e-05, "learning_rate": 0.009574287330745066, "loss": 2.6945, "step": 1927 }, { "crossentropy": 2.7301464080810547, "epoch": 0.10484243726039316, "grad_norm": 0.048591095954179764, "grad_norm_var": 8.881933927395811e-05, "learning_rate": 0.00957378441337228, "loss": 2.7301, "step": 1928 }, { "crossentropy": 2.7800345420837402, "epoch": 0.10489681611789337, "grad_norm": 0.04998498782515526, "grad_norm_var": 8.815401391814344e-05, "learning_rate": 0.009573281212337649, "loss": 2.78, "step": 1929 }, { "crossentropy": 2.664522886276245, "epoch": 0.10495119497539357, "grad_norm": 0.04495321959257126, "grad_norm_var": 2.08601535902246e-05, "learning_rate": 0.009572777727672382, "loss": 2.6645, "step": 1930 }, { "crossentropy": 2.720433831214905, "epoch": 0.10500557383289377, "grad_norm": 0.044649768620729446, "grad_norm_var": 2.3807491234562455e-05, "learning_rate": 0.009572273959407702, "loss": 2.7204, "step": 1931 }, { "crossentropy": 2.747573494911194, "epoch": 0.10505995269039398, "grad_norm": 0.05937587842345238, "grad_norm_var": 2.434555275560001e-05, "learning_rate": 0.009571769907574853, "loss": 2.7476, "step": 1932 }, { "crossentropy": 2.743692994117737, "epoch": 0.10511433154789418, "grad_norm": 0.04901600629091263, "grad_norm_var": 2.471314544003599e-05, "learning_rate": 0.009571265572205098, "loss": 2.7437, "step": 1933 }, { "crossentropy": 2.696113348007202, "epoch": 0.10516871040539438, "grad_norm": 0.057899314910173416, "grad_norm_var": 2.7167647675605672e-05, "learning_rate": 0.009570760953329713, "loss": 2.6961, "step": 1934 }, { "crossentropy": 2.8467551469802856, "epoch": 0.10522308926289459, "grad_norm": 0.047073300927877426, "grad_norm_var": 2.7888905835464033e-05, "learning_rate": 0.009570256050979995, "loss": 2.8468, "step": 1935 }, { "crossentropy": 2.7466412782669067, "epoch": 0.10527746812039479, "grad_norm": 0.045818544924259186, "grad_norm_var": 2.970785513389413e-05, "learning_rate": 0.009569750865187259, "loss": 2.7466, "step": 1936 }, { "crossentropy": 2.740467071533203, "epoch": 0.105331846977895, "grad_norm": 0.04769982025027275, "grad_norm_var": 2.81265428009468e-05, "learning_rate": 0.009569245395982836, "loss": 2.7405, "step": 1937 }, { "crossentropy": 2.7876415252685547, "epoch": 0.1053862258353952, "grad_norm": 0.05859410762786865, "grad_norm_var": 3.078059355967888e-05, "learning_rate": 0.00956873964339807, "loss": 2.7876, "step": 1938 }, { "crossentropy": 2.8882495164871216, "epoch": 0.1054406046928954, "grad_norm": 0.05168019235134125, "grad_norm_var": 3.068049581278024e-05, "learning_rate": 0.009568233607464331, "loss": 2.8882, "step": 1939 }, { "crossentropy": 2.8266454935073853, "epoch": 0.1054949835503956, "grad_norm": 0.055122215300798416, "grad_norm_var": 3.147894369294e-05, "learning_rate": 0.009567727288213004, "loss": 2.8266, "step": 1940 }, { "crossentropy": 2.7795530557632446, "epoch": 0.10554936240789581, "grad_norm": 0.051063574850559235, "grad_norm_var": 3.086186727452843e-05, "learning_rate": 0.00956722068567549, "loss": 2.7796, "step": 1941 }, { "crossentropy": 2.707252860069275, "epoch": 0.10560374126539601, "grad_norm": 0.04726020246744156, "grad_norm_var": 2.33106570172457e-05, "learning_rate": 0.009566713799883206, "loss": 2.7073, "step": 1942 }, { "crossentropy": 2.7125600576400757, "epoch": 0.10565812012289622, "grad_norm": 0.049345873296260834, "grad_norm_var": 2.3103395822564784e-05, "learning_rate": 0.009566206630867588, "loss": 2.7126, "step": 1943 }, { "crossentropy": 2.8502360582351685, "epoch": 0.10571249898039642, "grad_norm": 0.08398249745368958, "grad_norm_var": 9.234223209056697e-05, "learning_rate": 0.009565699178660094, "loss": 2.8502, "step": 1944 }, { "crossentropy": 2.7251886129379272, "epoch": 0.10576687783789662, "grad_norm": 0.048401180654764175, "grad_norm_var": 9.307656763340346e-05, "learning_rate": 0.009565191443292194, "loss": 2.7252, "step": 1945 }, { "crossentropy": 2.7038224935531616, "epoch": 0.10582125669539683, "grad_norm": 0.04603249579668045, "grad_norm_var": 9.204595227944908e-05, "learning_rate": 0.009564683424795375, "loss": 2.7038, "step": 1946 }, { "crossentropy": 2.8658403158187866, "epoch": 0.10587563555289703, "grad_norm": 0.04596657305955887, "grad_norm_var": 9.07429452854415e-05, "learning_rate": 0.009564175123201147, "loss": 2.8658, "step": 1947 }, { "crossentropy": 2.763969898223877, "epoch": 0.10593001441039723, "grad_norm": 0.04690735042095184, "grad_norm_var": 8.947860369447422e-05, "learning_rate": 0.009563666538541031, "loss": 2.764, "step": 1948 }, { "crossentropy": 2.814732551574707, "epoch": 0.10598439326789744, "grad_norm": 0.05626387149095535, "grad_norm_var": 8.988640718983007e-05, "learning_rate": 0.009563157670846574, "loss": 2.8147, "step": 1949 }, { "crossentropy": 2.813769817352295, "epoch": 0.10603877212539764, "grad_norm": 0.050467658787965775, "grad_norm_var": 8.793308915844746e-05, "learning_rate": 0.009562648520149332, "loss": 2.8138, "step": 1950 }, { "crossentropy": 2.797763705253601, "epoch": 0.10609315098289784, "grad_norm": 0.04647338017821312, "grad_norm_var": 8.834806459182455e-05, "learning_rate": 0.00956213908648088, "loss": 2.7978, "step": 1951 }, { "crossentropy": 2.8509762287139893, "epoch": 0.10614752984039805, "grad_norm": 0.055628057569265366, "grad_norm_var": 8.635252562932312e-05, "learning_rate": 0.009561629369872817, "loss": 2.851, "step": 1952 }, { "crossentropy": 2.8442413806915283, "epoch": 0.10620190869789825, "grad_norm": 0.04965139925479889, "grad_norm_var": 8.532705079557078e-05, "learning_rate": 0.009561119370356752, "loss": 2.8442, "step": 1953 }, { "crossentropy": 2.8048458099365234, "epoch": 0.10625628755539845, "grad_norm": 0.04532419517636299, "grad_norm_var": 8.586439952317758e-05, "learning_rate": 0.009560609087964318, "loss": 2.8048, "step": 1954 }, { "crossentropy": 2.807821273803711, "epoch": 0.10631066641289867, "grad_norm": 0.04751812666654587, "grad_norm_var": 8.704029174803526e-05, "learning_rate": 0.009560098522727158, "loss": 2.8078, "step": 1955 }, { "crossentropy": 2.7225018739700317, "epoch": 0.10636504527039888, "grad_norm": 0.050071123987436295, "grad_norm_var": 8.6254695198786e-05, "learning_rate": 0.009559587674676939, "loss": 2.7225, "step": 1956 }, { "crossentropy": 2.7945470809936523, "epoch": 0.10641942412789908, "grad_norm": 0.046503063291311264, "grad_norm_var": 8.768153485695692e-05, "learning_rate": 0.009559076543845345, "loss": 2.7945, "step": 1957 }, { "crossentropy": 2.7461875677108765, "epoch": 0.10647380298539928, "grad_norm": 0.04605920612812042, "grad_norm_var": 8.836851759987842e-05, "learning_rate": 0.009558565130264074, "loss": 2.7462, "step": 1958 }, { "crossentropy": 2.7947553396224976, "epoch": 0.10652818184289949, "grad_norm": 0.06212038919329643, "grad_norm_var": 9.589981741334278e-05, "learning_rate": 0.009558053433964844, "loss": 2.7948, "step": 1959 }, { "crossentropy": 2.7497496604919434, "epoch": 0.10658256070039969, "grad_norm": 0.07387124747037888, "grad_norm_var": 5.878183873967412e-05, "learning_rate": 0.009557541454979388, "loss": 2.7497, "step": 1960 }, { "crossentropy": 2.7649885416030884, "epoch": 0.10663693955789989, "grad_norm": 0.05816664919257164, "grad_norm_var": 6.125580507376031e-05, "learning_rate": 0.00955702919333946, "loss": 2.765, "step": 1961 }, { "crossentropy": 2.8230360746383667, "epoch": 0.1066913184154001, "grad_norm": 0.05390574410557747, "grad_norm_var": 5.9191997541053974e-05, "learning_rate": 0.009556516649076831, "loss": 2.823, "step": 1962 }, { "crossentropy": 2.7433876991271973, "epoch": 0.1067456972729003, "grad_norm": 0.06646063923835754, "grad_norm_var": 6.846088801600585e-05, "learning_rate": 0.009556003822223286, "loss": 2.7434, "step": 1963 }, { "crossentropy": 2.742140293121338, "epoch": 0.1068000761304005, "grad_norm": 0.04483608156442642, "grad_norm_var": 7.053921665159663e-05, "learning_rate": 0.009555490712810633, "loss": 2.7421, "step": 1964 }, { "crossentropy": 2.8531434535980225, "epoch": 0.10685445498790071, "grad_norm": 0.05250449851155281, "grad_norm_var": 6.995319891197038e-05, "learning_rate": 0.009554977320870692, "loss": 2.8531, "step": 1965 }, { "crossentropy": 2.7143993377685547, "epoch": 0.10690883384540091, "grad_norm": 0.062023311853408813, "grad_norm_var": 7.424694045983637e-05, "learning_rate": 0.009554463646435305, "loss": 2.7144, "step": 1966 }, { "crossentropy": 2.757106065750122, "epoch": 0.10696321270290111, "grad_norm": 0.04722164571285248, "grad_norm_var": 7.354898930354176e-05, "learning_rate": 0.00955394968953633, "loss": 2.7571, "step": 1967 }, { "crossentropy": 2.6746283769607544, "epoch": 0.10701759156040132, "grad_norm": 0.04808494448661804, "grad_norm_var": 7.533355194111139e-05, "learning_rate": 0.00955343545020564, "loss": 2.6746, "step": 1968 }, { "crossentropy": 2.807852268218994, "epoch": 0.10707197041790152, "grad_norm": 0.06910844147205353, "grad_norm_var": 8.928229559533729e-05, "learning_rate": 0.009552920928475126, "loss": 2.8079, "step": 1969 }, { "crossentropy": 2.6857242584228516, "epoch": 0.10712634927540172, "grad_norm": 0.04748031497001648, "grad_norm_var": 8.690299428266875e-05, "learning_rate": 0.009552406124376703, "loss": 2.6857, "step": 1970 }, { "crossentropy": 2.735579490661621, "epoch": 0.10718072813290193, "grad_norm": 0.07112430036067963, "grad_norm_var": 9.89816638360369e-05, "learning_rate": 0.009551891037942295, "loss": 2.7356, "step": 1971 }, { "crossentropy": 2.88913094997406, "epoch": 0.10723510699040213, "grad_norm": 0.09166543930768967, "grad_norm_var": 0.00017300351298009658, "learning_rate": 0.00955137566920385, "loss": 2.8891, "step": 1972 }, { "crossentropy": 2.706210136413574, "epoch": 0.10728948584790234, "grad_norm": 0.04970374330878258, "grad_norm_var": 0.0001683870173828899, "learning_rate": 0.009550860018193328, "loss": 2.7062, "step": 1973 }, { "crossentropy": 2.8263434171676636, "epoch": 0.10734386470540254, "grad_norm": 0.051958803087472916, "grad_norm_var": 0.00016036640117757494, "learning_rate": 0.009550344084942713, "loss": 2.8263, "step": 1974 }, { "crossentropy": 2.8413363695144653, "epoch": 0.10739824356290274, "grad_norm": 0.07124235481023788, "grad_norm_var": 0.00016888820039881173, "learning_rate": 0.009549827869483997, "loss": 2.8413, "step": 1975 }, { "crossentropy": 2.7220897674560547, "epoch": 0.10745262242040295, "grad_norm": 0.054922521114349365, "grad_norm_var": 0.0001561820762941835, "learning_rate": 0.0095493113718492, "loss": 2.7221, "step": 1976 }, { "crossentropy": 2.7282748222351074, "epoch": 0.10750700127790315, "grad_norm": 0.04918321594595909, "grad_norm_var": 0.00016195533921269605, "learning_rate": 0.009548794592070354, "loss": 2.7283, "step": 1977 }, { "crossentropy": 2.8402334451675415, "epoch": 0.10756138013540335, "grad_norm": 0.05035458132624626, "grad_norm_var": 0.00016478347977561715, "learning_rate": 0.009548277530179505, "loss": 2.8402, "step": 1978 }, { "crossentropy": 2.812433123588562, "epoch": 0.10761575899290356, "grad_norm": 0.05710932984948158, "grad_norm_var": 0.0001596900888176505, "learning_rate": 0.009547760186208728, "loss": 2.8124, "step": 1979 }, { "crossentropy": 2.8326419591903687, "epoch": 0.10767013785040376, "grad_norm": 0.05197388678789139, "grad_norm_var": 0.00015090983395800334, "learning_rate": 0.009547242560190102, "loss": 2.8326, "step": 1980 }, { "crossentropy": 2.893388032913208, "epoch": 0.10772451670790396, "grad_norm": 0.06366521120071411, "grad_norm_var": 0.0001507346096944033, "learning_rate": 0.00954672465215573, "loss": 2.8934, "step": 1981 }, { "crossentropy": 2.8055107593536377, "epoch": 0.10777889556540417, "grad_norm": 0.04813741147518158, "grad_norm_var": 0.00015635762506707444, "learning_rate": 0.009546206462137736, "loss": 2.8055, "step": 1982 }, { "crossentropy": 2.7824209928512573, "epoch": 0.10783327442290437, "grad_norm": 0.05058975890278816, "grad_norm_var": 0.00015236840472286883, "learning_rate": 0.009545687990168255, "loss": 2.7824, "step": 1983 }, { "crossentropy": 2.841901183128357, "epoch": 0.10788765328040457, "grad_norm": 0.047225385904312134, "grad_norm_var": 0.00015353877851371743, "learning_rate": 0.009545169236279445, "loss": 2.8419, "step": 1984 }, { "crossentropy": 2.8316361904144287, "epoch": 0.10794203213790478, "grad_norm": 0.04802901670336723, "grad_norm_var": 0.00014964001981034863, "learning_rate": 0.009544650200503473, "loss": 2.8316, "step": 1985 }, { "crossentropy": 2.7689191102981567, "epoch": 0.10799641099540498, "grad_norm": 0.0472852848470211, "grad_norm_var": 0.00014987753880824813, "learning_rate": 0.009544130882872535, "loss": 2.7689, "step": 1986 }, { "crossentropy": 2.6527788639068604, "epoch": 0.10805078985290519, "grad_norm": 0.04969659820199013, "grad_norm_var": 0.00013682256937230005, "learning_rate": 0.009543611283418836, "loss": 2.6528, "step": 1987 }, { "crossentropy": 2.7487473487854004, "epoch": 0.10810516871040539, "grad_norm": 0.052329275757074356, "grad_norm_var": 4.212624317637507e-05, "learning_rate": 0.0095430914021746, "loss": 2.7487, "step": 1988 }, { "crossentropy": 2.8128116130828857, "epoch": 0.10815954756790559, "grad_norm": 0.045143432915210724, "grad_norm_var": 4.525571112179386e-05, "learning_rate": 0.00954257123917207, "loss": 2.8128, "step": 1989 }, { "crossentropy": 2.852656364440918, "epoch": 0.1082139264254058, "grad_norm": 0.05009634047746658, "grad_norm_var": 4.55889939790464e-05, "learning_rate": 0.009542050794443509, "loss": 2.8527, "step": 1990 }, { "crossentropy": 2.7889420986175537, "epoch": 0.108268305282906, "grad_norm": 0.04678786173462868, "grad_norm_var": 2.1239375489236615e-05, "learning_rate": 0.00954153006802119, "loss": 2.7889, "step": 1991 }, { "crossentropy": 2.7070255279541016, "epoch": 0.10832268414040622, "grad_norm": 0.04493130370974541, "grad_norm_var": 2.1963980692448157e-05, "learning_rate": 0.009541009059937414, "loss": 2.707, "step": 1992 }, { "crossentropy": 2.692754030227661, "epoch": 0.10837706299790642, "grad_norm": 0.04428020492196083, "grad_norm_var": 2.410410509733433e-05, "learning_rate": 0.009540487770224487, "loss": 2.6928, "step": 1993 }, { "crossentropy": 2.7984561920166016, "epoch": 0.10843144185540662, "grad_norm": 0.04596096649765968, "grad_norm_var": 2.5016281584923593e-05, "learning_rate": 0.009539966198914741, "loss": 2.7985, "step": 1994 }, { "crossentropy": 2.7924124002456665, "epoch": 0.10848582071290683, "grad_norm": 0.05357922241091728, "grad_norm_var": 2.2250083456188766e-05, "learning_rate": 0.009539444346040524, "loss": 2.7924, "step": 1995 }, { "crossentropy": 2.611176371574402, "epoch": 0.10854019957040703, "grad_norm": 0.05148400738835335, "grad_norm_var": 2.2094151044048788e-05, "learning_rate": 0.009538922211634202, "loss": 2.6112, "step": 1996 }, { "crossentropy": 2.9042190313339233, "epoch": 0.10859457842790723, "grad_norm": 0.06143448129296303, "grad_norm_var": 1.8140338027383685e-05, "learning_rate": 0.009538399795728156, "loss": 2.9042, "step": 1997 }, { "crossentropy": 2.6991515159606934, "epoch": 0.10864895728540744, "grad_norm": 0.07075159251689911, "grad_norm_var": 4.693844004599601e-05, "learning_rate": 0.009537877098354786, "loss": 2.6992, "step": 1998 }, { "crossentropy": 2.842332124710083, "epoch": 0.10870333614290764, "grad_norm": 0.043959055095911026, "grad_norm_var": 4.969564528921518e-05, "learning_rate": 0.009537354119546508, "loss": 2.8423, "step": 1999 }, { "crossentropy": 2.7328356504440308, "epoch": 0.10875771500040785, "grad_norm": 0.04932745173573494, "grad_norm_var": 4.914205990828585e-05, "learning_rate": 0.009536830859335758, "loss": 2.7328, "step": 2000 }, { "crossentropy": 2.8630950450897217, "epoch": 0.10881209385790805, "grad_norm": 0.053765520453453064, "grad_norm_var": 4.944857799556978e-05, "learning_rate": 0.009536307317754988, "loss": 2.8631, "step": 2001 }, { "crossentropy": 2.844437003135681, "epoch": 0.10886647271540825, "grad_norm": 0.06577242910861969, "grad_norm_var": 6.245205621452452e-05, "learning_rate": 0.009535783494836667, "loss": 2.8444, "step": 2002 }, { "crossentropy": 2.7995342016220093, "epoch": 0.10892085157290846, "grad_norm": 0.06355582922697067, "grad_norm_var": 7.051236088650714e-05, "learning_rate": 0.009535259390613282, "loss": 2.7995, "step": 2003 }, { "crossentropy": 2.7878568172454834, "epoch": 0.10897523043040866, "grad_norm": 0.049859121441841125, "grad_norm_var": 7.101496978228755e-05, "learning_rate": 0.009534735005117338, "loss": 2.7879, "step": 2004 }, { "crossentropy": 2.7625256776809692, "epoch": 0.10902960928790886, "grad_norm": 0.04676175117492676, "grad_norm_var": 6.958199606531031e-05, "learning_rate": 0.009534210338381359, "loss": 2.7625, "step": 2005 }, { "crossentropy": 2.8578633069992065, "epoch": 0.10908398814540907, "grad_norm": 0.04992765933275223, "grad_norm_var": 6.964107776097575e-05, "learning_rate": 0.00953368539043788, "loss": 2.8579, "step": 2006 }, { "crossentropy": 2.814407229423523, "epoch": 0.10913836700290927, "grad_norm": 0.04912444204092026, "grad_norm_var": 6.816108160318685e-05, "learning_rate": 0.00953316016131946, "loss": 2.8144, "step": 2007 }, { "crossentropy": 2.758547306060791, "epoch": 0.10919274586040947, "grad_norm": 0.04788427799940109, "grad_norm_var": 6.56159408252888e-05, "learning_rate": 0.009532634651058674, "loss": 2.7585, "step": 2008 }, { "crossentropy": 2.8037891387939453, "epoch": 0.10924712471790968, "grad_norm": 0.049476414918899536, "grad_norm_var": 6.12869280939704e-05, "learning_rate": 0.009532108859688112, "loss": 2.8038, "step": 2009 }, { "crossentropy": 2.8862324953079224, "epoch": 0.10930150357540988, "grad_norm": 0.04413054510951042, "grad_norm_var": 6.328478604435948e-05, "learning_rate": 0.009531582787240383, "loss": 2.8862, "step": 2010 }, { "crossentropy": 2.8469799757003784, "epoch": 0.10935588243291008, "grad_norm": 0.051597535610198975, "grad_norm_var": 6.342332070999179e-05, "learning_rate": 0.009531056433748115, "loss": 2.847, "step": 2011 }, { "crossentropy": 2.773140549659729, "epoch": 0.10941026129041029, "grad_norm": 0.055351853370666504, "grad_norm_var": 6.355034233961309e-05, "learning_rate": 0.00953052979924395, "loss": 2.7731, "step": 2012 }, { "crossentropy": 2.775185227394104, "epoch": 0.10946464014791049, "grad_norm": 0.05842769145965576, "grad_norm_var": 6.085122687368294e-05, "learning_rate": 0.009530002883760552, "loss": 2.7752, "step": 2013 }, { "crossentropy": 2.8050297498703003, "epoch": 0.1095190190054107, "grad_norm": 0.06063608080148697, "grad_norm_var": 4.3445299649545684e-05, "learning_rate": 0.0095294756873306, "loss": 2.805, "step": 2014 }, { "crossentropy": 2.6585901975631714, "epoch": 0.1095733978629109, "grad_norm": 0.05889374390244484, "grad_norm_var": 4.043314574928147e-05, "learning_rate": 0.009528948209986786, "loss": 2.6586, "step": 2015 }, { "crossentropy": 2.7744945287704468, "epoch": 0.1096277767204111, "grad_norm": 0.053430959582328796, "grad_norm_var": 3.9254180209815034e-05, "learning_rate": 0.009528420451761829, "loss": 2.7745, "step": 2016 }, { "crossentropy": 2.722477078437805, "epoch": 0.1096821555779113, "grad_norm": 0.049984369426965714, "grad_norm_var": 4.009568044309458e-05, "learning_rate": 0.009527892412688454, "loss": 2.7225, "step": 2017 }, { "crossentropy": 2.8836625814437866, "epoch": 0.10973653443541151, "grad_norm": 0.05100119113922119, "grad_norm_var": 2.9416090598229027e-05, "learning_rate": 0.009527364092799416, "loss": 2.8837, "step": 2018 }, { "crossentropy": 2.7352218627929688, "epoch": 0.10979091329291171, "grad_norm": 0.048143792897462845, "grad_norm_var": 2.154830659280178e-05, "learning_rate": 0.009526835492127475, "loss": 2.7352, "step": 2019 }, { "crossentropy": 2.8335996866226196, "epoch": 0.10984529215041192, "grad_norm": 0.0468759685754776, "grad_norm_var": 2.2772869306998908e-05, "learning_rate": 0.00952630661070542, "loss": 2.8336, "step": 2020 }, { "crossentropy": 2.890078544616699, "epoch": 0.10989967100791212, "grad_norm": 0.051481232047080994, "grad_norm_var": 2.1275843996250664e-05, "learning_rate": 0.009525777448566046, "loss": 2.8901, "step": 2021 }, { "crossentropy": 2.8558952808380127, "epoch": 0.10995404986541232, "grad_norm": 0.04781372472643852, "grad_norm_var": 2.204002639639892e-05, "learning_rate": 0.009525248005742176, "loss": 2.8559, "step": 2022 }, { "crossentropy": 2.739243268966675, "epoch": 0.11000842872291253, "grad_norm": 0.04585226625204086, "grad_norm_var": 2.375257602336324e-05, "learning_rate": 0.009524718282266644, "loss": 2.7392, "step": 2023 }, { "crossentropy": 2.776674509048462, "epoch": 0.11006280758041273, "grad_norm": 0.051218681037425995, "grad_norm_var": 2.2923833015496033e-05, "learning_rate": 0.0095241882781723, "loss": 2.7767, "step": 2024 }, { "crossentropy": 2.8172924518585205, "epoch": 0.11011718643791293, "grad_norm": 0.06103160232305527, "grad_norm_var": 2.812082605930747e-05, "learning_rate": 0.00952365799349202, "loss": 2.8173, "step": 2025 }, { "crossentropy": 2.723988175392151, "epoch": 0.11017156529541314, "grad_norm": 0.04844589903950691, "grad_norm_var": 2.461757275843499e-05, "learning_rate": 0.009523127428258686, "loss": 2.724, "step": 2026 }, { "crossentropy": 2.7604968547821045, "epoch": 0.11022594415291334, "grad_norm": 0.046312879770994186, "grad_norm_var": 2.7007159424028565e-05, "learning_rate": 0.009522596582505207, "loss": 2.7605, "step": 2027 }, { "crossentropy": 2.755025267601013, "epoch": 0.11028032301041354, "grad_norm": 0.09106031805276871, "grad_norm_var": 0.00012179563680902357, "learning_rate": 0.009522065456264507, "loss": 2.755, "step": 2028 }, { "crossentropy": 2.774956464767456, "epoch": 0.11033470186791376, "grad_norm": 0.047054413706064224, "grad_norm_var": 0.00012379230299263173, "learning_rate": 0.009521534049569519, "loss": 2.775, "step": 2029 }, { "crossentropy": 2.8111021518707275, "epoch": 0.11038908072541397, "grad_norm": 0.049573738127946854, "grad_norm_var": 0.00012121361565059496, "learning_rate": 0.009521002362453205, "loss": 2.8111, "step": 2030 }, { "crossentropy": 2.7173893451690674, "epoch": 0.11044345958291417, "grad_norm": 0.0463222861289978, "grad_norm_var": 0.00012123046260552415, "learning_rate": 0.009520470394948542, "loss": 2.7174, "step": 2031 }, { "crossentropy": 2.794579267501831, "epoch": 0.11049783844041437, "grad_norm": 0.04676072672009468, "grad_norm_var": 0.00012293886021021624, "learning_rate": 0.009519938147088519, "loss": 2.7946, "step": 2032 }, { "crossentropy": 2.7986494302749634, "epoch": 0.11055221729791458, "grad_norm": 0.044858526438474655, "grad_norm_var": 0.0001258275718081086, "learning_rate": 0.009519405618906145, "loss": 2.7986, "step": 2033 }, { "crossentropy": 2.7345775365829468, "epoch": 0.11060659615541478, "grad_norm": 0.04413679614663124, "grad_norm_var": 0.0001292180767539577, "learning_rate": 0.00951887281043445, "loss": 2.7346, "step": 2034 }, { "crossentropy": 2.8043267726898193, "epoch": 0.11066097501291498, "grad_norm": 0.04542164131999016, "grad_norm_var": 0.0001307392672858731, "learning_rate": 0.009518339721706473, "loss": 2.8043, "step": 2035 }, { "crossentropy": 2.838055372238159, "epoch": 0.11071535387041519, "grad_norm": 0.08849797397851944, "grad_norm_var": 0.00021674414802391455, "learning_rate": 0.00951780635275528, "loss": 2.8381, "step": 2036 }, { "crossentropy": 2.7802799940109253, "epoch": 0.11076973272791539, "grad_norm": 0.046212587505578995, "grad_norm_var": 0.00021989031152168753, "learning_rate": 0.00951727270361395, "loss": 2.7803, "step": 2037 }, { "crossentropy": 2.739150643348694, "epoch": 0.1108241115854156, "grad_norm": 0.0456799641251564, "grad_norm_var": 0.00022169614281625268, "learning_rate": 0.009516738774315577, "loss": 2.7392, "step": 2038 }, { "crossentropy": 2.7083078622817993, "epoch": 0.1108784904429158, "grad_norm": 0.044966381043195724, "grad_norm_var": 0.00022259271903328216, "learning_rate": 0.009516204564893278, "loss": 2.7083, "step": 2039 }, { "crossentropy": 2.752472162246704, "epoch": 0.110932869300416, "grad_norm": 0.05105699226260185, "grad_norm_var": 0.00022263215517429643, "learning_rate": 0.00951567007538018, "loss": 2.7525, "step": 2040 }, { "crossentropy": 2.729441523551941, "epoch": 0.1109872481579162, "grad_norm": 0.075676828622818, "grad_norm_var": 0.0002517947192782541, "learning_rate": 0.009515135305809436, "loss": 2.7294, "step": 2041 }, { "crossentropy": 2.6897388696670532, "epoch": 0.11104162701541641, "grad_norm": 0.05262896418571472, "grad_norm_var": 0.000249858985672041, "learning_rate": 0.00951460025621421, "loss": 2.6897, "step": 2042 }, { "crossentropy": 2.873960018157959, "epoch": 0.11109600587291661, "grad_norm": 0.054703932255506516, "grad_norm_var": 0.00024550388621368983, "learning_rate": 0.009514064926627683, "loss": 2.874, "step": 2043 }, { "crossentropy": 2.8211594820022583, "epoch": 0.11115038473041682, "grad_norm": 0.0471484400331974, "grad_norm_var": 0.00015291791149244684, "learning_rate": 0.00951352931708306, "loss": 2.8212, "step": 2044 }, { "crossentropy": 2.59524405002594, "epoch": 0.11120476358791702, "grad_norm": 0.18334272503852844, "grad_norm_var": 0.001225430556714574, "learning_rate": 0.009512993427613555, "loss": 2.5952, "step": 2045 }, { "crossentropy": 2.8241628408432007, "epoch": 0.11125914244541722, "grad_norm": 0.05173807218670845, "grad_norm_var": 0.0012225884941187279, "learning_rate": 0.009512457258252406, "loss": 2.8242, "step": 2046 }, { "crossentropy": 2.979233980178833, "epoch": 0.11131352130291743, "grad_norm": 0.18533529341220856, "grad_norm_var": 0.002166256625056511, "learning_rate": 0.009511920809032863, "loss": 2.9792, "step": 2047 }, { "crossentropy": 2.912995219230652, "epoch": 0.11136790016041763, "grad_norm": 0.4874607026576996, "grad_norm_var": 0.012982707232005983, "learning_rate": 0.0095113840799882, "loss": 2.913, "step": 2048 }, { "crossentropy": 2.8120940923690796, "epoch": 0.11142227901791783, "grad_norm": 0.08575120568275452, "grad_norm_var": 0.012803994533900635, "learning_rate": 0.0095108470711517, "loss": 2.8121, "step": 2049 }, { "crossentropy": 2.803041934967041, "epoch": 0.11147665787541804, "grad_norm": 0.05859273672103882, "grad_norm_var": 0.012710615156065343, "learning_rate": 0.009510309782556672, "loss": 2.803, "step": 2050 }, { "crossentropy": 2.880671501159668, "epoch": 0.11153103673291824, "grad_norm": 0.06134667992591858, "grad_norm_var": 0.012610017962473177, "learning_rate": 0.009509772214236436, "loss": 2.8807, "step": 2051 }, { "crossentropy": 2.77207350730896, "epoch": 0.11158541559041844, "grad_norm": 0.06979472935199738, "grad_norm_var": 0.012663703480082233, "learning_rate": 0.009509234366224332, "loss": 2.7721, "step": 2052 }, { "crossentropy": 2.8328607082366943, "epoch": 0.11163979444791865, "grad_norm": 0.05420541763305664, "grad_norm_var": 0.012610278830093531, "learning_rate": 0.009508696238553715, "loss": 2.8329, "step": 2053 }, { "crossentropy": 2.918425679206848, "epoch": 0.11169417330541885, "grad_norm": 0.0578208826482296, "grad_norm_var": 0.012530604785778186, "learning_rate": 0.009508157831257963, "loss": 2.9184, "step": 2054 }, { "crossentropy": 2.8291555643081665, "epoch": 0.11174855216291905, "grad_norm": 0.09021959453821182, "grad_norm_var": 0.012318401612299835, "learning_rate": 0.009507619144370465, "loss": 2.8292, "step": 2055 }, { "crossentropy": 2.779375433921814, "epoch": 0.11180293102041926, "grad_norm": 0.060418885201215744, "grad_norm_var": 0.012257572937311195, "learning_rate": 0.00950708017792463, "loss": 2.7794, "step": 2056 }, { "crossentropy": 2.8311619758605957, "epoch": 0.11185730987791946, "grad_norm": 0.05026558041572571, "grad_norm_var": 0.012396475096233567, "learning_rate": 0.009506540931953881, "loss": 2.8312, "step": 2057 }, { "crossentropy": 2.732128858566284, "epoch": 0.11191168873541966, "grad_norm": 0.051452238112688065, "grad_norm_var": 0.012404491894255585, "learning_rate": 0.009506001406491668, "loss": 2.7321, "step": 2058 }, { "crossentropy": 2.7992353439331055, "epoch": 0.11196606759291987, "grad_norm": 0.05296521261334419, "grad_norm_var": 0.012415900424902402, "learning_rate": 0.009505461601571449, "loss": 2.7992, "step": 2059 }, { "crossentropy": 2.8012615442276, "epoch": 0.11202044645042007, "grad_norm": 0.05564180016517639, "grad_norm_var": 0.01235717003186986, "learning_rate": 0.009504921517226702, "loss": 2.8013, "step": 2060 }, { "crossentropy": 2.8944756984710693, "epoch": 0.11207482530792028, "grad_norm": 0.051210690289735794, "grad_norm_var": 0.012042099451203232, "learning_rate": 0.00950438115349092, "loss": 2.8945, "step": 2061 }, { "crossentropy": 2.822766900062561, "epoch": 0.11212920416542048, "grad_norm": 0.054455604404211044, "grad_norm_var": 0.012026790027943981, "learning_rate": 0.009503840510397621, "loss": 2.8228, "step": 2062 }, { "crossentropy": 2.8536810874938965, "epoch": 0.11218358302292068, "grad_norm": 0.04913630336523056, "grad_norm_var": 0.011553572294425593, "learning_rate": 0.00950329958798033, "loss": 2.8537, "step": 2063 }, { "crossentropy": 2.81355881690979, "epoch": 0.11223796188042089, "grad_norm": 0.06547041237354279, "grad_norm_var": 0.00014680268741436574, "learning_rate": 0.009502758386272599, "loss": 2.8136, "step": 2064 }, { "crossentropy": 2.835824131965637, "epoch": 0.11229234073792109, "grad_norm": 0.05476335808634758, "grad_norm_var": 0.00010268052169604731, "learning_rate": 0.009502216905307989, "loss": 2.8358, "step": 2065 }, { "crossentropy": 2.8995319604873657, "epoch": 0.1123467195954213, "grad_norm": 0.049590859562158585, "grad_norm_var": 0.00010776586341595673, "learning_rate": 0.009501675145120085, "loss": 2.8995, "step": 2066 }, { "crossentropy": 2.793468952178955, "epoch": 0.11240109845292151, "grad_norm": 0.06921844184398651, "grad_norm_var": 0.00011510148254977685, "learning_rate": 0.009501133105742486, "loss": 2.7935, "step": 2067 }, { "crossentropy": 2.7489415407180786, "epoch": 0.11245547731042171, "grad_norm": 0.047032665461301804, "grad_norm_var": 0.00011332411061359306, "learning_rate": 0.009500590787208807, "loss": 2.7489, "step": 2068 }, { "crossentropy": 2.687772274017334, "epoch": 0.11250985616792192, "grad_norm": 0.049518827348947525, "grad_norm_var": 0.00011651609616117701, "learning_rate": 0.009500048189552685, "loss": 2.6878, "step": 2069 }, { "crossentropy": 2.704944133758545, "epoch": 0.11256423502542212, "grad_norm": 0.058421626687049866, "grad_norm_var": 0.00011661851474070583, "learning_rate": 0.009499505312807768, "loss": 2.7049, "step": 2070 }, { "crossentropy": 2.8810375928878784, "epoch": 0.11261861388292232, "grad_norm": 0.046666115522384644, "grad_norm_var": 4.145961181594663e-05, "learning_rate": 0.009498962157007726, "loss": 2.881, "step": 2071 }, { "crossentropy": 2.7835181951522827, "epoch": 0.11267299274042253, "grad_norm": 0.05001365393400192, "grad_norm_var": 3.951432718774679e-05, "learning_rate": 0.009498418722186248, "loss": 2.7835, "step": 2072 }, { "crossentropy": 2.789811372756958, "epoch": 0.11272737159792273, "grad_norm": 0.04854585975408554, "grad_norm_var": 4.043827589698771e-05, "learning_rate": 0.009497875008377033, "loss": 2.7898, "step": 2073 }, { "crossentropy": 2.7860517501831055, "epoch": 0.11278175045542294, "grad_norm": 0.04772868752479553, "grad_norm_var": 4.226264468503987e-05, "learning_rate": 0.009497331015613803, "loss": 2.7861, "step": 2074 }, { "crossentropy": 2.850027918815613, "epoch": 0.11283612931292314, "grad_norm": 0.9366596341133118, "grad_norm_var": 0.04882787571874572, "learning_rate": 0.009496786743930298, "loss": 2.85, "step": 2075 }, { "crossentropy": 3.200363039970398, "epoch": 0.11289050817042334, "grad_norm": 1.268100380897522, "grad_norm_var": 0.13218070128272366, "learning_rate": 0.009496242193360269, "loss": 3.2004, "step": 2076 }, { "crossentropy": 2.9916290044784546, "epoch": 0.11294488702792355, "grad_norm": 0.2400466501712799, "grad_norm_var": 0.13106201757886685, "learning_rate": 0.009495697363937492, "loss": 2.9916, "step": 2077 }, { "crossentropy": 3.037353992462158, "epoch": 0.11299926588542375, "grad_norm": 0.16416305303573608, "grad_norm_var": 0.12974436388307592, "learning_rate": 0.009495152255695756, "loss": 3.0374, "step": 2078 }, { "crossentropy": 2.9214701652526855, "epoch": 0.11305364474292395, "grad_norm": 0.08844956755638123, "grad_norm_var": 0.12903539955172197, "learning_rate": 0.009494606868668869, "loss": 2.9215, "step": 2079 }, { "crossentropy": 2.9970911741256714, "epoch": 0.11310802360042416, "grad_norm": 0.08313699811697006, "grad_norm_var": 0.1287255918554062, "learning_rate": 0.009494061202890652, "loss": 2.9971, "step": 2080 }, { "crossentropy": 2.921916365623474, "epoch": 0.11316240245792436, "grad_norm": 0.11235897988080978, "grad_norm_var": 0.12776860436839277, "learning_rate": 0.00949351525839495, "loss": 2.9219, "step": 2081 }, { "crossentropy": 2.8396964073181152, "epoch": 0.11321678131542456, "grad_norm": 1.9770301580429077, "grad_norm_var": 0.31873923368730855, "learning_rate": 0.009492969035215622, "loss": 2.8397, "step": 2082 }, { "crossentropy": 2.9167640209198, "epoch": 0.11327116017292477, "grad_norm": 0.1045333594083786, "grad_norm_var": 0.3175871626805649, "learning_rate": 0.009492422533386542, "loss": 2.9168, "step": 2083 }, { "crossentropy": 2.907270908355713, "epoch": 0.11332553903042497, "grad_norm": 0.44564539194107056, "grad_norm_var": 0.3123378047078529, "learning_rate": 0.009491875752941605, "loss": 2.9073, "step": 2084 }, { "crossentropy": 2.9810914993286133, "epoch": 0.11337991788792517, "grad_norm": 0.07636361569166183, "grad_norm_var": 0.3112802583439422, "learning_rate": 0.009491328693914723, "loss": 2.9811, "step": 2085 }, { "crossentropy": 2.909327745437622, "epoch": 0.11343429674542538, "grad_norm": 0.12970861792564392, "grad_norm_var": 0.3087386007082078, "learning_rate": 0.00949078135633982, "loss": 2.9093, "step": 2086 }, { "crossentropy": 2.9440020322799683, "epoch": 0.11348867560292558, "grad_norm": 0.1267697513103485, "grad_norm_var": 0.3057535950556213, "learning_rate": 0.009490233740250846, "loss": 2.944, "step": 2087 }, { "crossentropy": 2.9567081928253174, "epoch": 0.11354305446042579, "grad_norm": 0.12018924951553345, "grad_norm_var": 0.3030794844364166, "learning_rate": 0.009489685845681762, "loss": 2.9567, "step": 2088 }, { "crossentropy": 2.954847812652588, "epoch": 0.11359743331792599, "grad_norm": 0.1141863539814949, "grad_norm_var": 0.30050835038437274, "learning_rate": 0.009489137672666546, "loss": 2.9548, "step": 2089 }, { "crossentropy": 2.9293969869613647, "epoch": 0.11365181217542619, "grad_norm": 0.09348026663064957, "grad_norm_var": 0.2986293808388278, "learning_rate": 0.009488589221239197, "loss": 2.9294, "step": 2090 }, { "crossentropy": 2.8388094902038574, "epoch": 0.1137061910329264, "grad_norm": 0.07124944776296616, "grad_norm_var": 0.28121187782482265, "learning_rate": 0.009488040491433728, "loss": 2.8388, "step": 2091 }, { "crossentropy": 2.93247389793396, "epoch": 0.1137605698904266, "grad_norm": 0.07391330599784851, "grad_norm_var": 0.22033031810048176, "learning_rate": 0.009487491483284176, "loss": 2.9325, "step": 2092 }, { "crossentropy": 2.939478635787964, "epoch": 0.1138149487479268, "grad_norm": 0.07614821195602417, "grad_norm_var": 0.22225573774547602, "learning_rate": 0.009486942196824583, "loss": 2.9395, "step": 2093 }, { "crossentropy": 2.8978688716888428, "epoch": 0.113869327605427, "grad_norm": 0.06685760617256165, "grad_norm_var": 0.22384547223956608, "learning_rate": 0.009486392632089018, "loss": 2.8979, "step": 2094 }, { "crossentropy": 2.841892123222351, "epoch": 0.11392370646292721, "grad_norm": 0.12487170100212097, "grad_norm_var": 0.22321668605173806, "learning_rate": 0.009485842789111566, "loss": 2.8419, "step": 2095 }, { "crossentropy": 2.8985496759414673, "epoch": 0.11397808532042741, "grad_norm": 0.08473096787929535, "grad_norm_var": 0.22318408543469015, "learning_rate": 0.009485292667926325, "loss": 2.8985, "step": 2096 }, { "crossentropy": 2.8687684535980225, "epoch": 0.11403246417792762, "grad_norm": 0.06521760672330856, "grad_norm_var": 0.22410878452788888, "learning_rate": 0.009484742268567416, "loss": 2.8688, "step": 2097 }, { "crossentropy": 2.976984739303589, "epoch": 0.11408684303542782, "grad_norm": 0.07732041925191879, "grad_norm_var": 0.008273834434077304, "learning_rate": 0.009484191591068971, "loss": 2.977, "step": 2098 }, { "crossentropy": 2.897068977355957, "epoch": 0.11414122189292802, "grad_norm": 0.06339573115110397, "grad_norm_var": 0.008440847834471759, "learning_rate": 0.009483640635465146, "loss": 2.8971, "step": 2099 }, { "crossentropy": 2.7725813388824463, "epoch": 0.11419560075042823, "grad_norm": 0.06968480348587036, "grad_norm_var": 0.0006065384434311203, "learning_rate": 0.009483089401790106, "loss": 2.7726, "step": 2100 }, { "crossentropy": 2.94451367855072, "epoch": 0.11424997960792843, "grad_norm": 0.31161704659461975, "grad_norm_var": 0.00364940615715017, "learning_rate": 0.009482537890078042, "loss": 2.9445, "step": 2101 }, { "crossentropy": 2.9563589096069336, "epoch": 0.11430435846542863, "grad_norm": 0.22530107200145721, "grad_norm_var": 0.004543944573649675, "learning_rate": 0.009481986100363158, "loss": 2.9564, "step": 2102 }, { "crossentropy": 2.8643985986709595, "epoch": 0.11435873732292884, "grad_norm": 0.07735291123390198, "grad_norm_var": 0.0045881083299227935, "learning_rate": 0.009481434032679674, "loss": 2.8644, "step": 2103 }, { "crossentropy": 2.8870162963867188, "epoch": 0.11441311618042906, "grad_norm": 0.08481944352388382, "grad_norm_var": 0.004605133660911662, "learning_rate": 0.009480881687061831, "loss": 2.887, "step": 2104 }, { "crossentropy": 2.8791340589523315, "epoch": 0.11446749503792926, "grad_norm": 0.0634978786110878, "grad_norm_var": 0.004703692669045382, "learning_rate": 0.009480329063543881, "loss": 2.8791, "step": 2105 }, { "crossentropy": 2.949547290802002, "epoch": 0.11452187389542946, "grad_norm": 0.07269595563411713, "grad_norm_var": 0.004753861923168009, "learning_rate": 0.0094797761621601, "loss": 2.9495, "step": 2106 }, { "crossentropy": 2.9937727451324463, "epoch": 0.11457625275292967, "grad_norm": 0.15655401349067688, "grad_norm_var": 0.004875492939309389, "learning_rate": 0.009479222982944778, "loss": 2.9938, "step": 2107 }, { "crossentropy": 2.8167532682418823, "epoch": 0.11463063161042987, "grad_norm": 0.08103574812412262, "grad_norm_var": 0.004848312070137332, "learning_rate": 0.009478669525932223, "loss": 2.8168, "step": 2108 }, { "crossentropy": 2.9547911882400513, "epoch": 0.11468501046793007, "grad_norm": 0.2055923491716385, "grad_norm_var": 0.0053748275044323, "learning_rate": 0.009478115791156759, "loss": 2.9548, "step": 2109 }, { "crossentropy": 2.873998522758484, "epoch": 0.11473938932543028, "grad_norm": 0.06987489759922028, "grad_norm_var": 0.005356266287482899, "learning_rate": 0.00947756177865273, "loss": 2.874, "step": 2110 }, { "crossentropy": 2.8001710176467896, "epoch": 0.11479376818293048, "grad_norm": 0.06383734196424484, "grad_norm_var": 0.005505481573905564, "learning_rate": 0.009477007488454493, "loss": 2.8002, "step": 2111 }, { "crossentropy": 2.905833601951599, "epoch": 0.11484814704043068, "grad_norm": 0.05632232502102852, "grad_norm_var": 0.005654602688358663, "learning_rate": 0.009476452920596425, "loss": 2.9058, "step": 2112 }, { "crossentropy": 2.9526705741882324, "epoch": 0.11490252589793089, "grad_norm": 0.07754094153642654, "grad_norm_var": 0.005592142602700255, "learning_rate": 0.00947589807511292, "loss": 2.9527, "step": 2113 }, { "crossentropy": 2.9172762632369995, "epoch": 0.11495690475543109, "grad_norm": 0.07146690040826797, "grad_norm_var": 0.005619615974453914, "learning_rate": 0.00947534295203839, "loss": 2.9173, "step": 2114 }, { "crossentropy": 2.724813222885132, "epoch": 0.1150112836129313, "grad_norm": 0.117667056620121, "grad_norm_var": 0.005470721362826989, "learning_rate": 0.009474787551407262, "loss": 2.7248, "step": 2115 }, { "crossentropy": 2.9537153244018555, "epoch": 0.1150656624704315, "grad_norm": 0.31084147095680237, "grad_norm_var": 0.007719047434960815, "learning_rate": 0.009474231873253983, "loss": 2.9537, "step": 2116 }, { "crossentropy": 2.8951624631881714, "epoch": 0.1151200413279317, "grad_norm": 0.059363286942243576, "grad_norm_var": 0.005516131757598331, "learning_rate": 0.009473675917613012, "loss": 2.8952, "step": 2117 }, { "crossentropy": 2.967040181159973, "epoch": 0.1151744201854319, "grad_norm": 0.0512191466987133, "grad_norm_var": 0.004782899965674792, "learning_rate": 0.009473119684518834, "loss": 2.967, "step": 2118 }, { "crossentropy": 2.8733952045440674, "epoch": 0.11522879904293211, "grad_norm": 0.08620316535234451, "grad_norm_var": 0.00475961950630453, "learning_rate": 0.009472563174005941, "loss": 2.8734, "step": 2119 }, { "crossentropy": 2.964674472808838, "epoch": 0.11528317790043231, "grad_norm": 0.07530087232589722, "grad_norm_var": 0.004786811693240855, "learning_rate": 0.009472006386108853, "loss": 2.9647, "step": 2120 }, { "crossentropy": 2.9154410362243652, "epoch": 0.11533755675793252, "grad_norm": 0.060242779552936554, "grad_norm_var": 0.004803832078228436, "learning_rate": 0.009471449320862095, "loss": 2.9154, "step": 2121 }, { "crossentropy": 2.682347536087036, "epoch": 0.11539193561543272, "grad_norm": 0.058877166360616684, "grad_norm_var": 0.0048678895228058735, "learning_rate": 0.00947089197830022, "loss": 2.6823, "step": 2122 }, { "crossentropy": 2.809948205947876, "epoch": 0.11544631447293292, "grad_norm": 0.04929093271493912, "grad_norm_var": 0.004779887630622489, "learning_rate": 0.009470334358457793, "loss": 2.8099, "step": 2123 }, { "crossentropy": 2.8447229862213135, "epoch": 0.11550069333043313, "grad_norm": 0.09155218303203583, "grad_norm_var": 0.004769438575428464, "learning_rate": 0.009469776461369397, "loss": 2.8447, "step": 2124 }, { "crossentropy": 2.9923747777938843, "epoch": 0.11555507218793333, "grad_norm": 0.05937037244439125, "grad_norm_var": 0.00393156234542839, "learning_rate": 0.009469218287069632, "loss": 2.9924, "step": 2125 }, { "crossentropy": 2.8511734008789062, "epoch": 0.11560945104543353, "grad_norm": 0.05746982991695404, "grad_norm_var": 0.0039660908677613465, "learning_rate": 0.009468659835593116, "loss": 2.8512, "step": 2126 }, { "crossentropy": 2.8233343362808228, "epoch": 0.11566382990293374, "grad_norm": 0.05683409050107002, "grad_norm_var": 0.003988133174916498, "learning_rate": 0.009468101106974483, "loss": 2.8233, "step": 2127 }, { "crossentropy": 2.8916302919387817, "epoch": 0.11571820876043394, "grad_norm": 0.06410728394985199, "grad_norm_var": 0.003963479628922098, "learning_rate": 0.009467542101248384, "loss": 2.8916, "step": 2128 }, { "crossentropy": 2.949025511741638, "epoch": 0.11577258761793414, "grad_norm": 0.06467711180448532, "grad_norm_var": 0.003985259286606093, "learning_rate": 0.00946698281844949, "loss": 2.949, "step": 2129 }, { "crossentropy": 2.915905714035034, "epoch": 0.11582696647543435, "grad_norm": 0.056870393455028534, "grad_norm_var": 0.004021809796977767, "learning_rate": 0.009466423258612484, "loss": 2.9159, "step": 2130 }, { "crossentropy": 2.7957425117492676, "epoch": 0.11588134533293455, "grad_norm": 0.07321950793266296, "grad_norm_var": 0.003936830103376378, "learning_rate": 0.009465863421772074, "loss": 2.7957, "step": 2131 }, { "crossentropy": 2.9050204753875732, "epoch": 0.11593572419043476, "grad_norm": 0.05100365728139877, "grad_norm_var": 0.00014917355981619763, "learning_rate": 0.009465303307962978, "loss": 2.905, "step": 2132 }, { "crossentropy": 3.0354889631271362, "epoch": 0.11599010304793496, "grad_norm": 0.059391412883996964, "grad_norm_var": 0.00014915818940045884, "learning_rate": 0.009464742917219932, "loss": 3.0355, "step": 2133 }, { "crossentropy": 2.9370228052139282, "epoch": 0.11604448190543516, "grad_norm": 0.048276644200086594, "grad_norm_var": 0.00015450845164743696, "learning_rate": 0.009464182249577693, "loss": 2.937, "step": 2134 }, { "crossentropy": 2.860231399536133, "epoch": 0.11609886076293537, "grad_norm": 0.05735395848751068, "grad_norm_var": 0.00011840025773098925, "learning_rate": 0.009463621305071035, "loss": 2.8602, "step": 2135 }, { "crossentropy": 2.9025262594223022, "epoch": 0.11615323962043557, "grad_norm": 0.05410788580775261, "grad_norm_var": 0.00010744553915864203, "learning_rate": 0.009463060083734742, "loss": 2.9025, "step": 2136 }, { "crossentropy": 2.8300459384918213, "epoch": 0.11620761847793577, "grad_norm": 0.046337395906448364, "grad_norm_var": 0.00011938691654925226, "learning_rate": 0.009462498585603624, "loss": 2.83, "step": 2137 }, { "crossentropy": 2.8359177112579346, "epoch": 0.11626199733543598, "grad_norm": 0.04508952051401138, "grad_norm_var": 0.0001320385188329937, "learning_rate": 0.009461936810712506, "loss": 2.8359, "step": 2138 }, { "crossentropy": 2.8740779161453247, "epoch": 0.11631637619293618, "grad_norm": 0.04880150407552719, "grad_norm_var": 0.00013265017400452636, "learning_rate": 0.009461374759096227, "loss": 2.8741, "step": 2139 }, { "crossentropy": 2.8770627975463867, "epoch": 0.11637075505043638, "grad_norm": 0.0456230603158474, "grad_norm_var": 6.149687050174615e-05, "learning_rate": 0.009460812430789644, "loss": 2.8771, "step": 2140 }, { "crossentropy": 2.829633116722107, "epoch": 0.1164251339079366, "grad_norm": 0.050839707255363464, "grad_norm_var": 6.168082429323877e-05, "learning_rate": 0.009460249825827632, "loss": 2.8296, "step": 2141 }, { "crossentropy": 2.799512267112732, "epoch": 0.1164795127654368, "grad_norm": 0.05150776728987694, "grad_norm_var": 6.193923774115428e-05, "learning_rate": 0.009459686944245085, "loss": 2.7995, "step": 2142 }, { "crossentropy": 2.9238271713256836, "epoch": 0.11653389162293701, "grad_norm": 0.04961966350674629, "grad_norm_var": 6.306971777589321e-05, "learning_rate": 0.00945912378607691, "loss": 2.9238, "step": 2143 }, { "crossentropy": 2.848636269569397, "epoch": 0.11658827048043721, "grad_norm": 0.0467463955283165, "grad_norm_var": 5.891997277447727e-05, "learning_rate": 0.00945856035135804, "loss": 2.8486, "step": 2144 }, { "crossentropy": 2.9212448596954346, "epoch": 0.11664264933793742, "grad_norm": 0.0460808165371418, "grad_norm_var": 5.180751155200811e-05, "learning_rate": 0.009457996640123407, "loss": 2.9212, "step": 2145 }, { "crossentropy": 2.8697092533111572, "epoch": 0.11669702819543762, "grad_norm": 0.05580781027674675, "grad_norm_var": 5.1178040549369056e-05, "learning_rate": 0.009457432652407982, "loss": 2.8697, "step": 2146 }, { "crossentropy": 2.9107238054275513, "epoch": 0.11675140705293782, "grad_norm": 0.059697531163692474, "grad_norm_var": 2.4101342039697938e-05, "learning_rate": 0.00945686838824674, "loss": 2.9107, "step": 2147 }, { "crossentropy": 2.8955368995666504, "epoch": 0.11680578591043803, "grad_norm": 0.0576779767870903, "grad_norm_var": 2.6872918943390833e-05, "learning_rate": 0.009456303847674674, "loss": 2.8955, "step": 2148 }, { "crossentropy": 2.9388025999069214, "epoch": 0.11686016476793823, "grad_norm": 0.04436039552092552, "grad_norm_var": 2.5047787006271295e-05, "learning_rate": 0.009455739030726799, "loss": 2.9388, "step": 2149 }, { "crossentropy": 2.707465171813965, "epoch": 0.11691454362543843, "grad_norm": 0.047072477638721466, "grad_norm_var": 2.5494663002872896e-05, "learning_rate": 0.009455173937438143, "loss": 2.7075, "step": 2150 }, { "crossentropy": 2.791179656982422, "epoch": 0.11696892248293864, "grad_norm": 0.10261647403240204, "grad_norm_var": 0.0001953831154992961, "learning_rate": 0.009454608567843754, "loss": 2.7912, "step": 2151 }, { "crossentropy": 2.9254162311553955, "epoch": 0.11702330134043884, "grad_norm": 0.051599886268377304, "grad_norm_var": 0.00019548908278059165, "learning_rate": 0.009454042921978694, "loss": 2.9254, "step": 2152 }, { "crossentropy": 2.8221906423568726, "epoch": 0.11707768019793904, "grad_norm": 0.046853091567754745, "grad_norm_var": 0.00019504123405748308, "learning_rate": 0.009453476999878044, "loss": 2.8222, "step": 2153 }, { "crossentropy": 2.7921698093414307, "epoch": 0.11713205905543925, "grad_norm": 0.04788936302065849, "grad_norm_var": 0.00019253157349860035, "learning_rate": 0.009452910801576902, "loss": 2.7922, "step": 2154 }, { "crossentropy": 2.7707326412200928, "epoch": 0.11718643791293945, "grad_norm": 0.04825650155544281, "grad_norm_var": 0.00019287700233827683, "learning_rate": 0.009452344327110385, "loss": 2.7707, "step": 2155 }, { "crossentropy": 2.8084789514541626, "epoch": 0.11724081677043965, "grad_norm": 0.04540414735674858, "grad_norm_var": 0.00019310306973847237, "learning_rate": 0.009451777576513623, "loss": 2.8085, "step": 2156 }, { "crossentropy": 2.810156464576721, "epoch": 0.11729519562793986, "grad_norm": 0.05199994146823883, "grad_norm_var": 0.00019281404637886315, "learning_rate": 0.009451210549821768, "loss": 2.8102, "step": 2157 }, { "crossentropy": 2.8690011501312256, "epoch": 0.11734957448544006, "grad_norm": 0.05035027116537094, "grad_norm_var": 0.00019317814820366704, "learning_rate": 0.00945064324706998, "loss": 2.869, "step": 2158 }, { "crossentropy": 2.929554581642151, "epoch": 0.11740395334294026, "grad_norm": 0.23942719399929047, "grad_norm_var": 0.002352932160874469, "learning_rate": 0.00945007566829345, "loss": 2.9296, "step": 2159 }, { "crossentropy": 2.8557852506637573, "epoch": 0.11745833220044047, "grad_norm": 0.0639311820268631, "grad_norm_var": 0.0023293013542733227, "learning_rate": 0.009449507813527377, "loss": 2.8558, "step": 2160 }, { "crossentropy": 2.855257987976074, "epoch": 0.11751271105794067, "grad_norm": 0.0690789446234703, "grad_norm_var": 0.002300698192334887, "learning_rate": 0.009448939682806976, "loss": 2.8553, "step": 2161 }, { "crossentropy": 2.8737372159957886, "epoch": 0.11756708991544088, "grad_norm": 0.08802711963653564, "grad_norm_var": 0.0023148066509791204, "learning_rate": 0.009448371276167485, "loss": 2.8737, "step": 2162 }, { "crossentropy": 2.9556052684783936, "epoch": 0.11762146877294108, "grad_norm": 0.0805511549115181, "grad_norm_var": 0.0023143409511373675, "learning_rate": 0.009447802593644153, "loss": 2.9556, "step": 2163 }, { "crossentropy": 2.9102703332901, "epoch": 0.11767584763044128, "grad_norm": 0.05422380939126015, "grad_norm_var": 0.0023211961708646534, "learning_rate": 0.00944723363527225, "loss": 2.9103, "step": 2164 }, { "crossentropy": 2.7535213232040405, "epoch": 0.11773022648794149, "grad_norm": 0.06383194774389267, "grad_norm_var": 0.0022764377276519596, "learning_rate": 0.009446664401087067, "loss": 2.7535, "step": 2165 }, { "crossentropy": 2.8477317094802856, "epoch": 0.11778460534544169, "grad_norm": 0.049689434468746185, "grad_norm_var": 0.0022681871892349403, "learning_rate": 0.009446094891123899, "loss": 2.8477, "step": 2166 }, { "crossentropy": 2.7821215391159058, "epoch": 0.1178389842029419, "grad_norm": 0.046875230967998505, "grad_norm_var": 0.002235637184039762, "learning_rate": 0.009445525105418073, "loss": 2.7821, "step": 2167 }, { "crossentropy": 2.787624716758728, "epoch": 0.1178933630604421, "grad_norm": 0.05006309971213341, "grad_norm_var": 0.0022392731818316114, "learning_rate": 0.009444955044004926, "loss": 2.7876, "step": 2168 }, { "crossentropy": 2.829988718032837, "epoch": 0.1179477419179423, "grad_norm": 0.05152711644768715, "grad_norm_var": 0.002227130541089815, "learning_rate": 0.00944438470691981, "loss": 2.83, "step": 2169 }, { "crossentropy": 2.943776249885559, "epoch": 0.1180021207754425, "grad_norm": 0.046964406967163086, "grad_norm_var": 0.002229765384936669, "learning_rate": 0.009443814094198097, "loss": 2.9438, "step": 2170 }, { "crossentropy": 2.8090137243270874, "epoch": 0.11805649963294271, "grad_norm": 0.05138426274061203, "grad_norm_var": 0.0022218250609365568, "learning_rate": 0.009443243205875177, "loss": 2.809, "step": 2171 }, { "crossentropy": 2.7843780517578125, "epoch": 0.11811087849044291, "grad_norm": 0.04376081004738808, "grad_norm_var": 0.0022271547863961746, "learning_rate": 0.009442672041986457, "loss": 2.7844, "step": 2172 }, { "crossentropy": 2.8748821020126343, "epoch": 0.11816525734794311, "grad_norm": 0.06835553795099258, "grad_norm_var": 0.0022071164698729455, "learning_rate": 0.009442100602567357, "loss": 2.8749, "step": 2173 }, { "crossentropy": 2.790615200996399, "epoch": 0.11821963620544332, "grad_norm": 0.044494710862636566, "grad_norm_var": 0.0022245052337985976, "learning_rate": 0.009441528887653322, "loss": 2.7906, "step": 2174 }, { "crossentropy": 2.8176294565200806, "epoch": 0.11827401506294352, "grad_norm": 0.04806335270404816, "grad_norm_var": 0.00017783652808945686, "learning_rate": 0.009440956897279803, "loss": 2.8176, "step": 2175 }, { "crossentropy": 2.8201191425323486, "epoch": 0.11832839392044373, "grad_norm": 0.04789638891816139, "grad_norm_var": 0.00018026635472919526, "learning_rate": 0.00944038463148228, "loss": 2.8201, "step": 2176 }, { "crossentropy": 2.7317886352539062, "epoch": 0.11838277277794393, "grad_norm": 0.1052369475364685, "grad_norm_var": 0.00032238562856089405, "learning_rate": 0.00943981209029624, "loss": 2.7318, "step": 2177 }, { "crossentropy": 2.7574896812438965, "epoch": 0.11843715163544415, "grad_norm": 0.06601474434137344, "grad_norm_var": 0.0002669152190351441, "learning_rate": 0.009439239273757194, "loss": 2.7575, "step": 2178 }, { "crossentropy": 2.79841947555542, "epoch": 0.11849153049294435, "grad_norm": 0.05109216272830963, "grad_norm_var": 0.0002303509434218369, "learning_rate": 0.009438666181900668, "loss": 2.7984, "step": 2179 }, { "crossentropy": 2.7522170543670654, "epoch": 0.11854590935044455, "grad_norm": 0.04843165725469589, "grad_norm_var": 0.00023350448809999086, "learning_rate": 0.009438092814762203, "loss": 2.7522, "step": 2180 }, { "crossentropy": 2.888813018798828, "epoch": 0.11860028820794476, "grad_norm": 0.044288940727710724, "grad_norm_var": 0.00023496096328769084, "learning_rate": 0.009437519172377357, "loss": 2.8888, "step": 2181 }, { "crossentropy": 2.7994043827056885, "epoch": 0.11865466706544496, "grad_norm": 0.06102760136127472, "grad_norm_var": 0.0002364659536554135, "learning_rate": 0.009436945254781712, "loss": 2.7994, "step": 2182 }, { "crossentropy": 2.7988197803497314, "epoch": 0.11870904592294516, "grad_norm": 0.05397450178861618, "grad_norm_var": 0.00023219285849596742, "learning_rate": 0.009436371062010857, "loss": 2.7988, "step": 2183 }, { "crossentropy": 2.851779341697693, "epoch": 0.11876342478044537, "grad_norm": 0.05159972608089447, "grad_norm_var": 0.00023129595594893095, "learning_rate": 0.009435796594100406, "loss": 2.8518, "step": 2184 }, { "crossentropy": 3.371079444885254, "epoch": 0.11881780363794557, "grad_norm": 0.7956311702728271, "grad_norm_var": 0.03446691205698444, "learning_rate": 0.009435221851085987, "loss": 3.3711, "step": 2185 }, { "crossentropy": 2.9894721508026123, "epoch": 0.11887218249544577, "grad_norm": 0.37985891103744507, "grad_norm_var": 0.03896077250614955, "learning_rate": 0.00943464683300324, "loss": 2.9895, "step": 2186 }, { "crossentropy": 2.9851841926574707, "epoch": 0.11892656135294598, "grad_norm": 0.1989319771528244, "grad_norm_var": 0.03892098950558309, "learning_rate": 0.009434071539887835, "loss": 2.9852, "step": 2187 }, { "crossentropy": 3.0014199018478394, "epoch": 0.11898094021044618, "grad_norm": 0.17616750299930573, "grad_norm_var": 0.03846260196704317, "learning_rate": 0.009433495971775443, "loss": 3.0014, "step": 2188 }, { "crossentropy": 2.930147886276245, "epoch": 0.11903531906794639, "grad_norm": 0.14368586242198944, "grad_norm_var": 0.03809699892224139, "learning_rate": 0.009432920128701767, "loss": 2.9301, "step": 2189 }, { "crossentropy": 2.759582996368408, "epoch": 0.11908969792544659, "grad_norm": 0.08372559398412704, "grad_norm_var": 0.03766864699145597, "learning_rate": 0.009432344010702516, "loss": 2.7596, "step": 2190 }, { "crossentropy": 2.907257914543152, "epoch": 0.11914407678294679, "grad_norm": 0.09359876066446304, "grad_norm_var": 0.03719618000795979, "learning_rate": 0.009431767617813423, "loss": 2.9073, "step": 2191 }, { "crossentropy": 2.9492493867874146, "epoch": 0.119198455640447, "grad_norm": 0.09447309374809265, "grad_norm_var": 0.036697229021315586, "learning_rate": 0.009431190950070233, "loss": 2.9492, "step": 2192 }, { "crossentropy": 2.9508137702941895, "epoch": 0.1192528344979472, "grad_norm": 0.0821499451994896, "grad_norm_var": 0.03687751936871944, "learning_rate": 0.009430614007508713, "loss": 2.9508, "step": 2193 }, { "crossentropy": 2.8722869157791138, "epoch": 0.1193072133554474, "grad_norm": 0.06780539453029633, "grad_norm_var": 0.03685730014711895, "learning_rate": 0.00943003679016464, "loss": 2.8723, "step": 2194 }, { "crossentropy": 2.8641302585601807, "epoch": 0.1193615922129476, "grad_norm": 0.0936751440167427, "grad_norm_var": 0.03639967651823325, "learning_rate": 0.009429459298073817, "loss": 2.8641, "step": 2195 }, { "crossentropy": 2.871585965156555, "epoch": 0.11941597107044781, "grad_norm": 0.056245435029268265, "grad_norm_var": 0.036293180201117194, "learning_rate": 0.00942888153127206, "loss": 2.8716, "step": 2196 }, { "crossentropy": 2.96267569065094, "epoch": 0.11947034992794801, "grad_norm": 0.06567955762147903, "grad_norm_var": 0.036006583924000586, "learning_rate": 0.009428303489795196, "loss": 2.9627, "step": 2197 }, { "crossentropy": 2.9391520023345947, "epoch": 0.11952472878544822, "grad_norm": 0.061334699392318726, "grad_norm_var": 0.036002695332748524, "learning_rate": 0.009427725173679081, "loss": 2.9392, "step": 2198 }, { "crossentropy": 2.914541244506836, "epoch": 0.11957910764294842, "grad_norm": 0.05423929542303085, "grad_norm_var": 0.035999092022526935, "learning_rate": 0.009427146582959576, "loss": 2.9145, "step": 2199 }, { "crossentropy": 2.9251681566238403, "epoch": 0.11963348650044862, "grad_norm": 0.05365186929702759, "grad_norm_var": 0.035970741401011176, "learning_rate": 0.00942656771767257, "loss": 2.9252, "step": 2200 }, { "crossentropy": 2.9576514959335327, "epoch": 0.11968786535794883, "grad_norm": 0.05053821951150894, "grad_norm_var": 0.007153977337415282, "learning_rate": 0.009425988577853957, "loss": 2.9577, "step": 2201 }, { "crossentropy": 2.818699836730957, "epoch": 0.11974224421544903, "grad_norm": 0.05419899523258209, "grad_norm_var": 0.002053242209934157, "learning_rate": 0.009425409163539662, "loss": 2.8187, "step": 2202 }, { "crossentropy": 2.8457618951797485, "epoch": 0.11979662307294923, "grad_norm": 0.06372056156396866, "grad_norm_var": 0.0012208754721220192, "learning_rate": 0.009424829474765617, "loss": 2.8458, "step": 2203 }, { "crossentropy": 2.9508776664733887, "epoch": 0.11985100193044944, "grad_norm": 0.06797622889280319, "grad_norm_var": 0.0005786200069434602, "learning_rate": 0.009424249511567772, "loss": 2.9509, "step": 2204 }, { "crossentropy": 2.9040348529815674, "epoch": 0.11990538078794964, "grad_norm": 0.04126675799489021, "grad_norm_var": 0.0002849060335634636, "learning_rate": 0.009423669273982097, "loss": 2.904, "step": 2205 }, { "crossentropy": 2.909958004951477, "epoch": 0.11995975964544985, "grad_norm": 0.04354604706168175, "grad_norm_var": 0.000300313768221079, "learning_rate": 0.009423088762044578, "loss": 2.91, "step": 2206 }, { "crossentropy": 2.802647113800049, "epoch": 0.12001413850295005, "grad_norm": 0.049459151923656464, "grad_norm_var": 0.0002552791921684042, "learning_rate": 0.009422507975791217, "loss": 2.8026, "step": 2207 }, { "crossentropy": 2.918286681175232, "epoch": 0.12006851736045025, "grad_norm": 0.073923259973526, "grad_norm_var": 0.00019406032113596737, "learning_rate": 0.009421926915258035, "loss": 2.9183, "step": 2208 }, { "crossentropy": 2.8220763206481934, "epoch": 0.12012289621795046, "grad_norm": 0.04493788257241249, "grad_norm_var": 0.0001767262939985109, "learning_rate": 0.009421345580481069, "loss": 2.8221, "step": 2209 }, { "crossentropy": 2.821146249771118, "epoch": 0.12017727507545066, "grad_norm": 0.04223363474011421, "grad_norm_var": 0.0001871894879639739, "learning_rate": 0.009420763971496372, "loss": 2.8211, "step": 2210 }, { "crossentropy": 2.926906704902649, "epoch": 0.12023165393295086, "grad_norm": 0.06632624566555023, "grad_norm_var": 0.00010125495867000691, "learning_rate": 0.009420182088340013, "loss": 2.9269, "step": 2211 }, { "crossentropy": 2.946037769317627, "epoch": 0.12028603279045107, "grad_norm": 0.048614177852869034, "grad_norm_var": 0.0001042174960297075, "learning_rate": 0.009419599931048083, "loss": 2.946, "step": 2212 }, { "crossentropy": 2.875656485557556, "epoch": 0.12034041164795127, "grad_norm": 0.045896146446466446, "grad_norm_var": 0.00010078000412927848, "learning_rate": 0.009419017499656686, "loss": 2.8757, "step": 2213 }, { "crossentropy": 2.697214722633362, "epoch": 0.12039479050545147, "grad_norm": 0.05663526803255081, "grad_norm_var": 9.748075731772221e-05, "learning_rate": 0.009418434794201944, "loss": 2.6972, "step": 2214 }, { "crossentropy": 2.7779048681259155, "epoch": 0.12044916936295169, "grad_norm": 0.051680177450180054, "grad_norm_var": 9.766263383658505e-05, "learning_rate": 0.009417851814719993, "loss": 2.7779, "step": 2215 }, { "crossentropy": 2.8700636625289917, "epoch": 0.1205035482204519, "grad_norm": 0.061861440539360046, "grad_norm_var": 0.00010213664963517402, "learning_rate": 0.009417268561246995, "loss": 2.8701, "step": 2216 }, { "crossentropy": 2.8082425594329834, "epoch": 0.1205579270779521, "grad_norm": 0.05888569355010986, "grad_norm_var": 0.00010272120738325508, "learning_rate": 0.009416685033819115, "loss": 2.8082, "step": 2217 }, { "crossentropy": 2.8826366662979126, "epoch": 0.1206123059354523, "grad_norm": 0.04235955700278282, "grad_norm_var": 0.00011187442827770237, "learning_rate": 0.00941610123247255, "loss": 2.8826, "step": 2218 }, { "crossentropy": 2.872837543487549, "epoch": 0.1206666847929525, "grad_norm": 0.14639060199260712, "grad_norm_var": 0.0006493895590599785, "learning_rate": 0.009415517157243503, "loss": 2.8728, "step": 2219 }, { "crossentropy": 2.790416121482849, "epoch": 0.12072106365045271, "grad_norm": 0.05372565984725952, "grad_norm_var": 0.0006447880357378246, "learning_rate": 0.009414932808168198, "loss": 2.7904, "step": 2220 }, { "crossentropy": 2.8818737268447876, "epoch": 0.12077544250795291, "grad_norm": 0.043669361621141434, "grad_norm_var": 0.0006397935424113182, "learning_rate": 0.009414348185282875, "loss": 2.8819, "step": 2221 }, { "crossentropy": 2.8466269969940186, "epoch": 0.12082982136545312, "grad_norm": 0.154111847281456, "grad_norm_var": 0.0011887858626651287, "learning_rate": 0.009413763288623795, "loss": 2.8466, "step": 2222 }, { "crossentropy": 2.8395906686782837, "epoch": 0.12088420022295332, "grad_norm": 0.07123851776123047, "grad_norm_var": 0.001173173971849998, "learning_rate": 0.00941317811822723, "loss": 2.8396, "step": 2223 }, { "crossentropy": 2.8879709243774414, "epoch": 0.12093857908045352, "grad_norm": 0.04820388928055763, "grad_norm_var": 0.0011887368901422528, "learning_rate": 0.009412592674129472, "loss": 2.888, "step": 2224 }, { "crossentropy": 2.93408739566803, "epoch": 0.12099295793795373, "grad_norm": 0.04451332986354828, "grad_norm_var": 0.0011898723851106775, "learning_rate": 0.009412006956366832, "loss": 2.9341, "step": 2225 }, { "crossentropy": 2.826335906982422, "epoch": 0.12104733679545393, "grad_norm": 0.04704746976494789, "grad_norm_var": 0.001176854827263146, "learning_rate": 0.009411420964975633, "loss": 2.8263, "step": 2226 }, { "crossentropy": 2.821158766746521, "epoch": 0.12110171565295413, "grad_norm": 0.048422589898109436, "grad_norm_var": 0.0011938956568567913, "learning_rate": 0.009410834699992217, "loss": 2.8212, "step": 2227 }, { "crossentropy": 2.8863608837127686, "epoch": 0.12115609451045434, "grad_norm": 0.046605516225099564, "grad_norm_var": 0.0011982560234419124, "learning_rate": 0.009410248161452946, "loss": 2.8864, "step": 2228 }, { "crossentropy": 2.821031093597412, "epoch": 0.12121047336795454, "grad_norm": 0.043547872453927994, "grad_norm_var": 0.001204215175308626, "learning_rate": 0.009409661349394196, "loss": 2.821, "step": 2229 }, { "crossentropy": 2.8142969608306885, "epoch": 0.12126485222545474, "grad_norm": 0.04927245154976845, "grad_norm_var": 0.001214520395839854, "learning_rate": 0.00940907426385236, "loss": 2.8143, "step": 2230 }, { "crossentropy": 2.791648507118225, "epoch": 0.12131923108295495, "grad_norm": 0.04389585182070732, "grad_norm_var": 0.0012302859632337191, "learning_rate": 0.00940848690486385, "loss": 2.7916, "step": 2231 }, { "crossentropy": 2.8288965225219727, "epoch": 0.12137360994045515, "grad_norm": 0.04689272865653038, "grad_norm_var": 0.001246032292332269, "learning_rate": 0.009407899272465091, "loss": 2.8289, "step": 2232 }, { "crossentropy": 2.8763798475265503, "epoch": 0.12142798879795536, "grad_norm": 0.04674411937594414, "grad_norm_var": 0.0012599620824149502, "learning_rate": 0.009407311366692529, "loss": 2.8764, "step": 2233 }, { "crossentropy": 2.8465341329574585, "epoch": 0.12148236765545556, "grad_norm": 0.04622102901339531, "grad_norm_var": 0.0012512761061867549, "learning_rate": 0.009406723187582624, "loss": 2.8465, "step": 2234 }, { "crossentropy": 2.806950092315674, "epoch": 0.12153674651295576, "grad_norm": 0.04448737949132919, "grad_norm_var": 0.0007439062226979783, "learning_rate": 0.009406134735171856, "loss": 2.807, "step": 2235 }, { "crossentropy": 2.77016019821167, "epoch": 0.12159112537045597, "grad_norm": 0.04417360574007034, "grad_norm_var": 0.0007511203687495469, "learning_rate": 0.00940554600949672, "loss": 2.7702, "step": 2236 }, { "crossentropy": 2.9578120708465576, "epoch": 0.12164550422795617, "grad_norm": 0.054780155420303345, "grad_norm_var": 0.0007430644117478387, "learning_rate": 0.009404957010593728, "loss": 2.9578, "step": 2237 }, { "crossentropy": 2.8667707443237305, "epoch": 0.12169988308545637, "grad_norm": 0.0450904555618763, "grad_norm_var": 4.5354065070435144e-05, "learning_rate": 0.00940436773849941, "loss": 2.8668, "step": 2238 }, { "crossentropy": 2.8336082696914673, "epoch": 0.12175426194295658, "grad_norm": 0.05266309157013893, "grad_norm_var": 9.849672543321639e-06, "learning_rate": 0.009403778193250308, "loss": 2.8336, "step": 2239 }, { "crossentropy": 2.836042642593384, "epoch": 0.12180864080045678, "grad_norm": 0.05883004143834114, "grad_norm_var": 1.8562836958153858e-05, "learning_rate": 0.009403188374882989, "loss": 2.836, "step": 2240 }, { "crossentropy": 2.8175312280654907, "epoch": 0.12186301965795698, "grad_norm": 0.05127064511179924, "grad_norm_var": 1.854625080885385e-05, "learning_rate": 0.009402598283434031, "loss": 2.8175, "step": 2241 }, { "crossentropy": 2.791464328765869, "epoch": 0.12191739851545719, "grad_norm": 0.0423603430390358, "grad_norm_var": 2.0590577740627e-05, "learning_rate": 0.009402007918940034, "loss": 2.7915, "step": 2242 }, { "crossentropy": 2.744749903678894, "epoch": 0.12197177737295739, "grad_norm": 0.04713468253612518, "grad_norm_var": 2.0592249264199334e-05, "learning_rate": 0.00940141728143761, "loss": 2.7448, "step": 2243 }, { "crossentropy": 2.7558188438415527, "epoch": 0.1220261562304576, "grad_norm": 0.049301162362098694, "grad_norm_var": 2.0635730859115918e-05, "learning_rate": 0.00940082637096339, "loss": 2.7558, "step": 2244 }, { "crossentropy": 2.8938353061676025, "epoch": 0.1220805350879578, "grad_norm": 0.04397781938314438, "grad_norm_var": 2.039684140341848e-05, "learning_rate": 0.00940023518755402, "loss": 2.8938, "step": 2245 }, { "crossentropy": 2.7029471397399902, "epoch": 0.122134913945458, "grad_norm": 0.042637161910533905, "grad_norm_var": 2.1972778515902402e-05, "learning_rate": 0.009399643731246165, "loss": 2.7029, "step": 2246 }, { "crossentropy": 2.84119713306427, "epoch": 0.1221892928029582, "grad_norm": 0.046221815049648285, "grad_norm_var": 2.1184239783654005e-05, "learning_rate": 0.00939905200207651, "loss": 2.8412, "step": 2247 }, { "crossentropy": 2.854127049446106, "epoch": 0.12224367166045841, "grad_norm": 0.04728970304131508, "grad_norm_var": 2.1152729051084467e-05, "learning_rate": 0.00939846000008175, "loss": 2.8541, "step": 2248 }, { "crossentropy": 2.928267478942871, "epoch": 0.12229805051795861, "grad_norm": 0.04288195073604584, "grad_norm_var": 2.2576696622757474e-05, "learning_rate": 0.009397867725298601, "loss": 2.9283, "step": 2249 }, { "crossentropy": 2.864618182182312, "epoch": 0.12235242937545882, "grad_norm": 0.04758854955434799, "grad_norm_var": 2.246811351877305e-05, "learning_rate": 0.009397275177763799, "loss": 2.8646, "step": 2250 }, { "crossentropy": 2.774232029914856, "epoch": 0.12240680823295902, "grad_norm": 0.04961799457669258, "grad_norm_var": 2.202299516623855e-05, "learning_rate": 0.009396682357514086, "loss": 2.7742, "step": 2251 }, { "crossentropy": 2.812814712524414, "epoch": 0.12246118709045922, "grad_norm": 0.04517243802547455, "grad_norm_var": 2.1593911440583266e-05, "learning_rate": 0.009396089264586233, "loss": 2.8128, "step": 2252 }, { "crossentropy": 2.7526957988739014, "epoch": 0.12251556594795944, "grad_norm": 0.0424431748688221, "grad_norm_var": 1.9832075136309798e-05, "learning_rate": 0.009395495899017024, "loss": 2.7527, "step": 2253 }, { "crossentropy": 2.8866764307022095, "epoch": 0.12256994480545964, "grad_norm": 0.3203655183315277, "grad_norm_var": 0.0046800765466452395, "learning_rate": 0.009394902260843255, "loss": 2.8867, "step": 2254 }, { "crossentropy": 2.734421133995056, "epoch": 0.12262432366295985, "grad_norm": 0.06810655444860458, "grad_norm_var": 0.0046708978976015055, "learning_rate": 0.009394308350101745, "loss": 2.7344, "step": 2255 }, { "crossentropy": 2.770306348800659, "epoch": 0.12267870252046005, "grad_norm": 0.057187844067811966, "grad_norm_var": 0.004672488576115954, "learning_rate": 0.00939371416682933, "loss": 2.7703, "step": 2256 }, { "crossentropy": 2.783417820930481, "epoch": 0.12273308137796025, "grad_norm": 0.06446049362421036, "grad_norm_var": 0.004658825742778242, "learning_rate": 0.009393119711062858, "loss": 2.7834, "step": 2257 }, { "crossentropy": 2.8379584550857544, "epoch": 0.12278746023546046, "grad_norm": 0.06856094300746918, "grad_norm_var": 0.004618983971358561, "learning_rate": 0.009392524982839196, "loss": 2.838, "step": 2258 }, { "crossentropy": 2.902474880218506, "epoch": 0.12284183909296066, "grad_norm": 0.05535658448934555, "grad_norm_var": 0.00460068142698374, "learning_rate": 0.009391929982195232, "loss": 2.9025, "step": 2259 }, { "crossentropy": 2.8589812517166138, "epoch": 0.12289621795046086, "grad_norm": 0.04826858267188072, "grad_norm_var": 0.004603349745911231, "learning_rate": 0.009391334709167865, "loss": 2.859, "step": 2260 }, { "crossentropy": 2.899470090866089, "epoch": 0.12295059680796107, "grad_norm": 0.0454040952026844, "grad_norm_var": 0.004598883185503912, "learning_rate": 0.009390739163794012, "loss": 2.8995, "step": 2261 }, { "crossentropy": 2.82949960231781, "epoch": 0.12300497566546127, "grad_norm": 0.0771709531545639, "grad_norm_var": 0.004555610797182084, "learning_rate": 0.009390143346110609, "loss": 2.8295, "step": 2262 }, { "crossentropy": 2.9046950340270996, "epoch": 0.12305935452296148, "grad_norm": 0.05076846852898598, "grad_norm_var": 0.004542256963218652, "learning_rate": 0.00938954725615461, "loss": 2.9047, "step": 2263 }, { "crossentropy": 2.8312032222747803, "epoch": 0.12311373338046168, "grad_norm": 0.04715791717171669, "grad_norm_var": 0.004542668790759571, "learning_rate": 0.009388950893962982, "loss": 2.8312, "step": 2264 }, { "crossentropy": 2.723353147506714, "epoch": 0.12316811223796188, "grad_norm": 0.045330651104450226, "grad_norm_var": 0.004533975178046395, "learning_rate": 0.00938835425957271, "loss": 2.7234, "step": 2265 }, { "crossentropy": 2.8242433071136475, "epoch": 0.12322249109546209, "grad_norm": 0.04593806713819504, "grad_norm_var": 0.0045392556570529805, "learning_rate": 0.0093877573530208, "loss": 2.8242, "step": 2266 }, { "crossentropy": 2.8188211917877197, "epoch": 0.12327686995296229, "grad_norm": 0.0428752601146698, "grad_norm_var": 0.004561056764833549, "learning_rate": 0.009387160174344268, "loss": 2.8188, "step": 2267 }, { "crossentropy": 2.708863854408264, "epoch": 0.1233312488104625, "grad_norm": 0.0639052465558052, "grad_norm_var": 0.00452026412705116, "learning_rate": 0.009386562723580153, "loss": 2.7089, "step": 2268 }, { "crossentropy": 2.7244813442230225, "epoch": 0.1233856276679627, "grad_norm": 0.05901774391531944, "grad_norm_var": 0.0044733166203470434, "learning_rate": 0.009385965000765509, "loss": 2.7245, "step": 2269 }, { "crossentropy": 2.7931222915649414, "epoch": 0.1234400065254629, "grad_norm": 0.047919731587171555, "grad_norm_var": 0.00010821299888469863, "learning_rate": 0.009385367005937404, "loss": 2.7931, "step": 2270 }, { "crossentropy": 2.7707358598709106, "epoch": 0.1234943853829631, "grad_norm": 0.04688471555709839, "grad_norm_var": 0.0001005887096975364, "learning_rate": 0.009384768739132925, "loss": 2.7707, "step": 2271 }, { "crossentropy": 2.752439498901367, "epoch": 0.12354876424046331, "grad_norm": 0.08418815582990646, "grad_norm_var": 0.000157131985297837, "learning_rate": 0.009384170200389176, "loss": 2.7524, "step": 2272 }, { "crossentropy": 2.7939388751983643, "epoch": 0.12360314309796351, "grad_norm": 0.04406420886516571, "grad_norm_var": 0.00015964954076900771, "learning_rate": 0.00938357138974328, "loss": 2.7939, "step": 2273 }, { "crossentropy": 2.8127949237823486, "epoch": 0.12365752195546371, "grad_norm": 0.04933779686689377, "grad_norm_var": 0.00014683568515790242, "learning_rate": 0.009382972307232371, "loss": 2.8128, "step": 2274 }, { "crossentropy": 2.897245407104492, "epoch": 0.12371190081296392, "grad_norm": 0.044619303196668625, "grad_norm_var": 0.0001511675014082891, "learning_rate": 0.009382372952893608, "loss": 2.8972, "step": 2275 }, { "crossentropy": 2.790053963661194, "epoch": 0.12376627967046412, "grad_norm": 0.042153168469667435, "grad_norm_var": 0.00015710043069403761, "learning_rate": 0.00938177332676416, "loss": 2.7901, "step": 2276 }, { "crossentropy": 2.7982579469680786, "epoch": 0.12382065852796433, "grad_norm": 0.04290295019745827, "grad_norm_var": 0.0001597897565877817, "learning_rate": 0.009381173428881215, "loss": 2.7983, "step": 2277 }, { "crossentropy": 2.8307689428329468, "epoch": 0.12387503738546453, "grad_norm": 0.044433001428842545, "grad_norm_var": 0.00011751243653465767, "learning_rate": 0.009380573259281978, "loss": 2.8308, "step": 2278 }, { "crossentropy": 2.894057869911194, "epoch": 0.12392941624296473, "grad_norm": 0.04550347104668617, "grad_norm_var": 0.00011877113821833948, "learning_rate": 0.009379972818003672, "loss": 2.8941, "step": 2279 }, { "crossentropy": 2.816710591316223, "epoch": 0.12398379510046494, "grad_norm": 0.041688524186611176, "grad_norm_var": 0.00012254160837684013, "learning_rate": 0.009379372105083534, "loss": 2.8167, "step": 2280 }, { "crossentropy": 2.778288960456848, "epoch": 0.12403817395796514, "grad_norm": 0.043631091713905334, "grad_norm_var": 0.00012364941339167707, "learning_rate": 0.009378771120558823, "loss": 2.7783, "step": 2281 }, { "crossentropy": 2.8409969806671143, "epoch": 0.12409255281546534, "grad_norm": 0.0855029821395874, "grad_norm_var": 0.00020366401382315984, "learning_rate": 0.009378169864466808, "loss": 2.841, "step": 2282 }, { "crossentropy": 2.857813835144043, "epoch": 0.12414693167296555, "grad_norm": 0.045054711401462555, "grad_norm_var": 0.00020137055352916066, "learning_rate": 0.00937756833684478, "loss": 2.8578, "step": 2283 }, { "crossentropy": 2.9161611795425415, "epoch": 0.12420131053046575, "grad_norm": 0.04577336460351944, "grad_norm_var": 0.00019295614738730151, "learning_rate": 0.009376966537730045, "loss": 2.9162, "step": 2284 }, { "crossentropy": 2.6826655864715576, "epoch": 0.12425568938796595, "grad_norm": 0.046184711158275604, "grad_norm_var": 0.00018917454822545862, "learning_rate": 0.009376364467159927, "loss": 2.6827, "step": 2285 }, { "crossentropy": 2.700788140296936, "epoch": 0.12431006824546616, "grad_norm": 0.04935026541352272, "grad_norm_var": 0.0001889075488721396, "learning_rate": 0.009375762125171765, "loss": 2.7008, "step": 2286 }, { "crossentropy": 2.7785463333129883, "epoch": 0.12436444710296636, "grad_norm": 0.05259003862738609, "grad_norm_var": 0.000188511644611806, "learning_rate": 0.009375159511802914, "loss": 2.7785, "step": 2287 }, { "crossentropy": 2.774510979652405, "epoch": 0.12441882596046656, "grad_norm": 0.043108273297548294, "grad_norm_var": 0.00010911325927526171, "learning_rate": 0.009374556627090748, "loss": 2.7745, "step": 2288 }, { "crossentropy": 2.6886175870895386, "epoch": 0.12447320481796677, "grad_norm": 0.040468621999025345, "grad_norm_var": 0.00011174515162407356, "learning_rate": 0.009373953471072661, "loss": 2.6886, "step": 2289 }, { "crossentropy": 2.879429578781128, "epoch": 0.12452758367546699, "grad_norm": 0.045450203120708466, "grad_norm_var": 0.00011181170978747096, "learning_rate": 0.009373350043786055, "loss": 2.8794, "step": 2290 }, { "crossentropy": 2.7302550077438354, "epoch": 0.12458196253296719, "grad_norm": 0.045355986803770065, "grad_norm_var": 0.00011157240607510919, "learning_rate": 0.009372746345268359, "loss": 2.7303, "step": 2291 }, { "crossentropy": 2.7531075477600098, "epoch": 0.12463634139046739, "grad_norm": 0.04394815117120743, "grad_norm_var": 0.00011050681010939996, "learning_rate": 0.00937214237555701, "loss": 2.7531, "step": 2292 }, { "crossentropy": 2.7813111543655396, "epoch": 0.1246907202479676, "grad_norm": 0.04089430347084999, "grad_norm_var": 0.00011200599699759487, "learning_rate": 0.009371538134689468, "loss": 2.7813, "step": 2293 }, { "crossentropy": 2.8725310564041138, "epoch": 0.1247450991054678, "grad_norm": 0.0426507294178009, "grad_norm_var": 0.00011291758039804796, "learning_rate": 0.009370933622703206, "loss": 2.8725, "step": 2294 }, { "crossentropy": 2.8243449926376343, "epoch": 0.124799477962968, "grad_norm": 0.10177087783813477, "grad_norm_var": 0.000297149101194649, "learning_rate": 0.009370328839635716, "loss": 2.8243, "step": 2295 }, { "crossentropy": 2.8498798608779907, "epoch": 0.1248538568204682, "grad_norm": 0.04353140667080879, "grad_norm_var": 0.0002951129490092139, "learning_rate": 0.009369723785524505, "loss": 2.8499, "step": 2296 }, { "crossentropy": 2.7252964973449707, "epoch": 0.12490823567796841, "grad_norm": 0.041524264961481094, "grad_norm_var": 0.00029744747899981805, "learning_rate": 0.009369118460407099, "loss": 2.7253, "step": 2297 }, { "crossentropy": 2.866705298423767, "epoch": 0.12496261453546861, "grad_norm": 0.041323766112327576, "grad_norm_var": 0.00021514723167910603, "learning_rate": 0.009368512864321039, "loss": 2.8667, "step": 2298 }, { "crossentropy": 2.9126824140548706, "epoch": 0.12501699339296882, "grad_norm": 0.049376167356967926, "grad_norm_var": 0.00021458207980877436, "learning_rate": 0.009367906997303884, "loss": 2.9127, "step": 2299 }, { "crossentropy": 2.777941107749939, "epoch": 0.12507137225046902, "grad_norm": 0.04297281429171562, "grad_norm_var": 0.00021602743042185886, "learning_rate": 0.009367300859393211, "loss": 2.7779, "step": 2300 }, { "crossentropy": 2.6824910640716553, "epoch": 0.12512575110796922, "grad_norm": 0.0431038998067379, "grad_norm_var": 0.00021743051633368407, "learning_rate": 0.009366694450626609, "loss": 2.6825, "step": 2301 }, { "crossentropy": 2.7485066652297974, "epoch": 0.12518012996546943, "grad_norm": 0.19141440093517303, "grad_norm_var": 0.001505082665939647, "learning_rate": 0.009366087771041687, "loss": 2.7485, "step": 2302 }, { "crossentropy": 2.817706823348999, "epoch": 0.12523450882296963, "grad_norm": 0.043551404029130936, "grad_norm_var": 0.0015153138766162038, "learning_rate": 0.009365480820676073, "loss": 2.8177, "step": 2303 }, { "crossentropy": 2.664399027824402, "epoch": 0.12528888768046983, "grad_norm": 0.04283010587096214, "grad_norm_var": 0.0015158071582254508, "learning_rate": 0.009364873599567409, "loss": 2.6644, "step": 2304 }, { "crossentropy": 2.738315224647522, "epoch": 0.12534326653797004, "grad_norm": 0.04320099949836731, "grad_norm_var": 0.0015105205469390997, "learning_rate": 0.009364266107753354, "loss": 2.7383, "step": 2305 }, { "crossentropy": 2.852235794067383, "epoch": 0.12539764539547024, "grad_norm": 0.04192904382944107, "grad_norm_var": 0.0015164509105922723, "learning_rate": 0.009363658345271583, "loss": 2.8522, "step": 2306 }, { "crossentropy": 2.7064690589904785, "epoch": 0.12545202425297045, "grad_norm": 0.04393624886870384, "grad_norm_var": 0.0015186317530887798, "learning_rate": 0.00936305031215979, "loss": 2.7065, "step": 2307 }, { "crossentropy": 2.7381690740585327, "epoch": 0.12550640311047065, "grad_norm": 0.08794784545898438, "grad_norm_var": 0.0015682082406102396, "learning_rate": 0.009362442008455684, "loss": 2.7382, "step": 2308 }, { "crossentropy": 2.746302843093872, "epoch": 0.12556078196797085, "grad_norm": 0.09488429874181747, "grad_norm_var": 0.0016209724567684911, "learning_rate": 0.00936183343419699, "loss": 2.7463, "step": 2309 }, { "crossentropy": 2.9462941884994507, "epoch": 0.12561516082547106, "grad_norm": 0.04035681113600731, "grad_norm_var": 0.001627294896799894, "learning_rate": 0.009361224589421456, "loss": 2.9463, "step": 2310 }, { "crossentropy": 2.810101628303528, "epoch": 0.12566953968297126, "grad_norm": 0.044590696692466736, "grad_norm_var": 0.001529217386175019, "learning_rate": 0.009360615474166839, "loss": 2.8101, "step": 2311 }, { "crossentropy": 2.7312601804733276, "epoch": 0.12572391854047146, "grad_norm": 0.8092495799064636, "grad_norm_var": 0.03664323188023453, "learning_rate": 0.009360006088470913, "loss": 2.7313, "step": 2312 }, { "crossentropy": 2.906070351600647, "epoch": 0.12577829739797167, "grad_norm": 0.044596631079912186, "grad_norm_var": 0.03661725089333793, "learning_rate": 0.009359396432371476, "loss": 2.9061, "step": 2313 }, { "crossentropy": 2.8636475801467896, "epoch": 0.12583267625547187, "grad_norm": 0.08702483028173447, "grad_norm_var": 0.03635015617486298, "learning_rate": 0.009358786505906337, "loss": 2.8636, "step": 2314 }, { "crossentropy": 2.8028982877731323, "epoch": 0.12588705511297207, "grad_norm": 0.05014200508594513, "grad_norm_var": 0.03634406008537061, "learning_rate": 0.009358176309113321, "loss": 2.8029, "step": 2315 }, { "crossentropy": 2.861294150352478, "epoch": 0.12594143397047228, "grad_norm": 0.07034264504909515, "grad_norm_var": 0.03614816215110412, "learning_rate": 0.009357565842030274, "loss": 2.8613, "step": 2316 }, { "crossentropy": 2.7751035690307617, "epoch": 0.12599581282797248, "grad_norm": 0.34502536058425903, "grad_norm_var": 0.039104405770882866, "learning_rate": 0.009356955104695057, "loss": 2.7751, "step": 2317 }, { "crossentropy": 2.7164978981018066, "epoch": 0.12605019168547268, "grad_norm": 0.05840982869267464, "grad_norm_var": 0.0391220585632196, "learning_rate": 0.009356344097145544, "loss": 2.7165, "step": 2318 }, { "crossentropy": 2.9129652976989746, "epoch": 0.1261045705429729, "grad_norm": 0.0699547678232193, "grad_norm_var": 0.03889033149992858, "learning_rate": 0.009355732819419633, "loss": 2.913, "step": 2319 }, { "crossentropy": 2.8206602334976196, "epoch": 0.1261589494004731, "grad_norm": 0.16843672096729279, "grad_norm_var": 0.038527024423249304, "learning_rate": 0.009355121271555234, "loss": 2.8207, "step": 2320 }, { "crossentropy": 2.9006417989730835, "epoch": 0.1262133282579733, "grad_norm": 0.0754302442073822, "grad_norm_var": 0.03821357004444082, "learning_rate": 0.009354509453590273, "loss": 2.9006, "step": 2321 }, { "crossentropy": 2.9302194118499756, "epoch": 0.1262677071154735, "grad_norm": 0.05203067138791084, "grad_norm_var": 0.03809692733807079, "learning_rate": 0.009353897365562698, "loss": 2.9302, "step": 2322 }, { "crossentropy": 2.821760892868042, "epoch": 0.1263220859729737, "grad_norm": 0.07257585972547531, "grad_norm_var": 0.03780466437594966, "learning_rate": 0.009353285007510466, "loss": 2.8218, "step": 2323 }, { "crossentropy": 2.8169476985931396, "epoch": 0.1263764648304739, "grad_norm": 0.05167527496814728, "grad_norm_var": 0.038117780554161686, "learning_rate": 0.009352672379471554, "loss": 2.8169, "step": 2324 }, { "crossentropy": 2.6719452142715454, "epoch": 0.1264308436879741, "grad_norm": 0.049491506069898605, "grad_norm_var": 0.03847979692409953, "learning_rate": 0.009352059481483961, "loss": 2.6719, "step": 2325 }, { "crossentropy": 2.8593426942825317, "epoch": 0.1264852225454743, "grad_norm": 0.04762560874223709, "grad_norm_var": 0.03839565402121205, "learning_rate": 0.009351446313585697, "loss": 2.8593, "step": 2326 }, { "crossentropy": 2.8066651821136475, "epoch": 0.12653960140297452, "grad_norm": 0.04840053245425224, "grad_norm_var": 0.038352648045374624, "learning_rate": 0.009350832875814787, "loss": 2.8067, "step": 2327 }, { "crossentropy": 2.8921083211898804, "epoch": 0.12659398026047472, "grad_norm": 0.04377233609557152, "grad_norm_var": 0.005778389821756049, "learning_rate": 0.009350219168209281, "loss": 2.8921, "step": 2328 }, { "crossentropy": 2.7766581773757935, "epoch": 0.12664835911797492, "grad_norm": 0.04525953158736229, "grad_norm_var": 0.005774984629059164, "learning_rate": 0.009349605190807238, "loss": 2.7767, "step": 2329 }, { "crossentropy": 2.757716417312622, "epoch": 0.12670273797547513, "grad_norm": 0.33295226097106934, "grad_norm_var": 0.009671408202302013, "learning_rate": 0.009348990943646733, "loss": 2.7577, "step": 2330 }, { "crossentropy": 2.8391621112823486, "epoch": 0.12675711683297533, "grad_norm": 0.05032606050372124, "grad_norm_var": 0.009670215104994223, "learning_rate": 0.009348376426765868, "loss": 2.8392, "step": 2331 }, { "crossentropy": 2.8529430627822876, "epoch": 0.12681149569047553, "grad_norm": 0.04729501157999039, "grad_norm_var": 0.009791039277339085, "learning_rate": 0.009347761640202748, "loss": 2.8529, "step": 2332 }, { "crossentropy": 2.862080693244934, "epoch": 0.12686587454797574, "grad_norm": 0.04993967339396477, "grad_norm_var": 0.005491145220875202, "learning_rate": 0.009347146583995507, "loss": 2.8621, "step": 2333 }, { "crossentropy": 2.876030921936035, "epoch": 0.12692025340547594, "grad_norm": 0.044755611568689346, "grad_norm_var": 0.0055402350054799205, "learning_rate": 0.009346531258182286, "loss": 2.876, "step": 2334 }, { "crossentropy": 2.821781873703003, "epoch": 0.12697463226297614, "grad_norm": 0.04613468050956726, "grad_norm_var": 0.0056016304948108015, "learning_rate": 0.00934591566280125, "loss": 2.8218, "step": 2335 }, { "crossentropy": 2.812955856323242, "epoch": 0.12702901112047635, "grad_norm": 0.04717385396361351, "grad_norm_var": 0.005036328829527293, "learning_rate": 0.009345299797890574, "loss": 2.813, "step": 2336 }, { "crossentropy": 2.8981106281280518, "epoch": 0.12708338997797655, "grad_norm": 0.07871188968420029, "grad_norm_var": 0.005039792538799068, "learning_rate": 0.009344683663488458, "loss": 2.8981, "step": 2337 }, { "crossentropy": 2.8380935192108154, "epoch": 0.12713776883547676, "grad_norm": 0.3390405476093292, "grad_norm_var": 0.009528972492622792, "learning_rate": 0.009344067259633111, "loss": 2.8381, "step": 2338 }, { "crossentropy": 2.8252551555633545, "epoch": 0.12719214769297696, "grad_norm": 0.05352651700377464, "grad_norm_var": 0.009588785294326945, "learning_rate": 0.009343450586362764, "loss": 2.8253, "step": 2339 }, { "crossentropy": 2.8028712272644043, "epoch": 0.12724652655047716, "grad_norm": 0.06255984306335449, "grad_norm_var": 0.009546367929598697, "learning_rate": 0.009342833643715657, "loss": 2.8029, "step": 2340 }, { "crossentropy": 2.883394718170166, "epoch": 0.12730090540797737, "grad_norm": 0.13888975977897644, "grad_norm_var": 0.00960252905623262, "learning_rate": 0.00934221643173006, "loss": 2.8834, "step": 2341 }, { "crossentropy": 2.8683382272720337, "epoch": 0.12735528426547757, "grad_norm": 0.05681135505437851, "grad_norm_var": 0.009553120390316227, "learning_rate": 0.009341598950444245, "loss": 2.8683, "step": 2342 }, { "crossentropy": 2.794377326965332, "epoch": 0.12740966312297777, "grad_norm": 0.09169819951057434, "grad_norm_var": 0.009413698856176187, "learning_rate": 0.009340981199896514, "loss": 2.7944, "step": 2343 }, { "crossentropy": 2.9912984371185303, "epoch": 0.12746404198047798, "grad_norm": 0.12406697869300842, "grad_norm_var": 0.009262289968699398, "learning_rate": 0.009340363180125174, "loss": 2.9913, "step": 2344 }, { "crossentropy": 2.9250032901763916, "epoch": 0.12751842083797818, "grad_norm": 0.07167592644691467, "grad_norm_var": 0.009111085538715175, "learning_rate": 0.009339744891168556, "loss": 2.925, "step": 2345 }, { "crossentropy": 2.8104212284088135, "epoch": 0.1275727996954784, "grad_norm": 0.054716337472200394, "grad_norm_var": 0.00538989203141052, "learning_rate": 0.009339126333065008, "loss": 2.8104, "step": 2346 }, { "crossentropy": 2.74973726272583, "epoch": 0.12762717855297862, "grad_norm": 0.06036510691046715, "grad_norm_var": 0.005350002513832738, "learning_rate": 0.009338507505852888, "loss": 2.7497, "step": 2347 }, { "crossentropy": 2.828366756439209, "epoch": 0.12768155741047882, "grad_norm": 0.23015742003917694, "grad_norm_var": 0.006509391261188914, "learning_rate": 0.009337888409570578, "loss": 2.8284, "step": 2348 }, { "crossentropy": 2.9098150730133057, "epoch": 0.12773593626797902, "grad_norm": 0.05114998668432236, "grad_norm_var": 0.0065019063653203, "learning_rate": 0.009337269044256474, "loss": 2.9098, "step": 2349 }, { "crossentropy": 2.808016061782837, "epoch": 0.12779031512547923, "grad_norm": 0.04717903584241867, "grad_norm_var": 0.006485403481219121, "learning_rate": 0.009336649409948985, "loss": 2.808, "step": 2350 }, { "crossentropy": 2.6947836875915527, "epoch": 0.12784469398297943, "grad_norm": 0.06445373594760895, "grad_norm_var": 0.006381853549590701, "learning_rate": 0.009336029506686546, "loss": 2.6948, "step": 2351 }, { "crossentropy": 2.811453938484192, "epoch": 0.12789907284047963, "grad_norm": 0.04452107101678848, "grad_norm_var": 0.0064003631370642775, "learning_rate": 0.009335409334507596, "loss": 2.8115, "step": 2352 }, { "crossentropy": 2.7557398080825806, "epoch": 0.12795345169797984, "grad_norm": 0.0486205630004406, "grad_norm_var": 0.006534725531215052, "learning_rate": 0.009334788893450604, "loss": 2.7557, "step": 2353 }, { "crossentropy": 2.8547332286834717, "epoch": 0.12800783055548004, "grad_norm": 0.0429450087249279, "grad_norm_var": 0.0024276340651649803, "learning_rate": 0.009334168183554044, "loss": 2.8547, "step": 2354 }, { "crossentropy": 2.7413039207458496, "epoch": 0.12806220941298024, "grad_norm": 0.07558693736791611, "grad_norm_var": 0.0023869216659351774, "learning_rate": 0.009333547204856414, "loss": 2.7413, "step": 2355 }, { "crossentropy": 2.76566219329834, "epoch": 0.12811658827048045, "grad_norm": 0.14786465466022491, "grad_norm_var": 0.0026537453850633886, "learning_rate": 0.009332925957396227, "loss": 2.7657, "step": 2356 }, { "crossentropy": 2.7997101545333862, "epoch": 0.12817096712798065, "grad_norm": 0.04312985762953758, "grad_norm_var": 0.002531384329168263, "learning_rate": 0.009332304441212013, "loss": 2.7997, "step": 2357 }, { "crossentropy": 2.898971199989319, "epoch": 0.12822534598548085, "grad_norm": 0.10816052556037903, "grad_norm_var": 0.002548140541134448, "learning_rate": 0.009331682656342315, "loss": 2.899, "step": 2358 }, { "crossentropy": 2.8886590003967285, "epoch": 0.12827972484298106, "grad_norm": 0.07394123077392578, "grad_norm_var": 0.0025440412589079345, "learning_rate": 0.009331060602825698, "loss": 2.8887, "step": 2359 }, { "crossentropy": 2.8295544385910034, "epoch": 0.12833410370048126, "grad_norm": 0.04621848836541176, "grad_norm_var": 0.0024709456752563565, "learning_rate": 0.009330438280700739, "loss": 2.8296, "step": 2360 }, { "crossentropy": 2.862494945526123, "epoch": 0.12838848255798146, "grad_norm": 0.05793515965342522, "grad_norm_var": 0.002490059862140489, "learning_rate": 0.009329815690006037, "loss": 2.8625, "step": 2361 }, { "crossentropy": 2.696905016899109, "epoch": 0.12844286141548167, "grad_norm": 0.048485469073057175, "grad_norm_var": 0.0025091790343065583, "learning_rate": 0.009329192830780202, "loss": 2.6969, "step": 2362 }, { "crossentropy": 2.7616788148880005, "epoch": 0.12849724027298187, "grad_norm": 0.0460202656686306, "grad_norm_var": 0.002548921279495232, "learning_rate": 0.009328569703061862, "loss": 2.7617, "step": 2363 }, { "crossentropy": 2.9019583463668823, "epoch": 0.12855161913048208, "grad_norm": 0.05971517413854599, "grad_norm_var": 0.000804966919855439, "learning_rate": 0.009327946306889667, "loss": 2.902, "step": 2364 }, { "crossentropy": 2.8430073261260986, "epoch": 0.12860599798798228, "grad_norm": 0.05998629331588745, "grad_norm_var": 0.0007960381944826724, "learning_rate": 0.009327322642302275, "loss": 2.843, "step": 2365 }, { "crossentropy": 2.7689871788024902, "epoch": 0.12866037684548248, "grad_norm": 0.046866513788700104, "grad_norm_var": 0.0007967211666692263, "learning_rate": 0.009326698709338369, "loss": 2.769, "step": 2366 }, { "crossentropy": 2.7993048429489136, "epoch": 0.12871475570298269, "grad_norm": 0.04349156841635704, "grad_norm_var": 0.0008212482005897375, "learning_rate": 0.009326074508036642, "loss": 2.7993, "step": 2367 }, { "crossentropy": 2.754772186279297, "epoch": 0.1287691345604829, "grad_norm": 0.044686853885650635, "grad_norm_var": 0.0008208615006182594, "learning_rate": 0.009325450038435806, "loss": 2.7548, "step": 2368 }, { "crossentropy": 2.8160977363586426, "epoch": 0.1288235134179831, "grad_norm": 0.04454195499420166, "grad_norm_var": 0.0008292333573218532, "learning_rate": 0.009324825300574591, "loss": 2.8161, "step": 2369 }, { "crossentropy": 2.821397542953491, "epoch": 0.1288778922754833, "grad_norm": 0.04666294530034065, "grad_norm_var": 0.0008207263685763063, "learning_rate": 0.009324200294491745, "loss": 2.8214, "step": 2370 }, { "crossentropy": 2.7995922565460205, "epoch": 0.1289322711329835, "grad_norm": 0.046331748366355896, "grad_norm_var": 0.0008215349849787889, "learning_rate": 0.009323575020226028, "loss": 2.7996, "step": 2371 }, { "crossentropy": 2.913392186164856, "epoch": 0.1289866499904837, "grad_norm": 0.04698646441102028, "grad_norm_var": 0.000279138726936447, "learning_rate": 0.009322949477816218, "loss": 2.9134, "step": 2372 }, { "crossentropy": 2.851479649543762, "epoch": 0.1290410288479839, "grad_norm": 0.04330335184931755, "grad_norm_var": 0.00027889036766981116, "learning_rate": 0.00932232366730111, "loss": 2.8515, "step": 2373 }, { "crossentropy": 2.7252429723739624, "epoch": 0.1290954077054841, "grad_norm": 0.043190766125917435, "grad_norm_var": 7.317379995919188e-05, "learning_rate": 0.009321697588719521, "loss": 2.7252, "step": 2374 }, { "crossentropy": 2.9084205627441406, "epoch": 0.12914978656298431, "grad_norm": 0.06496061384677887, "grad_norm_var": 4.942449732968224e-05, "learning_rate": 0.009321071242110275, "loss": 2.9084, "step": 2375 }, { "crossentropy": 2.787988543510437, "epoch": 0.12920416542048452, "grad_norm": 0.06634576618671417, "grad_norm_var": 6.637615469598859e-05, "learning_rate": 0.009320444627512222, "loss": 2.788, "step": 2376 }, { "crossentropy": 2.8226025104522705, "epoch": 0.12925854427798472, "grad_norm": 0.04921640455722809, "grad_norm_var": 6.259359674617524e-05, "learning_rate": 0.009319817744964218, "loss": 2.8226, "step": 2377 }, { "crossentropy": 2.860546112060547, "epoch": 0.12931292313548493, "grad_norm": 0.1949421465396881, "grad_norm_var": 0.0013726491042269538, "learning_rate": 0.009319190594505145, "loss": 2.8605, "step": 2378 }, { "crossentropy": 2.8594565391540527, "epoch": 0.12936730199298513, "grad_norm": 0.04068831726908684, "grad_norm_var": 0.001383797950339181, "learning_rate": 0.009318563176173899, "loss": 2.8595, "step": 2379 }, { "crossentropy": 2.8926254510879517, "epoch": 0.12942168085048533, "grad_norm": 0.04275861382484436, "grad_norm_var": 0.001399856985661768, "learning_rate": 0.00931793549000939, "loss": 2.8926, "step": 2380 }, { "crossentropy": 2.831834077835083, "epoch": 0.12947605970798554, "grad_norm": 0.06866542994976044, "grad_norm_var": 0.001407083371890577, "learning_rate": 0.009317307536050548, "loss": 2.8318, "step": 2381 }, { "crossentropy": 2.7931638956069946, "epoch": 0.12953043856548574, "grad_norm": 0.04603968933224678, "grad_norm_var": 0.00140839234815687, "learning_rate": 0.009316679314336319, "loss": 2.7932, "step": 2382 }, { "crossentropy": 2.85239839553833, "epoch": 0.12958481742298594, "grad_norm": 0.07148108631372452, "grad_norm_var": 0.0014020886058565486, "learning_rate": 0.009316050824905662, "loss": 2.8524, "step": 2383 }, { "crossentropy": 2.725277543067932, "epoch": 0.12963919628048615, "grad_norm": 0.04824483394622803, "grad_norm_var": 0.0013955915077374047, "learning_rate": 0.009315422067797556, "loss": 2.7253, "step": 2384 }, { "crossentropy": 2.8038134574890137, "epoch": 0.12969357513798635, "grad_norm": 0.041665785014629364, "grad_norm_var": 0.001402141029204549, "learning_rate": 0.009314793043050997, "loss": 2.8038, "step": 2385 }, { "crossentropy": 2.673764944076538, "epoch": 0.12974795399548655, "grad_norm": 0.08461590111255646, "grad_norm_var": 0.0014242076148117853, "learning_rate": 0.009314163750704998, "loss": 2.6738, "step": 2386 }, { "crossentropy": 2.8934178352355957, "epoch": 0.12980233285298676, "grad_norm": 0.04565844684839249, "grad_norm_var": 0.0014256842699981128, "learning_rate": 0.009313534190798584, "loss": 2.8934, "step": 2387 }, { "crossentropy": 2.9551193714141846, "epoch": 0.12985671171048696, "grad_norm": 0.04414461553096771, "grad_norm_var": 0.001432038029618741, "learning_rate": 0.0093129043633708, "loss": 2.9551, "step": 2388 }, { "crossentropy": 2.6209100484848022, "epoch": 0.12991109056798716, "grad_norm": 0.09209585934877396, "grad_norm_var": 0.0014576035419709764, "learning_rate": 0.009312274268460709, "loss": 2.6209, "step": 2389 }, { "crossentropy": 2.816053628921509, "epoch": 0.12996546942548737, "grad_norm": 0.04390706866979599, "grad_norm_var": 0.0014555245350295802, "learning_rate": 0.00931164390610739, "loss": 2.8161, "step": 2390 }, { "crossentropy": 2.716471552848816, "epoch": 0.13001984828298757, "grad_norm": 0.04456675797700882, "grad_norm_var": 0.0014825488881858695, "learning_rate": 0.009311013276349934, "loss": 2.7165, "step": 2391 }, { "crossentropy": 2.8158442974090576, "epoch": 0.13007422714048777, "grad_norm": 0.0511024110019207, "grad_norm_var": 0.0014924354269620483, "learning_rate": 0.009310382379227454, "loss": 2.8158, "step": 2392 }, { "crossentropy": 2.754544496536255, "epoch": 0.13012860599798798, "grad_norm": 0.04496997594833374, "grad_norm_var": 0.001501430038960089, "learning_rate": 0.009309751214779076, "loss": 2.7545, "step": 2393 }, { "crossentropy": 2.7412240505218506, "epoch": 0.13018298485548818, "grad_norm": 0.06372513622045517, "grad_norm_var": 0.0002664594642465369, "learning_rate": 0.009309119783043949, "loss": 2.7412, "step": 2394 }, { "crossentropy": 2.902130961418152, "epoch": 0.13023736371298839, "grad_norm": 0.0436849445104599, "grad_norm_var": 0.00026144405548826334, "learning_rate": 0.009308488084061227, "loss": 2.9021, "step": 2395 }, { "crossentropy": 2.878165364265442, "epoch": 0.1302917425704886, "grad_norm": 0.05641939491033554, "grad_norm_var": 0.0002511150391617552, "learning_rate": 0.009307856117870092, "loss": 2.8782, "step": 2396 }, { "crossentropy": 2.8636984825134277, "epoch": 0.1303461214279888, "grad_norm": 0.050302062183618546, "grad_norm_var": 0.00024041313097567744, "learning_rate": 0.009307223884509736, "loss": 2.8637, "step": 2397 }, { "crossentropy": 2.736552357673645, "epoch": 0.130400500285489, "grad_norm": 0.047148291021585464, "grad_norm_var": 0.00023923363026460786, "learning_rate": 0.009306591384019371, "loss": 2.7366, "step": 2398 }, { "crossentropy": 2.837515711784363, "epoch": 0.1304548791429892, "grad_norm": 0.04111701250076294, "grad_norm_var": 0.0002285469294909266, "learning_rate": 0.009305958616438225, "loss": 2.8375, "step": 2399 }, { "crossentropy": 2.847339630126953, "epoch": 0.1305092580004894, "grad_norm": 0.04247697442770004, "grad_norm_var": 0.0002340605273419953, "learning_rate": 0.009305325581805537, "loss": 2.8473, "step": 2400 }, { "crossentropy": 2.6393861770629883, "epoch": 0.1305636368579896, "grad_norm": 0.07721129059791565, "grad_norm_var": 0.0002623912475800867, "learning_rate": 0.009304692280160573, "loss": 2.6394, "step": 2401 }, { "crossentropy": 2.8252495527267456, "epoch": 0.1306180157154898, "grad_norm": 0.04456373304128647, "grad_norm_var": 0.00020220718617097727, "learning_rate": 0.009304058711542607, "loss": 2.8252, "step": 2402 }, { "crossentropy": 2.833463430404663, "epoch": 0.13067239457299, "grad_norm": 0.1376725733280182, "grad_norm_var": 0.0006527291472395353, "learning_rate": 0.009303424875990932, "loss": 2.8335, "step": 2403 }, { "crossentropy": 2.7558711767196655, "epoch": 0.13072677343049022, "grad_norm": 0.03919254243373871, "grad_norm_var": 0.0006632908792598886, "learning_rate": 0.009302790773544857, "loss": 2.7559, "step": 2404 }, { "crossentropy": 2.760709047317505, "epoch": 0.13078115228799042, "grad_norm": 0.04426700249314308, "grad_norm_var": 0.0005857039857943148, "learning_rate": 0.00930215640424371, "loss": 2.7607, "step": 2405 }, { "crossentropy": 2.7935131788253784, "epoch": 0.13083553114549062, "grad_norm": 0.07105227559804916, "grad_norm_var": 0.0005933442258372425, "learning_rate": 0.009301521768126835, "loss": 2.7935, "step": 2406 }, { "crossentropy": 2.9190657138824463, "epoch": 0.13088991000299083, "grad_norm": 0.06971026957035065, "grad_norm_var": 0.0005937993414978693, "learning_rate": 0.00930088686523359, "loss": 2.9191, "step": 2407 }, { "crossentropy": 2.7395756244659424, "epoch": 0.13094428886049103, "grad_norm": 0.04420628771185875, "grad_norm_var": 0.0006029193633157487, "learning_rate": 0.00930025169560335, "loss": 2.7396, "step": 2408 }, { "crossentropy": 2.843909502029419, "epoch": 0.13099866771799124, "grad_norm": 0.04642517492175102, "grad_norm_var": 0.0006006482080961947, "learning_rate": 0.00929961625927551, "loss": 2.8439, "step": 2409 }, { "crossentropy": 2.8271759748458862, "epoch": 0.13105304657549144, "grad_norm": 0.047406770288944244, "grad_norm_var": 0.0006036345411929289, "learning_rate": 0.009298980556289476, "loss": 2.8272, "step": 2410 }, { "crossentropy": 2.8271061182022095, "epoch": 0.13110742543299164, "grad_norm": 0.043559689074754715, "grad_norm_var": 0.0006038483489891898, "learning_rate": 0.009298344586684677, "loss": 2.8271, "step": 2411 }, { "crossentropy": 2.862902879714966, "epoch": 0.13116180429049185, "grad_norm": 0.04458336532115936, "grad_norm_var": 0.000612606147590557, "learning_rate": 0.009297708350500556, "loss": 2.8629, "step": 2412 }, { "crossentropy": 2.786189079284668, "epoch": 0.13121618314799205, "grad_norm": 0.054039232432842255, "grad_norm_var": 0.000610798804013359, "learning_rate": 0.009297071847776568, "loss": 2.7862, "step": 2413 }, { "crossentropy": 2.7576788663864136, "epoch": 0.13127056200549225, "grad_norm": 0.06414233148097992, "grad_norm_var": 0.0006089854655512333, "learning_rate": 0.00929643507855219, "loss": 2.7577, "step": 2414 }, { "crossentropy": 2.7950347661972046, "epoch": 0.13132494086299246, "grad_norm": 0.053977541625499725, "grad_norm_var": 0.0005921274170542075, "learning_rate": 0.009295798042866916, "loss": 2.795, "step": 2415 }, { "crossentropy": 2.778708815574646, "epoch": 0.13137931972049266, "grad_norm": 0.05611639469861984, "grad_norm_var": 0.0005759238072006919, "learning_rate": 0.009295160740760252, "loss": 2.7787, "step": 2416 }, { "crossentropy": 2.77916944026947, "epoch": 0.13143369857799286, "grad_norm": 0.04628373309969902, "grad_norm_var": 0.0005590947072736926, "learning_rate": 0.00929452317227172, "loss": 2.7792, "step": 2417 }, { "crossentropy": 2.7430835962295532, "epoch": 0.13148807743549307, "grad_norm": 0.04311070218682289, "grad_norm_var": 0.0005615778997079211, "learning_rate": 0.009293885337440869, "loss": 2.7431, "step": 2418 }, { "crossentropy": 2.8185616731643677, "epoch": 0.13154245629299327, "grad_norm": 0.12742158770561218, "grad_norm_var": 0.00045734819335656204, "learning_rate": 0.009293247236307249, "loss": 2.8186, "step": 2419 }, { "crossentropy": 2.743346095085144, "epoch": 0.1315968351504935, "grad_norm": 0.04289286956191063, "grad_norm_var": 0.0004499271327753758, "learning_rate": 0.00929260886891044, "loss": 2.7433, "step": 2420 }, { "crossentropy": 2.7614519596099854, "epoch": 0.1316512140079937, "grad_norm": 0.044517502188682556, "grad_norm_var": 0.0004495325029861691, "learning_rate": 0.00929197023529003, "loss": 2.7615, "step": 2421 }, { "crossentropy": 2.7053186893463135, "epoch": 0.1317055928654939, "grad_norm": 0.04397192969918251, "grad_norm_var": 0.0004417947256573961, "learning_rate": 0.009291331335485627, "loss": 2.7053, "step": 2422 }, { "crossentropy": 2.864442467689514, "epoch": 0.1317599717229941, "grad_norm": 0.05591510981321335, "grad_norm_var": 0.0004257538020223577, "learning_rate": 0.009290692169536856, "loss": 2.8644, "step": 2423 }, { "crossentropy": 2.623328685760498, "epoch": 0.13181435058049432, "grad_norm": 0.04684527590870857, "grad_norm_var": 0.0004228624120449932, "learning_rate": 0.00929005273748336, "loss": 2.6233, "step": 2424 }, { "crossentropy": 2.8862427473068237, "epoch": 0.13186872943799452, "grad_norm": 0.05259185656905174, "grad_norm_var": 0.00041915437101000677, "learning_rate": 0.009289413039364787, "loss": 2.8862, "step": 2425 }, { "crossentropy": 2.7817325592041016, "epoch": 0.13192310829549472, "grad_norm": 0.04508078843355179, "grad_norm_var": 0.0004216027078641368, "learning_rate": 0.00928877307522082, "loss": 2.7817, "step": 2426 }, { "crossentropy": 2.777704358100891, "epoch": 0.13197748715299493, "grad_norm": 0.04069865867495537, "grad_norm_var": 0.00042612200608928465, "learning_rate": 0.009288132845091144, "loss": 2.7777, "step": 2427 }, { "crossentropy": 2.745232105255127, "epoch": 0.13203186601049513, "grad_norm": 0.04214991256594658, "grad_norm_var": 0.00042951070930254224, "learning_rate": 0.009287492349015467, "loss": 2.7452, "step": 2428 }, { "crossentropy": 2.7820730209350586, "epoch": 0.13208624486799533, "grad_norm": 0.04331039637327194, "grad_norm_var": 0.0004362693376262087, "learning_rate": 0.009286851587033514, "loss": 2.7821, "step": 2429 }, { "crossentropy": 2.9330002069473267, "epoch": 0.13214062372549554, "grad_norm": 0.06911107897758484, "grad_norm_var": 0.00044515164949849396, "learning_rate": 0.009286210559185019, "loss": 2.933, "step": 2430 }, { "crossentropy": 2.8897950649261475, "epoch": 0.13219500258299574, "grad_norm": 0.04353334382176399, "grad_norm_var": 0.00045112974779423537, "learning_rate": 0.009285569265509743, "loss": 2.8898, "step": 2431 }, { "crossentropy": 2.809619426727295, "epoch": 0.13224938144049594, "grad_norm": 0.040806908160448074, "grad_norm_var": 0.00045884955308152486, "learning_rate": 0.009284927706047455, "loss": 2.8096, "step": 2432 }, { "crossentropy": 2.6830071210861206, "epoch": 0.13230376029799615, "grad_norm": 0.04822305962443352, "grad_norm_var": 0.00045766725922741203, "learning_rate": 0.009284285880837946, "loss": 2.683, "step": 2433 }, { "crossentropy": 2.792411684989929, "epoch": 0.13235813915549635, "grad_norm": 0.04111611843109131, "grad_norm_var": 0.0004602497319754418, "learning_rate": 0.009283643789921022, "loss": 2.7924, "step": 2434 }, { "crossentropy": 2.7765374183654785, "epoch": 0.13241251801299656, "grad_norm": 0.042862772941589355, "grad_norm_var": 5.410818065844456e-05, "learning_rate": 0.009283001433336503, "loss": 2.7765, "step": 2435 }, { "crossentropy": 2.79086434841156, "epoch": 0.13246689687049676, "grad_norm": 0.04184960573911667, "grad_norm_var": 5.46747263591126e-05, "learning_rate": 0.009282358811124228, "loss": 2.7909, "step": 2436 }, { "crossentropy": 2.777595639228821, "epoch": 0.13252127572799696, "grad_norm": 0.03909631446003914, "grad_norm_var": 5.7880599521793234e-05, "learning_rate": 0.009281715923324052, "loss": 2.7776, "step": 2437 }, { "crossentropy": 2.7785714864730835, "epoch": 0.13257565458549717, "grad_norm": 0.041656494140625, "grad_norm_var": 5.88642355562959e-05, "learning_rate": 0.009281072769975847, "loss": 2.7786, "step": 2438 }, { "crossentropy": 2.841466784477234, "epoch": 0.13263003344299737, "grad_norm": 0.047918763011693954, "grad_norm_var": 5.221251036207902e-05, "learning_rate": 0.0092804293511195, "loss": 2.8415, "step": 2439 }, { "crossentropy": 2.686657667160034, "epoch": 0.13268441230049757, "grad_norm": 0.04444372281432152, "grad_norm_var": 5.211922166193691e-05, "learning_rate": 0.009279785666794915, "loss": 2.6867, "step": 2440 }, { "crossentropy": 2.8258174657821655, "epoch": 0.13273879115799778, "grad_norm": 0.08123154193162918, "grad_norm_var": 0.00013131214038298843, "learning_rate": 0.009279141717042012, "loss": 2.8258, "step": 2441 }, { "crossentropy": 2.7551066875457764, "epoch": 0.13279317001549798, "grad_norm": 0.047515545040369034, "grad_norm_var": 0.0001310374959702482, "learning_rate": 0.00927849750190073, "loss": 2.7551, "step": 2442 }, { "crossentropy": 2.791785717010498, "epoch": 0.13284754887299818, "grad_norm": 0.04624488577246666, "grad_norm_var": 0.0001281373278483636, "learning_rate": 0.009277853021411023, "loss": 2.7918, "step": 2443 }, { "crossentropy": 2.785132884979248, "epoch": 0.1329019277304984, "grad_norm": 0.06266270577907562, "grad_norm_var": 0.00013962006236937566, "learning_rate": 0.009277208275612858, "loss": 2.7851, "step": 2444 }, { "crossentropy": 2.811016798019409, "epoch": 0.1329563065879986, "grad_norm": 0.04003177955746651, "grad_norm_var": 0.00014271306961541788, "learning_rate": 0.009276563264546223, "loss": 2.811, "step": 2445 }, { "crossentropy": 2.6727945804595947, "epoch": 0.1330106854454988, "grad_norm": 0.04086865484714508, "grad_norm_var": 0.00011549338781119822, "learning_rate": 0.009275917988251123, "loss": 2.6728, "step": 2446 }, { "crossentropy": 2.751089334487915, "epoch": 0.133065064302999, "grad_norm": 0.04512159898877144, "grad_norm_var": 0.00011494257031539375, "learning_rate": 0.009275272446767575, "loss": 2.7511, "step": 2447 }, { "crossentropy": 2.7220455408096313, "epoch": 0.1331194431604992, "grad_norm": 0.042118143290281296, "grad_norm_var": 0.0001139711015888486, "learning_rate": 0.009274626640135616, "loss": 2.722, "step": 2448 }, { "crossentropy": 2.7397890090942383, "epoch": 0.1331738220179994, "grad_norm": 0.04278823360800743, "grad_norm_var": 0.00011497445867428563, "learning_rate": 0.009273980568395297, "loss": 2.7398, "step": 2449 }, { "crossentropy": 2.7911518812179565, "epoch": 0.1332282008754996, "grad_norm": 0.04329027235507965, "grad_norm_var": 0.00011364527461636265, "learning_rate": 0.00927333423158669, "loss": 2.7912, "step": 2450 }, { "crossentropy": 2.8216261863708496, "epoch": 0.1332825797329998, "grad_norm": 0.04285157844424248, "grad_norm_var": 0.00011365124320759068, "learning_rate": 0.009272687629749875, "loss": 2.8216, "step": 2451 }, { "crossentropy": 2.7907280921936035, "epoch": 0.13333695859050002, "grad_norm": 0.04247051849961281, "grad_norm_var": 0.00011326089966250392, "learning_rate": 0.009272040762924958, "loss": 2.7907, "step": 2452 }, { "crossentropy": 2.76883065700531, "epoch": 0.13339133744800022, "grad_norm": 0.042861372232437134, "grad_norm_var": 0.00011023216834360194, "learning_rate": 0.009271393631152057, "loss": 2.7688, "step": 2453 }, { "crossentropy": 2.780067205429077, "epoch": 0.13344571630550042, "grad_norm": 0.049381937831640244, "grad_norm_var": 0.0001083245582429138, "learning_rate": 0.009270746234471303, "loss": 2.7801, "step": 2454 }, { "crossentropy": 2.84860098361969, "epoch": 0.13350009516300063, "grad_norm": 0.04208904504776001, "grad_norm_var": 0.00011021066295406612, "learning_rate": 0.00927009857292285, "loss": 2.8486, "step": 2455 }, { "crossentropy": 2.8466763496398926, "epoch": 0.13355447402050083, "grad_norm": 0.04412572830915451, "grad_norm_var": 0.00011033589165412809, "learning_rate": 0.009269450646546867, "loss": 2.8467, "step": 2456 }, { "crossentropy": 2.8508812189102173, "epoch": 0.13360885287800103, "grad_norm": 0.04208192229270935, "grad_norm_var": 2.8634230029495923e-05, "learning_rate": 0.00926880245538353, "loss": 2.8509, "step": 2457 }, { "crossentropy": 2.831805467605591, "epoch": 0.13366323173550124, "grad_norm": 0.0441519133746624, "grad_norm_var": 2.81151779010349e-05, "learning_rate": 0.00926815399947305, "loss": 2.8318, "step": 2458 }, { "crossentropy": 2.7529438734054565, "epoch": 0.13371761059300144, "grad_norm": 0.04751782864332199, "grad_norm_var": 2.8500507824335878e-05, "learning_rate": 0.009267505278855635, "loss": 2.7529, "step": 2459 }, { "crossentropy": 2.8409606218338013, "epoch": 0.13377198945050164, "grad_norm": 0.03983810916543007, "grad_norm_var": 6.245463876627886e-06, "learning_rate": 0.009266856293571522, "loss": 2.841, "step": 2460 }, { "crossentropy": 2.8340625762939453, "epoch": 0.13382636830800185, "grad_norm": 0.040358658879995346, "grad_norm_var": 6.112999930164321e-06, "learning_rate": 0.00926620704366096, "loss": 2.8341, "step": 2461 }, { "crossentropy": 2.8095897436141968, "epoch": 0.13388074716550205, "grad_norm": 0.04336782172322273, "grad_norm_var": 5.711606932854579e-06, "learning_rate": 0.009265557529164215, "loss": 2.8096, "step": 2462 }, { "crossentropy": 2.6992061138153076, "epoch": 0.13393512602300225, "grad_norm": 0.04537712410092354, "grad_norm_var": 5.774311388870995e-06, "learning_rate": 0.009264907750121567, "loss": 2.6992, "step": 2463 }, { "crossentropy": 2.771874785423279, "epoch": 0.13398950488050246, "grad_norm": 0.039990928024053574, "grad_norm_var": 6.425487961062709e-06, "learning_rate": 0.009264257706573319, "loss": 2.7719, "step": 2464 }, { "crossentropy": 2.730503559112549, "epoch": 0.13404388373800266, "grad_norm": 0.04215320572257042, "grad_norm_var": 6.492663136743968e-06, "learning_rate": 0.00926360739855978, "loss": 2.7305, "step": 2465 }, { "crossentropy": 2.829583168029785, "epoch": 0.13409826259550287, "grad_norm": 0.040311869233846664, "grad_norm_var": 7.0288161645105814e-06, "learning_rate": 0.009262956826121287, "loss": 2.8296, "step": 2466 }, { "crossentropy": 2.807092547416687, "epoch": 0.13415264145300307, "grad_norm": 0.04024329409003258, "grad_norm_var": 7.525834291793356e-06, "learning_rate": 0.009262305989298187, "loss": 2.8071, "step": 2467 }, { "crossentropy": 2.790026545524597, "epoch": 0.13420702031050327, "grad_norm": 0.04390588030219078, "grad_norm_var": 7.573347546579491e-06, "learning_rate": 0.00926165488813084, "loss": 2.79, "step": 2468 }, { "crossentropy": 2.7821351289749146, "epoch": 0.13426139916800348, "grad_norm": 0.040868453681468964, "grad_norm_var": 7.854375175929811e-06, "learning_rate": 0.009261003522659635, "loss": 2.7821, "step": 2469 }, { "crossentropy": 2.7606316804885864, "epoch": 0.13431577802550368, "grad_norm": 0.046534910798072815, "grad_norm_var": 5.8853100178629195e-06, "learning_rate": 0.009260351892924961, "loss": 2.7606, "step": 2470 }, { "crossentropy": 2.814087986946106, "epoch": 0.13437015688300388, "grad_norm": 0.04996317997574806, "grad_norm_var": 9.137592795339413e-06, "learning_rate": 0.009259699998967235, "loss": 2.8141, "step": 2471 }, { "crossentropy": 2.8063149452209473, "epoch": 0.1344245357405041, "grad_norm": 0.05431259423494339, "grad_norm_var": 1.6915461797016527e-05, "learning_rate": 0.009259047840826887, "loss": 2.8063, "step": 2472 }, { "crossentropy": 2.7994860410690308, "epoch": 0.1344789145980043, "grad_norm": 0.05047902464866638, "grad_norm_var": 1.9386402240827103e-05, "learning_rate": 0.009258395418544361, "loss": 2.7995, "step": 2473 }, { "crossentropy": 2.7398234605789185, "epoch": 0.1345332934555045, "grad_norm": 0.044086065143346786, "grad_norm_var": 1.9388288816984896e-05, "learning_rate": 0.009257742732160124, "loss": 2.7398, "step": 2474 }, { "crossentropy": 2.8545422554016113, "epoch": 0.1345876723130047, "grad_norm": 0.044423121958971024, "grad_norm_var": 1.867222501612508e-05, "learning_rate": 0.00925708978171465, "loss": 2.8545, "step": 2475 }, { "crossentropy": 2.776648163795471, "epoch": 0.1346420511705049, "grad_norm": 0.04251584783196449, "grad_norm_var": 1.7585030649091885e-05, "learning_rate": 0.009256436567248436, "loss": 2.7766, "step": 2476 }, { "crossentropy": 2.8553038835525513, "epoch": 0.1346964300280051, "grad_norm": 0.039328306913375854, "grad_norm_var": 1.8193634466873106e-05, "learning_rate": 0.009255783088801997, "loss": 2.8553, "step": 2477 }, { "crossentropy": 2.6985548734664917, "epoch": 0.1347508088855053, "grad_norm": 0.04014531895518303, "grad_norm_var": 1.9217994290029497e-05, "learning_rate": 0.009255129346415855, "loss": 2.6986, "step": 2478 }, { "crossentropy": 2.797871708869934, "epoch": 0.1348051877430055, "grad_norm": 0.042459454387426376, "grad_norm_var": 1.9229851282805537e-05, "learning_rate": 0.00925447534013056, "loss": 2.7979, "step": 2479 }, { "crossentropy": 2.8414586782455444, "epoch": 0.13485956660050571, "grad_norm": 0.05002044886350632, "grad_norm_var": 2.0346036554968797e-05, "learning_rate": 0.009253821069986672, "loss": 2.8415, "step": 2480 }, { "crossentropy": 2.8042551279067993, "epoch": 0.13491394545800592, "grad_norm": 0.04549126699566841, "grad_norm_var": 2.0004880416414164e-05, "learning_rate": 0.009253166536024766, "loss": 2.8043, "step": 2481 }, { "crossentropy": 2.8013335466384888, "epoch": 0.13496832431550612, "grad_norm": 0.041166290640830994, "grad_norm_var": 1.9551389350074404e-05, "learning_rate": 0.009252511738285436, "loss": 2.8013, "step": 2482 }, { "crossentropy": 2.7513023614883423, "epoch": 0.13502270317300633, "grad_norm": 0.046370480209589005, "grad_norm_var": 1.821888679492811e-05, "learning_rate": 0.009251856676809294, "loss": 2.7513, "step": 2483 }, { "crossentropy": 2.761615514755249, "epoch": 0.13507708203050653, "grad_norm": 0.06036815792322159, "grad_norm_var": 3.247117518104225e-05, "learning_rate": 0.009251201351636965, "loss": 2.7616, "step": 2484 }, { "crossentropy": 2.8250434398651123, "epoch": 0.13513146088800673, "grad_norm": 0.04581237956881523, "grad_norm_var": 3.0511805884998687e-05, "learning_rate": 0.009250545762809089, "loss": 2.825, "step": 2485 }, { "crossentropy": 2.700349450111389, "epoch": 0.13518583974550694, "grad_norm": 0.041607294231653214, "grad_norm_var": 3.198497436808408e-05, "learning_rate": 0.00924988991036633, "loss": 2.7003, "step": 2486 }, { "crossentropy": 2.650489926338196, "epoch": 0.13524021860300714, "grad_norm": 0.04127088561654091, "grad_norm_var": 3.229866192784499e-05, "learning_rate": 0.009249233794349358, "loss": 2.6505, "step": 2487 }, { "crossentropy": 2.8029253482818604, "epoch": 0.13529459746050734, "grad_norm": 0.043015509843826294, "grad_norm_var": 2.7175769613790638e-05, "learning_rate": 0.009248577414798871, "loss": 2.8029, "step": 2488 }, { "crossentropy": 2.803690552711487, "epoch": 0.13534897631800755, "grad_norm": 0.04820706322789192, "grad_norm_var": 2.581136533435265e-05, "learning_rate": 0.009247920771755572, "loss": 2.8037, "step": 2489 }, { "crossentropy": 2.8003093004226685, "epoch": 0.13540335517550775, "grad_norm": 0.05336516350507736, "grad_norm_var": 3.0349029063689996e-05, "learning_rate": 0.009247263865260189, "loss": 2.8003, "step": 2490 }, { "crossentropy": 2.8428179025650024, "epoch": 0.13545773403300795, "grad_norm": 0.05268128588795662, "grad_norm_var": 3.359305539397464e-05, "learning_rate": 0.009246606695353458, "loss": 2.8428, "step": 2491 }, { "crossentropy": 2.738071084022522, "epoch": 0.13551211289050816, "grad_norm": 0.04319823905825615, "grad_norm_var": 3.331751915132593e-05, "learning_rate": 0.00924594926207614, "loss": 2.7381, "step": 2492 }, { "crossentropy": 2.7821627855300903, "epoch": 0.13556649174800836, "grad_norm": 0.04334744065999985, "grad_norm_var": 3.080183843606731e-05, "learning_rate": 0.009245291565469008, "loss": 2.7822, "step": 2493 }, { "crossentropy": 2.7099921703338623, "epoch": 0.13562087060550856, "grad_norm": 0.04523920267820358, "grad_norm_var": 2.8339902950834326e-05, "learning_rate": 0.009244633605572851, "loss": 2.71, "step": 2494 }, { "crossentropy": 2.761281132698059, "epoch": 0.1356752494630088, "grad_norm": 0.04286786541342735, "grad_norm_var": 2.813159219092609e-05, "learning_rate": 0.009243975382428474, "loss": 2.7613, "step": 2495 }, { "crossentropy": 2.860003113746643, "epoch": 0.135729628320509, "grad_norm": 0.044601865112781525, "grad_norm_var": 2.7424519871492985e-05, "learning_rate": 0.0092433168960767, "loss": 2.86, "step": 2496 }, { "crossentropy": 2.802366018295288, "epoch": 0.1357840071780092, "grad_norm": 0.04480784758925438, "grad_norm_var": 2.751493491599203e-05, "learning_rate": 0.009242658146558369, "loss": 2.8024, "step": 2497 }, { "crossentropy": 2.723585367202759, "epoch": 0.1358383860355094, "grad_norm": 0.05414422228932381, "grad_norm_var": 2.9468996825970004e-05, "learning_rate": 0.009241999133914337, "loss": 2.7236, "step": 2498 }, { "crossentropy": 2.805682420730591, "epoch": 0.1358927648930096, "grad_norm": 0.0517343208193779, "grad_norm_var": 3.0865901170459785e-05, "learning_rate": 0.009241339858185472, "loss": 2.8057, "step": 2499 }, { "crossentropy": 2.787753939628601, "epoch": 0.1359471437505098, "grad_norm": 0.044400814920663834, "grad_norm_var": 1.890812953477261e-05, "learning_rate": 0.009240680319412662, "loss": 2.7878, "step": 2500 }, { "crossentropy": 2.783272624015808, "epoch": 0.13600152260801002, "grad_norm": 0.044055111706256866, "grad_norm_var": 1.920807811767727e-05, "learning_rate": 0.009240020517636813, "loss": 2.7833, "step": 2501 }, { "crossentropy": 2.7376396656036377, "epoch": 0.13605590146551022, "grad_norm": 0.05214426666498184, "grad_norm_var": 1.9752476804161627e-05, "learning_rate": 0.009239360452898844, "loss": 2.7376, "step": 2502 }, { "crossentropy": 2.825363516807556, "epoch": 0.13611028032301042, "grad_norm": 0.0430767759680748, "grad_norm_var": 1.862074401945546e-05, "learning_rate": 0.009238700125239693, "loss": 2.8254, "step": 2503 }, { "crossentropy": 2.714327335357666, "epoch": 0.13616465918051063, "grad_norm": 0.042088575661182404, "grad_norm_var": 1.915829512196352e-05, "learning_rate": 0.009238039534700312, "loss": 2.7143, "step": 2504 }, { "crossentropy": 2.809211015701294, "epoch": 0.13621903803801083, "grad_norm": 0.04425183683633804, "grad_norm_var": 1.943223566590694e-05, "learning_rate": 0.009237378681321668, "loss": 2.8092, "step": 2505 }, { "crossentropy": 2.7758134603500366, "epoch": 0.13627341689551103, "grad_norm": 0.041435495018959045, "grad_norm_var": 1.7606472708468073e-05, "learning_rate": 0.009236717565144752, "loss": 2.7758, "step": 2506 }, { "crossentropy": 2.7025707960128784, "epoch": 0.13632779575301124, "grad_norm": 0.04322168603539467, "grad_norm_var": 1.4620517771325957e-05, "learning_rate": 0.00923605618621056, "loss": 2.7026, "step": 2507 }, { "crossentropy": 2.719564437866211, "epoch": 0.13638217461051144, "grad_norm": 0.04356632009148598, "grad_norm_var": 1.4526402108210508e-05, "learning_rate": 0.009235394544560113, "loss": 2.7196, "step": 2508 }, { "crossentropy": 2.7559380531311035, "epoch": 0.13643655346801165, "grad_norm": 0.04956990107893944, "grad_norm_var": 1.5316854542065697e-05, "learning_rate": 0.009234732640234446, "loss": 2.7559, "step": 2509 }, { "crossentropy": 2.6989617347717285, "epoch": 0.13649093232551185, "grad_norm": 0.043798141181468964, "grad_norm_var": 1.5535257334311527e-05, "learning_rate": 0.00923407047327461, "loss": 2.699, "step": 2510 }, { "crossentropy": 2.755065083503723, "epoch": 0.13654531118301205, "grad_norm": 0.049558673053979874, "grad_norm_var": 1.588662830529954e-05, "learning_rate": 0.009233408043721667, "loss": 2.7551, "step": 2511 }, { "crossentropy": 2.757072329521179, "epoch": 0.13659969004051226, "grad_norm": 0.04218447208404541, "grad_norm_var": 1.6711693811461276e-05, "learning_rate": 0.009232745351616707, "loss": 2.7571, "step": 2512 }, { "crossentropy": 2.765035390853882, "epoch": 0.13665406889801246, "grad_norm": 0.04534514993429184, "grad_norm_var": 1.6653113832333182e-05, "learning_rate": 0.009232082397000825, "loss": 2.765, "step": 2513 }, { "crossentropy": 2.723930597305298, "epoch": 0.13670844775551266, "grad_norm": 0.045220810920000076, "grad_norm_var": 1.1834010054006158e-05, "learning_rate": 0.00923141917991514, "loss": 2.7239, "step": 2514 }, { "crossentropy": 2.7695748805999756, "epoch": 0.13676282661301287, "grad_norm": 0.06807637214660645, "grad_norm_var": 4.242934879992143e-05, "learning_rate": 0.009230755700400781, "loss": 2.7696, "step": 2515 }, { "crossentropy": 2.659196972846985, "epoch": 0.13681720547051307, "grad_norm": 0.04254959523677826, "grad_norm_var": 4.313073755929462e-05, "learning_rate": 0.009230091958498896, "loss": 2.6592, "step": 2516 }, { "crossentropy": 2.773462414741516, "epoch": 0.13687158432801327, "grad_norm": 0.04127369448542595, "grad_norm_var": 4.443156062699308e-05, "learning_rate": 0.009229427954250654, "loss": 2.7735, "step": 2517 }, { "crossentropy": 2.751695990562439, "epoch": 0.13692596318551348, "grad_norm": 0.042416609823703766, "grad_norm_var": 4.248691496739135e-05, "learning_rate": 0.009228763687697232, "loss": 2.7517, "step": 2518 }, { "crossentropy": 2.731890559196472, "epoch": 0.13698034204301368, "grad_norm": 0.05931805074214935, "grad_norm_var": 5.3775124275003604e-05, "learning_rate": 0.00922809915887983, "loss": 2.7319, "step": 2519 }, { "crossentropy": 2.77052104473114, "epoch": 0.13703472090051388, "grad_norm": 0.050753992050886154, "grad_norm_var": 5.338030257029015e-05, "learning_rate": 0.00922743436783966, "loss": 2.7705, "step": 2520 }, { "crossentropy": 2.8585426807403564, "epoch": 0.1370890997580141, "grad_norm": 0.0497419610619545, "grad_norm_var": 5.322770092351005e-05, "learning_rate": 0.00922676931461795, "loss": 2.8585, "step": 2521 }, { "crossentropy": 2.732657313346863, "epoch": 0.1371434786155143, "grad_norm": 0.04617210850119591, "grad_norm_var": 5.0877614216411166e-05, "learning_rate": 0.009226103999255948, "loss": 2.7327, "step": 2522 }, { "crossentropy": 2.7157652378082275, "epoch": 0.1371978574730145, "grad_norm": 0.04720417410135269, "grad_norm_var": 4.950525210224473e-05, "learning_rate": 0.009225438421794917, "loss": 2.7158, "step": 2523 }, { "crossentropy": 2.6892162561416626, "epoch": 0.1372522363305147, "grad_norm": 0.043339360505342484, "grad_norm_var": 4.9640276226807906e-05, "learning_rate": 0.009224772582276134, "loss": 2.6892, "step": 2524 }, { "crossentropy": 2.6949976682662964, "epoch": 0.1373066151880149, "grad_norm": 0.04879210516810417, "grad_norm_var": 4.950570534212924e-05, "learning_rate": 0.009224106480740894, "loss": 2.695, "step": 2525 }, { "crossentropy": 2.7835049629211426, "epoch": 0.1373609940455151, "grad_norm": 0.059303514659404755, "grad_norm_var": 5.613622801654231e-05, "learning_rate": 0.00922344011723051, "loss": 2.7835, "step": 2526 }, { "crossentropy": 2.780834436416626, "epoch": 0.1374153729030153, "grad_norm": 0.05068579316139221, "grad_norm_var": 5.632541066687702e-05, "learning_rate": 0.009222773491786304, "loss": 2.7808, "step": 2527 }, { "crossentropy": 2.6672455072402954, "epoch": 0.1374697517605155, "grad_norm": 0.046111732721328735, "grad_norm_var": 5.3773615583889894e-05, "learning_rate": 0.009222106604449627, "loss": 2.6672, "step": 2528 }, { "crossentropy": 2.738916039466858, "epoch": 0.13752413061801572, "grad_norm": 0.044568076729774475, "grad_norm_var": 5.4204960341510075e-05, "learning_rate": 0.009221439455261831, "loss": 2.7389, "step": 2529 }, { "crossentropy": 2.617315649986267, "epoch": 0.13757850947551592, "grad_norm": 0.04407746344804764, "grad_norm_var": 5.4877344727981685e-05, "learning_rate": 0.0092207720442643, "loss": 2.6173, "step": 2530 }, { "crossentropy": 2.7298004627227783, "epoch": 0.13763288833301612, "grad_norm": 0.05206548422574997, "grad_norm_var": 3.0226489297413965e-05, "learning_rate": 0.009220104371498417, "loss": 2.7298, "step": 2531 }, { "crossentropy": 2.7771650552749634, "epoch": 0.13768726719051633, "grad_norm": 0.04518638923764229, "grad_norm_var": 2.8736607567937896e-05, "learning_rate": 0.009219436437005598, "loss": 2.7772, "step": 2532 }, { "crossentropy": 2.834040641784668, "epoch": 0.13774164604801653, "grad_norm": 0.04511938616633415, "grad_norm_var": 2.6115489514223996e-05, "learning_rate": 0.009218768240827263, "loss": 2.834, "step": 2533 }, { "crossentropy": 2.7398892641067505, "epoch": 0.13779602490551673, "grad_norm": 0.042827021330595016, "grad_norm_var": 2.5797036318433013e-05, "learning_rate": 0.009218099783004856, "loss": 2.7399, "step": 2534 }, { "crossentropy": 2.76274573802948, "epoch": 0.13785040376301694, "grad_norm": 0.041019588708877563, "grad_norm_var": 2.021848593572502e-05, "learning_rate": 0.009217431063579832, "loss": 2.7627, "step": 2535 }, { "crossentropy": 2.6525890827178955, "epoch": 0.13790478262051714, "grad_norm": 0.0480666421353817, "grad_norm_var": 1.9436005856486985e-05, "learning_rate": 0.009216762082593668, "loss": 2.6526, "step": 2536 }, { "crossentropy": 2.74043071269989, "epoch": 0.13795916147801734, "grad_norm": 0.04296493157744408, "grad_norm_var": 1.9957675946727372e-05, "learning_rate": 0.009216092840087849, "loss": 2.7404, "step": 2537 }, { "crossentropy": 2.6228243112564087, "epoch": 0.13801354033551755, "grad_norm": 0.045934129506349564, "grad_norm_var": 1.9978568278997828e-05, "learning_rate": 0.009215423336103883, "loss": 2.6228, "step": 2538 }, { "crossentropy": 2.773112416267395, "epoch": 0.13806791919301775, "grad_norm": 0.04250727593898773, "grad_norm_var": 2.1044206169539466e-05, "learning_rate": 0.009214753570683291, "loss": 2.7731, "step": 2539 }, { "crossentropy": 2.8117339611053467, "epoch": 0.13812229805051796, "grad_norm": 0.04110363498330116, "grad_norm_var": 2.2272123796551745e-05, "learning_rate": 0.009214083543867614, "loss": 2.8117, "step": 2540 }, { "crossentropy": 2.7234487533569336, "epoch": 0.13817667690801816, "grad_norm": 0.04023491218686104, "grad_norm_var": 2.3972041157058895e-05, "learning_rate": 0.009213413255698404, "loss": 2.7234, "step": 2541 }, { "crossentropy": 2.722645878791809, "epoch": 0.13823105576551836, "grad_norm": 0.0401315912604332, "grad_norm_var": 1.2262655961366395e-05, "learning_rate": 0.00921274270621723, "loss": 2.7226, "step": 2542 }, { "crossentropy": 2.6736449003219604, "epoch": 0.13828543462301857, "grad_norm": 0.048771198838949203, "grad_norm_var": 1.0922293471656424e-05, "learning_rate": 0.009212071895465683, "loss": 2.6736, "step": 2543 }, { "crossentropy": 2.7946659326553345, "epoch": 0.13833981348051877, "grad_norm": 0.04401809349656105, "grad_norm_var": 1.0723468050416232e-05, "learning_rate": 0.009211400823485363, "loss": 2.7947, "step": 2544 }, { "crossentropy": 2.75459361076355, "epoch": 0.13839419233801897, "grad_norm": 0.0404285192489624, "grad_norm_var": 1.1639458056409563e-05, "learning_rate": 0.00921072949031789, "loss": 2.7546, "step": 2545 }, { "crossentropy": 2.7493256330490112, "epoch": 0.13844857119551918, "grad_norm": 0.04831160232424736, "grad_norm_var": 1.2787586958243277e-05, "learning_rate": 0.0092100578960049, "loss": 2.7493, "step": 2546 }, { "crossentropy": 2.710970163345337, "epoch": 0.13850295005301938, "grad_norm": 0.041024718433618546, "grad_norm_var": 8.964574316089094e-06, "learning_rate": 0.009209386040588043, "loss": 2.711, "step": 2547 }, { "crossentropy": 2.7949814796447754, "epoch": 0.13855732891051958, "grad_norm": 0.040937818586826324, "grad_norm_var": 9.19582711287886e-06, "learning_rate": 0.00920871392410899, "loss": 2.795, "step": 2548 }, { "crossentropy": 2.795302629470825, "epoch": 0.1386117077680198, "grad_norm": 0.041967280209064484, "grad_norm_var": 9.067948295084459e-06, "learning_rate": 0.009208041546609423, "loss": 2.7953, "step": 2549 }, { "crossentropy": 2.720414638519287, "epoch": 0.13866608662552, "grad_norm": 0.04025975987315178, "grad_norm_var": 9.587199996141428e-06, "learning_rate": 0.009207368908131041, "loss": 2.7204, "step": 2550 }, { "crossentropy": 2.850578188896179, "epoch": 0.1387204654830202, "grad_norm": 0.04196768254041672, "grad_norm_var": 9.395546192008294e-06, "learning_rate": 0.009206696008715563, "loss": 2.8506, "step": 2551 }, { "crossentropy": 2.6604844331741333, "epoch": 0.1387748443405204, "grad_norm": 0.10994996875524521, "grad_norm_var": 0.0002902228212151631, "learning_rate": 0.009206022848404721, "loss": 2.6605, "step": 2552 }, { "crossentropy": 2.8620697259902954, "epoch": 0.1388292231980206, "grad_norm": 0.051485851407051086, "grad_norm_var": 0.0002902819483181607, "learning_rate": 0.009205349427240262, "loss": 2.8621, "step": 2553 }, { "crossentropy": 2.647633671760559, "epoch": 0.1388836020555208, "grad_norm": 0.07141785323619843, "grad_norm_var": 0.00032575528378335305, "learning_rate": 0.009204675745263954, "loss": 2.6476, "step": 2554 }, { "crossentropy": 2.7999300956726074, "epoch": 0.138937980913021, "grad_norm": 0.04499531164765358, "grad_norm_var": 0.00032397755990868266, "learning_rate": 0.009204001802517575, "loss": 2.7999, "step": 2555 }, { "crossentropy": 2.65217661857605, "epoch": 0.1389923597705212, "grad_norm": 0.045676860958337784, "grad_norm_var": 0.00032035524319160106, "learning_rate": 0.009203327599042924, "loss": 2.6522, "step": 2556 }, { "crossentropy": 2.812811017036438, "epoch": 0.13904673862802142, "grad_norm": 0.05266677960753441, "grad_norm_var": 0.00031470066813830294, "learning_rate": 0.009202653134881814, "loss": 2.8128, "step": 2557 }, { "crossentropy": 2.7922202348709106, "epoch": 0.13910111748552162, "grad_norm": 0.05491061508655548, "grad_norm_var": 0.0003084118546220251, "learning_rate": 0.009201978410076073, "loss": 2.7922, "step": 2558 }, { "crossentropy": 2.7565144300460815, "epoch": 0.13915549634302182, "grad_norm": 0.06749913841485977, "grad_norm_var": 0.0003243319788274847, "learning_rate": 0.009201303424667549, "loss": 2.7565, "step": 2559 }, { "crossentropy": 2.8072034120559692, "epoch": 0.13920987520052203, "grad_norm": 0.0538744293153286, "grad_norm_var": 0.0003194608265977454, "learning_rate": 0.009200628178698106, "loss": 2.8072, "step": 2560 }, { "crossentropy": 2.6955626010894775, "epoch": 0.13926425405802223, "grad_norm": 0.2440837174654007, "grad_norm_var": 0.0025713715705025907, "learning_rate": 0.009199952672209617, "loss": 2.6956, "step": 2561 }, { "crossentropy": 2.8042303323745728, "epoch": 0.13931863291552243, "grad_norm": 0.045645471662282944, "grad_norm_var": 0.002577993344372076, "learning_rate": 0.00919927690524398, "loss": 2.8042, "step": 2562 }, { "crossentropy": 2.727171301841736, "epoch": 0.13937301177302264, "grad_norm": 0.04727954417467117, "grad_norm_var": 0.002560007771778953, "learning_rate": 0.009198600877843103, "loss": 2.7272, "step": 2563 }, { "crossentropy": 2.780558943748474, "epoch": 0.13942739063052284, "grad_norm": 0.051159631460905075, "grad_norm_var": 0.002532498369928013, "learning_rate": 0.009197924590048916, "loss": 2.7806, "step": 2564 }, { "crossentropy": 2.7354376316070557, "epoch": 0.13948176948802304, "grad_norm": 0.04214831814169884, "grad_norm_var": 0.0025319069708574326, "learning_rate": 0.00919724804190336, "loss": 2.7354, "step": 2565 }, { "crossentropy": 2.6494994163513184, "epoch": 0.13953614834552325, "grad_norm": 0.0721382424235344, "grad_norm_var": 0.002483617403875678, "learning_rate": 0.009196571233448391, "loss": 2.6495, "step": 2566 }, { "crossentropy": 2.810205101966858, "epoch": 0.13959052720302345, "grad_norm": 0.0511036142706871, "grad_norm_var": 0.0024564458480686177, "learning_rate": 0.009195894164725988, "loss": 2.8102, "step": 2567 }, { "crossentropy": 2.7081931829452515, "epoch": 0.13964490606052365, "grad_norm": 0.04891693964600563, "grad_norm_var": 0.0023570553739963285, "learning_rate": 0.00919521683577814, "loss": 2.7082, "step": 2568 }, { "crossentropy": 2.7759952545166016, "epoch": 0.13969928491802389, "grad_norm": 0.04904000461101532, "grad_norm_var": 0.0023619383552248127, "learning_rate": 0.009194539246646856, "loss": 2.776, "step": 2569 }, { "crossentropy": 2.818212389945984, "epoch": 0.1397536637755241, "grad_norm": 0.10743090510368347, "grad_norm_var": 0.0024730467437789356, "learning_rate": 0.009193861397374157, "loss": 2.8182, "step": 2570 }, { "crossentropy": 2.8905831575393677, "epoch": 0.1398080426330243, "grad_norm": 0.04328735172748566, "grad_norm_var": 0.0024783336516874524, "learning_rate": 0.009193183288002086, "loss": 2.8906, "step": 2571 }, { "crossentropy": 2.818553328514099, "epoch": 0.1398624214905245, "grad_norm": 0.05152733251452446, "grad_norm_var": 0.0024636024932973137, "learning_rate": 0.009192504918572694, "loss": 2.8186, "step": 2572 }, { "crossentropy": 2.714559316635132, "epoch": 0.1399168003480247, "grad_norm": 0.05045551061630249, "grad_norm_var": 0.0024683314409150105, "learning_rate": 0.009191826289128058, "loss": 2.7146, "step": 2573 }, { "crossentropy": 2.7614864110946655, "epoch": 0.1399711792055249, "grad_norm": 0.050584279000759125, "grad_norm_var": 0.002476781440316694, "learning_rate": 0.009191147399710265, "loss": 2.7615, "step": 2574 }, { "crossentropy": 2.799337387084961, "epoch": 0.1400255580630251, "grad_norm": 0.05204338952898979, "grad_norm_var": 0.00249122050135413, "learning_rate": 0.009190468250361414, "loss": 2.7993, "step": 2575 }, { "crossentropy": 2.7115237712860107, "epoch": 0.1400799369205253, "grad_norm": 0.05180521681904793, "grad_norm_var": 0.0024949148544865336, "learning_rate": 0.009189788841123632, "loss": 2.7115, "step": 2576 }, { "crossentropy": 2.758355975151062, "epoch": 0.14013431577802551, "grad_norm": 0.04993666335940361, "grad_norm_var": 0.0002450949393745155, "learning_rate": 0.009189109172039049, "loss": 2.7584, "step": 2577 }, { "crossentropy": 2.7061315774917603, "epoch": 0.14018869463552572, "grad_norm": 0.04440387338399887, "grad_norm_var": 0.0002465795479812594, "learning_rate": 0.009188429243149825, "loss": 2.7061, "step": 2578 }, { "crossentropy": 2.780548095703125, "epoch": 0.14024307349302592, "grad_norm": 0.03990189731121063, "grad_norm_var": 0.0002565467772647771, "learning_rate": 0.00918774905449812, "loss": 2.7805, "step": 2579 }, { "crossentropy": 2.763217806816101, "epoch": 0.14029745235052613, "grad_norm": 0.062064915895462036, "grad_norm_var": 0.0002605872366492954, "learning_rate": 0.009187068606126126, "loss": 2.7632, "step": 2580 }, { "crossentropy": 2.6898319721221924, "epoch": 0.14035183120802633, "grad_norm": 0.06206095591187477, "grad_norm_var": 0.0002534401626974057, "learning_rate": 0.009186387898076038, "loss": 2.6898, "step": 2581 }, { "crossentropy": 2.8344262838363647, "epoch": 0.14040621006552653, "grad_norm": 0.04115178436040878, "grad_norm_var": 0.0002443734343864921, "learning_rate": 0.009185706930390078, "loss": 2.8344, "step": 2582 }, { "crossentropy": 2.74008572101593, "epoch": 0.14046058892302674, "grad_norm": 0.12446867674589157, "grad_norm_var": 0.0005575084594694219, "learning_rate": 0.009185025703110474, "loss": 2.7401, "step": 2583 }, { "crossentropy": 2.69505774974823, "epoch": 0.14051496778052694, "grad_norm": 0.04660974070429802, "grad_norm_var": 0.0005606561066582488, "learning_rate": 0.00918434421627948, "loss": 2.6951, "step": 2584 }, { "crossentropy": 2.7121118307113647, "epoch": 0.14056934663802714, "grad_norm": 0.079414501786232, "grad_norm_var": 0.0005823425623589523, "learning_rate": 0.009183662469939359, "loss": 2.7121, "step": 2585 }, { "crossentropy": 2.8395661115646362, "epoch": 0.14062372549552735, "grad_norm": 0.045827750116586685, "grad_norm_var": 0.0004284764522494118, "learning_rate": 0.009182980464132389, "loss": 2.8396, "step": 2586 }, { "crossentropy": 2.729229211807251, "epoch": 0.14067810435302755, "grad_norm": 0.05015897378325462, "grad_norm_var": 0.00041980623770845676, "learning_rate": 0.009182298198900872, "loss": 2.7292, "step": 2587 }, { "crossentropy": 2.737873077392578, "epoch": 0.14073248321052775, "grad_norm": 0.046223677694797516, "grad_norm_var": 0.0004250106944160595, "learning_rate": 0.00918161567428712, "loss": 2.7379, "step": 2588 }, { "crossentropy": 2.8672144412994385, "epoch": 0.14078686206802796, "grad_norm": 0.0447721928358078, "grad_norm_var": 0.0004312835863462478, "learning_rate": 0.009180932890333462, "loss": 2.8672, "step": 2589 }, { "crossentropy": 2.740343689918518, "epoch": 0.14084124092552816, "grad_norm": 0.043550074100494385, "grad_norm_var": 0.0004391874863879106, "learning_rate": 0.009180249847082244, "loss": 2.7403, "step": 2590 }, { "crossentropy": 2.7659188508987427, "epoch": 0.14089561978302836, "grad_norm": 0.10349921882152557, "grad_norm_var": 0.0005824999757023237, "learning_rate": 0.009179566544575827, "loss": 2.7659, "step": 2591 }, { "crossentropy": 2.7728191614151, "epoch": 0.14094999864052857, "grad_norm": 0.055053357034921646, "grad_norm_var": 0.000580264021132527, "learning_rate": 0.009178882982856592, "loss": 2.7728, "step": 2592 }, { "crossentropy": 2.8181211948394775, "epoch": 0.14100437749802877, "grad_norm": 0.04973611980676651, "grad_norm_var": 0.0005805006887903969, "learning_rate": 0.00917819916196693, "loss": 2.8181, "step": 2593 }, { "crossentropy": 2.7097636461257935, "epoch": 0.14105875635552897, "grad_norm": 0.050155531615018845, "grad_norm_var": 0.0005716192513817701, "learning_rate": 0.00917751508194925, "loss": 2.7098, "step": 2594 }, { "crossentropy": 2.824933409690857, "epoch": 0.14111313521302918, "grad_norm": 0.05443526431918144, "grad_norm_var": 0.0005477338130051755, "learning_rate": 0.009176830742845978, "loss": 2.8249, "step": 2595 }, { "crossentropy": 2.7027106285095215, "epoch": 0.14116751407052938, "grad_norm": 0.04649285227060318, "grad_norm_var": 0.0005584959981716181, "learning_rate": 0.00917614614469956, "loss": 2.7027, "step": 2596 }, { "crossentropy": 2.7232606410980225, "epoch": 0.14122189292802959, "grad_norm": 0.06604645401239395, "grad_norm_var": 0.0005611282821153897, "learning_rate": 0.00917546128755245, "loss": 2.7233, "step": 2597 }, { "crossentropy": 2.749642848968506, "epoch": 0.1412762717855298, "grad_norm": 0.04583284258842468, "grad_norm_var": 0.0005512177140153702, "learning_rate": 0.009174776171447125, "loss": 2.7496, "step": 2598 }, { "crossentropy": 2.7205281257629395, "epoch": 0.14133065064303, "grad_norm": 0.04478095844388008, "grad_norm_var": 0.00025799100794245545, "learning_rate": 0.009174090796426074, "loss": 2.7205, "step": 2599 }, { "crossentropy": 2.754762649536133, "epoch": 0.1413850295005302, "grad_norm": 0.04271985590457916, "grad_norm_var": 0.00026304811085160935, "learning_rate": 0.009173405162531801, "loss": 2.7548, "step": 2600 }, { "crossentropy": 2.7140512466430664, "epoch": 0.1414394083580304, "grad_norm": 0.04308999329805374, "grad_norm_var": 0.000223848338986504, "learning_rate": 0.009172719269806832, "loss": 2.7141, "step": 2601 }, { "crossentropy": 2.8349924087524414, "epoch": 0.1414937872155306, "grad_norm": 0.04350573942065239, "grad_norm_var": 0.00022610351824784897, "learning_rate": 0.009172033118293706, "loss": 2.835, "step": 2602 }, { "crossentropy": 2.663567543029785, "epoch": 0.1415481660730308, "grad_norm": 0.041427165269851685, "grad_norm_var": 0.0002328705308112452, "learning_rate": 0.009171346708034973, "loss": 2.6636, "step": 2603 }, { "crossentropy": 2.7458831071853638, "epoch": 0.141602544930531, "grad_norm": 0.043296895921230316, "grad_norm_var": 0.00023539959507838375, "learning_rate": 0.009170660039073207, "loss": 2.7459, "step": 2604 }, { "crossentropy": 2.7412092685699463, "epoch": 0.1416569237880312, "grad_norm": 0.04727786034345627, "grad_norm_var": 0.00023366135247391602, "learning_rate": 0.009169973111450994, "loss": 2.7412, "step": 2605 }, { "crossentropy": 2.85252046585083, "epoch": 0.14171130264553142, "grad_norm": 0.048243448138237, "grad_norm_var": 0.00023018439568777872, "learning_rate": 0.009169285925210935, "loss": 2.8525, "step": 2606 }, { "crossentropy": 2.793068051338196, "epoch": 0.14176568150303162, "grad_norm": 0.04341822490096092, "grad_norm_var": 4.003481428237453e-05, "learning_rate": 0.009168598480395652, "loss": 2.7931, "step": 2607 }, { "crossentropy": 2.7564053535461426, "epoch": 0.14182006036053182, "grad_norm": 0.0408262237906456, "grad_norm_var": 3.9010739310775015e-05, "learning_rate": 0.009167910777047776, "loss": 2.7564, "step": 2608 }, { "crossentropy": 2.902486801147461, "epoch": 0.14187443921803203, "grad_norm": 0.04688310623168945, "grad_norm_var": 3.846165580605783e-05, "learning_rate": 0.00916722281520996, "loss": 2.9025, "step": 2609 }, { "crossentropy": 2.7359567880630493, "epoch": 0.14192881807553223, "grad_norm": 0.04449763521552086, "grad_norm_var": 3.7913694840060036e-05, "learning_rate": 0.009166534594924869, "loss": 2.736, "step": 2610 }, { "crossentropy": 2.7569451332092285, "epoch": 0.14198319693303244, "grad_norm": 0.04162730649113655, "grad_norm_var": 3.448436142835283e-05, "learning_rate": 0.009165846116235188, "loss": 2.7569, "step": 2611 }, { "crossentropy": 2.6882985830307007, "epoch": 0.14203757579053264, "grad_norm": 0.04496346041560173, "grad_norm_var": 3.445315373923804e-05, "learning_rate": 0.009165157379183615, "loss": 2.6883, "step": 2612 }, { "crossentropy": 2.811200261116028, "epoch": 0.14209195464803284, "grad_norm": 0.04751044511795044, "grad_norm_var": 5.214758056010007e-06, "learning_rate": 0.009164468383812864, "loss": 2.8112, "step": 2613 }, { "crossentropy": 2.7171385288238525, "epoch": 0.14214633350553305, "grad_norm": 0.04335377365350723, "grad_norm_var": 5.1149486146604785e-06, "learning_rate": 0.009163779130165665, "loss": 2.7171, "step": 2614 }, { "crossentropy": 2.771382212638855, "epoch": 0.14220071236303325, "grad_norm": 0.044618915766477585, "grad_norm_var": 5.104337624702258e-06, "learning_rate": 0.00916308961828477, "loss": 2.7714, "step": 2615 }, { "crossentropy": 2.727513551712036, "epoch": 0.14225509122053345, "grad_norm": 0.044681042432785034, "grad_norm_var": 4.956701849014265e-06, "learning_rate": 0.009162399848212934, "loss": 2.7275, "step": 2616 }, { "crossentropy": 2.847384810447693, "epoch": 0.14230947007803366, "grad_norm": 0.04201665148139, "grad_norm_var": 5.205640292621899e-06, "learning_rate": 0.009161709819992942, "loss": 2.8474, "step": 2617 }, { "crossentropy": 2.782588839530945, "epoch": 0.14236384893553386, "grad_norm": 0.04277493804693222, "grad_norm_var": 5.31244127503656e-06, "learning_rate": 0.009161019533667587, "loss": 2.7826, "step": 2618 }, { "crossentropy": 2.773630380630493, "epoch": 0.14241822779303406, "grad_norm": 0.04259316250681877, "grad_norm_var": 4.964221353705572e-06, "learning_rate": 0.00916032898927968, "loss": 2.7736, "step": 2619 }, { "crossentropy": 2.8245437145233154, "epoch": 0.14247260665053427, "grad_norm": 0.05030105262994766, "grad_norm_var": 7.1062337024201e-06, "learning_rate": 0.009159638186872049, "loss": 2.8245, "step": 2620 }, { "crossentropy": 2.632408380508423, "epoch": 0.14252698550803447, "grad_norm": 0.044208381325006485, "grad_norm_var": 6.649970336499352e-06, "learning_rate": 0.009158947126487533, "loss": 2.6324, "step": 2621 }, { "crossentropy": 2.7006213665008545, "epoch": 0.14258136436553467, "grad_norm": 0.05135609582066536, "grad_norm_var": 8.795680752290921e-06, "learning_rate": 0.009158255808168994, "loss": 2.7006, "step": 2622 }, { "crossentropy": 2.713031053543091, "epoch": 0.14263574322303488, "grad_norm": 0.045016247779130936, "grad_norm_var": 8.676446328357602e-06, "learning_rate": 0.009157564231959306, "loss": 2.713, "step": 2623 }, { "crossentropy": 2.8037211894989014, "epoch": 0.14269012208053508, "grad_norm": 0.04326149821281433, "grad_norm_var": 7.748113632701353e-06, "learning_rate": 0.009156872397901363, "loss": 2.8037, "step": 2624 }, { "crossentropy": 2.7213228940963745, "epoch": 0.14274450093803528, "grad_norm": 0.04033192619681358, "grad_norm_var": 8.767251431424095e-06, "learning_rate": 0.009156180306038066, "loss": 2.7213, "step": 2625 }, { "crossentropy": 2.6575303077697754, "epoch": 0.1427988797955355, "grad_norm": 0.040669217705726624, "grad_norm_var": 9.720001014825765e-06, "learning_rate": 0.009155487956412346, "loss": 2.6575, "step": 2626 }, { "crossentropy": 2.665833830833435, "epoch": 0.1428532586530357, "grad_norm": 0.04106822609901428, "grad_norm_var": 9.941025595872066e-06, "learning_rate": 0.009154795349067133, "loss": 2.6658, "step": 2627 }, { "crossentropy": 2.9348338842391968, "epoch": 0.1429076375105359, "grad_norm": 0.04324992373585701, "grad_norm_var": 9.971886300643117e-06, "learning_rate": 0.009154102484045386, "loss": 2.9348, "step": 2628 }, { "crossentropy": 2.858519196510315, "epoch": 0.1429620163680361, "grad_norm": 0.041986264288425446, "grad_norm_var": 9.432161535163908e-06, "learning_rate": 0.009153409361390077, "loss": 2.8585, "step": 2629 }, { "crossentropy": 2.8987555503845215, "epoch": 0.1430163952255363, "grad_norm": 0.04178966209292412, "grad_norm_var": 9.687082730420032e-06, "learning_rate": 0.009152715981144192, "loss": 2.8988, "step": 2630 }, { "crossentropy": 2.7923747301101685, "epoch": 0.1430707740830365, "grad_norm": 0.04384298622608185, "grad_norm_var": 9.634319685976565e-06, "learning_rate": 0.009152022343350733, "loss": 2.7924, "step": 2631 }, { "crossentropy": 2.816634774208069, "epoch": 0.1431251529405367, "grad_norm": 0.04549788311123848, "grad_norm_var": 9.78322775015087e-06, "learning_rate": 0.009151328448052718, "loss": 2.8166, "step": 2632 }, { "crossentropy": 2.7008392810821533, "epoch": 0.1431795317980369, "grad_norm": 0.052190184593200684, "grad_norm_var": 1.3903831227734161e-05, "learning_rate": 0.009150634295293185, "loss": 2.7008, "step": 2633 }, { "crossentropy": 2.8830312490463257, "epoch": 0.14323391065553712, "grad_norm": 0.044952768832445145, "grad_norm_var": 1.3733145360074414e-05, "learning_rate": 0.009149939885115182, "loss": 2.883, "step": 2634 }, { "crossentropy": 2.7665884494781494, "epoch": 0.14328828951303732, "grad_norm": 0.04441102594137192, "grad_norm_var": 1.3472722708993828e-05, "learning_rate": 0.009149245217561776, "loss": 2.7666, "step": 2635 }, { "crossentropy": 2.7843557596206665, "epoch": 0.14334266837053752, "grad_norm": 0.04085492342710495, "grad_norm_var": 1.1911157234276853e-05, "learning_rate": 0.00914855029267605, "loss": 2.7844, "step": 2636 }, { "crossentropy": 2.686606764793396, "epoch": 0.14339704722803773, "grad_norm": 0.044687800109386444, "grad_norm_var": 1.1936097105607184e-05, "learning_rate": 0.009147855110501103, "loss": 2.6866, "step": 2637 }, { "crossentropy": 2.771600127220154, "epoch": 0.14345142608553793, "grad_norm": 0.03962972015142441, "grad_norm_var": 9.142964182775233e-06, "learning_rate": 0.00914715967108005, "loss": 2.7716, "step": 2638 }, { "crossentropy": 2.655178427696228, "epoch": 0.14350580494303813, "grad_norm": 0.06336114555597305, "grad_norm_var": 3.4276458537938334e-05, "learning_rate": 0.009146463974456018, "loss": 2.6552, "step": 2639 }, { "crossentropy": 2.7712334394454956, "epoch": 0.14356018380053834, "grad_norm": 0.050406694412231445, "grad_norm_var": 3.6300203569521125e-05, "learning_rate": 0.009145768020672157, "loss": 2.7712, "step": 2640 }, { "crossentropy": 2.765931487083435, "epoch": 0.14361456265803854, "grad_norm": 0.05199658125638962, "grad_norm_var": 3.764799415700385e-05, "learning_rate": 0.00914507180977163, "loss": 2.7659, "step": 2641 }, { "crossentropy": 2.741411805152893, "epoch": 0.14366894151553875, "grad_norm": 0.058608412742614746, "grad_norm_var": 4.58187650803558e-05, "learning_rate": 0.00914437534179761, "loss": 2.7414, "step": 2642 }, { "crossentropy": 2.6393022537231445, "epoch": 0.14372332037303895, "grad_norm": 0.04131844639778137, "grad_norm_var": 4.5632004959920515e-05, "learning_rate": 0.009143678616793298, "loss": 2.6393, "step": 2643 }, { "crossentropy": 2.75165855884552, "epoch": 0.14377769923053918, "grad_norm": 0.04341216757893562, "grad_norm_var": 4.555687414338946e-05, "learning_rate": 0.009142981634801902, "loss": 2.7517, "step": 2644 }, { "crossentropy": 2.7176594734191895, "epoch": 0.14383207808803938, "grad_norm": 0.04778437316417694, "grad_norm_var": 4.392950822121854e-05, "learning_rate": 0.009142284395866647, "loss": 2.7177, "step": 2645 }, { "crossentropy": 2.6825376749038696, "epoch": 0.1438864569455396, "grad_norm": 0.17951618134975433, "grad_norm_var": 0.001130636176099684, "learning_rate": 0.009141586900030774, "loss": 2.6825, "step": 2646 }, { "crossentropy": 2.6931731700897217, "epoch": 0.1439408358030398, "grad_norm": 0.04662073776125908, "grad_norm_var": 0.0011266975471355373, "learning_rate": 0.009140889147337544, "loss": 2.6932, "step": 2647 }, { "crossentropy": 2.8724653720855713, "epoch": 0.14399521466054, "grad_norm": 0.0446174256503582, "grad_norm_var": 0.0011279733765693393, "learning_rate": 0.009140191137830229, "loss": 2.8725, "step": 2648 }, { "crossentropy": 2.836632251739502, "epoch": 0.1440495935180402, "grad_norm": 0.04377656802535057, "grad_norm_var": 0.0011365572117969225, "learning_rate": 0.00913949287155212, "loss": 2.8366, "step": 2649 }, { "crossentropy": 2.639215588569641, "epoch": 0.1441039723755404, "grad_norm": 0.04035341367125511, "grad_norm_var": 0.0011442690209177745, "learning_rate": 0.009138794348546522, "loss": 2.6392, "step": 2650 }, { "crossentropy": 2.7877272367477417, "epoch": 0.1441583512330406, "grad_norm": 0.041055694222450256, "grad_norm_var": 0.0011497478355505279, "learning_rate": 0.009138095568856757, "loss": 2.7877, "step": 2651 }, { "crossentropy": 2.6787333488464355, "epoch": 0.1442127300905408, "grad_norm": 0.043856870383024216, "grad_norm_var": 0.0011446993885957351, "learning_rate": 0.009137396532526165, "loss": 2.6787, "step": 2652 }, { "crossentropy": 2.7522599697113037, "epoch": 0.144267108948041, "grad_norm": 0.04653775691986084, "grad_norm_var": 0.0011423542175465094, "learning_rate": 0.009136697239598096, "loss": 2.7523, "step": 2653 }, { "crossentropy": 2.6699399948120117, "epoch": 0.14432148780554122, "grad_norm": 0.04510132223367691, "grad_norm_var": 0.0011328819783124294, "learning_rate": 0.00913599769011592, "loss": 2.6699, "step": 2654 }, { "crossentropy": 2.714115023612976, "epoch": 0.14437586666304142, "grad_norm": 0.042080316692590714, "grad_norm_var": 0.0011389384452638641, "learning_rate": 0.009135297884123024, "loss": 2.7141, "step": 2655 }, { "crossentropy": 2.7819257974624634, "epoch": 0.14443024552054162, "grad_norm": 0.04260430112481117, "grad_norm_var": 0.0011466793167791514, "learning_rate": 0.009134597821662807, "loss": 2.7819, "step": 2656 }, { "crossentropy": 2.6711100339889526, "epoch": 0.14448462437804183, "grad_norm": 0.039362289011478424, "grad_norm_var": 0.0011595297039485972, "learning_rate": 0.009133897502778689, "loss": 2.6711, "step": 2657 }, { "crossentropy": 2.7037020921707153, "epoch": 0.14453900323554203, "grad_norm": 0.040576860308647156, "grad_norm_var": 0.0011661575491000884, "learning_rate": 0.009133196927514102, "loss": 2.7037, "step": 2658 }, { "crossentropy": 2.6844260692596436, "epoch": 0.14459338209304223, "grad_norm": 0.040332626551389694, "grad_norm_var": 0.0011675941616142683, "learning_rate": 0.009132496095912495, "loss": 2.6844, "step": 2659 }, { "crossentropy": 2.742575168609619, "epoch": 0.14464776095054244, "grad_norm": 0.044686537235975266, "grad_norm_var": 0.001166283297716986, "learning_rate": 0.009131795008017331, "loss": 2.7426, "step": 2660 }, { "crossentropy": 2.783678650856018, "epoch": 0.14470213980804264, "grad_norm": 0.04240662232041359, "grad_norm_var": 0.0011709729848349707, "learning_rate": 0.009131093663872094, "loss": 2.7837, "step": 2661 }, { "crossentropy": 2.69681453704834, "epoch": 0.14475651866554284, "grad_norm": 0.04764324799180031, "grad_norm_var": 6.396101360386409e-06, "learning_rate": 0.00913039206352028, "loss": 2.6968, "step": 2662 }, { "crossentropy": 2.782546281814575, "epoch": 0.14481089752304305, "grad_norm": 0.0506497286260128, "grad_norm_var": 9.234412799586678e-06, "learning_rate": 0.009129690207005402, "loss": 2.7825, "step": 2663 }, { "crossentropy": 2.68765127658844, "epoch": 0.14486527638054325, "grad_norm": 0.06411605328321457, "grad_norm_var": 3.5960033987718816e-05, "learning_rate": 0.009128988094370984, "loss": 2.6877, "step": 2664 }, { "crossentropy": 2.8535943031311035, "epoch": 0.14491965523804345, "grad_norm": 0.04798252135515213, "grad_norm_var": 3.654990246109615e-05, "learning_rate": 0.009128285725660577, "loss": 2.8536, "step": 2665 }, { "crossentropy": 2.821890950202942, "epoch": 0.14497403409554366, "grad_norm": 0.05465186759829521, "grad_norm_var": 4.054713787892168e-05, "learning_rate": 0.009127583100917738, "loss": 2.8219, "step": 2666 }, { "crossentropy": 2.77485191822052, "epoch": 0.14502841295304386, "grad_norm": 0.07000131905078888, "grad_norm_var": 7.4398727210366e-05, "learning_rate": 0.009126880220186042, "loss": 2.7749, "step": 2667 }, { "crossentropy": 2.857543706893921, "epoch": 0.14508279181054407, "grad_norm": 0.0451011136174202, "grad_norm_var": 7.386423670786545e-05, "learning_rate": 0.009126177083509083, "loss": 2.8575, "step": 2668 }, { "crossentropy": 2.843836545944214, "epoch": 0.14513717066804427, "grad_norm": 0.04393293336033821, "grad_norm_var": 7.470573689537698e-05, "learning_rate": 0.009125473690930466, "loss": 2.8438, "step": 2669 }, { "crossentropy": 2.8403488397598267, "epoch": 0.14519154952554447, "grad_norm": 0.04101330786943436, "grad_norm_var": 7.70995627008987e-05, "learning_rate": 0.00912477004249382, "loss": 2.8403, "step": 2670 }, { "crossentropy": 2.67095148563385, "epoch": 0.14524592838304468, "grad_norm": 0.04179608076810837, "grad_norm_var": 7.730323751496496e-05, "learning_rate": 0.009124066138242778, "loss": 2.671, "step": 2671 }, { "crossentropy": 2.829958438873291, "epoch": 0.14530030724054488, "grad_norm": 0.04151780530810356, "grad_norm_var": 7.80577845104063e-05, "learning_rate": 0.009123361978221, "loss": 2.83, "step": 2672 }, { "crossentropy": 2.747593641281128, "epoch": 0.14535468609804508, "grad_norm": 0.03847433626651764, "grad_norm_var": 7.90392234041093e-05, "learning_rate": 0.009122657562472158, "loss": 2.7476, "step": 2673 }, { "crossentropy": 2.865001678466797, "epoch": 0.1454090649555453, "grad_norm": 0.04199792444705963, "grad_norm_var": 7.791427095312775e-05, "learning_rate": 0.009121952891039937, "loss": 2.865, "step": 2674 }, { "crossentropy": 2.679887890815735, "epoch": 0.1454634438130455, "grad_norm": 0.04130001738667488, "grad_norm_var": 7.707806984670306e-05, "learning_rate": 0.00912124796396804, "loss": 2.6799, "step": 2675 }, { "crossentropy": 2.729501247406006, "epoch": 0.1455178226705457, "grad_norm": 0.04248283430933952, "grad_norm_var": 7.815815226314115e-05, "learning_rate": 0.009120542781300187, "loss": 2.7295, "step": 2676 }, { "crossentropy": 2.7620128393173218, "epoch": 0.1455722015280459, "grad_norm": 0.05982892960309982, "grad_norm_var": 8.601351464840029e-05, "learning_rate": 0.009119837343080113, "loss": 2.762, "step": 2677 }, { "crossentropy": 2.7082639932632446, "epoch": 0.1456265803855461, "grad_norm": 0.044398512691259384, "grad_norm_var": 8.694728370053495e-05, "learning_rate": 0.009119131649351566, "loss": 2.7083, "step": 2678 }, { "crossentropy": 2.708625316619873, "epoch": 0.1456809592430463, "grad_norm": 0.04405989497900009, "grad_norm_var": 8.740161867627957e-05, "learning_rate": 0.009118425700158316, "loss": 2.7086, "step": 2679 }, { "crossentropy": 2.788426995277405, "epoch": 0.1457353381005465, "grad_norm": 0.04951423779129982, "grad_norm_var": 6.870061307732335e-05, "learning_rate": 0.009117719495544144, "loss": 2.7884, "step": 2680 }, { "crossentropy": 2.7426837682724, "epoch": 0.1457897169580467, "grad_norm": 0.058289628475904465, "grad_norm_var": 7.702961530256328e-05, "learning_rate": 0.009117013035552846, "loss": 2.7427, "step": 2681 }, { "crossentropy": 2.8179203271865845, "epoch": 0.14584409581554691, "grad_norm": 0.05262281373143196, "grad_norm_var": 7.532434374902784e-05, "learning_rate": 0.009116306320228239, "loss": 2.8179, "step": 2682 }, { "crossentropy": 2.868131160736084, "epoch": 0.14589847467304712, "grad_norm": 0.04283849149942398, "grad_norm_var": 3.911443656758514e-05, "learning_rate": 0.009115599349614152, "loss": 2.8681, "step": 2683 }, { "crossentropy": 2.698603868484497, "epoch": 0.14595285353054732, "grad_norm": 0.04288945347070694, "grad_norm_var": 3.9559321135203734e-05, "learning_rate": 0.00911489212375443, "loss": 2.6986, "step": 2684 }, { "crossentropy": 2.849348306655884, "epoch": 0.14600723238804753, "grad_norm": 0.04806680977344513, "grad_norm_var": 3.979956153195846e-05, "learning_rate": 0.009114184642692933, "loss": 2.8493, "step": 2685 }, { "crossentropy": 2.8753254413604736, "epoch": 0.14606161124554773, "grad_norm": 0.05086292698979378, "grad_norm_var": 3.971698814825263e-05, "learning_rate": 0.009113476906473541, "loss": 2.8753, "step": 2686 }, { "crossentropy": 2.7436243295669556, "epoch": 0.14611599010304793, "grad_norm": 0.045995090156793594, "grad_norm_var": 3.829244500251205e-05, "learning_rate": 0.009112768915140148, "loss": 2.7436, "step": 2687 }, { "crossentropy": 2.868555188179016, "epoch": 0.14617036896054814, "grad_norm": 0.0440615750849247, "grad_norm_var": 3.698290069162746e-05, "learning_rate": 0.00911206066873666, "loss": 2.8686, "step": 2688 }, { "crossentropy": 2.6555731296539307, "epoch": 0.14622474781804834, "grad_norm": 0.04282539710402489, "grad_norm_var": 3.3376555095034297e-05, "learning_rate": 0.009111352167307003, "loss": 2.6556, "step": 2689 }, { "crossentropy": 2.748712420463562, "epoch": 0.14627912667554854, "grad_norm": 0.045227911323308945, "grad_norm_var": 3.187345796388883e-05, "learning_rate": 0.009110643410895118, "loss": 2.7487, "step": 2690 }, { "crossentropy": 2.677029848098755, "epoch": 0.14633350553304875, "grad_norm": 0.04450041800737381, "grad_norm_var": 2.9994256287866476e-05, "learning_rate": 0.009109934399544961, "loss": 2.677, "step": 2691 }, { "crossentropy": 2.7731921672821045, "epoch": 0.14638788439054895, "grad_norm": 0.057977765798568726, "grad_norm_var": 3.483285985171839e-05, "learning_rate": 0.009109225133300503, "loss": 2.7732, "step": 2692 }, { "crossentropy": 2.720455050468445, "epoch": 0.14644226324804915, "grad_norm": 0.041935089975595474, "grad_norm_var": 2.7511413452946157e-05, "learning_rate": 0.009108515612205735, "loss": 2.7205, "step": 2693 }, { "crossentropy": 2.6467570066452026, "epoch": 0.14649664210554936, "grad_norm": 0.039395369589328766, "grad_norm_var": 3.098081748164148e-05, "learning_rate": 0.009107805836304658, "loss": 2.6468, "step": 2694 }, { "crossentropy": 2.655234932899475, "epoch": 0.14655102096304956, "grad_norm": 0.0388309620320797, "grad_norm_var": 3.4698656399088226e-05, "learning_rate": 0.009107095805641292, "loss": 2.6552, "step": 2695 }, { "crossentropy": 2.803689956665039, "epoch": 0.14660539982054976, "grad_norm": 0.04289506375789642, "grad_norm_var": 3.4877922463117236e-05, "learning_rate": 0.009106385520259675, "loss": 2.8037, "step": 2696 }, { "crossentropy": 2.7351930141448975, "epoch": 0.14665977867804997, "grad_norm": 0.04875509440898895, "grad_norm_var": 2.5191607882853415e-05, "learning_rate": 0.009105674980203855, "loss": 2.7352, "step": 2697 }, { "crossentropy": 2.767076849937439, "epoch": 0.14671415753555017, "grad_norm": 0.04173712804913521, "grad_norm_var": 2.2411934946403357e-05, "learning_rate": 0.009104964185517902, "loss": 2.7671, "step": 2698 }, { "crossentropy": 2.808807611465454, "epoch": 0.14676853639305038, "grad_norm": 0.04160887375473976, "grad_norm_var": 2.2848457608352855e-05, "learning_rate": 0.009104253136245894, "loss": 2.8088, "step": 2699 }, { "crossentropy": 2.66647207736969, "epoch": 0.14682291525055058, "grad_norm": 0.04705733805894852, "grad_norm_var": 2.2845868680522404e-05, "learning_rate": 0.009103541832431936, "loss": 2.6665, "step": 2700 }, { "crossentropy": 2.8693222999572754, "epoch": 0.14687729410805078, "grad_norm": 0.05065351724624634, "grad_norm_var": 2.4284432598759877e-05, "learning_rate": 0.009102830274120139, "loss": 2.8693, "step": 2701 }, { "crossentropy": 2.8520067930221558, "epoch": 0.14693167296555099, "grad_norm": 0.0465378500521183, "grad_norm_var": 2.2228246565613763e-05, "learning_rate": 0.009102118461354633, "loss": 2.852, "step": 2702 }, { "crossentropy": 2.7667168378829956, "epoch": 0.1469860518230512, "grad_norm": 0.04535014182329178, "grad_norm_var": 2.2168643205270484e-05, "learning_rate": 0.009101406394179565, "loss": 2.7667, "step": 2703 }, { "crossentropy": 2.9285298585891724, "epoch": 0.1470404306805514, "grad_norm": 0.04982070252299309, "grad_norm_var": 2.3552233310097927e-05, "learning_rate": 0.009100694072639096, "loss": 2.9285, "step": 2704 }, { "crossentropy": 2.6960668563842773, "epoch": 0.1470948095380516, "grad_norm": 0.04473773390054703, "grad_norm_var": 2.3144909654220815e-05, "learning_rate": 0.009099981496777402, "loss": 2.6961, "step": 2705 }, { "crossentropy": 2.7672336101531982, "epoch": 0.1471491883955518, "grad_norm": 0.10641782730817795, "grad_norm_var": 0.0002554371234992893, "learning_rate": 0.009099268666638679, "loss": 2.7672, "step": 2706 }, { "crossentropy": 2.752339005470276, "epoch": 0.147203567253052, "grad_norm": 0.06369279325008392, "grad_norm_var": 0.0002662709995954573, "learning_rate": 0.009098555582267135, "loss": 2.7523, "step": 2707 }, { "crossentropy": 2.628996968269348, "epoch": 0.1472579461105522, "grad_norm": 0.0436333566904068, "grad_norm_var": 0.00026475791152450655, "learning_rate": 0.009097842243706996, "loss": 2.629, "step": 2708 }, { "crossentropy": 2.7159438133239746, "epoch": 0.1473123249680524, "grad_norm": 0.042930543422698975, "grad_norm_var": 0.0002638069921597522, "learning_rate": 0.0090971286510025, "loss": 2.7159, "step": 2709 }, { "crossentropy": 2.7171117067337036, "epoch": 0.14736670382555261, "grad_norm": 0.042772501707077026, "grad_norm_var": 0.0002599120361692163, "learning_rate": 0.009096414804197905, "loss": 2.7171, "step": 2710 }, { "crossentropy": 2.7487391233444214, "epoch": 0.14742108268305282, "grad_norm": 0.043397456407547, "grad_norm_var": 0.0002545126388051172, "learning_rate": 0.009095700703337484, "loss": 2.7487, "step": 2711 }, { "crossentropy": 2.7171601057052612, "epoch": 0.14747546154055302, "grad_norm": 0.04443284869194031, "grad_norm_var": 0.0002531780527326349, "learning_rate": 0.009094986348465523, "loss": 2.7172, "step": 2712 }, { "crossentropy": 2.7599211931228638, "epoch": 0.14752984039805322, "grad_norm": 0.041152797639369965, "grad_norm_var": 0.00025827611636176174, "learning_rate": 0.009094271739626325, "loss": 2.7599, "step": 2713 }, { "crossentropy": 2.7703827619552612, "epoch": 0.14758421925555343, "grad_norm": 0.08191954344511032, "grad_norm_var": 0.00031628236420286375, "learning_rate": 0.009093556876864212, "loss": 2.7704, "step": 2714 }, { "crossentropy": 2.6633776426315308, "epoch": 0.14763859811305363, "grad_norm": 0.04027277231216431, "grad_norm_var": 0.00031829090998784413, "learning_rate": 0.00909284176022352, "loss": 2.6634, "step": 2715 }, { "crossentropy": 2.6843968629837036, "epoch": 0.14769297697055384, "grad_norm": 0.044465068727731705, "grad_norm_var": 0.0003204793110052353, "learning_rate": 0.009092126389748597, "loss": 2.6844, "step": 2716 }, { "crossentropy": 2.812389850616455, "epoch": 0.14774735582805404, "grad_norm": 0.04173552244901657, "grad_norm_var": 0.00032706496331470487, "learning_rate": 0.009091410765483808, "loss": 2.8124, "step": 2717 }, { "crossentropy": 2.7343353033065796, "epoch": 0.14780173468555427, "grad_norm": 0.040179293602705, "grad_norm_var": 0.0003337601540827586, "learning_rate": 0.009090694887473539, "loss": 2.7343, "step": 2718 }, { "crossentropy": 2.792637348175049, "epoch": 0.14785611354305447, "grad_norm": 0.042176246643066406, "grad_norm_var": 0.00033680478837683946, "learning_rate": 0.009089978755762188, "loss": 2.7926, "step": 2719 }, { "crossentropy": 2.7083334922790527, "epoch": 0.14791049240055468, "grad_norm": 0.04513462260365486, "grad_norm_var": 0.00033882571375351594, "learning_rate": 0.009089262370394166, "loss": 2.7083, "step": 2720 }, { "crossentropy": 2.6673206090927124, "epoch": 0.14796487125805488, "grad_norm": 0.042949460446834564, "grad_norm_var": 0.00034041517944225397, "learning_rate": 0.009088545731413903, "loss": 2.6673, "step": 2721 }, { "crossentropy": 2.7589410543441772, "epoch": 0.14801925011555508, "grad_norm": 0.0414574109017849, "grad_norm_var": 0.00011943096267853503, "learning_rate": 0.009087828838865847, "loss": 2.7589, "step": 2722 }, { "crossentropy": 2.8009074926376343, "epoch": 0.1480736289730553, "grad_norm": 0.04098251834511757, "grad_norm_var": 9.928403600417361e-05, "learning_rate": 0.009087111692794459, "loss": 2.8009, "step": 2723 }, { "crossentropy": 2.77269446849823, "epoch": 0.1481280078305555, "grad_norm": 0.04506535828113556, "grad_norm_var": 9.915613147105136e-05, "learning_rate": 0.009086394293244213, "loss": 2.7727, "step": 2724 }, { "crossentropy": 2.5899710655212402, "epoch": 0.1481823866880557, "grad_norm": 0.05175529047846794, "grad_norm_var": 0.00010151309874967077, "learning_rate": 0.009085676640259603, "loss": 2.59, "step": 2725 }, { "crossentropy": 2.7748912572860718, "epoch": 0.1482367655455559, "grad_norm": 0.0487489178776741, "grad_norm_var": 0.00010147995134975307, "learning_rate": 0.009084958733885136, "loss": 2.7749, "step": 2726 }, { "crossentropy": 2.704935908317566, "epoch": 0.1482911444030561, "grad_norm": 0.0411534309387207, "grad_norm_var": 0.00010257009924693448, "learning_rate": 0.009084240574165337, "loss": 2.7049, "step": 2727 }, { "crossentropy": 2.6433286666870117, "epoch": 0.1483455232605563, "grad_norm": 0.040946509689092636, "grad_norm_var": 0.00010398796620830758, "learning_rate": 0.009083522161144745, "loss": 2.6433, "step": 2728 }, { "crossentropy": 2.7879068851470947, "epoch": 0.1483999021180565, "grad_norm": 0.04076870530843735, "grad_norm_var": 0.00010422652177687336, "learning_rate": 0.009082803494867918, "loss": 2.7879, "step": 2729 }, { "crossentropy": 2.855169177055359, "epoch": 0.1484542809755567, "grad_norm": 0.042474158108234406, "grad_norm_var": 1.0490600954103588e-05, "learning_rate": 0.009082084575379422, "loss": 2.8552, "step": 2730 }, { "crossentropy": 2.7125771045684814, "epoch": 0.14850865983305692, "grad_norm": 0.050174228847026825, "grad_norm_var": 1.283064471088487e-05, "learning_rate": 0.009081365402723848, "loss": 2.7126, "step": 2731 }, { "crossentropy": 2.8210527896881104, "epoch": 0.14856303869055712, "grad_norm": 0.05915188416838646, "grad_norm_var": 2.769192387750598e-05, "learning_rate": 0.009080645976945797, "loss": 2.8211, "step": 2732 }, { "crossentropy": 2.7320127487182617, "epoch": 0.14861741754805732, "grad_norm": 0.03991451859474182, "grad_norm_var": 2.8613696732153412e-05, "learning_rate": 0.009079926298089885, "loss": 2.732, "step": 2733 }, { "crossentropy": 2.811127781867981, "epoch": 0.14867179640555753, "grad_norm": 0.0402812696993351, "grad_norm_var": 2.8554721375080064e-05, "learning_rate": 0.009079206366200748, "loss": 2.8111, "step": 2734 }, { "crossentropy": 2.8205024003982544, "epoch": 0.14872617526305773, "grad_norm": 0.04698733985424042, "grad_norm_var": 2.8465259675799526e-05, "learning_rate": 0.009078486181323035, "loss": 2.8205, "step": 2735 }, { "crossentropy": 2.654941201210022, "epoch": 0.14878055412055793, "grad_norm": 0.04543142765760422, "grad_norm_var": 2.848117430199725e-05, "learning_rate": 0.009077765743501413, "loss": 2.6549, "step": 2736 }, { "crossentropy": 2.7461997270584106, "epoch": 0.14883493297805814, "grad_norm": 0.04421349987387657, "grad_norm_var": 2.825395514257084e-05, "learning_rate": 0.00907704505278056, "loss": 2.7462, "step": 2737 }, { "crossentropy": 2.7692583799362183, "epoch": 0.14888931183555834, "grad_norm": 0.04544518142938614, "grad_norm_var": 2.7380646183203932e-05, "learning_rate": 0.009076324109205175, "loss": 2.7693, "step": 2738 }, { "crossentropy": 2.7099913358688354, "epoch": 0.14894369069305854, "grad_norm": 0.04308931902050972, "grad_norm_var": 2.646817429751494e-05, "learning_rate": 0.009075602912819968, "loss": 2.71, "step": 2739 }, { "crossentropy": 2.7279164791107178, "epoch": 0.14899806955055875, "grad_norm": 0.04290616884827614, "grad_norm_var": 2.6841520247513664e-05, "learning_rate": 0.009074881463669669, "loss": 2.7279, "step": 2740 }, { "crossentropy": 2.628283381462097, "epoch": 0.14905244840805895, "grad_norm": 0.04314889758825302, "grad_norm_var": 2.396591993911526e-05, "learning_rate": 0.00907415976179902, "loss": 2.6283, "step": 2741 }, { "crossentropy": 2.683390259742737, "epoch": 0.14910682726555916, "grad_norm": 0.05352712422609329, "grad_norm_var": 2.7986930941594167e-05, "learning_rate": 0.00907343780725278, "loss": 2.6834, "step": 2742 }, { "crossentropy": 2.680178642272949, "epoch": 0.14916120612305936, "grad_norm": 0.04389801248908043, "grad_norm_var": 2.7058932862997614e-05, "learning_rate": 0.009072715600075727, "loss": 2.6802, "step": 2743 }, { "crossentropy": 2.7435628175735474, "epoch": 0.14921558498055956, "grad_norm": 0.06132996082305908, "grad_norm_var": 4.160962431521027e-05, "learning_rate": 0.009071993140312647, "loss": 2.7436, "step": 2744 }, { "crossentropy": 2.7800108194351196, "epoch": 0.14926996383805977, "grad_norm": 0.06322053074836731, "grad_norm_var": 5.619325976724358e-05, "learning_rate": 0.009071270428008352, "loss": 2.78, "step": 2745 }, { "crossentropy": 2.747239351272583, "epoch": 0.14932434269555997, "grad_norm": 0.046252910047769547, "grad_norm_var": 5.438996525617314e-05, "learning_rate": 0.009070547463207656, "loss": 2.7472, "step": 2746 }, { "crossentropy": 2.7395522594451904, "epoch": 0.14937872155306017, "grad_norm": 0.050277307629585266, "grad_norm_var": 5.441967640829736e-05, "learning_rate": 0.009069824245955406, "loss": 2.7396, "step": 2747 }, { "crossentropy": 2.7373191118240356, "epoch": 0.14943310041056038, "grad_norm": 0.04128420725464821, "grad_norm_var": 4.7965392483186446e-05, "learning_rate": 0.009069100776296449, "loss": 2.7373, "step": 2748 }, { "crossentropy": 2.774502158164978, "epoch": 0.14948747926806058, "grad_norm": 0.03990419954061508, "grad_norm_var": 4.7975079733547124e-05, "learning_rate": 0.009068377054275654, "loss": 2.7745, "step": 2749 }, { "crossentropy": 2.7159576416015625, "epoch": 0.14954185812556078, "grad_norm": 0.04152807220816612, "grad_norm_var": 4.696365252222726e-05, "learning_rate": 0.009067653079937909, "loss": 2.716, "step": 2750 }, { "crossentropy": 2.6785370111465454, "epoch": 0.149596236983061, "grad_norm": 0.04465019330382347, "grad_norm_var": 4.731763908952964e-05, "learning_rate": 0.009066928853328111, "loss": 2.6785, "step": 2751 }, { "crossentropy": 2.7926470041275024, "epoch": 0.1496506158405612, "grad_norm": 0.04631073772907257, "grad_norm_var": 4.719593272273874e-05, "learning_rate": 0.009066204374491179, "loss": 2.7926, "step": 2752 }, { "crossentropy": 2.775046706199646, "epoch": 0.1497049946980614, "grad_norm": 0.043671414256095886, "grad_norm_var": 4.7411122494005604e-05, "learning_rate": 0.009065479643472039, "loss": 2.775, "step": 2753 }, { "crossentropy": 2.80887770652771, "epoch": 0.1497593735555616, "grad_norm": 0.04203035682439804, "grad_norm_var": 4.880358890414349e-05, "learning_rate": 0.009064754660315644, "loss": 2.8089, "step": 2754 }, { "crossentropy": 2.8297680616378784, "epoch": 0.1498137524130618, "grad_norm": 0.042495276778936386, "grad_norm_var": 4.911078609239333e-05, "learning_rate": 0.009064029425066955, "loss": 2.8298, "step": 2755 }, { "crossentropy": 2.6341731548309326, "epoch": 0.149868131270562, "grad_norm": 0.04860452562570572, "grad_norm_var": 4.829406944474868e-05, "learning_rate": 0.009063303937770951, "loss": 2.6342, "step": 2756 }, { "crossentropy": 2.5828659534454346, "epoch": 0.1499225101280622, "grad_norm": 0.04824794828891754, "grad_norm_var": 4.7295144786546386e-05, "learning_rate": 0.009062578198472622, "loss": 2.5829, "step": 2757 }, { "crossentropy": 2.718815326690674, "epoch": 0.1499768889855624, "grad_norm": 0.04308159649372101, "grad_norm_var": 4.547938548285132e-05, "learning_rate": 0.009061852207216982, "loss": 2.7188, "step": 2758 }, { "crossentropy": 2.70085608959198, "epoch": 0.15003126784306262, "grad_norm": 0.041041597723960876, "grad_norm_var": 4.7046656488846686e-05, "learning_rate": 0.009061125964049056, "loss": 2.7009, "step": 2759 }, { "crossentropy": 2.8779126405715942, "epoch": 0.15008564670056282, "grad_norm": 0.04232272133231163, "grad_norm_var": 3.2031849367561424e-05, "learning_rate": 0.009060399469013882, "loss": 2.8779, "step": 2760 }, { "crossentropy": 2.689252734184265, "epoch": 0.15014002555806302, "grad_norm": 0.04939322918653488, "grad_norm_var": 1.0956721741959414e-05, "learning_rate": 0.009059672722156518, "loss": 2.6893, "step": 2761 }, { "crossentropy": 2.762911558151245, "epoch": 0.15019440441556323, "grad_norm": 0.05171648785471916, "grad_norm_var": 1.4140489676692236e-05, "learning_rate": 0.009058945723522038, "loss": 2.7629, "step": 2762 }, { "crossentropy": 2.7222737073898315, "epoch": 0.15024878327306343, "grad_norm": 0.04097383841872215, "grad_norm_var": 1.2737136474795923e-05, "learning_rate": 0.009058218473155528, "loss": 2.7223, "step": 2763 }, { "crossentropy": 2.7823588848114014, "epoch": 0.15030316213056363, "grad_norm": 0.04418083652853966, "grad_norm_var": 1.2134049396484286e-05, "learning_rate": 0.009057490971102094, "loss": 2.7824, "step": 2764 }, { "crossentropy": 2.7897168397903442, "epoch": 0.15035754098806384, "grad_norm": 0.04385476931929588, "grad_norm_var": 1.0749487737761696e-05, "learning_rate": 0.00905676321740685, "loss": 2.7897, "step": 2765 }, { "crossentropy": 2.6332647800445557, "epoch": 0.15041191984556404, "grad_norm": 0.044368673115968704, "grad_norm_var": 1.0078397199896523e-05, "learning_rate": 0.009056035212114935, "loss": 2.6333, "step": 2766 }, { "crossentropy": 2.810358762741089, "epoch": 0.15046629870306424, "grad_norm": 0.04473228380084038, "grad_norm_var": 1.0077080036533885e-05, "learning_rate": 0.009055306955271496, "loss": 2.8104, "step": 2767 }, { "crossentropy": 2.75481116771698, "epoch": 0.15052067756056445, "grad_norm": 0.05074342340230942, "grad_norm_var": 1.2189648269163532e-05, "learning_rate": 0.009054578446921702, "loss": 2.7548, "step": 2768 }, { "crossentropy": 2.79829478263855, "epoch": 0.15057505641806465, "grad_norm": 0.04635418951511383, "grad_norm_var": 1.213162126194022e-05, "learning_rate": 0.00905384968711073, "loss": 2.7983, "step": 2769 }, { "crossentropy": 2.7828176021575928, "epoch": 0.15062943527556485, "grad_norm": 0.04332096129655838, "grad_norm_var": 1.168016231660902e-05, "learning_rate": 0.009053120675883784, "loss": 2.7828, "step": 2770 }, { "crossentropy": 2.729050397872925, "epoch": 0.15068381413306506, "grad_norm": 0.07143034040927887, "grad_norm_var": 5.3034407717548966e-05, "learning_rate": 0.00905239141328607, "loss": 2.7291, "step": 2771 }, { "crossentropy": 2.7562286853790283, "epoch": 0.15073819299056526, "grad_norm": 0.04113636910915375, "grad_norm_var": 5.5069865362690425e-05, "learning_rate": 0.009051661899362818, "loss": 2.7562, "step": 2772 }, { "crossentropy": 2.809638023376465, "epoch": 0.15079257184806547, "grad_norm": 0.04313041269779205, "grad_norm_var": 5.5637639309085755e-05, "learning_rate": 0.009050932134159273, "loss": 2.8096, "step": 2773 }, { "crossentropy": 2.670302152633667, "epoch": 0.15084695070556567, "grad_norm": 0.04634697362780571, "grad_norm_var": 5.4876102807946416e-05, "learning_rate": 0.009050202117720695, "loss": 2.6703, "step": 2774 }, { "crossentropy": 2.758177876472473, "epoch": 0.15090132956306587, "grad_norm": 0.04069516062736511, "grad_norm_var": 5.5138759352742665e-05, "learning_rate": 0.009049471850092356, "loss": 2.7582, "step": 2775 }, { "crossentropy": 2.699456572532654, "epoch": 0.15095570842056608, "grad_norm": 0.03941338509321213, "grad_norm_var": 5.730517599281109e-05, "learning_rate": 0.009048741331319547, "loss": 2.6995, "step": 2776 }, { "crossentropy": 2.84930419921875, "epoch": 0.15101008727806628, "grad_norm": 0.04347401112318039, "grad_norm_var": 5.710263033905828e-05, "learning_rate": 0.009048010561447578, "loss": 2.8493, "step": 2777 }, { "crossentropy": 2.7495235204696655, "epoch": 0.15106446613556648, "grad_norm": 0.048941466957330704, "grad_norm_var": 5.546585287500427e-05, "learning_rate": 0.009047279540521766, "loss": 2.7495, "step": 2778 }, { "crossentropy": 2.8166381120681763, "epoch": 0.1511188449930667, "grad_norm": 0.044718336313962936, "grad_norm_var": 5.3923371056369074e-05, "learning_rate": 0.009046548268587452, "loss": 2.8166, "step": 2779 }, { "crossentropy": 2.8557870388031006, "epoch": 0.1511732238505669, "grad_norm": 0.04396873340010643, "grad_norm_var": 5.397911702918565e-05, "learning_rate": 0.009045816745689985, "loss": 2.8558, "step": 2780 }, { "crossentropy": 2.6190766096115112, "epoch": 0.1512276027080671, "grad_norm": 0.045262280851602554, "grad_norm_var": 5.369296003272742e-05, "learning_rate": 0.009045084971874737, "loss": 2.6191, "step": 2781 }, { "crossentropy": 2.6262826919555664, "epoch": 0.1512819815655673, "grad_norm": 0.04007969796657562, "grad_norm_var": 5.584836823549835e-05, "learning_rate": 0.00904435294718709, "loss": 2.6263, "step": 2782 }, { "crossentropy": 2.746650457382202, "epoch": 0.1513363604230675, "grad_norm": 0.04556692764163017, "grad_norm_var": 5.576649206337839e-05, "learning_rate": 0.009043620671672444, "loss": 2.7467, "step": 2783 }, { "crossentropy": 2.732364058494568, "epoch": 0.1513907392805677, "grad_norm": 0.04240599274635315, "grad_norm_var": 5.473950302369024e-05, "learning_rate": 0.009042888145376212, "loss": 2.7324, "step": 2784 }, { "crossentropy": 2.811184525489807, "epoch": 0.1514451181380679, "grad_norm": 0.045968037098646164, "grad_norm_var": 5.469919627439501e-05, "learning_rate": 0.009042155368343829, "loss": 2.8112, "step": 2785 }, { "crossentropy": 2.7998422384262085, "epoch": 0.1514994969955681, "grad_norm": 0.04585951194167137, "grad_norm_var": 5.440970468947315e-05, "learning_rate": 0.009041422340620737, "loss": 2.7998, "step": 2786 }, { "crossentropy": 2.815150737762451, "epoch": 0.15155387585306831, "grad_norm": 0.04315321147441864, "grad_norm_var": 6.713412527699821e-06, "learning_rate": 0.009040689062252399, "loss": 2.8152, "step": 2787 }, { "crossentropy": 2.8495233058929443, "epoch": 0.15160825471056852, "grad_norm": 0.04505152627825737, "grad_norm_var": 6.303139157195064e-06, "learning_rate": 0.009039955533284292, "loss": 2.8495, "step": 2788 }, { "crossentropy": 2.771491050720215, "epoch": 0.15166263356806872, "grad_norm": 0.044216591864824295, "grad_norm_var": 6.250615887147074e-06, "learning_rate": 0.00903922175376191, "loss": 2.7715, "step": 2789 }, { "crossentropy": 2.7575563192367554, "epoch": 0.15171701242556893, "grad_norm": 0.04202908277511597, "grad_norm_var": 6.105047133543537e-06, "learning_rate": 0.009038487723730762, "loss": 2.7576, "step": 2790 }, { "crossentropy": 2.6951733827590942, "epoch": 0.15177139128306913, "grad_norm": 0.04552765563130379, "grad_norm_var": 5.563901460142508e-06, "learning_rate": 0.00903775344323637, "loss": 2.6952, "step": 2791 }, { "crossentropy": 2.6701239347457886, "epoch": 0.15182577014056936, "grad_norm": 0.05735842511057854, "grad_norm_var": 1.4471447333572137e-05, "learning_rate": 0.009037018912324273, "loss": 2.6701, "step": 2792 }, { "crossentropy": 2.803795337677002, "epoch": 0.15188014899806956, "grad_norm": 0.03795333206653595, "grad_norm_var": 1.766435065822658e-05, "learning_rate": 0.009036284131040026, "loss": 2.8038, "step": 2793 }, { "crossentropy": 2.8125542402267456, "epoch": 0.15193452785556977, "grad_norm": 0.04053223878145218, "grad_norm_var": 1.7528860481546885e-05, "learning_rate": 0.009035549099429201, "loss": 2.8126, "step": 2794 }, { "crossentropy": 2.7989108562469482, "epoch": 0.15198890671306997, "grad_norm": 0.05476440116763115, "grad_norm_var": 2.4325633275332908e-05, "learning_rate": 0.009034813817537384, "loss": 2.7989, "step": 2795 }, { "crossentropy": 2.6224600076675415, "epoch": 0.15204328557057017, "grad_norm": 0.041810404509305954, "grad_norm_var": 2.4908119102826783e-05, "learning_rate": 0.009034078285410174, "loss": 2.6225, "step": 2796 }, { "crossentropy": 2.7623372077941895, "epoch": 0.15209766442807038, "grad_norm": 0.04311336949467659, "grad_norm_var": 2.507751884982881e-05, "learning_rate": 0.009033342503093192, "loss": 2.7623, "step": 2797 }, { "crossentropy": 2.7826406955718994, "epoch": 0.15215204328557058, "grad_norm": 0.04460803419351578, "grad_norm_var": 2.35623104770812e-05, "learning_rate": 0.009032606470632065, "loss": 2.7826, "step": 2798 }, { "crossentropy": 2.730056047439575, "epoch": 0.15220642214307079, "grad_norm": 0.04588497802615166, "grad_norm_var": 2.3592889636374812e-05, "learning_rate": 0.009031870188072446, "loss": 2.7301, "step": 2799 }, { "crossentropy": 2.8307617902755737, "epoch": 0.152260801000571, "grad_norm": 0.04627057537436485, "grad_norm_var": 2.3182067128217575e-05, "learning_rate": 0.009031133655459997, "loss": 2.8308, "step": 2800 }, { "crossentropy": 2.772267460823059, "epoch": 0.1523151798580712, "grad_norm": 0.04510544613003731, "grad_norm_var": 2.314671681672009e-05, "learning_rate": 0.009030396872840398, "loss": 2.7723, "step": 2801 }, { "crossentropy": 2.6612740755081177, "epoch": 0.1523695587155714, "grad_norm": 0.042494695633649826, "grad_norm_var": 2.3559543775953684e-05, "learning_rate": 0.009029659840259342, "loss": 2.6613, "step": 2802 }, { "crossentropy": 2.737613320350647, "epoch": 0.1524239375730716, "grad_norm": 0.04900229349732399, "grad_norm_var": 2.426365287012469e-05, "learning_rate": 0.009028922557762538, "loss": 2.7376, "step": 2803 }, { "crossentropy": 2.7367576360702515, "epoch": 0.1524783164305718, "grad_norm": 0.057318780571222305, "grad_norm_var": 3.316822552222043e-05, "learning_rate": 0.009028185025395714, "loss": 2.7368, "step": 2804 }, { "crossentropy": 2.584765315055847, "epoch": 0.152532695288072, "grad_norm": 0.045282892882823944, "grad_norm_var": 3.296804903332676e-05, "learning_rate": 0.009027447243204612, "loss": 2.5848, "step": 2805 }, { "crossentropy": 2.711709141731262, "epoch": 0.1525870741455722, "grad_norm": 0.04352710023522377, "grad_norm_var": 3.2277011700302534e-05, "learning_rate": 0.009026709211234985, "loss": 2.7117, "step": 2806 }, { "crossentropy": 2.86260449886322, "epoch": 0.1526414530030724, "grad_norm": 0.06164155900478363, "grad_norm_var": 4.687918145332559e-05, "learning_rate": 0.009025970929532605, "loss": 2.8626, "step": 2807 }, { "crossentropy": 2.6517406702041626, "epoch": 0.15269583186057262, "grad_norm": 0.05552732199430466, "grad_norm_var": 4.463099879852828e-05, "learning_rate": 0.009025232398143264, "loss": 2.6517, "step": 2808 }, { "crossentropy": 2.673583984375, "epoch": 0.15275021071807282, "grad_norm": 0.04736006632447243, "grad_norm_var": 3.859237681805427e-05, "learning_rate": 0.009024493617112762, "loss": 2.6736, "step": 2809 }, { "crossentropy": 2.71708083152771, "epoch": 0.15280458957557302, "grad_norm": 0.041538055986166, "grad_norm_var": 3.768559314407648e-05, "learning_rate": 0.009023754586486916, "loss": 2.7171, "step": 2810 }, { "crossentropy": 2.697894811630249, "epoch": 0.15285896843307323, "grad_norm": 0.04379419982433319, "grad_norm_var": 3.506152705460974e-05, "learning_rate": 0.009023015306311563, "loss": 2.6979, "step": 2811 }, { "crossentropy": 2.701596260070801, "epoch": 0.15291334729057343, "grad_norm": 0.06879153847694397, "grad_norm_var": 6.137829629170495e-05, "learning_rate": 0.009022275776632553, "loss": 2.7016, "step": 2812 }, { "crossentropy": 2.7771610021591187, "epoch": 0.15296772614807363, "grad_norm": 0.04090607166290283, "grad_norm_var": 6.336489615249337e-05, "learning_rate": 0.009021535997495748, "loss": 2.7772, "step": 2813 }, { "crossentropy": 2.7063276767730713, "epoch": 0.15302210500557384, "grad_norm": 0.044167403131723404, "grad_norm_var": 6.361689965035551e-05, "learning_rate": 0.00902079596894703, "loss": 2.7063, "step": 2814 }, { "crossentropy": 2.634642720222473, "epoch": 0.15307648386307404, "grad_norm": 0.1034880205988884, "grad_norm_var": 0.0002496600810863063, "learning_rate": 0.009020055691032295, "loss": 2.6346, "step": 2815 }, { "crossentropy": 2.6665127277374268, "epoch": 0.15313086272057425, "grad_norm": 0.04477326571941376, "grad_norm_var": 0.0002509966375522326, "learning_rate": 0.009019315163797453, "loss": 2.6665, "step": 2816 }, { "crossentropy": 2.835039973258972, "epoch": 0.15318524157807445, "grad_norm": 0.04273541271686554, "grad_norm_var": 0.00025358010875229, "learning_rate": 0.009018574387288432, "loss": 2.835, "step": 2817 }, { "crossentropy": 2.786589741706848, "epoch": 0.15323962043557465, "grad_norm": 0.04057757183909416, "grad_norm_var": 0.0002562451021649721, "learning_rate": 0.009017833361551174, "loss": 2.7866, "step": 2818 }, { "crossentropy": 2.8200745582580566, "epoch": 0.15329399929307486, "grad_norm": 0.041474148631095886, "grad_norm_var": 0.0002626977226179559, "learning_rate": 0.009017092086631637, "loss": 2.8201, "step": 2819 }, { "crossentropy": 2.720735192298889, "epoch": 0.15334837815057506, "grad_norm": 0.03868626058101654, "grad_norm_var": 0.00026976982296258626, "learning_rate": 0.009016350562575794, "loss": 2.7207, "step": 2820 }, { "crossentropy": 2.724596381187439, "epoch": 0.15340275700807526, "grad_norm": 0.04048559069633484, "grad_norm_var": 0.0002743961960681893, "learning_rate": 0.009015608789429632, "loss": 2.7246, "step": 2821 }, { "crossentropy": 2.7454899549484253, "epoch": 0.15345713586557547, "grad_norm": 0.04590652510523796, "grad_norm_var": 0.0002727069174899718, "learning_rate": 0.009014866767239157, "loss": 2.7455, "step": 2822 }, { "crossentropy": 2.6963305473327637, "epoch": 0.15351151472307567, "grad_norm": 0.0459253191947937, "grad_norm_var": 0.00026399224506925363, "learning_rate": 0.009014124496050392, "loss": 2.6963, "step": 2823 }, { "crossentropy": 2.8642022609710693, "epoch": 0.15356589358057587, "grad_norm": 0.045200128108263016, "grad_norm_var": 0.00026185396297557573, "learning_rate": 0.009013381975909365, "loss": 2.8642, "step": 2824 }, { "crossentropy": 2.7582300901412964, "epoch": 0.15362027243807608, "grad_norm": 0.041115667670965195, "grad_norm_var": 0.0002652301796161355, "learning_rate": 0.00901263920686213, "loss": 2.7582, "step": 2825 }, { "crossentropy": 2.7958439588546753, "epoch": 0.15367465129557628, "grad_norm": 0.042084019631147385, "grad_norm_var": 0.0002647712900894328, "learning_rate": 0.009011896188954751, "loss": 2.7958, "step": 2826 }, { "crossentropy": 2.7649987936019897, "epoch": 0.15372903015307648, "grad_norm": 0.043588656932115555, "grad_norm_var": 0.00026489280964758587, "learning_rate": 0.009011152922233312, "loss": 2.765, "step": 2827 }, { "crossentropy": 2.6830285787582397, "epoch": 0.1537834090105767, "grad_norm": 0.048321835696697235, "grad_norm_var": 0.00023465969932879392, "learning_rate": 0.009010409406743908, "loss": 2.683, "step": 2828 }, { "crossentropy": 2.693724036216736, "epoch": 0.1538377878680769, "grad_norm": 0.04571292921900749, "grad_norm_var": 0.00023230084135073524, "learning_rate": 0.009009665642532652, "loss": 2.6937, "step": 2829 }, { "crossentropy": 2.671627402305603, "epoch": 0.1538921667255771, "grad_norm": 0.0410626083612442, "grad_norm_var": 0.0002341339708158182, "learning_rate": 0.00900892162964567, "loss": 2.6716, "step": 2830 }, { "crossentropy": 2.8391475677490234, "epoch": 0.1539465455830773, "grad_norm": 0.03928941860795021, "grad_norm_var": 7.737144042544811e-06, "learning_rate": 0.009008177368129106, "loss": 2.8391, "step": 2831 }, { "crossentropy": 2.6503093242645264, "epoch": 0.1540009244405775, "grad_norm": 0.040153857320547104, "grad_norm_var": 7.937806146428248e-06, "learning_rate": 0.00900743285802912, "loss": 2.6503, "step": 2832 }, { "crossentropy": 2.802140951156616, "epoch": 0.1540553032980777, "grad_norm": 0.0385868139564991, "grad_norm_var": 8.963472430287279e-06, "learning_rate": 0.009006688099391882, "loss": 2.8021, "step": 2833 }, { "crossentropy": 2.7427656650543213, "epoch": 0.1541096821555779, "grad_norm": 0.03886101767420769, "grad_norm_var": 9.561467809109566e-06, "learning_rate": 0.009005943092263586, "loss": 2.7428, "step": 2834 }, { "crossentropy": 2.746352791786194, "epoch": 0.1541640610130781, "grad_norm": 0.045809607952833176, "grad_norm_var": 1.027130987671972e-05, "learning_rate": 0.009005197836690433, "loss": 2.7464, "step": 2835 }, { "crossentropy": 2.734773635864258, "epoch": 0.15421843987057832, "grad_norm": 0.03846896067261696, "grad_norm_var": 1.0386188799297683e-05, "learning_rate": 0.009004452332718644, "loss": 2.7348, "step": 2836 }, { "crossentropy": 2.763750433921814, "epoch": 0.15427281872807852, "grad_norm": 0.03895498812198639, "grad_norm_var": 1.0951019732747628e-05, "learning_rate": 0.009003706580394455, "loss": 2.7638, "step": 2837 }, { "crossentropy": 2.7706552743911743, "epoch": 0.15432719758557872, "grad_norm": 0.04054071754217148, "grad_norm_var": 1.0270523960391869e-05, "learning_rate": 0.009002960579764116, "loss": 2.7707, "step": 2838 }, { "crossentropy": 2.705652356147766, "epoch": 0.15438157644307893, "grad_norm": 0.03820626810193062, "grad_norm_var": 1.006238764315696e-05, "learning_rate": 0.009002214330873894, "loss": 2.7057, "step": 2839 }, { "crossentropy": 2.655367612838745, "epoch": 0.15443595530057913, "grad_norm": 0.04194638133049011, "grad_norm_var": 9.171906201942542e-06, "learning_rate": 0.009001467833770071, "loss": 2.6554, "step": 2840 }, { "crossentropy": 2.670365333557129, "epoch": 0.15449033415807933, "grad_norm": 0.03873998299241066, "grad_norm_var": 9.62072651331704e-06, "learning_rate": 0.009000721088498944, "loss": 2.6704, "step": 2841 }, { "crossentropy": 2.8408873081207275, "epoch": 0.15454471301557954, "grad_norm": 0.04132452234625816, "grad_norm_var": 9.574397050154428e-06, "learning_rate": 0.008999974095106824, "loss": 2.8409, "step": 2842 }, { "crossentropy": 2.8420687913894653, "epoch": 0.15459909187307974, "grad_norm": 0.04059360548853874, "grad_norm_var": 9.19035512475282e-06, "learning_rate": 0.00899922685364004, "loss": 2.8421, "step": 2843 }, { "crossentropy": 2.7552032470703125, "epoch": 0.15465347073057994, "grad_norm": 0.038779184222221375, "grad_norm_var": 5.611386098515874e-06, "learning_rate": 0.008998479364144935, "loss": 2.7552, "step": 2844 }, { "crossentropy": 2.7525523900985718, "epoch": 0.15470784958808015, "grad_norm": 0.041321199387311935, "grad_norm_var": 3.7288700135158667e-06, "learning_rate": 0.008997731626667867, "loss": 2.7526, "step": 2845 }, { "crossentropy": 2.7092074155807495, "epoch": 0.15476222844558035, "grad_norm": 0.04181351885199547, "grad_norm_var": 3.853986897083457e-06, "learning_rate": 0.008996983641255212, "loss": 2.7092, "step": 2846 }, { "crossentropy": 2.8111287355422974, "epoch": 0.15481660730308056, "grad_norm": 0.04336295649409294, "grad_norm_var": 4.390070841125739e-06, "learning_rate": 0.008996235407953357, "loss": 2.8111, "step": 2847 }, { "crossentropy": 2.718505382537842, "epoch": 0.15487098616058076, "grad_norm": 0.04402017965912819, "grad_norm_var": 5.1631920039911055e-06, "learning_rate": 0.008995486926808707, "loss": 2.7185, "step": 2848 }, { "crossentropy": 2.7516579627990723, "epoch": 0.15492536501808096, "grad_norm": 0.04382140561938286, "grad_norm_var": 5.39519591062701e-06, "learning_rate": 0.008994738197867684, "loss": 2.7517, "step": 2849 }, { "crossentropy": 2.7224758863449097, "epoch": 0.15497974387558117, "grad_norm": 0.04593843221664429, "grad_norm_var": 6.474053120676613e-06, "learning_rate": 0.008993989221176722, "loss": 2.7225, "step": 2850 }, { "crossentropy": 2.8293325901031494, "epoch": 0.15503412273308137, "grad_norm": 0.044025570154190063, "grad_norm_var": 5.642520049244555e-06, "learning_rate": 0.008993239996782271, "loss": 2.8293, "step": 2851 }, { "crossentropy": 2.7362630367279053, "epoch": 0.15508850159058157, "grad_norm": 0.04116471856832504, "grad_norm_var": 5.055376830820762e-06, "learning_rate": 0.008992490524730801, "loss": 2.7363, "step": 2852 }, { "crossentropy": 2.8006352186203003, "epoch": 0.15514288044808178, "grad_norm": 0.04255753010511398, "grad_norm_var": 4.627432181416163e-06, "learning_rate": 0.00899174080506879, "loss": 2.8006, "step": 2853 }, { "crossentropy": 2.8265156745910645, "epoch": 0.15519725930558198, "grad_norm": 0.04583996161818504, "grad_norm_var": 5.521222036788028e-06, "learning_rate": 0.008990990837842733, "loss": 2.8265, "step": 2854 }, { "crossentropy": 2.7126227617263794, "epoch": 0.15525163816308218, "grad_norm": 0.04670536518096924, "grad_norm_var": 5.633700544239023e-06, "learning_rate": 0.008990240623099149, "loss": 2.7126, "step": 2855 }, { "crossentropy": 2.7018256187438965, "epoch": 0.1553060170205824, "grad_norm": 0.04628700762987137, "grad_norm_var": 6.420159969507989e-06, "learning_rate": 0.008989490160884558, "loss": 2.7018, "step": 2856 }, { "crossentropy": 2.7634758949279785, "epoch": 0.1553603958780826, "grad_norm": 0.04605741426348686, "grad_norm_var": 5.714352372425806e-06, "learning_rate": 0.008988739451245509, "loss": 2.7635, "step": 2857 }, { "crossentropy": 2.8198834657669067, "epoch": 0.1554147747355828, "grad_norm": 0.05135086551308632, "grad_norm_var": 9.288523088872974e-06, "learning_rate": 0.008987988494228556, "loss": 2.8199, "step": 2858 }, { "crossentropy": 2.7590579986572266, "epoch": 0.155469153593083, "grad_norm": 0.05344787612557411, "grad_norm_var": 1.3815990302145215e-05, "learning_rate": 0.008987237289880276, "loss": 2.7591, "step": 2859 }, { "crossentropy": 2.7394320964813232, "epoch": 0.1555235324505832, "grad_norm": 0.045954927802085876, "grad_norm_var": 1.1292032453936487e-05, "learning_rate": 0.008986485838247255, "loss": 2.7394, "step": 2860 }, { "crossentropy": 2.7850524187088013, "epoch": 0.1555779113080834, "grad_norm": 0.04097963497042656, "grad_norm_var": 1.1477306876591225e-05, "learning_rate": 0.008985734139376103, "loss": 2.7851, "step": 2861 }, { "crossentropy": 2.710474967956543, "epoch": 0.1556322901655836, "grad_norm": 0.04246283695101738, "grad_norm_var": 1.1209781448912453e-05, "learning_rate": 0.008984982193313432, "loss": 2.7105, "step": 2862 }, { "crossentropy": 2.7262340784072876, "epoch": 0.1556866690230838, "grad_norm": 0.039905112236738205, "grad_norm_var": 1.2826416162385554e-05, "learning_rate": 0.008984230000105882, "loss": 2.7262, "step": 2863 }, { "crossentropy": 2.7931454181671143, "epoch": 0.15574104788058402, "grad_norm": 0.04101911187171936, "grad_norm_var": 1.3794359871016395e-05, "learning_rate": 0.0089834775598001, "loss": 2.7931, "step": 2864 }, { "crossentropy": 2.70094633102417, "epoch": 0.15579542673808422, "grad_norm": 0.04055977985262871, "grad_norm_var": 1.490433118762786e-05, "learning_rate": 0.008982724872442756, "loss": 2.7009, "step": 2865 }, { "crossentropy": 2.759639263153076, "epoch": 0.15584980559558442, "grad_norm": 0.038690365850925446, "grad_norm_var": 1.6933894102117802e-05, "learning_rate": 0.008981971938080527, "loss": 2.7596, "step": 2866 }, { "crossentropy": 2.7279646396636963, "epoch": 0.15590418445308465, "grad_norm": 0.06268325448036194, "grad_norm_var": 3.82866305875057e-05, "learning_rate": 0.00898121875676011, "loss": 2.728, "step": 2867 }, { "crossentropy": 2.7831095457077026, "epoch": 0.15595856331058486, "grad_norm": 0.04179522022604942, "grad_norm_var": 3.795928724390629e-05, "learning_rate": 0.008980465328528218, "loss": 2.7831, "step": 2868 }, { "crossentropy": 2.839314103126526, "epoch": 0.15601294216808506, "grad_norm": 0.045160382986068726, "grad_norm_var": 3.7398494209751366e-05, "learning_rate": 0.00897971165343158, "loss": 2.8393, "step": 2869 }, { "crossentropy": 2.7567838430404663, "epoch": 0.15606732102558526, "grad_norm": 0.04602398723363876, "grad_norm_var": 3.7407573512898464e-05, "learning_rate": 0.008978957731516933, "loss": 2.7568, "step": 2870 }, { "crossentropy": 2.672402024269104, "epoch": 0.15612169988308547, "grad_norm": 0.04761377349495888, "grad_norm_var": 3.7596944564985355e-05, "learning_rate": 0.008978203562831036, "loss": 2.6724, "step": 2871 }, { "crossentropy": 2.7249890565872192, "epoch": 0.15617607874058567, "grad_norm": 0.04585382714867592, "grad_norm_var": 3.757040606249239e-05, "learning_rate": 0.008977449147420664, "loss": 2.725, "step": 2872 }, { "crossentropy": 2.801623225212097, "epoch": 0.15623045759808588, "grad_norm": 0.04558194428682327, "grad_norm_var": 3.7555372365969245e-05, "learning_rate": 0.008976694485332604, "loss": 2.8016, "step": 2873 }, { "crossentropy": 2.7815250158309937, "epoch": 0.15628483645558608, "grad_norm": 0.04928591102361679, "grad_norm_var": 3.6229606391265334e-05, "learning_rate": 0.00897593957661366, "loss": 2.7815, "step": 2874 }, { "crossentropy": 2.8865805864334106, "epoch": 0.15633921531308628, "grad_norm": 0.0481383241713047, "grad_norm_var": 3.232149156930722e-05, "learning_rate": 0.008975184421310652, "loss": 2.8866, "step": 2875 }, { "crossentropy": 2.8636956214904785, "epoch": 0.1563935941705865, "grad_norm": 0.04703933000564575, "grad_norm_var": 3.2517618962586935e-05, "learning_rate": 0.008974429019470409, "loss": 2.8637, "step": 2876 }, { "crossentropy": 2.6231619119644165, "epoch": 0.1564479730280867, "grad_norm": 0.05203085392713547, "grad_norm_var": 3.396951935209127e-05, "learning_rate": 0.008973673371139785, "loss": 2.6232, "step": 2877 }, { "crossentropy": 2.78518807888031, "epoch": 0.1565023518855869, "grad_norm": 0.04627120867371559, "grad_norm_var": 3.3148312648343706e-05, "learning_rate": 0.008972917476365644, "loss": 2.7852, "step": 2878 }, { "crossentropy": 2.8037664890289307, "epoch": 0.1565567307430871, "grad_norm": 0.04765019938349724, "grad_norm_var": 3.0496753788822272e-05, "learning_rate": 0.008972161335194864, "loss": 2.8038, "step": 2879 }, { "crossentropy": 2.7929571866989136, "epoch": 0.1566111096005873, "grad_norm": 0.044937025755643845, "grad_norm_var": 2.8547352294705276e-05, "learning_rate": 0.00897140494767434, "loss": 2.793, "step": 2880 }, { "crossentropy": 2.80379855632782, "epoch": 0.1566654884580875, "grad_norm": 0.04797228425741196, "grad_norm_var": 2.578217140164803e-05, "learning_rate": 0.008970648313850986, "loss": 2.8038, "step": 2881 }, { "crossentropy": 2.7804067134857178, "epoch": 0.1567198673155877, "grad_norm": 0.04371047019958496, "grad_norm_var": 2.1597443512984202e-05, "learning_rate": 0.008969891433771724, "loss": 2.7804, "step": 2882 }, { "crossentropy": 2.8570481538772583, "epoch": 0.1567742461730879, "grad_norm": 0.042567916214466095, "grad_norm_var": 6.457460263782575e-06, "learning_rate": 0.008969134307483497, "loss": 2.857, "step": 2883 }, { "crossentropy": 2.8714537620544434, "epoch": 0.15682862503058811, "grad_norm": 0.04152849689126015, "grad_norm_var": 6.623961328389103e-06, "learning_rate": 0.00896837693503326, "loss": 2.8715, "step": 2884 }, { "crossentropy": 2.8205370903015137, "epoch": 0.15688300388808832, "grad_norm": 0.046042993664741516, "grad_norm_var": 6.534374669115289e-06, "learning_rate": 0.008967619316467985, "loss": 2.8205, "step": 2885 }, { "crossentropy": 2.8050143718719482, "epoch": 0.15693738274558852, "grad_norm": 0.044754743576049805, "grad_norm_var": 6.697092531299647e-06, "learning_rate": 0.00896686145183466, "loss": 2.805, "step": 2886 }, { "crossentropy": 2.8066855669021606, "epoch": 0.15699176160308873, "grad_norm": 0.0429622083902359, "grad_norm_var": 7.241545242762474e-06, "learning_rate": 0.008966103341180287, "loss": 2.8067, "step": 2887 }, { "crossentropy": 2.753365159034729, "epoch": 0.15704614046058893, "grad_norm": 0.0390508733689785, "grad_norm_var": 1.0285223983820766e-05, "learning_rate": 0.008965344984551881, "loss": 2.7534, "step": 2888 }, { "crossentropy": 2.7075846195220947, "epoch": 0.15710051931808913, "grad_norm": 0.04175979271531105, "grad_norm_var": 1.1205082473588792e-05, "learning_rate": 0.008964586381996476, "loss": 2.7076, "step": 2889 }, { "crossentropy": 2.74777889251709, "epoch": 0.15715489817558934, "grad_norm": 0.046532612293958664, "grad_norm_var": 1.0236329642136082e-05, "learning_rate": 0.00896382753356112, "loss": 2.7478, "step": 2890 }, { "crossentropy": 2.804794430732727, "epoch": 0.15720927703308954, "grad_norm": 0.04686293005943298, "grad_norm_var": 9.835660358074351e-06, "learning_rate": 0.008963068439292876, "loss": 2.8048, "step": 2891 }, { "crossentropy": 2.852848172187805, "epoch": 0.15726365589058974, "grad_norm": 0.04794928804039955, "grad_norm_var": 1.0122145676309119e-05, "learning_rate": 0.008962309099238822, "loss": 2.8528, "step": 2892 }, { "crossentropy": 2.8162994384765625, "epoch": 0.15731803474808995, "grad_norm": 0.04721652343869209, "grad_norm_var": 7.16124068246924e-06, "learning_rate": 0.00896154951344605, "loss": 2.8163, "step": 2893 }, { "crossentropy": 2.6894123554229736, "epoch": 0.15737241360559015, "grad_norm": 0.042806681245565414, "grad_norm_var": 7.259811795890837e-06, "learning_rate": 0.008960789681961673, "loss": 2.6894, "step": 2894 }, { "crossentropy": 2.696621894836426, "epoch": 0.15742679246309035, "grad_norm": 0.04104138910770416, "grad_norm_var": 7.340655523054481e-06, "learning_rate": 0.008960029604832812, "loss": 2.6966, "step": 2895 }, { "crossentropy": 2.7506226301193237, "epoch": 0.15748117132059056, "grad_norm": 0.03819644823670387, "grad_norm_var": 9.545843801030361e-06, "learning_rate": 0.008959269282106606, "loss": 2.7506, "step": 2896 }, { "crossentropy": 2.790038824081421, "epoch": 0.15753555017809076, "grad_norm": 0.04560549184679985, "grad_norm_var": 8.582362977527641e-06, "learning_rate": 0.00895850871383021, "loss": 2.79, "step": 2897 }, { "crossentropy": 2.763395309448242, "epoch": 0.15758992903559096, "grad_norm": 0.044572461396455765, "grad_norm_var": 8.634395624246283e-06, "learning_rate": 0.008957747900050796, "loss": 2.7634, "step": 2898 }, { "crossentropy": 2.8305217027664185, "epoch": 0.15764430789309117, "grad_norm": 0.042878519743680954, "grad_norm_var": 8.592892095827567e-06, "learning_rate": 0.008956986840815546, "loss": 2.8305, "step": 2899 }, { "crossentropy": 2.7126413583755493, "epoch": 0.15769868675059137, "grad_norm": 0.043103281408548355, "grad_norm_var": 8.28456739898984e-06, "learning_rate": 0.00895622553617166, "loss": 2.7126, "step": 2900 }, { "crossentropy": 2.832995295524597, "epoch": 0.15775306560809157, "grad_norm": 0.04044877365231514, "grad_norm_var": 8.592482125550222e-06, "learning_rate": 0.008955463986166357, "loss": 2.833, "step": 2901 }, { "crossentropy": 2.7945200204849243, "epoch": 0.15780744446559178, "grad_norm": 0.043634336441755295, "grad_norm_var": 8.481087278387697e-06, "learning_rate": 0.008954702190846865, "loss": 2.7945, "step": 2902 }, { "crossentropy": 2.696248173713684, "epoch": 0.15786182332309198, "grad_norm": 0.0420110784471035, "grad_norm_var": 8.594903860331683e-06, "learning_rate": 0.00895394015026043, "loss": 2.6962, "step": 2903 }, { "crossentropy": 2.7547783851623535, "epoch": 0.15791620218059219, "grad_norm": 0.04298575967550278, "grad_norm_var": 7.30475759538416e-06, "learning_rate": 0.00895317786445431, "loss": 2.7548, "step": 2904 }, { "crossentropy": 2.7587915658950806, "epoch": 0.1579705810380924, "grad_norm": 0.07020056992769241, "grad_norm_var": 5.0880091710529356e-05, "learning_rate": 0.008952415333475787, "loss": 2.7588, "step": 2905 }, { "crossentropy": 2.6835784912109375, "epoch": 0.1580249598955926, "grad_norm": 0.039118047803640366, "grad_norm_var": 5.3174502860726484e-05, "learning_rate": 0.00895165255737215, "loss": 2.6836, "step": 2906 }, { "crossentropy": 2.759055495262146, "epoch": 0.1580793387530928, "grad_norm": 0.041149817407131195, "grad_norm_var": 5.373024800846536e-05, "learning_rate": 0.008950889536190705, "loss": 2.7591, "step": 2907 }, { "crossentropy": 2.701099991798401, "epoch": 0.158133717610593, "grad_norm": 0.043036069720983505, "grad_norm_var": 5.301697139846369e-05, "learning_rate": 0.008950126269978773, "loss": 2.7011, "step": 2908 }, { "crossentropy": 2.6315886974334717, "epoch": 0.1581880964680932, "grad_norm": 0.04001469537615776, "grad_norm_var": 5.341034634706934e-05, "learning_rate": 0.008949362758783694, "loss": 2.6316, "step": 2909 }, { "crossentropy": 2.6959325075149536, "epoch": 0.1582424753255934, "grad_norm": 0.04154514893889427, "grad_norm_var": 5.367692929417536e-05, "learning_rate": 0.00894859900265282, "loss": 2.6959, "step": 2910 }, { "crossentropy": 2.7559814453125, "epoch": 0.1582968541830936, "grad_norm": 0.040908053517341614, "grad_norm_var": 5.37256853198625e-05, "learning_rate": 0.008947835001633516, "loss": 2.756, "step": 2911 }, { "crossentropy": 2.7939045429229736, "epoch": 0.15835123304059381, "grad_norm": 0.043953366577625275, "grad_norm_var": 5.156259549315787e-05, "learning_rate": 0.008947070755773166, "loss": 2.7939, "step": 2912 }, { "crossentropy": 2.6361318826675415, "epoch": 0.15840561189809402, "grad_norm": 0.04326054826378822, "grad_norm_var": 5.1427071026546346e-05, "learning_rate": 0.008946306265119166, "loss": 2.6361, "step": 2913 }, { "crossentropy": 2.671873092651367, "epoch": 0.15845999075559422, "grad_norm": 0.04284249246120453, "grad_norm_var": 5.1465071422826525e-05, "learning_rate": 0.008945541529718933, "loss": 2.6719, "step": 2914 }, { "crossentropy": 2.751077890396118, "epoch": 0.15851436961309442, "grad_norm": 0.04423486441373825, "grad_norm_var": 5.14101207185646e-05, "learning_rate": 0.008944776549619891, "loss": 2.7511, "step": 2915 }, { "crossentropy": 2.774796485900879, "epoch": 0.15856874847059463, "grad_norm": 0.0586320236325264, "grad_norm_var": 6.482581072014086e-05, "learning_rate": 0.008944011324869485, "loss": 2.7748, "step": 2916 }, { "crossentropy": 2.72002112865448, "epoch": 0.15862312732809483, "grad_norm": 0.054023195058107376, "grad_norm_var": 6.833399467710554e-05, "learning_rate": 0.008943245855515173, "loss": 2.72, "step": 2917 }, { "crossentropy": 2.8527756929397583, "epoch": 0.15867750618559504, "grad_norm": 0.05461626127362251, "grad_norm_var": 7.281496334410122e-05, "learning_rate": 0.00894248014160443, "loss": 2.8528, "step": 2918 }, { "crossentropy": 2.6769272089004517, "epoch": 0.15873188504309524, "grad_norm": 0.05069928616285324, "grad_norm_var": 7.243896816102596e-05, "learning_rate": 0.008941714183184747, "loss": 2.6769, "step": 2919 }, { "crossentropy": 2.6818816661834717, "epoch": 0.15878626390059544, "grad_norm": 0.04125214368104935, "grad_norm_var": 7.354342840643202e-05, "learning_rate": 0.00894094798030362, "loss": 2.6819, "step": 2920 }, { "crossentropy": 2.7432457208633423, "epoch": 0.15884064275809565, "grad_norm": 0.041111838072538376, "grad_norm_var": 3.58354518153708e-05, "learning_rate": 0.008940181533008578, "loss": 2.7432, "step": 2921 }, { "crossentropy": 2.7754441499710083, "epoch": 0.15889502161559585, "grad_norm": 0.042905379086732864, "grad_norm_var": 3.374913400889338e-05, "learning_rate": 0.00893941484134715, "loss": 2.7754, "step": 2922 }, { "crossentropy": 2.6462146043777466, "epoch": 0.15894940047309605, "grad_norm": 0.04046555608510971, "grad_norm_var": 3.415353282110826e-05, "learning_rate": 0.008938647905366883, "loss": 2.6462, "step": 2923 }, { "crossentropy": 2.7351735830307007, "epoch": 0.15900377933059626, "grad_norm": 0.045074887573719025, "grad_norm_var": 3.381997078788218e-05, "learning_rate": 0.008937880725115348, "loss": 2.7352, "step": 2924 }, { "crossentropy": 2.556356430053711, "epoch": 0.15905815818809646, "grad_norm": 0.13498257100582123, "grad_norm_var": 0.0005299910751565051, "learning_rate": 0.00893711330064012, "loss": 2.5564, "step": 2925 }, { "crossentropy": 2.7808101177215576, "epoch": 0.15911253704559666, "grad_norm": 0.04147748276591301, "grad_norm_var": 0.000530079206246852, "learning_rate": 0.008936345631988798, "loss": 2.7808, "step": 2926 }, { "crossentropy": 2.703028917312622, "epoch": 0.15916691590309687, "grad_norm": 0.041143644601106644, "grad_norm_var": 0.0005297569487418408, "learning_rate": 0.008935577719208988, "loss": 2.703, "step": 2927 }, { "crossentropy": 2.6880024671554565, "epoch": 0.15922129476059707, "grad_norm": 0.04097365587949753, "grad_norm_var": 0.0005332275545037937, "learning_rate": 0.008934809562348317, "loss": 2.688, "step": 2928 }, { "crossentropy": 2.769566297531128, "epoch": 0.15927567361809727, "grad_norm": 0.054306358098983765, "grad_norm_var": 0.0005292986074393824, "learning_rate": 0.008934041161454425, "loss": 2.7696, "step": 2929 }, { "crossentropy": 2.7054330110549927, "epoch": 0.15933005247559748, "grad_norm": 0.040538616478443146, "grad_norm_var": 0.0005323808254667326, "learning_rate": 0.00893327251657497, "loss": 2.7054, "step": 2930 }, { "crossentropy": 2.6455676555633545, "epoch": 0.15938443133309768, "grad_norm": 0.04233863949775696, "grad_norm_var": 0.0005344809201621115, "learning_rate": 0.008932503627757618, "loss": 2.6456, "step": 2931 }, { "crossentropy": 2.6598546504974365, "epoch": 0.15943881019059788, "grad_norm": 0.04941396042704582, "grad_norm_var": 0.0005310675201263964, "learning_rate": 0.008931734495050058, "loss": 2.6599, "step": 2932 }, { "crossentropy": 2.719261407852173, "epoch": 0.1594931890480981, "grad_norm": 0.03966563567519188, "grad_norm_var": 0.0005380828689386482, "learning_rate": 0.00893096511849999, "loss": 2.7193, "step": 2933 }, { "crossentropy": 2.73096239566803, "epoch": 0.1595475679055983, "grad_norm": 0.041465017944574356, "grad_norm_var": 0.000540903817313044, "learning_rate": 0.008930195498155132, "loss": 2.731, "step": 2934 }, { "crossentropy": 2.6225483417510986, "epoch": 0.1596019467630985, "grad_norm": 0.04581374675035477, "grad_norm_var": 0.0005414439804651287, "learning_rate": 0.008929425634063211, "loss": 2.6225, "step": 2935 }, { "crossentropy": 2.7722342014312744, "epoch": 0.1596563256205987, "grad_norm": 0.04415890574455261, "grad_norm_var": 0.0005389951758761169, "learning_rate": 0.008928655526271976, "loss": 2.7722, "step": 2936 }, { "crossentropy": 2.7372548580169678, "epoch": 0.1597107044780989, "grad_norm": 0.041501544415950775, "grad_norm_var": 0.0005385888300819884, "learning_rate": 0.00892788517482919, "loss": 2.7373, "step": 2937 }, { "crossentropy": 2.736565947532654, "epoch": 0.1597650833355991, "grad_norm": 0.04095447435975075, "grad_norm_var": 0.0005404482263562142, "learning_rate": 0.008927114579782625, "loss": 2.7366, "step": 2938 }, { "crossentropy": 2.735694169998169, "epoch": 0.1598194621930993, "grad_norm": 0.036464277654886246, "grad_norm_var": 0.0005460111833568588, "learning_rate": 0.008926343741180077, "loss": 2.7357, "step": 2939 }, { "crossentropy": 2.8163464069366455, "epoch": 0.1598738410505995, "grad_norm": 0.042913056910037994, "grad_norm_var": 0.0005473675329877992, "learning_rate": 0.008925572659069349, "loss": 2.8163, "step": 2940 }, { "crossentropy": 2.711140990257263, "epoch": 0.15992821990809974, "grad_norm": 0.0406692735850811, "grad_norm_var": 1.7436977312366293e-05, "learning_rate": 0.008924801333498265, "loss": 2.7111, "step": 2941 }, { "crossentropy": 2.7112808227539062, "epoch": 0.15998259876559995, "grad_norm": 0.04452100396156311, "grad_norm_var": 1.750464093729174e-05, "learning_rate": 0.008924029764514662, "loss": 2.7113, "step": 2942 }, { "crossentropy": 2.7865606546401978, "epoch": 0.16003697762310015, "grad_norm": 0.04572945833206177, "grad_norm_var": 1.7728203354553523e-05, "learning_rate": 0.00892325795216639, "loss": 2.7866, "step": 2943 }, { "crossentropy": 2.713441252708435, "epoch": 0.16009135648060036, "grad_norm": 0.04781722649931908, "grad_norm_var": 1.86108898308052e-05, "learning_rate": 0.008922485896501318, "loss": 2.7134, "step": 2944 }, { "crossentropy": 2.6869486570358276, "epoch": 0.16014573533810056, "grad_norm": 0.044075652956962585, "grad_norm_var": 1.060534219650775e-05, "learning_rate": 0.008921713597567328, "loss": 2.6869, "step": 2945 }, { "crossentropy": 2.77097225189209, "epoch": 0.16020011419560076, "grad_norm": 0.047019798308610916, "grad_norm_var": 1.1101489422075256e-05, "learning_rate": 0.008920941055412317, "loss": 2.771, "step": 2946 }, { "crossentropy": 2.7776403427124023, "epoch": 0.16025449305310097, "grad_norm": 0.04731135070323944, "grad_norm_var": 1.1938226526151535e-05, "learning_rate": 0.008920168270084197, "loss": 2.7776, "step": 2947 }, { "crossentropy": 2.7793785333633423, "epoch": 0.16030887191060117, "grad_norm": 0.04891035705804825, "grad_norm_var": 1.1571637026126734e-05, "learning_rate": 0.008919395241630896, "loss": 2.7794, "step": 2948 }, { "crossentropy": 2.692612051963806, "epoch": 0.16036325076810137, "grad_norm": 0.047997891902923584, "grad_norm_var": 1.1443272484375443e-05, "learning_rate": 0.008918621970100356, "loss": 2.6926, "step": 2949 }, { "crossentropy": 2.7141727209091187, "epoch": 0.16041762962560158, "grad_norm": 0.04966313764452934, "grad_norm_var": 1.2645878422333038e-05, "learning_rate": 0.008917848455540533, "loss": 2.7142, "step": 2950 }, { "crossentropy": 2.6635594367980957, "epoch": 0.16047200848310178, "grad_norm": 0.04687141999602318, "grad_norm_var": 1.2870028826600782e-05, "learning_rate": 0.008917074697999405, "loss": 2.6636, "step": 2951 }, { "crossentropy": 2.7693734169006348, "epoch": 0.16052638734060198, "grad_norm": 0.04856611788272858, "grad_norm_var": 1.3715396493627942e-05, "learning_rate": 0.008916300697524955, "loss": 2.7694, "step": 2952 }, { "crossentropy": 2.774706721305847, "epoch": 0.1605807661981022, "grad_norm": 0.053230464458465576, "grad_norm_var": 1.6745911804924055e-05, "learning_rate": 0.008915526454165185, "loss": 2.7747, "step": 2953 }, { "crossentropy": 2.6552352905273438, "epoch": 0.1606351450556024, "grad_norm": 0.0484880767762661, "grad_norm_var": 1.543121315810408e-05, "learning_rate": 0.008914751967968116, "loss": 2.6552, "step": 2954 }, { "crossentropy": 2.778749465942383, "epoch": 0.1606895239131026, "grad_norm": 0.04056698456406593, "grad_norm_var": 1.1121667584904013e-05, "learning_rate": 0.008913977238981779, "loss": 2.7787, "step": 2955 }, { "crossentropy": 2.8245447874069214, "epoch": 0.1607439027706028, "grad_norm": 0.04029465839266777, "grad_norm_var": 1.281010586278696e-05, "learning_rate": 0.008913202267254224, "loss": 2.8245, "step": 2956 }, { "crossentropy": 2.7956215143203735, "epoch": 0.160798281628103, "grad_norm": 0.041750505566596985, "grad_norm_var": 1.2063017310066867e-05, "learning_rate": 0.00891242705283351, "loss": 2.7956, "step": 2957 }, { "crossentropy": 2.873242974281311, "epoch": 0.1608526604856032, "grad_norm": 0.1261918991804123, "grad_norm_var": 0.00040820338814603056, "learning_rate": 0.00891165159576772, "loss": 2.8732, "step": 2958 }, { "crossentropy": 2.6790632009506226, "epoch": 0.1609070393431034, "grad_norm": 0.049963489174842834, "grad_norm_var": 0.00040604902731689637, "learning_rate": 0.008910875896104944, "loss": 2.6791, "step": 2959 }, { "crossentropy": 2.723686099052429, "epoch": 0.1609614182006036, "grad_norm": 0.044015850871801376, "grad_norm_var": 0.0004089682849958489, "learning_rate": 0.008910099953893292, "loss": 2.7237, "step": 2960 }, { "crossentropy": 2.6500723361968994, "epoch": 0.16101579705810382, "grad_norm": 0.04072321951389313, "grad_norm_var": 0.00041301496435274296, "learning_rate": 0.008909323769180886, "loss": 2.6501, "step": 2961 }, { "crossentropy": 2.710270643234253, "epoch": 0.16107017591560402, "grad_norm": 0.0415518581867218, "grad_norm_var": 0.0004180389988677718, "learning_rate": 0.008908547342015863, "loss": 2.7103, "step": 2962 }, { "crossentropy": 2.7180261611938477, "epoch": 0.16112455477310422, "grad_norm": 0.04368441924452782, "grad_norm_var": 0.0004206479007346552, "learning_rate": 0.00890777067244638, "loss": 2.718, "step": 2963 }, { "crossentropy": 2.6234792470932007, "epoch": 0.16117893363060443, "grad_norm": 0.04803953319787979, "grad_norm_var": 0.0004209123105539724, "learning_rate": 0.008906993760520602, "loss": 2.6235, "step": 2964 }, { "crossentropy": 2.6945717334747314, "epoch": 0.16123331248810463, "grad_norm": 0.03725258633494377, "grad_norm_var": 0.0004320357656199823, "learning_rate": 0.008906216606286713, "loss": 2.6946, "step": 2965 }, { "crossentropy": 2.7114827632904053, "epoch": 0.16128769134560483, "grad_norm": 0.04181456193327904, "grad_norm_var": 0.0004362941631909719, "learning_rate": 0.008905439209792913, "loss": 2.7115, "step": 2966 }, { "crossentropy": 2.701432228088379, "epoch": 0.16134207020310504, "grad_norm": 0.043351657688617706, "grad_norm_var": 0.00043833155245521084, "learning_rate": 0.008904661571087413, "loss": 2.7014, "step": 2967 }, { "crossentropy": 2.684532642364502, "epoch": 0.16139644906060524, "grad_norm": 0.04030734300613403, "grad_norm_var": 0.0004434498461766198, "learning_rate": 0.008903883690218444, "loss": 2.6845, "step": 2968 }, { "crossentropy": 2.7990171909332275, "epoch": 0.16145082791810544, "grad_norm": 0.04549960419535637, "grad_norm_var": 0.000442645909426307, "learning_rate": 0.008903105567234247, "loss": 2.799, "step": 2969 }, { "crossentropy": 2.682486057281494, "epoch": 0.16150520677560565, "grad_norm": 0.04322211071848869, "grad_norm_var": 0.00044427755868981856, "learning_rate": 0.00890232720218308, "loss": 2.6825, "step": 2970 }, { "crossentropy": 2.7458837032318115, "epoch": 0.16155958563310585, "grad_norm": 0.04884801432490349, "grad_norm_var": 0.0004403405635352575, "learning_rate": 0.008901548595113221, "loss": 2.7459, "step": 2971 }, { "crossentropy": 2.6401171684265137, "epoch": 0.16161396449060605, "grad_norm": 0.037087276577949524, "grad_norm_var": 0.0004445062078883852, "learning_rate": 0.008900769746072954, "loss": 2.6401, "step": 2972 }, { "crossentropy": 2.839337468147278, "epoch": 0.16166834334810626, "grad_norm": 0.043649278581142426, "grad_norm_var": 0.000443065434067003, "learning_rate": 0.008899990655110586, "loss": 2.8393, "step": 2973 }, { "crossentropy": 2.70032274723053, "epoch": 0.16172272220560646, "grad_norm": 0.0422310046851635, "grad_norm_var": 1.3352893549260086e-05, "learning_rate": 0.008899211322274432, "loss": 2.7003, "step": 2974 }, { "crossentropy": 2.7512099742889404, "epoch": 0.16177710106310667, "grad_norm": 0.04201722890138626, "grad_norm_var": 1.0136176767453857e-05, "learning_rate": 0.008898431747612828, "loss": 2.7512, "step": 2975 }, { "crossentropy": 2.7716434001922607, "epoch": 0.16183147992060687, "grad_norm": 0.04589058831334114, "grad_norm_var": 1.0683265722184986e-05, "learning_rate": 0.008897651931174118, "loss": 2.7716, "step": 2976 }, { "crossentropy": 2.737837791442871, "epoch": 0.16188585877810707, "grad_norm": 0.04810481145977974, "grad_norm_var": 1.202198917142714e-05, "learning_rate": 0.008896871873006673, "loss": 2.7378, "step": 2977 }, { "crossentropy": 2.598825454711914, "epoch": 0.16194023763560728, "grad_norm": 0.04246030002832413, "grad_norm_var": 1.1863702018531131e-05, "learning_rate": 0.008896091573158866, "loss": 2.5988, "step": 2978 }, { "crossentropy": 2.737033486366272, "epoch": 0.16199461649310748, "grad_norm": 0.03914423659443855, "grad_norm_var": 1.2944302611088826e-05, "learning_rate": 0.008895311031679091, "loss": 2.737, "step": 2979 }, { "crossentropy": 2.683209180831909, "epoch": 0.16204899535060768, "grad_norm": 0.03975430130958557, "grad_norm_var": 1.1730988710939988e-05, "learning_rate": 0.00889453024861576, "loss": 2.6832, "step": 2980 }, { "crossentropy": 2.787900924682617, "epoch": 0.1621033742081079, "grad_norm": 0.04462040215730667, "grad_norm_var": 9.929870590514682e-06, "learning_rate": 0.008893749224017292, "loss": 2.7879, "step": 2981 }, { "crossentropy": 2.7865070104599, "epoch": 0.1621577530656081, "grad_norm": 0.0410870723426342, "grad_norm_var": 1.0077950499376432e-05, "learning_rate": 0.008892967957932128, "loss": 2.7865, "step": 2982 }, { "crossentropy": 2.7409404516220093, "epoch": 0.1622121319231083, "grad_norm": 0.039450764656066895, "grad_norm_var": 1.082254664233409e-05, "learning_rate": 0.008892186450408719, "loss": 2.7409, "step": 2983 }, { "crossentropy": 2.809882879257202, "epoch": 0.1622665107806085, "grad_norm": 0.03857994079589844, "grad_norm_var": 1.156262860260877e-05, "learning_rate": 0.008891404701495537, "loss": 2.8099, "step": 2984 }, { "crossentropy": 2.6860543489456177, "epoch": 0.1623208896381087, "grad_norm": 0.03933867812156677, "grad_norm_var": 1.1555451846074979e-05, "learning_rate": 0.008890622711241063, "loss": 2.6861, "step": 2985 }, { "crossentropy": 2.7680323123931885, "epoch": 0.1623752684956089, "grad_norm": 0.04000749811530113, "grad_norm_var": 1.1770879982875235e-05, "learning_rate": 0.008889840479693795, "loss": 2.768, "step": 2986 }, { "crossentropy": 2.858722686767578, "epoch": 0.1624296473531091, "grad_norm": 0.040003128349781036, "grad_norm_var": 8.6043969895253e-06, "learning_rate": 0.008889058006902246, "loss": 2.8587, "step": 2987 }, { "crossentropy": 2.7549028396606445, "epoch": 0.1624840262106093, "grad_norm": 0.042317502200603485, "grad_norm_var": 7.261824503459099e-06, "learning_rate": 0.008888275292914949, "loss": 2.7549, "step": 2988 }, { "crossentropy": 2.802478313446045, "epoch": 0.16253840506810951, "grad_norm": 0.04247716814279556, "grad_norm_var": 7.057282515388988e-06, "learning_rate": 0.00888749233778044, "loss": 2.8025, "step": 2989 }, { "crossentropy": 2.6398050785064697, "epoch": 0.16259278392560972, "grad_norm": 0.04230296611785889, "grad_norm_var": 7.062530398537342e-06, "learning_rate": 0.00888670914154728, "loss": 2.6398, "step": 2990 }, { "crossentropy": 2.679387927055359, "epoch": 0.16264716278310992, "grad_norm": 0.043385569006204605, "grad_norm_var": 7.233363391421591e-06, "learning_rate": 0.008885925704264044, "loss": 2.6794, "step": 2991 }, { "crossentropy": 2.723318934440613, "epoch": 0.16270154164061013, "grad_norm": 0.04879356920719147, "grad_norm_var": 9.340367293568923e-06, "learning_rate": 0.00888514202597932, "loss": 2.7233, "step": 2992 }, { "crossentropy": 2.63360059261322, "epoch": 0.16275592049811033, "grad_norm": 0.04125060513615608, "grad_norm_var": 6.687645089660926e-06, "learning_rate": 0.008884358106741708, "loss": 2.6336, "step": 2993 }, { "crossentropy": 2.721818208694458, "epoch": 0.16281029935561053, "grad_norm": 0.049036916345357895, "grad_norm_var": 1.0179593818124278e-05, "learning_rate": 0.008883573946599829, "loss": 2.7218, "step": 2994 }, { "crossentropy": 2.8139408826828003, "epoch": 0.16286467821311074, "grad_norm": 0.041420985013246536, "grad_norm_var": 9.645185564363668e-06, "learning_rate": 0.008882789545602312, "loss": 2.8139, "step": 2995 }, { "crossentropy": 2.659801483154297, "epoch": 0.16291905707061094, "grad_norm": 0.041699934750795364, "grad_norm_var": 9.26958110546988e-06, "learning_rate": 0.00888200490379781, "loss": 2.6598, "step": 2996 }, { "crossentropy": 2.6046621799468994, "epoch": 0.16297343592811114, "grad_norm": 0.042549461126327515, "grad_norm_var": 8.879179791364672e-06, "learning_rate": 0.00888122002123498, "loss": 2.6047, "step": 2997 }, { "crossentropy": 2.6847037076950073, "epoch": 0.16302781478561135, "grad_norm": 0.04077102988958359, "grad_norm_var": 8.928374223078859e-06, "learning_rate": 0.008880434897962504, "loss": 2.6847, "step": 2998 }, { "crossentropy": 2.78983473777771, "epoch": 0.16308219364311155, "grad_norm": 0.04425631836056709, "grad_norm_var": 8.682817248904275e-06, "learning_rate": 0.008879649534029074, "loss": 2.7898, "step": 2999 }, { "crossentropy": 2.665000796318054, "epoch": 0.16313657250061175, "grad_norm": 0.05281537026166916, "grad_norm_var": 1.4122353152273315e-05, "learning_rate": 0.008878863929483397, "loss": 2.665, "step": 3000 }, { "crossentropy": 2.744784355163574, "epoch": 0.16319095135811196, "grad_norm": 0.04537929221987724, "grad_norm_var": 1.3231199251551028e-05, "learning_rate": 0.008878078084374194, "loss": 2.7448, "step": 3001 }, { "crossentropy": 2.7409238815307617, "epoch": 0.16324533021561216, "grad_norm": 0.04137235879898071, "grad_norm_var": 1.2683993741361733e-05, "learning_rate": 0.008877291998750205, "loss": 2.7409, "step": 3002 }, { "crossentropy": 2.755748987197876, "epoch": 0.16329970907311236, "grad_norm": 0.042407095432281494, "grad_norm_var": 1.1847566084716097e-05, "learning_rate": 0.008876505672660183, "loss": 2.7557, "step": 3003 }, { "crossentropy": 2.768223285675049, "epoch": 0.16335408793061257, "grad_norm": 0.0413280613720417, "grad_norm_var": 1.2116173816421189e-05, "learning_rate": 0.008875719106152891, "loss": 2.7682, "step": 3004 }, { "crossentropy": 2.633805751800537, "epoch": 0.16340846678811277, "grad_norm": 0.04599035903811455, "grad_norm_var": 1.2254854708854991e-05, "learning_rate": 0.008874932299277112, "loss": 2.6338, "step": 3005 }, { "crossentropy": 2.7641767263412476, "epoch": 0.16346284564561298, "grad_norm": 0.046439122408628464, "grad_norm_var": 1.2362006593811398e-05, "learning_rate": 0.008874145252081647, "loss": 2.7642, "step": 3006 }, { "crossentropy": 2.6399558782577515, "epoch": 0.16351722450311318, "grad_norm": 0.04495882987976074, "grad_norm_var": 1.2323625744798707e-05, "learning_rate": 0.008873357964615303, "loss": 2.64, "step": 3007 }, { "crossentropy": 2.7122875452041626, "epoch": 0.16357160336061338, "grad_norm": 0.04673507437109947, "grad_norm_var": 1.1383766975966991e-05, "learning_rate": 0.00887257043692691, "loss": 2.7123, "step": 3008 }, { "crossentropy": 2.750861644744873, "epoch": 0.16362598221811359, "grad_norm": 0.04110640287399292, "grad_norm_var": 1.1443229558894853e-05, "learning_rate": 0.00887178266906531, "loss": 2.7509, "step": 3009 }, { "crossentropy": 2.6649038791656494, "epoch": 0.1636803610756138, "grad_norm": 0.04082811251282692, "grad_norm_var": 1.0433682089016199e-05, "learning_rate": 0.008870994661079356, "loss": 2.6649, "step": 3010 }, { "crossentropy": 2.6962445974349976, "epoch": 0.163734739933114, "grad_norm": 0.04125289246439934, "grad_norm_var": 1.0487727684685364e-05, "learning_rate": 0.008870206413017926, "loss": 2.6962, "step": 3011 }, { "crossentropy": 2.7462676763534546, "epoch": 0.1637891187906142, "grad_norm": 0.04260505735874176, "grad_norm_var": 1.0292354403823358e-05, "learning_rate": 0.0088694179249299, "loss": 2.7463, "step": 3012 }, { "crossentropy": 2.7039928436279297, "epoch": 0.1638434976481144, "grad_norm": 0.041658300906419754, "grad_norm_var": 1.0490542213439403e-05, "learning_rate": 0.008868629196864182, "loss": 2.704, "step": 3013 }, { "crossentropy": 2.8287272453308105, "epoch": 0.1638978765056146, "grad_norm": 0.04305768013000488, "grad_norm_var": 9.910927386704883e-06, "learning_rate": 0.008867840228869687, "loss": 2.8287, "step": 3014 }, { "crossentropy": 2.7190475463867188, "epoch": 0.1639522553631148, "grad_norm": 0.041009292006492615, "grad_norm_var": 1.0409939387703848e-05, "learning_rate": 0.008867051020995349, "loss": 2.719, "step": 3015 }, { "crossentropy": 2.7299169301986694, "epoch": 0.16400663422061504, "grad_norm": 0.043083880096673965, "grad_norm_var": 4.480506268448627e-06, "learning_rate": 0.00886626157329011, "loss": 2.7299, "step": 3016 }, { "crossentropy": 2.739431381225586, "epoch": 0.16406101307811524, "grad_norm": 0.042865172028541565, "grad_norm_var": 4.103368080428845e-06, "learning_rate": 0.008865471885802934, "loss": 2.7394, "step": 3017 }, { "crossentropy": 2.6953760385513306, "epoch": 0.16411539193561545, "grad_norm": 0.04258507862687111, "grad_norm_var": 3.945264258473948e-06, "learning_rate": 0.008864681958582794, "loss": 2.6954, "step": 3018 }, { "crossentropy": 2.7773338556289673, "epoch": 0.16416977079311565, "grad_norm": 0.04609732702374458, "grad_norm_var": 4.507404996915293e-06, "learning_rate": 0.008863891791678685, "loss": 2.7773, "step": 3019 }, { "crossentropy": 2.807327389717102, "epoch": 0.16422414965061585, "grad_norm": 0.03954296559095383, "grad_norm_var": 5.1580706166453586e-06, "learning_rate": 0.008863101385139606, "loss": 2.8073, "step": 3020 }, { "crossentropy": 2.7317713499069214, "epoch": 0.16427852850811606, "grad_norm": 0.03883444890379906, "grad_norm_var": 5.613611604875166e-06, "learning_rate": 0.008862310739014582, "loss": 2.7318, "step": 3021 }, { "crossentropy": 2.7486305236816406, "epoch": 0.16433290736561626, "grad_norm": 0.041262634098529816, "grad_norm_var": 4.6843172495922755e-06, "learning_rate": 0.008861519853352647, "loss": 2.7486, "step": 3022 }, { "crossentropy": 2.620904564857483, "epoch": 0.16438728622311646, "grad_norm": 0.03794281557202339, "grad_norm_var": 5.313535519277322e-06, "learning_rate": 0.00886072872820285, "loss": 2.6209, "step": 3023 }, { "crossentropy": 2.744908928871155, "epoch": 0.16444166508061667, "grad_norm": 0.043249331414699554, "grad_norm_var": 3.827709024273087e-06, "learning_rate": 0.008859937363614259, "loss": 2.7449, "step": 3024 }, { "crossentropy": 2.651060461997986, "epoch": 0.16449604393811687, "grad_norm": 0.040290143340826035, "grad_norm_var": 3.932468399399311e-06, "learning_rate": 0.008859145759635948, "loss": 2.6511, "step": 3025 }, { "crossentropy": 2.7202506065368652, "epoch": 0.16455042279561707, "grad_norm": 0.04052633419632912, "grad_norm_var": 3.970640009825534e-06, "learning_rate": 0.008858353916317016, "loss": 2.7203, "step": 3026 }, { "crossentropy": 2.687113642692566, "epoch": 0.16460480165311728, "grad_norm": 0.0417017862200737, "grad_norm_var": 3.961473707214669e-06, "learning_rate": 0.008857561833706572, "loss": 2.6871, "step": 3027 }, { "crossentropy": 2.6835397481918335, "epoch": 0.16465918051061748, "grad_norm": 0.047182369977235794, "grad_norm_var": 5.857187367270568e-06, "learning_rate": 0.008856769511853739, "loss": 2.6835, "step": 3028 }, { "crossentropy": 2.6734132766723633, "epoch": 0.16471355936811768, "grad_norm": 0.042544975876808167, "grad_norm_var": 5.874132587857517e-06, "learning_rate": 0.008855976950807656, "loss": 2.6734, "step": 3029 }, { "crossentropy": 2.5926432609558105, "epoch": 0.1647679382256179, "grad_norm": 0.039206985384225845, "grad_norm_var": 6.25065235259712e-06, "learning_rate": 0.008855184150617479, "loss": 2.5926, "step": 3030 }, { "crossentropy": 2.6873905658721924, "epoch": 0.1648223170831181, "grad_norm": 0.03976921737194061, "grad_norm_var": 6.468465551268988e-06, "learning_rate": 0.008854391111332374, "loss": 2.6874, "step": 3031 }, { "crossentropy": 2.7144464254379272, "epoch": 0.1648766959406183, "grad_norm": 0.04194760322570801, "grad_norm_var": 6.334625967282144e-06, "learning_rate": 0.008853597833001527, "loss": 2.7144, "step": 3032 }, { "crossentropy": 2.7112534046173096, "epoch": 0.1649310747981185, "grad_norm": 0.04539818689227104, "grad_norm_var": 7.164002054756552e-06, "learning_rate": 0.008852804315674133, "loss": 2.7113, "step": 3033 }, { "crossentropy": 2.724487066268921, "epoch": 0.1649854536556187, "grad_norm": 0.03985283896327019, "grad_norm_var": 7.328226579369988e-06, "learning_rate": 0.008852010559399407, "loss": 2.7245, "step": 3034 }, { "crossentropy": 2.6987743377685547, "epoch": 0.1650398325131189, "grad_norm": 0.04224395379424095, "grad_norm_var": 5.937577284330306e-06, "learning_rate": 0.00885121656422658, "loss": 2.6988, "step": 3035 }, { "crossentropy": 2.6695879697799683, "epoch": 0.1650942113706191, "grad_norm": 0.03838035464286804, "grad_norm_var": 6.301171490860104e-06, "learning_rate": 0.008850422330204891, "loss": 2.6696, "step": 3036 }, { "crossentropy": 2.817970037460327, "epoch": 0.1651485902281193, "grad_norm": 0.0398697666823864, "grad_norm_var": 6.031834312094213e-06, "learning_rate": 0.0088496278573836, "loss": 2.818, "step": 3037 }, { "crossentropy": 2.7491679191589355, "epoch": 0.16520296908561952, "grad_norm": 0.03858296945691109, "grad_norm_var": 6.506685108665736e-06, "learning_rate": 0.008848833145811976, "loss": 2.7492, "step": 3038 }, { "crossentropy": 2.736085891723633, "epoch": 0.16525734794311972, "grad_norm": 0.04035481438040733, "grad_norm_var": 5.833042126259189e-06, "learning_rate": 0.00884803819553931, "loss": 2.7361, "step": 3039 }, { "crossentropy": 2.6920745372772217, "epoch": 0.16531172680061992, "grad_norm": 0.04804622754454613, "grad_norm_var": 8.505888311229464e-06, "learning_rate": 0.008847243006614902, "loss": 2.6921, "step": 3040 }, { "crossentropy": 2.7652121782302856, "epoch": 0.16536610565812013, "grad_norm": 0.04622746631503105, "grad_norm_var": 9.657416655621474e-06, "learning_rate": 0.008846447579088071, "loss": 2.7652, "step": 3041 }, { "crossentropy": 2.739451766014099, "epoch": 0.16542048451562033, "grad_norm": 0.038539301604032516, "grad_norm_var": 1.0291896831209936e-05, "learning_rate": 0.008845651913008144, "loss": 2.7395, "step": 3042 }, { "crossentropy": 2.7977709770202637, "epoch": 0.16547486337312053, "grad_norm": 0.042677029967308044, "grad_norm_var": 1.0330045841480152e-05, "learning_rate": 0.008844856008424473, "loss": 2.7978, "step": 3043 }, { "crossentropy": 2.6851422786712646, "epoch": 0.16552924223062074, "grad_norm": 0.042858775705099106, "grad_norm_var": 8.46848988207919e-06, "learning_rate": 0.008844059865386417, "loss": 2.6851, "step": 3044 }, { "crossentropy": 2.700414299964905, "epoch": 0.16558362108812094, "grad_norm": 0.04267704114317894, "grad_norm_var": 8.485228753685181e-06, "learning_rate": 0.008843263483943351, "loss": 2.7004, "step": 3045 }, { "crossentropy": 2.7798635959625244, "epoch": 0.16563799994562114, "grad_norm": 0.04793623834848404, "grad_norm_var": 1.038737882544377e-05, "learning_rate": 0.008842466864144667, "loss": 2.7799, "step": 3046 }, { "crossentropy": 2.5787330865859985, "epoch": 0.16569237880312135, "grad_norm": 0.041813187301158905, "grad_norm_var": 9.983276881286342e-06, "learning_rate": 0.00884167000603977, "loss": 2.5787, "step": 3047 }, { "crossentropy": 2.6653175354003906, "epoch": 0.16574675766062155, "grad_norm": 0.04137415811419487, "grad_norm_var": 1.0033668096601916e-05, "learning_rate": 0.00884087290967808, "loss": 2.6653, "step": 3048 }, { "crossentropy": 2.671926259994507, "epoch": 0.16580113651812176, "grad_norm": 0.042158808559179306, "grad_norm_var": 9.352228639273458e-06, "learning_rate": 0.008840075575109036, "loss": 2.6719, "step": 3049 }, { "crossentropy": 2.71705162525177, "epoch": 0.16585551537562196, "grad_norm": 0.04104039818048477, "grad_norm_var": 9.084623897010792e-06, "learning_rate": 0.008839278002382085, "loss": 2.7171, "step": 3050 }, { "crossentropy": 2.746447443962097, "epoch": 0.16590989423312216, "grad_norm": 0.03776237741112709, "grad_norm_var": 1.029797536943085e-05, "learning_rate": 0.00883848019154669, "loss": 2.7464, "step": 3051 }, { "crossentropy": 2.7178295850753784, "epoch": 0.16596427309062237, "grad_norm": 0.039389677345752716, "grad_norm_var": 9.88883530254747e-06, "learning_rate": 0.008837682142652332, "loss": 2.7178, "step": 3052 }, { "crossentropy": 2.673491358757019, "epoch": 0.16601865194812257, "grad_norm": 0.03885667398571968, "grad_norm_var": 1.023489228738622e-05, "learning_rate": 0.008836883855748505, "loss": 2.6735, "step": 3053 }, { "crossentropy": 2.7069475650787354, "epoch": 0.16607303080562277, "grad_norm": 0.04172755032777786, "grad_norm_var": 9.464908183932456e-06, "learning_rate": 0.00883608533088472, "loss": 2.7069, "step": 3054 }, { "crossentropy": 2.6921510696411133, "epoch": 0.16612740966312298, "grad_norm": 0.04694090783596039, "grad_norm_var": 1.0652216443412093e-05, "learning_rate": 0.008835286568110497, "loss": 2.6922, "step": 3055 }, { "crossentropy": 2.78054416179657, "epoch": 0.16618178852062318, "grad_norm": 0.04296821355819702, "grad_norm_var": 8.509772038419137e-06, "learning_rate": 0.008834487567475378, "loss": 2.7805, "step": 3056 }, { "crossentropy": 2.785494804382324, "epoch": 0.16623616737812338, "grad_norm": 0.04249030351638794, "grad_norm_var": 7.367977418965867e-06, "learning_rate": 0.008833688329028914, "loss": 2.7855, "step": 3057 }, { "crossentropy": 2.8499059677124023, "epoch": 0.1662905462356236, "grad_norm": 0.05766088888049126, "grad_norm_var": 2.152274631672385e-05, "learning_rate": 0.008832888852820676, "loss": 2.8499, "step": 3058 }, { "crossentropy": 2.6883822679519653, "epoch": 0.1663449250931238, "grad_norm": 0.0397670604288578, "grad_norm_var": 2.223385853717119e-05, "learning_rate": 0.008832089138900242, "loss": 2.6884, "step": 3059 }, { "crossentropy": 2.7049676179885864, "epoch": 0.166399303950624, "grad_norm": 0.07808363437652588, "grad_norm_var": 9.928958442997019e-05, "learning_rate": 0.008831289187317215, "loss": 2.705, "step": 3060 }, { "crossentropy": 2.7855870723724365, "epoch": 0.1664536828081242, "grad_norm": 0.04160454124212265, "grad_norm_var": 9.971731714489193e-05, "learning_rate": 0.008830488998121203, "loss": 2.7856, "step": 3061 }, { "crossentropy": 2.716089367866516, "epoch": 0.1665080616656244, "grad_norm": 0.03959086537361145, "grad_norm_var": 0.00010091245127308834, "learning_rate": 0.008829688571361835, "loss": 2.7161, "step": 3062 }, { "crossentropy": 2.6653279066085815, "epoch": 0.1665624405231246, "grad_norm": 0.03956444561481476, "grad_norm_var": 0.00010205713223146081, "learning_rate": 0.008828887907088752, "loss": 2.6653, "step": 3063 }, { "crossentropy": 2.682817578315735, "epoch": 0.1666168193806248, "grad_norm": 0.04158001020550728, "grad_norm_var": 0.00010197573473918082, "learning_rate": 0.00882808700535161, "loss": 2.6828, "step": 3064 }, { "crossentropy": 2.683691382408142, "epoch": 0.166671198238125, "grad_norm": 0.4092918336391449, "grad_norm_var": 0.00841402733408197, "learning_rate": 0.008827285866200081, "loss": 2.6837, "step": 3065 }, { "crossentropy": 2.6653090715408325, "epoch": 0.16672557709562522, "grad_norm": 0.04306713491678238, "grad_norm_var": 0.00840716222809295, "learning_rate": 0.008826484489683852, "loss": 2.6653, "step": 3066 }, { "crossentropy": 2.6156442165374756, "epoch": 0.16677995595312542, "grad_norm": 0.040312521159648895, "grad_norm_var": 0.00839744996308251, "learning_rate": 0.008825682875852621, "loss": 2.6156, "step": 3067 }, { "crossentropy": 2.682439684867859, "epoch": 0.16683433481062562, "grad_norm": 0.0425909049808979, "grad_norm_var": 0.008386014852041806, "learning_rate": 0.008824881024756105, "loss": 2.6824, "step": 3068 }, { "crossentropy": 2.690931797027588, "epoch": 0.16688871366812583, "grad_norm": 0.04632848873734474, "grad_norm_var": 0.008360588758608362, "learning_rate": 0.008824078936444034, "loss": 2.6909, "step": 3069 }, { "crossentropy": 2.62508225440979, "epoch": 0.16694309252562603, "grad_norm": 0.04421421140432358, "grad_norm_var": 0.008352149061138365, "learning_rate": 0.008823276610966154, "loss": 2.6251, "step": 3070 }, { "crossentropy": 2.7555075883865356, "epoch": 0.16699747138312623, "grad_norm": 0.05426300689578056, "grad_norm_var": 0.00833444875936109, "learning_rate": 0.00882247404837222, "loss": 2.7555, "step": 3071 }, { "crossentropy": 2.753893733024597, "epoch": 0.16705185024062644, "grad_norm": 0.06278011202812195, "grad_norm_var": 0.00829031818321284, "learning_rate": 0.00882167124871201, "loss": 2.7539, "step": 3072 }, { "crossentropy": 2.7383071184158325, "epoch": 0.16710622909812664, "grad_norm": 0.039181675761938095, "grad_norm_var": 0.008303226237866243, "learning_rate": 0.008820868212035314, "loss": 2.7383, "step": 3073 }, { "crossentropy": 2.733229398727417, "epoch": 0.16716060795562684, "grad_norm": 0.04151986911892891, "grad_norm_var": 0.008346049003792167, "learning_rate": 0.008820064938391933, "loss": 2.7332, "step": 3074 }, { "crossentropy": 2.689277410507202, "epoch": 0.16721498681312705, "grad_norm": 0.042911700904369354, "grad_norm_var": 0.00833441691196871, "learning_rate": 0.008819261427831687, "loss": 2.6893, "step": 3075 }, { "crossentropy": 2.7429323196411133, "epoch": 0.16726936567062725, "grad_norm": 0.338774710893631, "grad_norm_var": 0.012891375718027585, "learning_rate": 0.008818457680404406, "loss": 2.7429, "step": 3076 }, { "crossentropy": 2.650004029273987, "epoch": 0.16732374452812745, "grad_norm": 0.043317005038261414, "grad_norm_var": 0.012881542467004608, "learning_rate": 0.00881765369615994, "loss": 2.65, "step": 3077 }, { "crossentropy": 2.7505338191986084, "epoch": 0.16737812338562766, "grad_norm": 0.04670874401926994, "grad_norm_var": 0.012841062464722839, "learning_rate": 0.00881684947514815, "loss": 2.7505, "step": 3078 }, { "crossentropy": 2.7228176593780518, "epoch": 0.16743250224312786, "grad_norm": 0.043843306601047516, "grad_norm_var": 0.012815700094298032, "learning_rate": 0.008816045017418915, "loss": 2.7228, "step": 3079 }, { "crossentropy": 2.780332088470459, "epoch": 0.16748688110062807, "grad_norm": 0.04419751837849617, "grad_norm_var": 0.0128005234817326, "learning_rate": 0.008815240323022127, "loss": 2.7803, "step": 3080 }, { "crossentropy": 2.811899185180664, "epoch": 0.16754125995812827, "grad_norm": 0.04750346392393112, "grad_norm_var": 0.005408120352776816, "learning_rate": 0.00881443539200769, "loss": 2.8119, "step": 3081 }, { "crossentropy": 2.748543620109558, "epoch": 0.16759563881562847, "grad_norm": 0.04437917098402977, "grad_norm_var": 0.005404593163077599, "learning_rate": 0.008813630224425524, "loss": 2.7485, "step": 3082 }, { "crossentropy": 2.681835174560547, "epoch": 0.16765001767312868, "grad_norm": 0.04381638765335083, "grad_norm_var": 0.005394328379643842, "learning_rate": 0.008812824820325572, "loss": 2.6818, "step": 3083 }, { "crossentropy": 2.684537410736084, "epoch": 0.16770439653062888, "grad_norm": 0.041047386825084686, "grad_norm_var": 0.005398913299788604, "learning_rate": 0.008812019179757775, "loss": 2.6845, "step": 3084 }, { "crossentropy": 2.7441095113754272, "epoch": 0.16775877538812908, "grad_norm": 0.04008100926876068, "grad_norm_var": 0.00541611401829071, "learning_rate": 0.008811213302772103, "loss": 2.7441, "step": 3085 }, { "crossentropy": 2.767059087753296, "epoch": 0.1678131542456293, "grad_norm": 0.040338415652513504, "grad_norm_var": 0.00542710126508192, "learning_rate": 0.008810407189418537, "loss": 2.7671, "step": 3086 }, { "crossentropy": 2.8026658296585083, "epoch": 0.1678675331031295, "grad_norm": 0.04138027876615524, "grad_norm_var": 0.005453196930847494, "learning_rate": 0.008809600839747068, "loss": 2.8027, "step": 3087 }, { "crossentropy": 2.749069094657898, "epoch": 0.1679219119606297, "grad_norm": 0.044173065572977066, "grad_norm_var": 0.005474416997674907, "learning_rate": 0.008808794253807707, "loss": 2.7491, "step": 3088 }, { "crossentropy": 2.7376166582107544, "epoch": 0.1679762908181299, "grad_norm": 0.045102257281541824, "grad_norm_var": 0.005459030267958537, "learning_rate": 0.008807987431650477, "loss": 2.7376, "step": 3089 }, { "crossentropy": 2.749690890312195, "epoch": 0.16803066967563013, "grad_norm": 0.04499206319451332, "grad_norm_var": 0.005450386387524565, "learning_rate": 0.008807180373325417, "loss": 2.7497, "step": 3090 }, { "crossentropy": 2.7456239461898804, "epoch": 0.16808504853313033, "grad_norm": 0.04258806258440018, "grad_norm_var": 0.005451218155693417, "learning_rate": 0.00880637307888258, "loss": 2.7456, "step": 3091 }, { "crossentropy": 2.717083215713501, "epoch": 0.16813942739063054, "grad_norm": 0.04103797301650047, "grad_norm_var": 4.80190323480049e-06, "learning_rate": 0.008805565548372033, "loss": 2.7171, "step": 3092 }, { "crossentropy": 2.755706310272217, "epoch": 0.16819380624813074, "grad_norm": 0.04414563626050949, "grad_norm_var": 4.834915253726137e-06, "learning_rate": 0.00880475778184386, "loss": 2.7557, "step": 3093 }, { "crossentropy": 2.740170955657959, "epoch": 0.16824818510563094, "grad_norm": 0.05044981837272644, "grad_norm_var": 7.3309358069770585e-06, "learning_rate": 0.008803949779348154, "loss": 2.7402, "step": 3094 }, { "crossentropy": 2.816481590270996, "epoch": 0.16830256396313115, "grad_norm": 0.04135795682668686, "grad_norm_var": 7.666935048551646e-06, "learning_rate": 0.008803141540935032, "loss": 2.8165, "step": 3095 }, { "crossentropy": 2.726052403450012, "epoch": 0.16835694282063135, "grad_norm": 0.043610211461782455, "grad_norm_var": 7.636762020534795e-06, "learning_rate": 0.008802333066654615, "loss": 2.7261, "step": 3096 }, { "crossentropy": 2.626070499420166, "epoch": 0.16841132167813155, "grad_norm": 0.04684090614318848, "grad_norm_var": 7.310545724327893e-06, "learning_rate": 0.008801524356557046, "loss": 2.6261, "step": 3097 }, { "crossentropy": 2.7062524557113647, "epoch": 0.16846570053563176, "grad_norm": 0.05621453747153282, "grad_norm_var": 1.7517699593352862e-05, "learning_rate": 0.00880071541069248, "loss": 2.7063, "step": 3098 }, { "crossentropy": 2.759526014328003, "epoch": 0.16852007939313196, "grad_norm": 0.0523676872253418, "grad_norm_var": 2.1652322978061612e-05, "learning_rate": 0.00879990622911109, "loss": 2.7595, "step": 3099 }, { "crossentropy": 2.6949421167373657, "epoch": 0.16857445825063216, "grad_norm": 0.08780243247747421, "grad_norm_var": 0.00013530361739823763, "learning_rate": 0.008799096811863057, "loss": 2.6949, "step": 3100 }, { "crossentropy": 2.6568275690078735, "epoch": 0.16862883710813237, "grad_norm": 0.04052319377660751, "grad_norm_var": 0.00013486928247803855, "learning_rate": 0.008798287158998582, "loss": 2.6568, "step": 3101 }, { "crossentropy": 2.7316067218780518, "epoch": 0.16868321596563257, "grad_norm": 0.048618920147418976, "grad_norm_var": 0.0001310460312707334, "learning_rate": 0.00879747727056788, "loss": 2.7316, "step": 3102 }, { "crossentropy": 2.756551742553711, "epoch": 0.16873759482313277, "grad_norm": 0.04303708299994469, "grad_norm_var": 0.0001297109990109105, "learning_rate": 0.008796667146621176, "loss": 2.7566, "step": 3103 }, { "crossentropy": 2.6711361408233643, "epoch": 0.16879197368063298, "grad_norm": 0.040275562554597855, "grad_norm_var": 0.00013280704641056428, "learning_rate": 0.008795856787208714, "loss": 2.6711, "step": 3104 }, { "crossentropy": 2.8614046573638916, "epoch": 0.16884635253813318, "grad_norm": 0.041441481560468674, "grad_norm_var": 0.00013508844193750122, "learning_rate": 0.008795046192380756, "loss": 2.8614, "step": 3105 }, { "crossentropy": 2.7207943201065063, "epoch": 0.16890073139563339, "grad_norm": 0.04052348807454109, "grad_norm_var": 0.0001380281994030484, "learning_rate": 0.008794235362187569, "loss": 2.7208, "step": 3106 }, { "crossentropy": 2.7456448078155518, "epoch": 0.1689551102531336, "grad_norm": 0.08311542123556137, "grad_norm_var": 0.0002138580081406826, "learning_rate": 0.008793424296679444, "loss": 2.7456, "step": 3107 }, { "crossentropy": 2.724758505821228, "epoch": 0.1690094891106338, "grad_norm": 0.04127440229058266, "grad_norm_var": 0.0002135762996623255, "learning_rate": 0.00879261299590668, "loss": 2.7248, "step": 3108 }, { "crossentropy": 2.716172218322754, "epoch": 0.169063867968134, "grad_norm": 0.042283426970243454, "grad_norm_var": 0.00021527145525321867, "learning_rate": 0.008791801459919592, "loss": 2.7162, "step": 3109 }, { "crossentropy": 2.692919611930847, "epoch": 0.1691182468256342, "grad_norm": 0.04494775831699371, "grad_norm_var": 0.00021682142619643686, "learning_rate": 0.008790989688768515, "loss": 2.6929, "step": 3110 }, { "crossentropy": 2.7076233625411987, "epoch": 0.1691726256831344, "grad_norm": 0.045579198747873306, "grad_norm_var": 0.0002132738999067527, "learning_rate": 0.008790177682503791, "loss": 2.7076, "step": 3111 }, { "crossentropy": 2.6964681148529053, "epoch": 0.1692270045406346, "grad_norm": 0.04316575825214386, "grad_norm_var": 0.00021365918797601469, "learning_rate": 0.00878936544117578, "loss": 2.6965, "step": 3112 }, { "crossentropy": 2.6774102449417114, "epoch": 0.1692813833981348, "grad_norm": 0.040858957916498184, "grad_norm_var": 0.00021831620304683705, "learning_rate": 0.00878855296483486, "loss": 2.6774, "step": 3113 }, { "crossentropy": 2.736940622329712, "epoch": 0.16933576225563501, "grad_norm": 0.07445418834686279, "grad_norm_var": 0.0002554339945821158, "learning_rate": 0.008787740253531415, "loss": 2.7369, "step": 3114 }, { "crossentropy": 2.6786959171295166, "epoch": 0.16939014111313522, "grad_norm": 0.10303329676389694, "grad_norm_var": 0.0004275307597893509, "learning_rate": 0.008786927307315852, "loss": 2.6787, "step": 3115 }, { "crossentropy": 2.710302472114563, "epoch": 0.16944451997063542, "grad_norm": 0.0458260215818882, "grad_norm_var": 0.0003473973480169913, "learning_rate": 0.008786114126238585, "loss": 2.7103, "step": 3116 }, { "crossentropy": 2.743224024772644, "epoch": 0.16949889882813562, "grad_norm": 0.04147034510970116, "grad_norm_var": 0.00034610698515793645, "learning_rate": 0.00878530071035005, "loss": 2.7432, "step": 3117 }, { "crossentropy": 2.661962151527405, "epoch": 0.16955327768563583, "grad_norm": 0.05757491663098335, "grad_norm_var": 0.00034798531026294786, "learning_rate": 0.008784487059700696, "loss": 2.662, "step": 3118 }, { "crossentropy": 2.861728310585022, "epoch": 0.16960765654313603, "grad_norm": 0.041569847613573074, "grad_norm_var": 0.00034983491018357454, "learning_rate": 0.008783673174340983, "loss": 2.8617, "step": 3119 }, { "crossentropy": 2.773074746131897, "epoch": 0.16966203540063624, "grad_norm": 0.04285479709506035, "grad_norm_var": 0.00034631767656156384, "learning_rate": 0.008782859054321386, "loss": 2.7731, "step": 3120 }, { "crossentropy": 2.6909372806549072, "epoch": 0.16971641425813644, "grad_norm": 0.061976417899131775, "grad_norm_var": 0.00034411058519760084, "learning_rate": 0.008782044699692397, "loss": 2.6909, "step": 3121 }, { "crossentropy": 2.725612998008728, "epoch": 0.16977079311563664, "grad_norm": 0.05753270164132118, "grad_norm_var": 0.00033354172155949843, "learning_rate": 0.008781230110504523, "loss": 2.7256, "step": 3122 }, { "crossentropy": 2.728929877281189, "epoch": 0.16982517197313685, "grad_norm": 0.0422334223985672, "grad_norm_var": 0.00028049245899545137, "learning_rate": 0.008780415286808283, "loss": 2.7289, "step": 3123 }, { "crossentropy": 2.694536566734314, "epoch": 0.16987955083063705, "grad_norm": 0.04198552668094635, "grad_norm_var": 0.0002795388910378037, "learning_rate": 0.00877960022865421, "loss": 2.6945, "step": 3124 }, { "crossentropy": 2.7345434427261353, "epoch": 0.16993392968813725, "grad_norm": 0.047839146107435226, "grad_norm_var": 0.0002744857859164899, "learning_rate": 0.008778784936092856, "loss": 2.7345, "step": 3125 }, { "crossentropy": 2.799386739730835, "epoch": 0.16998830854563746, "grad_norm": 0.04711531475186348, "grad_norm_var": 0.0002727249805725878, "learning_rate": 0.008777969409174783, "loss": 2.7994, "step": 3126 }, { "crossentropy": 2.6315643787384033, "epoch": 0.17004268740313766, "grad_norm": 0.041231829673051834, "grad_norm_var": 0.00027773923409387275, "learning_rate": 0.00877715364795057, "loss": 2.6316, "step": 3127 }, { "crossentropy": 2.745857357978821, "epoch": 0.17009706626063786, "grad_norm": 0.17298470437526703, "grad_norm_var": 0.0011795176069208193, "learning_rate": 0.008776337652470807, "loss": 2.7459, "step": 3128 }, { "crossentropy": 2.6426061391830444, "epoch": 0.17015144511813807, "grad_norm": 0.0564396008849144, "grad_norm_var": 0.0011548556192699853, "learning_rate": 0.008775521422786104, "loss": 2.6426, "step": 3129 }, { "crossentropy": 2.7427053451538086, "epoch": 0.17020582397563827, "grad_norm": 0.05122574418783188, "grad_norm_var": 0.0011469324734658443, "learning_rate": 0.008774704958947081, "loss": 2.7427, "step": 3130 }, { "crossentropy": 2.7835663557052612, "epoch": 0.17026020283313847, "grad_norm": 0.04840105026960373, "grad_norm_var": 0.0010167723908433692, "learning_rate": 0.008773888261004378, "loss": 2.7836, "step": 3131 }, { "crossentropy": 2.7783035039901733, "epoch": 0.17031458169063868, "grad_norm": 0.05817462131381035, "grad_norm_var": 0.001009318925915864, "learning_rate": 0.008773071329008644, "loss": 2.7783, "step": 3132 }, { "crossentropy": 2.74269437789917, "epoch": 0.17036896054813888, "grad_norm": 0.05206064134836197, "grad_norm_var": 0.0009945227603253815, "learning_rate": 0.008772254163010543, "loss": 2.7427, "step": 3133 }, { "crossentropy": 2.687895178794861, "epoch": 0.17042333940563908, "grad_norm": 0.04541153460741043, "grad_norm_var": 0.0010037696655949594, "learning_rate": 0.008771436763060756, "loss": 2.6879, "step": 3134 }, { "crossentropy": 2.683008909225464, "epoch": 0.1704777182631393, "grad_norm": 0.04277295619249344, "grad_norm_var": 0.0010014146204078117, "learning_rate": 0.008770619129209976, "loss": 2.683, "step": 3135 }, { "crossentropy": 2.658653974533081, "epoch": 0.1705320971206395, "grad_norm": 0.039668407291173935, "grad_norm_var": 0.0010080120718145957, "learning_rate": 0.008769801261508912, "loss": 2.6587, "step": 3136 }, { "crossentropy": 2.7027828693389893, "epoch": 0.1705864759781397, "grad_norm": 0.0424463152885437, "grad_norm_var": 0.0010180874406336721, "learning_rate": 0.00876898316000829, "loss": 2.7028, "step": 3137 }, { "crossentropy": 2.7433406114578247, "epoch": 0.1706408548356399, "grad_norm": 0.044020362198352814, "grad_norm_var": 0.0010257830373526671, "learning_rate": 0.008768164824758846, "loss": 2.7433, "step": 3138 }, { "crossentropy": 2.6145710945129395, "epoch": 0.1706952336931401, "grad_norm": 0.0431448370218277, "grad_norm_var": 0.0010243290210753036, "learning_rate": 0.008767346255811333, "loss": 2.6146, "step": 3139 }, { "crossentropy": 2.6911357641220093, "epoch": 0.1707496125506403, "grad_norm": 0.0433465838432312, "grad_norm_var": 0.0010221405972650208, "learning_rate": 0.008766527453216518, "loss": 2.6911, "step": 3140 }, { "crossentropy": 2.761690616607666, "epoch": 0.1708039914081405, "grad_norm": 0.04131351038813591, "grad_norm_var": 0.0010308305459659892, "learning_rate": 0.00876570841702518, "loss": 2.7617, "step": 3141 }, { "crossentropy": 2.6728776693344116, "epoch": 0.1708583702656407, "grad_norm": 0.041714541614055634, "grad_norm_var": 0.0010378703986695195, "learning_rate": 0.008764889147288117, "loss": 2.6729, "step": 3142 }, { "crossentropy": 2.7752652168273926, "epoch": 0.17091274912314092, "grad_norm": 0.1316259801387787, "grad_norm_var": 0.0013944061426821859, "learning_rate": 0.008764069644056141, "loss": 2.7753, "step": 3143 }, { "crossentropy": 2.736570119857788, "epoch": 0.17096712798064112, "grad_norm": 0.03986218199133873, "grad_norm_var": 0.0004907427699674449, "learning_rate": 0.008763249907380073, "loss": 2.7366, "step": 3144 }, { "crossentropy": 2.68306040763855, "epoch": 0.17102150683814132, "grad_norm": 0.04165245592594147, "grad_norm_var": 0.0004943778003596205, "learning_rate": 0.008762429937310754, "loss": 2.6831, "step": 3145 }, { "crossentropy": 2.764085531234741, "epoch": 0.17107588569564153, "grad_norm": 0.042676106095314026, "grad_norm_var": 0.0004980364827765862, "learning_rate": 0.00876160973389904, "loss": 2.7641, "step": 3146 }, { "crossentropy": 2.7302048206329346, "epoch": 0.17113026455314173, "grad_norm": 0.040251825004816055, "grad_norm_var": 0.0005038084751765925, "learning_rate": 0.008760789297195795, "loss": 2.7302, "step": 3147 }, { "crossentropy": 2.828959107398987, "epoch": 0.17118464341064193, "grad_norm": 0.17134714126586914, "grad_norm_var": 0.0014369583233893987, "learning_rate": 0.008759968627251907, "loss": 2.829, "step": 3148 }, { "crossentropy": 2.842694878578186, "epoch": 0.17123902226814214, "grad_norm": 0.04222005233168602, "grad_norm_var": 0.0014487792929172923, "learning_rate": 0.008759147724118267, "loss": 2.8427, "step": 3149 }, { "crossentropy": 2.8339701890945435, "epoch": 0.17129340112564234, "grad_norm": 0.05557500198483467, "grad_norm_var": 0.0014411004348693515, "learning_rate": 0.008758326587845791, "loss": 2.834, "step": 3150 }, { "crossentropy": 2.8473968505859375, "epoch": 0.17134777998314255, "grad_norm": 0.04453242942690849, "grad_norm_var": 0.0014380789074622772, "learning_rate": 0.008757505218485404, "loss": 2.8474, "step": 3151 }, { "crossentropy": 2.7303625345230103, "epoch": 0.17140215884064275, "grad_norm": 0.04247388616204262, "grad_norm_var": 0.001432242059023331, "learning_rate": 0.008756683616088046, "loss": 2.7304, "step": 3152 }, { "crossentropy": 2.7902896404266357, "epoch": 0.17145653769814295, "grad_norm": 0.04153361916542053, "grad_norm_var": 0.0014340363236693345, "learning_rate": 0.008755861780704673, "loss": 2.7903, "step": 3153 }, { "crossentropy": 2.8292704820632935, "epoch": 0.17151091655564316, "grad_norm": 0.03988690674304962, "grad_norm_var": 0.0014420953780991924, "learning_rate": 0.008755039712386255, "loss": 2.8293, "step": 3154 }, { "crossentropy": 2.732968330383301, "epoch": 0.17156529541314336, "grad_norm": 0.05913969874382019, "grad_norm_var": 0.0014297155945375852, "learning_rate": 0.008754217411183774, "loss": 2.733, "step": 3155 }, { "crossentropy": 2.708853602409363, "epoch": 0.17161967427064356, "grad_norm": 0.040396325290203094, "grad_norm_var": 0.0014358062439690108, "learning_rate": 0.008753394877148228, "loss": 2.7089, "step": 3156 }, { "crossentropy": 2.6554760932922363, "epoch": 0.17167405312814377, "grad_norm": 0.044099725782871246, "grad_norm_var": 0.0014303664165861167, "learning_rate": 0.008752572110330632, "loss": 2.6555, "step": 3157 }, { "crossentropy": 2.818962812423706, "epoch": 0.17172843198564397, "grad_norm": 0.04479685425758362, "grad_norm_var": 0.001424498775538094, "learning_rate": 0.008751749110782012, "loss": 2.819, "step": 3158 }, { "crossentropy": 2.744170069694519, "epoch": 0.17178281084314417, "grad_norm": 0.04147255793213844, "grad_norm_var": 0.001043003431496066, "learning_rate": 0.00875092587855341, "loss": 2.7442, "step": 3159 }, { "crossentropy": 2.730104088783264, "epoch": 0.17183718970064438, "grad_norm": 0.06500201672315598, "grad_norm_var": 0.0010418358756111089, "learning_rate": 0.008750102413695882, "loss": 2.7301, "step": 3160 }, { "crossentropy": 2.7133110761642456, "epoch": 0.17189156855814458, "grad_norm": 0.03934973478317261, "grad_norm_var": 0.001045825104280416, "learning_rate": 0.008749278716260498, "loss": 2.7133, "step": 3161 }, { "crossentropy": 2.7244986295700073, "epoch": 0.17194594741564478, "grad_norm": 0.03972063586115837, "grad_norm_var": 0.0010506056318368307, "learning_rate": 0.008748454786298345, "loss": 2.7245, "step": 3162 }, { "crossentropy": 2.7116185426712036, "epoch": 0.172000326273145, "grad_norm": 0.04234929382801056, "grad_norm_var": 0.0010472490141092414, "learning_rate": 0.008747630623860522, "loss": 2.7116, "step": 3163 }, { "crossentropy": 2.7704862356185913, "epoch": 0.1720547051306452, "grad_norm": 0.04760297015309334, "grad_norm_var": 5.773174376917095e-05, "learning_rate": 0.008746806228998137, "loss": 2.7705, "step": 3164 }, { "crossentropy": 2.8355191946029663, "epoch": 0.17210908398814542, "grad_norm": 0.0651005432009697, "grad_norm_var": 8.003503796351966e-05, "learning_rate": 0.008745981601762326, "loss": 2.8355, "step": 3165 }, { "crossentropy": 2.7978758811950684, "epoch": 0.17216346284564563, "grad_norm": 0.04211466759443283, "grad_norm_var": 7.608495434775388e-05, "learning_rate": 0.008745156742204227, "loss": 2.7979, "step": 3166 }, { "crossentropy": 2.7845232486724854, "epoch": 0.17221784170314583, "grad_norm": 0.04272174835205078, "grad_norm_var": 7.669806760105072e-05, "learning_rate": 0.008744331650375, "loss": 2.7845, "step": 3167 }, { "crossentropy": 2.767199397087097, "epoch": 0.17227222056064603, "grad_norm": 0.05212010443210602, "grad_norm_var": 7.783693477963107e-05, "learning_rate": 0.008743506326325813, "loss": 2.7672, "step": 3168 }, { "crossentropy": 2.8247156143188477, "epoch": 0.17232659941814624, "grad_norm": 0.04976334050297737, "grad_norm_var": 7.638668051815437e-05, "learning_rate": 0.008742680770107855, "loss": 2.8247, "step": 3169 }, { "crossentropy": 2.6691983938217163, "epoch": 0.17238097827564644, "grad_norm": 0.0522993728518486, "grad_norm_var": 7.386766149688819e-05, "learning_rate": 0.008741854981772325, "loss": 2.6692, "step": 3170 }, { "crossentropy": 2.692439913749695, "epoch": 0.17243535713314664, "grad_norm": 0.04706539958715439, "grad_norm_var": 6.505057096820865e-05, "learning_rate": 0.008741028961370438, "loss": 2.6924, "step": 3171 }, { "crossentropy": 2.7566475868225098, "epoch": 0.17248973599064685, "grad_norm": 0.044287458062171936, "grad_norm_var": 6.244187155156969e-05, "learning_rate": 0.008740202708953423, "loss": 2.7566, "step": 3172 }, { "crossentropy": 2.708035111427307, "epoch": 0.17254411484814705, "grad_norm": 0.045155368745326996, "grad_norm_var": 6.203409876308977e-05, "learning_rate": 0.00873937622457252, "loss": 2.708, "step": 3173 }, { "crossentropy": 2.74112069606781, "epoch": 0.17259849370564725, "grad_norm": 0.04672384634613991, "grad_norm_var": 6.1556847728957e-05, "learning_rate": 0.008738549508278992, "loss": 2.7411, "step": 3174 }, { "crossentropy": 2.7908798456192017, "epoch": 0.17265287256314746, "grad_norm": 0.048099584877491, "grad_norm_var": 5.881848122574196e-05, "learning_rate": 0.008737722560124108, "loss": 2.7909, "step": 3175 }, { "crossentropy": 2.6875858306884766, "epoch": 0.17270725142064766, "grad_norm": 0.047398339956998825, "grad_norm_var": 3.849670787960987e-05, "learning_rate": 0.008736895380159157, "loss": 2.6876, "step": 3176 }, { "crossentropy": 2.7808741331100464, "epoch": 0.17276163027814787, "grad_norm": 0.051915548741817474, "grad_norm_var": 3.556122400638114e-05, "learning_rate": 0.008736067968435438, "loss": 2.7809, "step": 3177 }, { "crossentropy": 2.7224714756011963, "epoch": 0.17281600913564807, "grad_norm": 0.0414651595056057, "grad_norm_var": 3.3877408011425265e-05, "learning_rate": 0.008735240325004265, "loss": 2.7225, "step": 3178 }, { "crossentropy": 2.6366524696350098, "epoch": 0.17287038799314827, "grad_norm": 0.05053909122943878, "grad_norm_var": 3.2023062824234186e-05, "learning_rate": 0.00873441244991697, "loss": 2.6367, "step": 3179 }, { "crossentropy": 2.7333176136016846, "epoch": 0.17292476685064848, "grad_norm": 0.0467069149017334, "grad_norm_var": 3.216826436681988e-05, "learning_rate": 0.008733584343224899, "loss": 2.7333, "step": 3180 }, { "crossentropy": 2.8504226207733154, "epoch": 0.17297914570814868, "grad_norm": 0.05225811526179314, "grad_norm_var": 1.3780690856291835e-05, "learning_rate": 0.008732756004979405, "loss": 2.8504, "step": 3181 }, { "crossentropy": 2.8634033203125, "epoch": 0.17303352456564888, "grad_norm": 0.05591043084859848, "grad_norm_var": 1.5697019513792077e-05, "learning_rate": 0.008731927435231865, "loss": 2.8634, "step": 3182 }, { "crossentropy": 2.693675994873047, "epoch": 0.1730879034231491, "grad_norm": 0.05160434916615486, "grad_norm_var": 1.390108017467658e-05, "learning_rate": 0.008731098634033664, "loss": 2.6937, "step": 3183 }, { "crossentropy": 2.7693159580230713, "epoch": 0.1731422822806493, "grad_norm": 0.04934987425804138, "grad_norm_var": 1.321238895851858e-05, "learning_rate": 0.008730269601436205, "loss": 2.7693, "step": 3184 }, { "crossentropy": 2.689110517501831, "epoch": 0.1731966611381495, "grad_norm": 0.04565440118312836, "grad_norm_var": 1.373099845038411e-05, "learning_rate": 0.008729440337490901, "loss": 2.6891, "step": 3185 }, { "crossentropy": 2.7325830459594727, "epoch": 0.1732510399956497, "grad_norm": 0.0485374741256237, "grad_norm_var": 1.2723359153137455e-05, "learning_rate": 0.008728610842249187, "loss": 2.7326, "step": 3186 }, { "crossentropy": 2.701395630836487, "epoch": 0.1733054188531499, "grad_norm": 0.04238930344581604, "grad_norm_var": 1.4854711478673223e-05, "learning_rate": 0.008727781115762502, "loss": 2.7014, "step": 3187 }, { "crossentropy": 2.773187518119812, "epoch": 0.1733597977106501, "grad_norm": 0.03833348676562309, "grad_norm_var": 2.001733630153235e-05, "learning_rate": 0.00872695115808231, "loss": 2.7732, "step": 3188 }, { "crossentropy": 2.6783828735351562, "epoch": 0.1734141765681503, "grad_norm": 0.03982454165816307, "grad_norm_var": 2.3550634819765003e-05, "learning_rate": 0.00872612096926008, "loss": 2.6784, "step": 3189 }, { "crossentropy": 2.682796597480774, "epoch": 0.1734685554256505, "grad_norm": 0.04012032970786095, "grad_norm_var": 2.6778394985566925e-05, "learning_rate": 0.008725290549347304, "loss": 2.6828, "step": 3190 }, { "crossentropy": 2.899787425994873, "epoch": 0.17352293428315071, "grad_norm": 0.04299648106098175, "grad_norm_var": 2.7577322724704986e-05, "learning_rate": 0.00872445989839548, "loss": 2.8998, "step": 3191 }, { "crossentropy": 2.718534231185913, "epoch": 0.17357731314065092, "grad_norm": 0.0440533384680748, "grad_norm_var": 2.790395971371349e-05, "learning_rate": 0.008723629016456125, "loss": 2.7185, "step": 3192 }, { "crossentropy": 2.750275492668152, "epoch": 0.17363169199815112, "grad_norm": 0.04346306249499321, "grad_norm_var": 2.6101023780986803e-05, "learning_rate": 0.008722797903580768, "loss": 2.7503, "step": 3193 }, { "crossentropy": 2.7440438270568848, "epoch": 0.17368607085565133, "grad_norm": 0.04305146634578705, "grad_norm_var": 2.5336073565463264e-05, "learning_rate": 0.008721966559820958, "loss": 2.744, "step": 3194 }, { "crossentropy": 2.698755979537964, "epoch": 0.17374044971315153, "grad_norm": 0.04389655590057373, "grad_norm_var": 2.4006803505826196e-05, "learning_rate": 0.008721134985228252, "loss": 2.6988, "step": 3195 }, { "crossentropy": 2.912325620651245, "epoch": 0.17379482857065173, "grad_norm": 0.04313698038458824, "grad_norm_var": 2.4233315733244305e-05, "learning_rate": 0.008720303179854224, "loss": 2.9123, "step": 3196 }, { "crossentropy": 2.8946956396102905, "epoch": 0.17384920742815194, "grad_norm": 0.04392244666814804, "grad_norm_var": 2.0827351913977643e-05, "learning_rate": 0.008719471143750459, "loss": 2.8947, "step": 3197 }, { "crossentropy": 2.6719603538513184, "epoch": 0.17390358628565214, "grad_norm": 0.03836440294981003, "grad_norm_var": 1.3995051201224664e-05, "learning_rate": 0.008718638876968563, "loss": 2.672, "step": 3198 }, { "crossentropy": 2.692308187484741, "epoch": 0.17395796514315234, "grad_norm": 0.0408523865044117, "grad_norm_var": 9.843774676091855e-06, "learning_rate": 0.00871780637956015, "loss": 2.6923, "step": 3199 }, { "crossentropy": 2.7841399908065796, "epoch": 0.17401234400065255, "grad_norm": 0.0452822744846344, "grad_norm_var": 7.432214835595361e-06, "learning_rate": 0.008716973651576851, "loss": 2.7841, "step": 3200 }, { "crossentropy": 2.680017828941345, "epoch": 0.17406672285815275, "grad_norm": 0.045721471309661865, "grad_norm_var": 7.458536793327787e-06, "learning_rate": 0.008716140693070314, "loss": 2.68, "step": 3201 }, { "crossentropy": 2.8678518533706665, "epoch": 0.17412110171565295, "grad_norm": 0.05141183361411095, "grad_norm_var": 1.0194239093907985e-05, "learning_rate": 0.008715307504092194, "loss": 2.8679, "step": 3202 }, { "crossentropy": 2.837776303291321, "epoch": 0.17417548057315316, "grad_norm": 0.05385710671544075, "grad_norm_var": 1.7592598518128878e-05, "learning_rate": 0.008714474084694166, "loss": 2.8378, "step": 3203 }, { "crossentropy": 2.7222801446914673, "epoch": 0.17422985943065336, "grad_norm": 0.05038640648126602, "grad_norm_var": 1.8139451647468326e-05, "learning_rate": 0.008713640434927918, "loss": 2.7223, "step": 3204 }, { "crossentropy": 2.7692506313323975, "epoch": 0.17428423828815356, "grad_norm": 0.050091393291950226, "grad_norm_var": 1.846910049277765e-05, "learning_rate": 0.008712806554845152, "loss": 2.7693, "step": 3205 }, { "crossentropy": 2.844081997871399, "epoch": 0.17433861714565377, "grad_norm": 0.04978378117084503, "grad_norm_var": 1.796927657805077e-05, "learning_rate": 0.008711972444497586, "loss": 2.8441, "step": 3206 }, { "crossentropy": 2.851728081703186, "epoch": 0.17439299600315397, "grad_norm": 0.04462132602930069, "grad_norm_var": 1.7561151355159563e-05, "learning_rate": 0.008711138103936951, "loss": 2.8517, "step": 3207 }, { "crossentropy": 2.8116642236709595, "epoch": 0.17444737486065418, "grad_norm": 0.04115358740091324, "grad_norm_var": 1.87401647741743e-05, "learning_rate": 0.008710303533214989, "loss": 2.8117, "step": 3208 }, { "crossentropy": 2.6653223037719727, "epoch": 0.17450175371815438, "grad_norm": 0.05716554820537567, "grad_norm_var": 2.6639780357910296e-05, "learning_rate": 0.008709468732383461, "loss": 2.6653, "step": 3209 }, { "crossentropy": 2.7425575256347656, "epoch": 0.17455613257565458, "grad_norm": 0.040598079562187195, "grad_norm_var": 2.811745331223377e-05, "learning_rate": 0.008708633701494141, "loss": 2.7426, "step": 3210 }, { "crossentropy": 2.698383927345276, "epoch": 0.17461051143315479, "grad_norm": 0.04168667644262314, "grad_norm_var": 2.9120642496876837e-05, "learning_rate": 0.008707798440598817, "loss": 2.6984, "step": 3211 }, { "crossentropy": 2.7249542474746704, "epoch": 0.174664890290655, "grad_norm": 0.04315703362226486, "grad_norm_var": 2.911267240176951e-05, "learning_rate": 0.008706962949749288, "loss": 2.725, "step": 3212 }, { "crossentropy": 2.598042130470276, "epoch": 0.1747192691481552, "grad_norm": 0.043981652706861496, "grad_norm_var": 2.9095476717077897e-05, "learning_rate": 0.008706127228997376, "loss": 2.598, "step": 3213 }, { "crossentropy": 2.696053981781006, "epoch": 0.1747736480056554, "grad_norm": 0.04201723635196686, "grad_norm_var": 2.614617102912516e-05, "learning_rate": 0.008705291278394907, "loss": 2.6961, "step": 3214 }, { "crossentropy": 2.7470935583114624, "epoch": 0.1748280268631556, "grad_norm": 0.04680412635207176, "grad_norm_var": 2.3989083892150588e-05, "learning_rate": 0.008704455097993726, "loss": 2.7471, "step": 3215 }, { "crossentropy": 2.6954089403152466, "epoch": 0.1748824057206558, "grad_norm": 0.043932102620601654, "grad_norm_var": 2.436408772495587e-05, "learning_rate": 0.008703618687845695, "loss": 2.6954, "step": 3216 }, { "crossentropy": 2.7196810245513916, "epoch": 0.174936784578156, "grad_norm": 0.04030017927289009, "grad_norm_var": 2.687078072510085e-05, "learning_rate": 0.008702782048002686, "loss": 2.7197, "step": 3217 }, { "crossentropy": 2.786706566810608, "epoch": 0.1749911634356562, "grad_norm": 0.04547487944364548, "grad_norm_var": 2.5034574222769965e-05, "learning_rate": 0.008701945178516586, "loss": 2.7867, "step": 3218 }, { "crossentropy": 2.7794690132141113, "epoch": 0.17504554229315641, "grad_norm": 0.043510496616363525, "grad_norm_var": 2.0800826209518784e-05, "learning_rate": 0.008701108079439298, "loss": 2.7795, "step": 3219 }, { "crossentropy": 2.65086829662323, "epoch": 0.17509992115065662, "grad_norm": 0.04144512116909027, "grad_norm_var": 1.972352365454394e-05, "learning_rate": 0.008700270750822737, "loss": 2.6509, "step": 3220 }, { "crossentropy": 2.74454665184021, "epoch": 0.17515430000815682, "grad_norm": 0.11417106539011002, "grad_norm_var": 0.00032214572755233644, "learning_rate": 0.008699433192718833, "loss": 2.7445, "step": 3221 }, { "crossentropy": 2.705790877342224, "epoch": 0.17520867886565702, "grad_norm": 0.03899236395955086, "grad_norm_var": 0.0003279189579455478, "learning_rate": 0.008698595405179532, "loss": 2.7058, "step": 3222 }, { "crossentropy": 2.7676457166671753, "epoch": 0.17526305772315723, "grad_norm": 0.04435420781373978, "grad_norm_var": 0.00032804600303126797, "learning_rate": 0.008697757388256793, "loss": 2.7676, "step": 3223 }, { "crossentropy": 2.743669867515564, "epoch": 0.17531743658065743, "grad_norm": 0.04932926222681999, "grad_norm_var": 0.0003247096874569352, "learning_rate": 0.008696919142002588, "loss": 2.7437, "step": 3224 }, { "crossentropy": 2.7626752853393555, "epoch": 0.17537181543815764, "grad_norm": 0.045455820858478546, "grad_norm_var": 0.00031983982805121596, "learning_rate": 0.008696080666468904, "loss": 2.7627, "step": 3225 }, { "crossentropy": 2.8721708059310913, "epoch": 0.17542619429565784, "grad_norm": 0.04308243840932846, "grad_norm_var": 0.0003178314653189133, "learning_rate": 0.008695241961707742, "loss": 2.8722, "step": 3226 }, { "crossentropy": 2.6746299266815186, "epoch": 0.17548057315315804, "grad_norm": 0.041588302701711655, "grad_norm_var": 0.00031791462854744524, "learning_rate": 0.00869440302777112, "loss": 2.6746, "step": 3227 }, { "crossentropy": 2.6331175565719604, "epoch": 0.17553495201065825, "grad_norm": 0.04252394661307335, "grad_norm_var": 0.00031834635117612, "learning_rate": 0.008693563864711066, "loss": 2.6331, "step": 3228 }, { "crossentropy": 2.568435549736023, "epoch": 0.17558933086815845, "grad_norm": 0.04069548100233078, "grad_norm_var": 0.00032075355539053126, "learning_rate": 0.008692724472579625, "loss": 2.5684, "step": 3229 }, { "crossentropy": 2.73647141456604, "epoch": 0.17564370972565865, "grad_norm": 0.04054943099617958, "grad_norm_var": 0.00032200620236783463, "learning_rate": 0.008691884851428855, "loss": 2.7365, "step": 3230 }, { "crossentropy": 2.6743791103363037, "epoch": 0.17569808858315886, "grad_norm": 0.04369087889790535, "grad_norm_var": 0.0003229581441856037, "learning_rate": 0.008691045001310828, "loss": 2.6744, "step": 3231 }, { "crossentropy": 2.769903063774109, "epoch": 0.17575246744065906, "grad_norm": 0.04359172657132149, "grad_norm_var": 0.0003231247445231046, "learning_rate": 0.008690204922277631, "loss": 2.7699, "step": 3232 }, { "crossentropy": 2.7764264345169067, "epoch": 0.17580684629815926, "grad_norm": 0.04145924746990204, "grad_norm_var": 0.00032210805122051355, "learning_rate": 0.008689364614381365, "loss": 2.7764, "step": 3233 }, { "crossentropy": 2.7496978044509888, "epoch": 0.17586122515565947, "grad_norm": 0.04034387320280075, "grad_norm_var": 0.00032513530838043273, "learning_rate": 0.008688524077674146, "loss": 2.7497, "step": 3234 }, { "crossentropy": 2.674251079559326, "epoch": 0.17591560401315967, "grad_norm": 0.03902030736207962, "grad_norm_var": 0.0003285887180788163, "learning_rate": 0.008687683312208103, "loss": 2.6743, "step": 3235 }, { "crossentropy": 2.7406927347183228, "epoch": 0.17596998287065987, "grad_norm": 0.042065463960170746, "grad_norm_var": 0.00032816213436990947, "learning_rate": 0.008686842318035378, "loss": 2.7407, "step": 3236 }, { "crossentropy": 2.615877866744995, "epoch": 0.17602436172816008, "grad_norm": 0.03890729323029518, "grad_norm_var": 7.447560376771152e-06, "learning_rate": 0.00868600109520813, "loss": 2.6159, "step": 3237 }, { "crossentropy": 2.6439679861068726, "epoch": 0.17607874058566028, "grad_norm": 0.0556090883910656, "grad_norm_var": 1.7535741512997072e-05, "learning_rate": 0.008685159643778528, "loss": 2.644, "step": 3238 }, { "crossentropy": 2.73510205745697, "epoch": 0.1761331194431605, "grad_norm": 0.039867039769887924, "grad_norm_var": 1.8143498697656354e-05, "learning_rate": 0.008684317963798765, "loss": 2.7351, "step": 3239 }, { "crossentropy": 2.786271333694458, "epoch": 0.17618749830066072, "grad_norm": 0.03900989145040512, "grad_norm_var": 1.6071600101395106e-05, "learning_rate": 0.008683476055321033, "loss": 2.7863, "step": 3240 }, { "crossentropy": 2.73294198513031, "epoch": 0.17624187715816092, "grad_norm": 0.0374281145632267, "grad_norm_var": 1.6765655141493843e-05, "learning_rate": 0.008682633918397553, "loss": 2.7329, "step": 3241 }, { "crossentropy": 2.7484281063079834, "epoch": 0.17629625601566112, "grad_norm": 0.0389423705637455, "grad_norm_var": 1.7150820104913643e-05, "learning_rate": 0.00868179155308055, "loss": 2.7484, "step": 3242 }, { "crossentropy": 2.732898473739624, "epoch": 0.17635063487316133, "grad_norm": 0.03838653489947319, "grad_norm_var": 1.7788315341465966e-05, "learning_rate": 0.008680948959422266, "loss": 2.7329, "step": 3243 }, { "crossentropy": 2.7053643465042114, "epoch": 0.17640501373066153, "grad_norm": 0.04125429317355156, "grad_norm_var": 1.769552424269368e-05, "learning_rate": 0.008680106137474958, "loss": 2.7054, "step": 3244 }, { "crossentropy": 2.7548282146453857, "epoch": 0.17645939258816173, "grad_norm": 0.04763982072472572, "grad_norm_var": 2.014856644308166e-05, "learning_rate": 0.008679263087290901, "loss": 2.7548, "step": 3245 }, { "crossentropy": 2.7777140140533447, "epoch": 0.17651377144566194, "grad_norm": 0.04637731984257698, "grad_norm_var": 2.1349824833388245e-05, "learning_rate": 0.008678419808922378, "loss": 2.7777, "step": 3246 }, { "crossentropy": 2.7579017877578735, "epoch": 0.17656815030316214, "grad_norm": 0.03843230381608009, "grad_norm_var": 2.1962383747681168e-05, "learning_rate": 0.008677576302421686, "loss": 2.7579, "step": 3247 }, { "crossentropy": 2.771591067314148, "epoch": 0.17662252916066234, "grad_norm": 0.03896394744515419, "grad_norm_var": 2.2177398360233246e-05, "learning_rate": 0.008676732567841142, "loss": 2.7716, "step": 3248 }, { "crossentropy": 2.7576522827148438, "epoch": 0.17667690801816255, "grad_norm": 0.040031496435403824, "grad_norm_var": 2.230907368817222e-05, "learning_rate": 0.008675888605233072, "loss": 2.7577, "step": 3249 }, { "crossentropy": 2.648131251335144, "epoch": 0.17673128687566275, "grad_norm": 0.038845743983983994, "grad_norm_var": 2.2658801426538436e-05, "learning_rate": 0.008675044414649816, "loss": 2.6481, "step": 3250 }, { "crossentropy": 2.7500752210617065, "epoch": 0.17678566573316296, "grad_norm": 0.03943705931305885, "grad_norm_var": 2.2543046932282927e-05, "learning_rate": 0.008674199996143732, "loss": 2.7501, "step": 3251 }, { "crossentropy": 2.6811031103134155, "epoch": 0.17684004459066316, "grad_norm": 0.0682893767952919, "grad_norm_var": 6.811343032035733e-05, "learning_rate": 0.00867335534976719, "loss": 2.6811, "step": 3252 }, { "crossentropy": 2.7224055528640747, "epoch": 0.17689442344816336, "grad_norm": 0.04105758294463158, "grad_norm_var": 6.723937643847132e-05, "learning_rate": 0.008672510475572574, "loss": 2.7224, "step": 3253 }, { "crossentropy": 2.759271264076233, "epoch": 0.17694880230566357, "grad_norm": 0.047910187393426895, "grad_norm_var": 5.810131521549929e-05, "learning_rate": 0.008671665373612282, "loss": 2.7593, "step": 3254 }, { "crossentropy": 2.72824490070343, "epoch": 0.17700318116316377, "grad_norm": 0.046954043209552765, "grad_norm_var": 5.864182201017146e-05, "learning_rate": 0.008670820043938727, "loss": 2.7282, "step": 3255 }, { "crossentropy": 2.715222716331482, "epoch": 0.17705756002066397, "grad_norm": 0.04411938413977623, "grad_norm_var": 5.7514300542813034e-05, "learning_rate": 0.008669974486604335, "loss": 2.7152, "step": 3256 }, { "crossentropy": 2.7656742334365845, "epoch": 0.17711193887816418, "grad_norm": 0.057969190180301666, "grad_norm_var": 6.758598719870523e-05, "learning_rate": 0.008669128701661546, "loss": 2.7657, "step": 3257 }, { "crossentropy": 2.6794513463974, "epoch": 0.17716631773566438, "grad_norm": 0.041860152035951614, "grad_norm_var": 6.589247364735349e-05, "learning_rate": 0.008668282689162816, "loss": 2.6795, "step": 3258 }, { "crossentropy": 2.565923810005188, "epoch": 0.17722069659316458, "grad_norm": 0.04268624633550644, "grad_norm_var": 6.334503643418791e-05, "learning_rate": 0.008667436449160616, "loss": 2.5659, "step": 3259 }, { "crossentropy": 2.7327373027801514, "epoch": 0.1772750754506648, "grad_norm": 0.042818646878004074, "grad_norm_var": 6.269287296347218e-05, "learning_rate": 0.008666589981707424, "loss": 2.7327, "step": 3260 }, { "crossentropy": 2.726635217666626, "epoch": 0.177329454308165, "grad_norm": 0.04297766089439392, "grad_norm_var": 6.254219073803009e-05, "learning_rate": 0.00866574328685574, "loss": 2.7266, "step": 3261 }, { "crossentropy": 2.711059331893921, "epoch": 0.1773838331656652, "grad_norm": 0.041161030530929565, "grad_norm_var": 6.322967132701025e-05, "learning_rate": 0.008664896364658075, "loss": 2.7111, "step": 3262 }, { "crossentropy": 2.6405892372131348, "epoch": 0.1774382120231654, "grad_norm": 0.043023355305194855, "grad_norm_var": 6.077482442281007e-05, "learning_rate": 0.008664049215166955, "loss": 2.6406, "step": 3263 }, { "crossentropy": 2.8223694562911987, "epoch": 0.1774925908806656, "grad_norm": 0.04262879490852356, "grad_norm_var": 5.872264456547293e-05, "learning_rate": 0.008663201838434918, "loss": 2.8224, "step": 3264 }, { "crossentropy": 2.730622172355652, "epoch": 0.1775469697381658, "grad_norm": 0.04319348186254501, "grad_norm_var": 5.7206179605382285e-05, "learning_rate": 0.008662354234514518, "loss": 2.7306, "step": 3265 }, { "crossentropy": 2.7502493858337402, "epoch": 0.177601348595666, "grad_norm": 0.04267933592200279, "grad_norm_var": 5.4821426862298034e-05, "learning_rate": 0.008661506403458323, "loss": 2.7502, "step": 3266 }, { "crossentropy": 2.7175391912460327, "epoch": 0.1776557274531662, "grad_norm": 0.04006906971335411, "grad_norm_var": 5.433144762395919e-05, "learning_rate": 0.008660658345318914, "loss": 2.7175, "step": 3267 }, { "crossentropy": 2.7654958963394165, "epoch": 0.17771010631066642, "grad_norm": 0.041161566972732544, "grad_norm_var": 1.821214422380307e-05, "learning_rate": 0.008659810060148888, "loss": 2.7655, "step": 3268 }, { "crossentropy": 2.7051130533218384, "epoch": 0.17776448516816662, "grad_norm": 0.05282469838857651, "grad_norm_var": 2.2419374733591522e-05, "learning_rate": 0.008658961548000855, "loss": 2.7051, "step": 3269 }, { "crossentropy": 2.7677654027938843, "epoch": 0.17781886402566682, "grad_norm": 0.04299526661634445, "grad_norm_var": 2.17778035923446e-05, "learning_rate": 0.008658112808927436, "loss": 2.7678, "step": 3270 }, { "crossentropy": 2.747841000556946, "epoch": 0.17787324288316703, "grad_norm": 0.03871549665927887, "grad_norm_var": 2.3126613742245558e-05, "learning_rate": 0.008657263842981272, "loss": 2.7478, "step": 3271 }, { "crossentropy": 2.6513813734054565, "epoch": 0.17792762174066723, "grad_norm": 0.04021986201405525, "grad_norm_var": 2.391365582181433e-05, "learning_rate": 0.008656414650215015, "loss": 2.6514, "step": 3272 }, { "crossentropy": 2.682365298271179, "epoch": 0.17798200059816743, "grad_norm": 0.04054149240255356, "grad_norm_var": 9.41737605779826e-06, "learning_rate": 0.008655565230681329, "loss": 2.6824, "step": 3273 }, { "crossentropy": 2.8159507513046265, "epoch": 0.17803637945566764, "grad_norm": 0.03906702250242233, "grad_norm_var": 1.013293386276463e-05, "learning_rate": 0.008654715584432895, "loss": 2.816, "step": 3274 }, { "crossentropy": 2.586790919303894, "epoch": 0.17809075831316784, "grad_norm": 0.041265882551670074, "grad_norm_var": 1.0185437786598233e-05, "learning_rate": 0.008653865711522409, "loss": 2.5868, "step": 3275 }, { "crossentropy": 2.7193191051483154, "epoch": 0.17814513717066804, "grad_norm": 0.041588425636291504, "grad_norm_var": 1.0180014288462397e-05, "learning_rate": 0.008653015612002577, "loss": 2.7193, "step": 3276 }, { "crossentropy": 2.7381895780563354, "epoch": 0.17819951602816825, "grad_norm": 0.04114305600523949, "grad_norm_var": 1.0183521521574719e-05, "learning_rate": 0.008652165285926123, "loss": 2.7382, "step": 3277 }, { "crossentropy": 2.788026452064514, "epoch": 0.17825389488566845, "grad_norm": 0.03977711871266365, "grad_norm_var": 1.0461234429077979e-05, "learning_rate": 0.008651314733345784, "loss": 2.788, "step": 3278 }, { "crossentropy": 2.735785126686096, "epoch": 0.17830827374316865, "grad_norm": 0.042844124138355255, "grad_norm_var": 1.0437134521029158e-05, "learning_rate": 0.008650463954314309, "loss": 2.7358, "step": 3279 }, { "crossentropy": 2.733507752418518, "epoch": 0.17836265260066886, "grad_norm": 0.040325380861759186, "grad_norm_var": 1.05509535381779e-05, "learning_rate": 0.008649612948884462, "loss": 2.7335, "step": 3280 }, { "crossentropy": 2.759605050086975, "epoch": 0.17841703145816906, "grad_norm": 0.04079141840338707, "grad_norm_var": 1.0457494195287298e-05, "learning_rate": 0.008648761717109021, "loss": 2.7596, "step": 3281 }, { "crossentropy": 2.6176745891571045, "epoch": 0.17847141031566927, "grad_norm": 0.03887065872550011, "grad_norm_var": 1.0828996432378964e-05, "learning_rate": 0.008647910259040784, "loss": 2.6177, "step": 3282 }, { "crossentropy": 2.721795916557312, "epoch": 0.17852578917316947, "grad_norm": 0.04046151787042618, "grad_norm_var": 1.0769631899453825e-05, "learning_rate": 0.00864705857473255, "loss": 2.7218, "step": 3283 }, { "crossentropy": 2.70996356010437, "epoch": 0.17858016803066967, "grad_norm": 0.03828706592321396, "grad_norm_var": 1.1382060511586946e-05, "learning_rate": 0.008646206664237144, "loss": 2.71, "step": 3284 }, { "crossentropy": 2.7023680210113525, "epoch": 0.17863454688816988, "grad_norm": 0.04455343261361122, "grad_norm_var": 2.8735339263591396e-06, "learning_rate": 0.008645354527607401, "loss": 2.7024, "step": 3285 }, { "crossentropy": 2.705723285675049, "epoch": 0.17868892574567008, "grad_norm": 0.0375235415995121, "grad_norm_var": 3.0815002140600786e-06, "learning_rate": 0.008644502164896168, "loss": 2.7057, "step": 3286 }, { "crossentropy": 2.7644894123077393, "epoch": 0.17874330460317028, "grad_norm": 0.03815450519323349, "grad_norm_var": 3.2251840881643175e-06, "learning_rate": 0.00864364957615631, "loss": 2.7645, "step": 3287 }, { "crossentropy": 2.725139856338501, "epoch": 0.1787976834606705, "grad_norm": 0.040460195392370224, "grad_norm_var": 3.2249954012440626e-06, "learning_rate": 0.008642796761440702, "loss": 2.7251, "step": 3288 }, { "crossentropy": 2.7099764347076416, "epoch": 0.1788520623181707, "grad_norm": 0.04145699366927147, "grad_norm_var": 3.30033581056229e-06, "learning_rate": 0.008641943720802235, "loss": 2.71, "step": 3289 }, { "crossentropy": 2.7573658227920532, "epoch": 0.1789064411756709, "grad_norm": 0.04432098940014839, "grad_norm_var": 4.084349037146382e-06, "learning_rate": 0.008641090454293811, "loss": 2.7574, "step": 3290 }, { "crossentropy": 2.8604401350021362, "epoch": 0.1789608200331711, "grad_norm": 0.045114245265722275, "grad_norm_var": 5.280309030699278e-06, "learning_rate": 0.008640236961968355, "loss": 2.8604, "step": 3291 }, { "crossentropy": 2.694216012954712, "epoch": 0.1790151988906713, "grad_norm": 0.04255514591932297, "grad_norm_var": 5.41720099632136e-06, "learning_rate": 0.008639383243878794, "loss": 2.6942, "step": 3292 }, { "crossentropy": 2.7029409408569336, "epoch": 0.1790695777481715, "grad_norm": 0.037593547254800797, "grad_norm_var": 6.15584812469853e-06, "learning_rate": 0.00863852930007808, "loss": 2.7029, "step": 3293 }, { "crossentropy": 2.6954739093780518, "epoch": 0.1791239566056717, "grad_norm": 0.038209736347198486, "grad_norm_var": 6.52694352339152e-06, "learning_rate": 0.00863767513061917, "loss": 2.6955, "step": 3294 }, { "crossentropy": 2.6834832429885864, "epoch": 0.1791783354631719, "grad_norm": 0.03902055695652962, "grad_norm_var": 6.357854747608924e-06, "learning_rate": 0.008636820735555037, "loss": 2.6835, "step": 3295 }, { "crossentropy": 2.8095433712005615, "epoch": 0.17923271432067212, "grad_norm": 0.04040829464793205, "grad_norm_var": 6.356561994401838e-06, "learning_rate": 0.008635966114938677, "loss": 2.8095, "step": 3296 }, { "crossentropy": 2.6524786949157715, "epoch": 0.17928709317817232, "grad_norm": 0.038215406239032745, "grad_norm_var": 6.666525852190806e-06, "learning_rate": 0.008635111268823084, "loss": 2.6525, "step": 3297 }, { "crossentropy": 2.652108907699585, "epoch": 0.17934147203567252, "grad_norm": 0.04065512865781784, "grad_norm_var": 6.519429483769062e-06, "learning_rate": 0.008634256197261282, "loss": 2.6521, "step": 3298 }, { "crossentropy": 2.7980971336364746, "epoch": 0.17939585089317273, "grad_norm": 0.041536130011081696, "grad_norm_var": 6.595132094488048e-06, "learning_rate": 0.0086334009003063, "loss": 2.7981, "step": 3299 }, { "crossentropy": 2.780650019645691, "epoch": 0.17945022975067293, "grad_norm": 0.041616469621658325, "grad_norm_var": 6.3037722983720505e-06, "learning_rate": 0.008632545378011178, "loss": 2.7807, "step": 3300 }, { "crossentropy": 2.6755166053771973, "epoch": 0.17950460860817313, "grad_norm": 0.17668792605400085, "grad_norm_var": 0.001165199591659449, "learning_rate": 0.00863168963042898, "loss": 2.6755, "step": 3301 }, { "crossentropy": 2.7097893953323364, "epoch": 0.17955898746567334, "grad_norm": 0.04468432813882828, "grad_norm_var": 0.0011574751170984002, "learning_rate": 0.008630833657612776, "loss": 2.7098, "step": 3302 }, { "crossentropy": 2.6269110441207886, "epoch": 0.17961336632317354, "grad_norm": 0.04347574710845947, "grad_norm_var": 0.001151253334749427, "learning_rate": 0.008629977459615655, "loss": 2.6269, "step": 3303 }, { "crossentropy": 2.729421615600586, "epoch": 0.17966774518067374, "grad_norm": 0.055214740335941315, "grad_norm_var": 0.0011465824605054501, "learning_rate": 0.008629121036490715, "loss": 2.7294, "step": 3304 }, { "crossentropy": 2.7698930501937866, "epoch": 0.17972212403817395, "grad_norm": 0.04263220354914665, "grad_norm_var": 0.0011452247071080622, "learning_rate": 0.008628264388291071, "loss": 2.7699, "step": 3305 }, { "crossentropy": 2.7643003463745117, "epoch": 0.17977650289567415, "grad_norm": 0.05802119895815849, "grad_norm_var": 0.0011452186340997, "learning_rate": 0.008627407515069855, "loss": 2.7643, "step": 3306 }, { "crossentropy": 2.760649085044861, "epoch": 0.17983088175317435, "grad_norm": 0.04218151792883873, "grad_norm_var": 0.0011482933136474321, "learning_rate": 0.008626550416880206, "loss": 2.7606, "step": 3307 }, { "crossentropy": 2.634426236152649, "epoch": 0.17988526061067456, "grad_norm": 0.03997151181101799, "grad_norm_var": 0.0011517640600319577, "learning_rate": 0.00862569309377528, "loss": 2.6344, "step": 3308 }, { "crossentropy": 2.7056161165237427, "epoch": 0.17993963946817476, "grad_norm": 0.04132092371582985, "grad_norm_var": 0.0011458414960181028, "learning_rate": 0.00862483554580825, "loss": 2.7056, "step": 3309 }, { "crossentropy": 2.72312331199646, "epoch": 0.17999401832567496, "grad_norm": 0.039781857281923294, "grad_norm_var": 0.0011432120565267462, "learning_rate": 0.008623977773032296, "loss": 2.7231, "step": 3310 }, { "crossentropy": 2.775909900665283, "epoch": 0.18004839718317517, "grad_norm": 0.04476796090602875, "grad_norm_var": 0.0011356451442200845, "learning_rate": 0.008623119775500622, "loss": 2.7759, "step": 3311 }, { "crossentropy": 2.7479344606399536, "epoch": 0.18010277604067537, "grad_norm": 0.050849799066782, "grad_norm_var": 0.001126393331984075, "learning_rate": 0.008622261553266437, "loss": 2.7479, "step": 3312 }, { "crossentropy": 2.6866058111190796, "epoch": 0.18015715489817558, "grad_norm": 0.0395498126745224, "grad_norm_var": 0.0011239451598668, "learning_rate": 0.008621403106382968, "loss": 2.6866, "step": 3313 }, { "crossentropy": 2.7008801698684692, "epoch": 0.1802115337556758, "grad_norm": 0.04584217071533203, "grad_norm_var": 0.0011173073720903124, "learning_rate": 0.008620544434903453, "loss": 2.7009, "step": 3314 }, { "crossentropy": 2.6897109746932983, "epoch": 0.180265912613176, "grad_norm": 0.044306445866823196, "grad_norm_var": 0.001113549465208974, "learning_rate": 0.008619685538881148, "loss": 2.6897, "step": 3315 }, { "crossentropy": 2.700515627861023, "epoch": 0.18032029147067621, "grad_norm": 0.04385253041982651, "grad_norm_var": 0.001110413936958267, "learning_rate": 0.00861882641836932, "loss": 2.7005, "step": 3316 }, { "crossentropy": 2.7214903831481934, "epoch": 0.18037467032817642, "grad_norm": 0.03659338504076004, "grad_norm_var": 3.267003638471877e-05, "learning_rate": 0.00861796707342125, "loss": 2.7215, "step": 3317 }, { "crossentropy": 2.81954562664032, "epoch": 0.18042904918567662, "grad_norm": 0.039429131895303726, "grad_norm_var": 3.431276059251785e-05, "learning_rate": 0.008617107504090239, "loss": 2.8195, "step": 3318 }, { "crossentropy": 2.709984540939331, "epoch": 0.18048342804317682, "grad_norm": 0.043549586087465286, "grad_norm_var": 3.430560732377852e-05, "learning_rate": 0.00861624771042959, "loss": 2.71, "step": 3319 }, { "crossentropy": 2.700837731361389, "epoch": 0.18053780690067703, "grad_norm": 0.04164779186248779, "grad_norm_var": 2.5959790935540376e-05, "learning_rate": 0.008615387692492631, "loss": 2.7008, "step": 3320 }, { "crossentropy": 2.754199743270874, "epoch": 0.18059218575817723, "grad_norm": 0.038656413555145264, "grad_norm_var": 2.7351350430805145e-05, "learning_rate": 0.008614527450332699, "loss": 2.7542, "step": 3321 }, { "crossentropy": 2.6859521865844727, "epoch": 0.18064656461567744, "grad_norm": 0.04415612295269966, "grad_norm_var": 1.186532277417749e-05, "learning_rate": 0.008613666984003142, "loss": 2.686, "step": 3322 }, { "crossentropy": 2.732832670211792, "epoch": 0.18070094347317764, "grad_norm": 0.040956851094961166, "grad_norm_var": 1.1974906739865322e-05, "learning_rate": 0.008612806293557328, "loss": 2.7328, "step": 3323 }, { "crossentropy": 2.6691800355911255, "epoch": 0.18075532233067784, "grad_norm": 0.04037418216466904, "grad_norm_var": 1.186528618445278e-05, "learning_rate": 0.008611945379048636, "loss": 2.6692, "step": 3324 }, { "crossentropy": 2.7928818464279175, "epoch": 0.18080970118817805, "grad_norm": 0.03896124288439751, "grad_norm_var": 1.2498423791059098e-05, "learning_rate": 0.00861108424053046, "loss": 2.7929, "step": 3325 }, { "crossentropy": 2.6971380710601807, "epoch": 0.18086408004567825, "grad_norm": 0.045372482389211655, "grad_norm_var": 1.273901263160182e-05, "learning_rate": 0.008610222878056205, "loss": 2.6971, "step": 3326 }, { "crossentropy": 2.796499252319336, "epoch": 0.18091845890317845, "grad_norm": 0.04502742365002632, "grad_norm_var": 1.2824132488593355e-05, "learning_rate": 0.008609361291679295, "loss": 2.7965, "step": 3327 }, { "crossentropy": 2.686182975769043, "epoch": 0.18097283776067866, "grad_norm": 0.04267954081296921, "grad_norm_var": 7.840651068576298e-06, "learning_rate": 0.008608499481453159, "loss": 2.6862, "step": 3328 }, { "crossentropy": 2.58586585521698, "epoch": 0.18102721661817886, "grad_norm": 0.044033318758010864, "grad_norm_var": 7.671330876681214e-06, "learning_rate": 0.008607637447431251, "loss": 2.5859, "step": 3329 }, { "crossentropy": 2.7334611415863037, "epoch": 0.18108159547567906, "grad_norm": 0.04130932688713074, "grad_norm_var": 6.763259375715489e-06, "learning_rate": 0.008606775189667033, "loss": 2.7335, "step": 3330 }, { "crossentropy": 2.544481039047241, "epoch": 0.18113597433317927, "grad_norm": 0.12724781036376953, "grad_norm_var": 0.0004629805699329224, "learning_rate": 0.008605912708213979, "loss": 2.5445, "step": 3331 }, { "crossentropy": 2.73301362991333, "epoch": 0.18119035319067947, "grad_norm": 0.0404362790286541, "grad_norm_var": 0.00046519625207009523, "learning_rate": 0.008605050003125581, "loss": 2.733, "step": 3332 }, { "crossentropy": 2.6585158109664917, "epoch": 0.18124473204817967, "grad_norm": 0.04478823393583298, "grad_norm_var": 0.0004581298774328888, "learning_rate": 0.008604187074455342, "loss": 2.6585, "step": 3333 }, { "crossentropy": 2.774744987487793, "epoch": 0.18129911090567988, "grad_norm": 0.04354650154709816, "grad_norm_var": 0.000454805809777292, "learning_rate": 0.008603323922256783, "loss": 2.7747, "step": 3334 }, { "crossentropy": 2.755731463432312, "epoch": 0.18135348976318008, "grad_norm": 0.042141884565353394, "grad_norm_var": 0.00045570330738503025, "learning_rate": 0.008602460546583432, "loss": 2.7557, "step": 3335 }, { "crossentropy": 2.7230221033096313, "epoch": 0.18140786862068028, "grad_norm": 0.04369397088885307, "grad_norm_var": 0.000454345592677551, "learning_rate": 0.008601596947488837, "loss": 2.723, "step": 3336 }, { "crossentropy": 2.7680519819259644, "epoch": 0.1814622474781805, "grad_norm": 0.04794640466570854, "grad_norm_var": 0.0004485235525182919, "learning_rate": 0.008600733125026556, "loss": 2.7681, "step": 3337 }, { "crossentropy": 2.780203342437744, "epoch": 0.1815166263356807, "grad_norm": 0.04441630095243454, "grad_norm_var": 0.0004483843089943059, "learning_rate": 0.008599869079250165, "loss": 2.7802, "step": 3338 }, { "crossentropy": 2.704013466835022, "epoch": 0.1815710051931809, "grad_norm": 0.044417548924684525, "grad_norm_var": 0.00044574071354872545, "learning_rate": 0.00859900481021325, "loss": 2.704, "step": 3339 }, { "crossentropy": 2.68363094329834, "epoch": 0.1816253840506811, "grad_norm": 0.04332165792584419, "grad_norm_var": 0.00044308062970271477, "learning_rate": 0.00859814031796941, "loss": 2.6836, "step": 3340 }, { "crossentropy": 2.7919331789016724, "epoch": 0.1816797629081813, "grad_norm": 0.04251812770962715, "grad_norm_var": 0.0004392485781661505, "learning_rate": 0.008597275602572264, "loss": 2.7919, "step": 3341 }, { "crossentropy": 2.7399120330810547, "epoch": 0.1817341417656815, "grad_norm": 0.042064934968948364, "grad_norm_var": 0.00044150167121181173, "learning_rate": 0.008596410664075438, "loss": 2.7399, "step": 3342 }, { "crossentropy": 2.76504385471344, "epoch": 0.1817885206231817, "grad_norm": 0.04236500337719917, "grad_norm_var": 0.000443257063453865, "learning_rate": 0.008595545502532576, "loss": 2.765, "step": 3343 }, { "crossentropy": 2.618406891822815, "epoch": 0.1818428994806819, "grad_norm": 0.04030189663171768, "grad_norm_var": 0.00044547394963678305, "learning_rate": 0.008594680117997333, "loss": 2.6184, "step": 3344 }, { "crossentropy": 2.777595043182373, "epoch": 0.18189727833818212, "grad_norm": 0.04196188971400261, "grad_norm_var": 0.00044695073732259886, "learning_rate": 0.008593814510523382, "loss": 2.7776, "step": 3345 }, { "crossentropy": 2.8155828714370728, "epoch": 0.18195165719568232, "grad_norm": 0.04516883194446564, "grad_norm_var": 0.00044429468209913745, "learning_rate": 0.008592948680164405, "loss": 2.8156, "step": 3346 }, { "crossentropy": 2.67386794090271, "epoch": 0.18200603605318252, "grad_norm": 0.039676081389188766, "grad_norm_var": 4.363709643787767e-06, "learning_rate": 0.0085920826269741, "loss": 2.6739, "step": 3347 }, { "crossentropy": 2.731380343437195, "epoch": 0.18206041491068273, "grad_norm": 0.04060404375195503, "grad_norm_var": 4.307051514712556e-06, "learning_rate": 0.008591216351006181, "loss": 2.7314, "step": 3348 }, { "crossentropy": 2.8280035257339478, "epoch": 0.18211479376818293, "grad_norm": 0.042098406702280045, "grad_norm_var": 4.13883138209118e-06, "learning_rate": 0.008590349852314369, "loss": 2.828, "step": 3349 }, { "crossentropy": 2.6145933866500854, "epoch": 0.18216917262568313, "grad_norm": 0.04231303930282593, "grad_norm_var": 4.125987213700564e-06, "learning_rate": 0.008589483130952409, "loss": 2.6146, "step": 3350 }, { "crossentropy": 2.7702860832214355, "epoch": 0.18222355148318334, "grad_norm": 0.04449602589011192, "grad_norm_var": 4.261667944265429e-06, "learning_rate": 0.008588616186974053, "loss": 2.7703, "step": 3351 }, { "crossentropy": 2.7204359769821167, "epoch": 0.18227793034068354, "grad_norm": 0.03831987455487251, "grad_norm_var": 5.540987386401924e-06, "learning_rate": 0.008587749020433064, "loss": 2.7204, "step": 3352 }, { "crossentropy": 2.6574435234069824, "epoch": 0.18233230919818375, "grad_norm": 0.04031279310584068, "grad_norm_var": 3.7661523712863185e-06, "learning_rate": 0.008586881631383226, "loss": 2.6574, "step": 3353 }, { "crossentropy": 2.6400954723358154, "epoch": 0.18238668805568395, "grad_norm": 0.0414699986577034, "grad_norm_var": 3.417332493349928e-06, "learning_rate": 0.008586014019878333, "loss": 2.6401, "step": 3354 }, { "crossentropy": 2.7722352743148804, "epoch": 0.18244106691318415, "grad_norm": 0.03996247053146362, "grad_norm_var": 3.199867483320157e-06, "learning_rate": 0.008585146185972194, "loss": 2.7722, "step": 3355 }, { "crossentropy": 2.6424704790115356, "epoch": 0.18249544577068436, "grad_norm": 0.0410926379263401, "grad_norm_var": 3.023890057148269e-06, "learning_rate": 0.00858427812971863, "loss": 2.6425, "step": 3356 }, { "crossentropy": 2.738465428352356, "epoch": 0.18254982462818456, "grad_norm": 0.04021532088518143, "grad_norm_var": 3.056648712156512e-06, "learning_rate": 0.008583409851171479, "loss": 2.7385, "step": 3357 }, { "crossentropy": 2.723001480102539, "epoch": 0.18260420348568476, "grad_norm": 0.043824370950460434, "grad_norm_var": 3.4057718349779497e-06, "learning_rate": 0.008582541350384592, "loss": 2.723, "step": 3358 }, { "crossentropy": 2.6949801445007324, "epoch": 0.18265858234318497, "grad_norm": 0.03798355534672737, "grad_norm_var": 4.106931002062266e-06, "learning_rate": 0.008581672627411828, "loss": 2.695, "step": 3359 }, { "crossentropy": 2.76446795463562, "epoch": 0.18271296120068517, "grad_norm": 0.040676604956388474, "grad_norm_var": 4.068958751331636e-06, "learning_rate": 0.008580803682307068, "loss": 2.7645, "step": 3360 }, { "crossentropy": 2.789855718612671, "epoch": 0.18276734005818537, "grad_norm": 0.043575916439294815, "grad_norm_var": 4.382611088748524e-06, "learning_rate": 0.008579934515124201, "loss": 2.7899, "step": 3361 }, { "crossentropy": 2.6514360904693604, "epoch": 0.18282171891568558, "grad_norm": 0.03722506761550903, "grad_norm_var": 4.294362069202397e-06, "learning_rate": 0.008579065125917133, "loss": 2.6514, "step": 3362 }, { "crossentropy": 2.687358260154724, "epoch": 0.18287609777318578, "grad_norm": 0.04725240170955658, "grad_norm_var": 6.680492372930733e-06, "learning_rate": 0.008578195514739784, "loss": 2.6874, "step": 3363 }, { "crossentropy": 2.771495819091797, "epoch": 0.18293047663068598, "grad_norm": 0.03842005506157875, "grad_norm_var": 7.192596652246206e-06, "learning_rate": 0.008577325681646084, "loss": 2.7715, "step": 3364 }, { "crossentropy": 2.6800029277801514, "epoch": 0.1829848554881862, "grad_norm": 0.03829610347747803, "grad_norm_var": 7.64194355026785e-06, "learning_rate": 0.00857645562668998, "loss": 2.68, "step": 3365 }, { "crossentropy": 2.658851981163025, "epoch": 0.1830392343456864, "grad_norm": 0.041605737060308456, "grad_norm_var": 7.5460591496184785e-06, "learning_rate": 0.008575585349925433, "loss": 2.6589, "step": 3366 }, { "crossentropy": 2.803844451904297, "epoch": 0.1830936132031866, "grad_norm": 0.04073032736778259, "grad_norm_var": 6.637121791544e-06, "learning_rate": 0.008574714851406415, "loss": 2.8038, "step": 3367 }, { "crossentropy": 2.728574514389038, "epoch": 0.1831479920606868, "grad_norm": 0.04189661890268326, "grad_norm_var": 6.308667564944997e-06, "learning_rate": 0.008573844131186916, "loss": 2.7286, "step": 3368 }, { "crossentropy": 2.653040647506714, "epoch": 0.183202370918187, "grad_norm": 0.058849036693573, "grad_norm_var": 2.6310284097294003e-05, "learning_rate": 0.008572973189320935, "loss": 2.653, "step": 3369 }, { "crossentropy": 2.6724050045013428, "epoch": 0.1832567497756872, "grad_norm": 0.040889814496040344, "grad_norm_var": 2.6377525634701562e-05, "learning_rate": 0.008572102025862489, "loss": 2.6724, "step": 3370 }, { "crossentropy": 2.765295386314392, "epoch": 0.1833111286331874, "grad_norm": 0.047138068825006485, "grad_norm_var": 2.7616540753612703e-05, "learning_rate": 0.008571230640865605, "loss": 2.7653, "step": 3371 }, { "crossentropy": 2.7876049280166626, "epoch": 0.1833655074906876, "grad_norm": 0.050711050629615784, "grad_norm_var": 3.162009798698308e-05, "learning_rate": 0.008570359034384325, "loss": 2.7876, "step": 3372 }, { "crossentropy": 2.733212947845459, "epoch": 0.18341988634818782, "grad_norm": 0.043791428208351135, "grad_norm_var": 3.1053162454934497e-05, "learning_rate": 0.008569487206472708, "loss": 2.7332, "step": 3373 }, { "crossentropy": 2.585910439491272, "epoch": 0.18347426520568802, "grad_norm": 0.043516855686903, "grad_norm_var": 3.1037742068754035e-05, "learning_rate": 0.00856861515718482, "loss": 2.5859, "step": 3374 }, { "crossentropy": 2.6434324979782104, "epoch": 0.18352864406318822, "grad_norm": 0.03941316530108452, "grad_norm_var": 3.0154961707170615e-05, "learning_rate": 0.008567742886574749, "loss": 2.6434, "step": 3375 }, { "crossentropy": 2.741678833961487, "epoch": 0.18358302292068843, "grad_norm": 0.039276547729969025, "grad_norm_var": 3.078105565883227e-05, "learning_rate": 0.00856687039469659, "loss": 2.7417, "step": 3376 }, { "crossentropy": 2.551367163658142, "epoch": 0.18363740177818863, "grad_norm": 0.0389200784265995, "grad_norm_var": 3.195635667753253e-05, "learning_rate": 0.008565997681604456, "loss": 2.5514, "step": 3377 }, { "crossentropy": 2.7550108432769775, "epoch": 0.18369178063568883, "grad_norm": 0.04028186574578285, "grad_norm_var": 3.0188373687801914e-05, "learning_rate": 0.008565124747352468, "loss": 2.755, "step": 3378 }, { "crossentropy": 2.7731211185455322, "epoch": 0.18374615949318904, "grad_norm": 0.04621802642941475, "grad_norm_var": 2.96945331267319e-05, "learning_rate": 0.008564251591994768, "loss": 2.7731, "step": 3379 }, { "crossentropy": 2.764640212059021, "epoch": 0.18380053835068924, "grad_norm": 0.05286400765180588, "grad_norm_var": 3.3678144602227766e-05, "learning_rate": 0.008563378215585506, "loss": 2.7646, "step": 3380 }, { "crossentropy": 2.791471481323242, "epoch": 0.18385491720818944, "grad_norm": 0.04571562632918358, "grad_norm_var": 3.145138160008364e-05, "learning_rate": 0.008562504618178852, "loss": 2.7915, "step": 3381 }, { "crossentropy": 2.668624758720398, "epoch": 0.18390929606568965, "grad_norm": 0.03881998732686043, "grad_norm_var": 3.300721320423546e-05, "learning_rate": 0.008561630799828981, "loss": 2.6686, "step": 3382 }, { "crossentropy": 2.694709897041321, "epoch": 0.18396367492318985, "grad_norm": 0.04199123755097389, "grad_norm_var": 3.250400032627678e-05, "learning_rate": 0.008560756760590088, "loss": 2.6947, "step": 3383 }, { "crossentropy": 2.7773300409317017, "epoch": 0.18401805378069006, "grad_norm": 0.04175671562552452, "grad_norm_var": 3.255179686797611e-05, "learning_rate": 0.008559882500516381, "loss": 2.7773, "step": 3384 }, { "crossentropy": 2.7760958671569824, "epoch": 0.18407243263819026, "grad_norm": 0.03944218531250954, "grad_norm_var": 1.866301051234211e-05, "learning_rate": 0.008559008019662081, "loss": 2.7761, "step": 3385 }, { "crossentropy": 2.7443853616714478, "epoch": 0.18412681149569046, "grad_norm": 0.039115577936172485, "grad_norm_var": 1.9399561215994025e-05, "learning_rate": 0.00855813331808142, "loss": 2.7444, "step": 3386 }, { "crossentropy": 2.6400688886642456, "epoch": 0.18418119035319067, "grad_norm": 0.039885811507701874, "grad_norm_var": 1.8744153775016963e-05, "learning_rate": 0.00855725839582865, "loss": 2.6401, "step": 3387 }, { "crossentropy": 2.8007304668426514, "epoch": 0.1842355692106909, "grad_norm": 0.040175266563892365, "grad_norm_var": 1.429820547653333e-05, "learning_rate": 0.008556383252958027, "loss": 2.8007, "step": 3388 }, { "crossentropy": 2.618551254272461, "epoch": 0.1842899480681911, "grad_norm": 0.04447770491242409, "grad_norm_var": 1.4496228007532657e-05, "learning_rate": 0.008555507889523832, "loss": 2.6186, "step": 3389 }, { "crossentropy": 2.5851434469223022, "epoch": 0.1843443269256913, "grad_norm": 0.048714905977249146, "grad_norm_var": 1.724185584994949e-05, "learning_rate": 0.008554632305580354, "loss": 2.5851, "step": 3390 }, { "crossentropy": 2.63366436958313, "epoch": 0.1843987057831915, "grad_norm": 0.039730120450258255, "grad_norm_var": 1.7125425276328052e-05, "learning_rate": 0.008553756501181894, "loss": 2.6337, "step": 3391 }, { "crossentropy": 2.7885942459106445, "epoch": 0.1844530846406917, "grad_norm": 0.04100055620074272, "grad_norm_var": 1.6607779689830662e-05, "learning_rate": 0.008552880476382767, "loss": 2.7886, "step": 3392 }, { "crossentropy": 2.6851009130477905, "epoch": 0.18450746349819191, "grad_norm": 0.04939395189285278, "grad_norm_var": 1.8542446552851042e-05, "learning_rate": 0.008552004231237307, "loss": 2.6851, "step": 3393 }, { "crossentropy": 2.614825487136841, "epoch": 0.18456184235569212, "grad_norm": 0.04840215668082237, "grad_norm_var": 1.9613545850359162e-05, "learning_rate": 0.008551127765799856, "loss": 2.6148, "step": 3394 }, { "crossentropy": 2.640076756477356, "epoch": 0.18461622121319232, "grad_norm": 0.04271066561341286, "grad_norm_var": 1.9161114473158676e-05, "learning_rate": 0.008550251080124771, "loss": 2.6401, "step": 3395 }, { "crossentropy": 2.6589677333831787, "epoch": 0.18467060007069253, "grad_norm": 0.04265572875738144, "grad_norm_var": 1.277536362056982e-05, "learning_rate": 0.008549374174266424, "loss": 2.659, "step": 3396 }, { "crossentropy": 2.6996618509292603, "epoch": 0.18472497892819273, "grad_norm": 0.04159598425030708, "grad_norm_var": 1.2206698285510455e-05, "learning_rate": 0.0085484970482792, "loss": 2.6997, "step": 3397 }, { "crossentropy": 2.699703812599182, "epoch": 0.18477935778569293, "grad_norm": 0.04174121469259262, "grad_norm_var": 1.1309892339357528e-05, "learning_rate": 0.008547619702217497, "loss": 2.6997, "step": 3398 }, { "crossentropy": 2.7946449518203735, "epoch": 0.18483373664319314, "grad_norm": 0.03853146731853485, "grad_norm_var": 1.2373144886540279e-05, "learning_rate": 0.008546742136135729, "loss": 2.7946, "step": 3399 }, { "crossentropy": 2.8402990102767944, "epoch": 0.18488811550069334, "grad_norm": 0.04190538823604584, "grad_norm_var": 1.2360622292779896e-05, "learning_rate": 0.00854586435008832, "loss": 2.8403, "step": 3400 }, { "crossentropy": 2.648194193840027, "epoch": 0.18494249435819354, "grad_norm": 0.04074639454483986, "grad_norm_var": 1.194086089870098e-05, "learning_rate": 0.008544986344129708, "loss": 2.6482, "step": 3401 }, { "crossentropy": 2.7585195302963257, "epoch": 0.18499687321569375, "grad_norm": 0.036959338933229446, "grad_norm_var": 1.3218530276826179e-05, "learning_rate": 0.00854410811831435, "loss": 2.7585, "step": 3402 }, { "crossentropy": 2.7724446058273315, "epoch": 0.18505125207319395, "grad_norm": 0.057596612721681595, "grad_norm_var": 2.685250233429298e-05, "learning_rate": 0.008543229672696709, "loss": 2.7724, "step": 3403 }, { "crossentropy": 2.7492371797561646, "epoch": 0.18510563093069415, "grad_norm": 0.06121143326163292, "grad_norm_var": 4.512557861770424e-05, "learning_rate": 0.008542351007331268, "loss": 2.7492, "step": 3404 }, { "crossentropy": 2.703964948654175, "epoch": 0.18516000978819436, "grad_norm": 0.04490198567509651, "grad_norm_var": 4.511656887383938e-05, "learning_rate": 0.008541472122272518, "loss": 2.704, "step": 3405 }, { "crossentropy": 2.695204734802246, "epoch": 0.18521438864569456, "grad_norm": 0.04305674508213997, "grad_norm_var": 4.421105643253546e-05, "learning_rate": 0.008540593017574972, "loss": 2.6952, "step": 3406 }, { "crossentropy": 2.697633385658264, "epoch": 0.18526876750319476, "grad_norm": 0.03978385403752327, "grad_norm_var": 4.417700061545519e-05, "learning_rate": 0.008539713693293146, "loss": 2.6976, "step": 3407 }, { "crossentropy": 2.7814595699310303, "epoch": 0.18532314636069497, "grad_norm": 0.05160871148109436, "grad_norm_var": 4.6243521059772394e-05, "learning_rate": 0.008538834149481577, "loss": 2.7815, "step": 3408 }, { "crossentropy": 2.722256898880005, "epoch": 0.18537752521819517, "grad_norm": 0.040746066719293594, "grad_norm_var": 4.60530904264765e-05, "learning_rate": 0.008537954386194813, "loss": 2.7223, "step": 3409 }, { "crossentropy": 2.687412142753601, "epoch": 0.18543190407569538, "grad_norm": 0.041081126779317856, "grad_norm_var": 4.572528920013848e-05, "learning_rate": 0.008537074403487417, "loss": 2.6874, "step": 3410 }, { "crossentropy": 2.7450097799301147, "epoch": 0.18548628293319558, "grad_norm": 0.054259784519672394, "grad_norm_var": 5.1803621615717496e-05, "learning_rate": 0.008536194201413964, "loss": 2.745, "step": 3411 }, { "crossentropy": 2.7316551208496094, "epoch": 0.18554066179069578, "grad_norm": 0.0385427363216877, "grad_norm_var": 5.4091049339835956e-05, "learning_rate": 0.008535313780029044, "loss": 2.7317, "step": 3412 }, { "crossentropy": 2.7279030084609985, "epoch": 0.18559504064819599, "grad_norm": 0.04197920113801956, "grad_norm_var": 5.394459990880803e-05, "learning_rate": 0.008534433139387258, "loss": 2.7279, "step": 3413 }, { "crossentropy": 2.6832820177078247, "epoch": 0.1856494195056962, "grad_norm": 0.04202931746840477, "grad_norm_var": 5.383744522983175e-05, "learning_rate": 0.008533552279543223, "loss": 2.6833, "step": 3414 }, { "crossentropy": 2.687725305557251, "epoch": 0.1857037983631964, "grad_norm": 0.03853577375411987, "grad_norm_var": 5.3833913795148644e-05, "learning_rate": 0.008532671200551574, "loss": 2.6877, "step": 3415 }, { "crossentropy": 2.6193536520004272, "epoch": 0.1857581772206966, "grad_norm": 0.0405593104660511, "grad_norm_var": 5.44458614136117e-05, "learning_rate": 0.008531789902466948, "loss": 2.6194, "step": 3416 }, { "crossentropy": 2.6650644540786743, "epoch": 0.1858125560781968, "grad_norm": 0.03931611031293869, "grad_norm_var": 5.530859946920822e-05, "learning_rate": 0.008530908385344005, "loss": 2.6651, "step": 3417 }, { "crossentropy": 2.774336099624634, "epoch": 0.185866934935697, "grad_norm": 0.036971256136894226, "grad_norm_var": 5.529660983807153e-05, "learning_rate": 0.008530026649237415, "loss": 2.7743, "step": 3418 }, { "crossentropy": 2.6438573598861694, "epoch": 0.1859213137931972, "grad_norm": 0.04082632064819336, "grad_norm_var": 4.361490133794188e-05, "learning_rate": 0.008529144694201865, "loss": 2.6439, "step": 3419 }, { "crossentropy": 2.643210291862488, "epoch": 0.1859756926506974, "grad_norm": 0.04669514670968056, "grad_norm_var": 2.2433092951447812e-05, "learning_rate": 0.008528262520292049, "loss": 2.6432, "step": 3420 }, { "crossentropy": 2.7374666929244995, "epoch": 0.18603007150819761, "grad_norm": 0.059661101549863815, "grad_norm_var": 4.06644990145805e-05, "learning_rate": 0.008527380127562682, "loss": 2.7375, "step": 3421 }, { "crossentropy": 2.731146812438965, "epoch": 0.18608445036569782, "grad_norm": 0.04559706151485443, "grad_norm_var": 4.092504511802227e-05, "learning_rate": 0.00852649751606849, "loss": 2.7311, "step": 3422 }, { "crossentropy": 2.6988816261291504, "epoch": 0.18613882922319802, "grad_norm": 0.04613913968205452, "grad_norm_var": 4.0184306249616e-05, "learning_rate": 0.00852561468586421, "loss": 2.6989, "step": 3423 }, { "crossentropy": 2.7969770431518555, "epoch": 0.18619320808069822, "grad_norm": 0.04237871244549751, "grad_norm_var": 3.618723780497706e-05, "learning_rate": 0.00852473163700459, "loss": 2.797, "step": 3424 }, { "crossentropy": 2.6986842155456543, "epoch": 0.18624758693819843, "grad_norm": 0.03744395822286606, "grad_norm_var": 3.806247529146688e-05, "learning_rate": 0.008523848369544403, "loss": 2.6987, "step": 3425 }, { "crossentropy": 2.6630899906158447, "epoch": 0.18630196579569863, "grad_norm": 0.03999669477343559, "grad_norm_var": 3.844971935698452e-05, "learning_rate": 0.008522964883538424, "loss": 2.6631, "step": 3426 }, { "crossentropy": 2.717584252357483, "epoch": 0.18635634465319884, "grad_norm": 0.03782511502504349, "grad_norm_var": 3.105892448007977e-05, "learning_rate": 0.008522081179041447, "loss": 2.7176, "step": 3427 }, { "crossentropy": 2.729630708694458, "epoch": 0.18641072351069904, "grad_norm": 0.04212363809347153, "grad_norm_var": 3.013515889985352e-05, "learning_rate": 0.008521197256108282, "loss": 2.7296, "step": 3428 }, { "crossentropy": 2.843446731567383, "epoch": 0.18646510236819924, "grad_norm": 0.04095732048153877, "grad_norm_var": 3.0255014810766597e-05, "learning_rate": 0.008520313114793744, "loss": 2.8434, "step": 3429 }, { "crossentropy": 2.7660131454467773, "epoch": 0.18651948122569945, "grad_norm": 0.03923342376947403, "grad_norm_var": 3.085044931449837e-05, "learning_rate": 0.008519428755152669, "loss": 2.766, "step": 3430 }, { "crossentropy": 2.7041356563568115, "epoch": 0.18657386008319965, "grad_norm": 0.047026194632053375, "grad_norm_var": 3.1274294879220176e-05, "learning_rate": 0.008518544177239903, "loss": 2.7041, "step": 3431 }, { "crossentropy": 2.7263911962509155, "epoch": 0.18662823894069985, "grad_norm": 0.04143930599093437, "grad_norm_var": 3.10748177094551e-05, "learning_rate": 0.008517659381110307, "loss": 2.7264, "step": 3432 }, { "crossentropy": 2.7831496000289917, "epoch": 0.18668261779820006, "grad_norm": 0.0523579977452755, "grad_norm_var": 3.5774398892588624e-05, "learning_rate": 0.008516774366818757, "loss": 2.7831, "step": 3433 }, { "crossentropy": 2.7289174795150757, "epoch": 0.18673699665570026, "grad_norm": 0.04241929203271866, "grad_norm_var": 3.285642995822791e-05, "learning_rate": 0.008515889134420139, "loss": 2.7289, "step": 3434 }, { "crossentropy": 2.665192723274231, "epoch": 0.18679137551320046, "grad_norm": 0.03944191336631775, "grad_norm_var": 3.3540354238633294e-05, "learning_rate": 0.008515003683969354, "loss": 2.6652, "step": 3435 }, { "crossentropy": 2.697622060775757, "epoch": 0.18684575437070067, "grad_norm": 0.039853453636169434, "grad_norm_var": 3.382122664817201e-05, "learning_rate": 0.00851411801552132, "loss": 2.6976, "step": 3436 }, { "crossentropy": 2.7540420293807983, "epoch": 0.18690013322820087, "grad_norm": 0.04086532071232796, "grad_norm_var": 1.5070094317969479e-05, "learning_rate": 0.00851323212913096, "loss": 2.754, "step": 3437 }, { "crossentropy": 2.7117072343826294, "epoch": 0.18695451208570107, "grad_norm": 0.04134150594472885, "grad_norm_var": 1.4270838023164716e-05, "learning_rate": 0.008512346024853219, "loss": 2.7117, "step": 3438 }, { "crossentropy": 2.551481604576111, "epoch": 0.18700889094320128, "grad_norm": 0.04497023671865463, "grad_norm_var": 1.3699863251027492e-05, "learning_rate": 0.008511459702743054, "loss": 2.5515, "step": 3439 }, { "crossentropy": 2.748076915740967, "epoch": 0.18706326980070148, "grad_norm": 0.03877010568976402, "grad_norm_var": 1.4261580035061089e-05, "learning_rate": 0.008510573162855431, "loss": 2.7481, "step": 3440 }, { "crossentropy": 2.7151957750320435, "epoch": 0.18711764865820169, "grad_norm": 0.1196187362074852, "grad_norm_var": 0.00039044996841267586, "learning_rate": 0.008509686405245333, "loss": 2.7152, "step": 3441 }, { "crossentropy": 2.7843849658966064, "epoch": 0.1871720275157019, "grad_norm": 0.03831934928894043, "grad_norm_var": 0.0003921395197945862, "learning_rate": 0.008508799429967757, "loss": 2.7844, "step": 3442 }, { "crossentropy": 2.6661723852157593, "epoch": 0.1872264063732021, "grad_norm": 0.03865921497344971, "grad_norm_var": 0.0003912004252956263, "learning_rate": 0.008507912237077711, "loss": 2.6662, "step": 3443 }, { "crossentropy": 2.7069973945617676, "epoch": 0.1872807852307023, "grad_norm": 0.03823711723089218, "grad_norm_var": 0.0003945223547139192, "learning_rate": 0.00850702482663022, "loss": 2.707, "step": 3444 }, { "crossentropy": 2.621298909187317, "epoch": 0.1873351640882025, "grad_norm": 0.041105933487415314, "grad_norm_var": 0.0003944145127410608, "learning_rate": 0.008506137198680316, "loss": 2.6213, "step": 3445 }, { "crossentropy": 2.697556495666504, "epoch": 0.1873895429457027, "grad_norm": 0.04242337495088577, "grad_norm_var": 0.00039196889181567103, "learning_rate": 0.008505249353283053, "loss": 2.6976, "step": 3446 }, { "crossentropy": 2.7540687322616577, "epoch": 0.1874439218032029, "grad_norm": 0.039848778396844864, "grad_norm_var": 0.0003948554428883777, "learning_rate": 0.008504361290493492, "loss": 2.7541, "step": 3447 }, { "crossentropy": 2.698484182357788, "epoch": 0.1874983006607031, "grad_norm": 0.039857134222984314, "grad_norm_var": 0.0003960224136095478, "learning_rate": 0.008503473010366712, "loss": 2.6985, "step": 3448 }, { "crossentropy": 2.7654426097869873, "epoch": 0.1875526795182033, "grad_norm": 0.03685460612177849, "grad_norm_var": 0.00039817182163461284, "learning_rate": 0.008502584512957804, "loss": 2.7654, "step": 3449 }, { "crossentropy": 2.6889541149139404, "epoch": 0.18760705837570352, "grad_norm": 0.041256021708250046, "grad_norm_var": 0.00039868174048232846, "learning_rate": 0.008501695798321867, "loss": 2.689, "step": 3450 }, { "crossentropy": 2.6441566944122314, "epoch": 0.18766143723320372, "grad_norm": 0.04312833771109581, "grad_norm_var": 0.0003967554606905989, "learning_rate": 0.008500806866514023, "loss": 2.6442, "step": 3451 }, { "crossentropy": 2.797572612762451, "epoch": 0.18771581609070392, "grad_norm": 0.03976037725806236, "grad_norm_var": 0.0003968238346302684, "learning_rate": 0.0084999177175894, "loss": 2.7976, "step": 3452 }, { "crossentropy": 2.705004334449768, "epoch": 0.18777019494820413, "grad_norm": 0.04427719861268997, "grad_norm_var": 0.0003955278346586671, "learning_rate": 0.008499028351603143, "loss": 2.705, "step": 3453 }, { "crossentropy": 2.7403476238250732, "epoch": 0.18782457380570433, "grad_norm": 0.0465632863342762, "grad_norm_var": 0.00039431809694197333, "learning_rate": 0.00849813876861041, "loss": 2.7403, "step": 3454 }, { "crossentropy": 2.6081005334854126, "epoch": 0.18787895266320453, "grad_norm": 0.04829863831400871, "grad_norm_var": 0.0003946186790806066, "learning_rate": 0.008497248968666372, "loss": 2.6081, "step": 3455 }, { "crossentropy": 2.6368733644485474, "epoch": 0.18793333152070474, "grad_norm": 0.04350011795759201, "grad_norm_var": 0.00039141876951540673, "learning_rate": 0.008496358951826214, "loss": 2.6369, "step": 3456 }, { "crossentropy": 2.718008518218994, "epoch": 0.18798771037820494, "grad_norm": 0.04257037118077278, "grad_norm_var": 9.818238014636105e-06, "learning_rate": 0.008495468718145134, "loss": 2.718, "step": 3457 }, { "crossentropy": 2.664548397064209, "epoch": 0.18804208923570515, "grad_norm": 0.04537782445549965, "grad_norm_var": 9.899898373930995e-06, "learning_rate": 0.00849457826767834, "loss": 2.6645, "step": 3458 }, { "crossentropy": 2.583868622779846, "epoch": 0.18809646809320535, "grad_norm": 0.04424731060862541, "grad_norm_var": 9.375540534015125e-06, "learning_rate": 0.008493687600481064, "loss": 2.5839, "step": 3459 }, { "crossentropy": 2.6901512145996094, "epoch": 0.18815084695070555, "grad_norm": 0.0440717488527298, "grad_norm_var": 8.317876648098604e-06, "learning_rate": 0.008492796716608537, "loss": 2.6902, "step": 3460 }, { "crossentropy": 2.6605825424194336, "epoch": 0.18820522580820576, "grad_norm": 0.04094778001308441, "grad_norm_var": 8.352976538839966e-06, "learning_rate": 0.008491905616116015, "loss": 2.6606, "step": 3461 }, { "crossentropy": 2.769295573234558, "epoch": 0.188259604665706, "grad_norm": 0.047567885369062424, "grad_norm_var": 9.82666101729575e-06, "learning_rate": 0.008491014299058762, "loss": 2.7693, "step": 3462 }, { "crossentropy": 2.7759445905685425, "epoch": 0.1883139835232062, "grad_norm": 0.04687853157520294, "grad_norm_var": 9.95414477513908e-06, "learning_rate": 0.008490122765492056, "loss": 2.7759, "step": 3463 }, { "crossentropy": 2.7759640216827393, "epoch": 0.1883683623807064, "grad_norm": 0.0472489558160305, "grad_norm_var": 9.830679131016205e-06, "learning_rate": 0.008489231015471192, "loss": 2.776, "step": 3464 }, { "crossentropy": 2.6983710527420044, "epoch": 0.1884227412382066, "grad_norm": 0.04160872846841812, "grad_norm_var": 6.771426108899646e-06, "learning_rate": 0.008488339049051472, "loss": 2.6984, "step": 3465 }, { "crossentropy": 2.7052706480026245, "epoch": 0.1884771200957068, "grad_norm": 0.04018076881766319, "grad_norm_var": 7.2666800956372e-06, "learning_rate": 0.008487446866288217, "loss": 2.7053, "step": 3466 }, { "crossentropy": 2.557266592979431, "epoch": 0.188531498953207, "grad_norm": 0.04092184081673622, "grad_norm_var": 7.868376806961323e-06, "learning_rate": 0.008486554467236758, "loss": 2.5573, "step": 3467 }, { "crossentropy": 2.6188706159591675, "epoch": 0.1885858778107072, "grad_norm": 0.04031665623188019, "grad_norm_var": 7.573163097116381e-06, "learning_rate": 0.008485661851952443, "loss": 2.6189, "step": 3468 }, { "crossentropy": 2.5906673669815063, "epoch": 0.1886402566682074, "grad_norm": 0.037667978554964066, "grad_norm_var": 1.0090814177919164e-05, "learning_rate": 0.008484769020490628, "loss": 2.5907, "step": 3469 }, { "crossentropy": 2.742706298828125, "epoch": 0.18869463552570762, "grad_norm": 0.03604021295905113, "grad_norm_var": 1.288634687070129e-05, "learning_rate": 0.008483875972906688, "loss": 2.7427, "step": 3470 }, { "crossentropy": 2.7112677097320557, "epoch": 0.18874901438320782, "grad_norm": 0.03726576268672943, "grad_norm_var": 1.264856120230014e-05, "learning_rate": 0.008482982709256008, "loss": 2.7113, "step": 3471 }, { "crossentropy": 2.7448760271072388, "epoch": 0.18880339324070802, "grad_norm": 0.03842085972428322, "grad_norm_var": 1.343182615609224e-05, "learning_rate": 0.008482089229593988, "loss": 2.7449, "step": 3472 }, { "crossentropy": 2.74935519695282, "epoch": 0.18885777209820823, "grad_norm": 0.04004676640033722, "grad_norm_var": 1.3623921124292631e-05, "learning_rate": 0.00848119553397604, "loss": 2.7494, "step": 3473 }, { "crossentropy": 2.8095884323120117, "epoch": 0.18891215095570843, "grad_norm": 0.04792243242263794, "grad_norm_var": 1.5242294753554467e-05, "learning_rate": 0.00848030162245759, "loss": 2.8096, "step": 3474 }, { "crossentropy": 2.679007053375244, "epoch": 0.18896652981320863, "grad_norm": 0.0421186201274395, "grad_norm_var": 1.4876203064635256e-05, "learning_rate": 0.008479407495094078, "loss": 2.679, "step": 3475 }, { "crossentropy": 2.8487507104873657, "epoch": 0.18902090867070884, "grad_norm": 0.03793935850262642, "grad_norm_var": 1.5390837086276936e-05, "learning_rate": 0.008478513151940956, "loss": 2.8488, "step": 3476 }, { "crossentropy": 2.8614823818206787, "epoch": 0.18907528752820904, "grad_norm": 0.03918621689081192, "grad_norm_var": 1.570117139279888e-05, "learning_rate": 0.008477618593053694, "loss": 2.8615, "step": 3477 }, { "crossentropy": 2.6729825735092163, "epoch": 0.18912966638570924, "grad_norm": 0.0459258109331131, "grad_norm_var": 1.4504659729489e-05, "learning_rate": 0.008476723818487766, "loss": 2.673, "step": 3478 }, { "crossentropy": 2.7242839336395264, "epoch": 0.18918404524320945, "grad_norm": 0.06008822098374367, "grad_norm_var": 3.5358320281848354e-05, "learning_rate": 0.00847582882829867, "loss": 2.7243, "step": 3479 }, { "crossentropy": 2.6836787462234497, "epoch": 0.18923842410070965, "grad_norm": 0.06135554611682892, "grad_norm_var": 5.756250838134964e-05, "learning_rate": 0.008474933622541909, "loss": 2.6837, "step": 3480 }, { "crossentropy": 2.577765703201294, "epoch": 0.18929280295820985, "grad_norm": 0.04647820070385933, "grad_norm_var": 5.818153665593974e-05, "learning_rate": 0.008474038201273007, "loss": 2.5778, "step": 3481 }, { "crossentropy": 2.724903106689453, "epoch": 0.18934718181571006, "grad_norm": 0.03935990110039711, "grad_norm_var": 5.855872167585859e-05, "learning_rate": 0.008473142564547493, "loss": 2.7249, "step": 3482 }, { "crossentropy": 2.7803518772125244, "epoch": 0.18940156067321026, "grad_norm": 0.0410073883831501, "grad_norm_var": 5.853329742008338e-05, "learning_rate": 0.008472246712420915, "loss": 2.7804, "step": 3483 }, { "crossentropy": 2.7862056493759155, "epoch": 0.18945593953071047, "grad_norm": 0.03909260779619217, "grad_norm_var": 5.9096908444263883e-05, "learning_rate": 0.008471350644948835, "loss": 2.7862, "step": 3484 }, { "crossentropy": 2.7532107830047607, "epoch": 0.18951031838821067, "grad_norm": 0.03898666426539421, "grad_norm_var": 5.8247036552549514e-05, "learning_rate": 0.008470454362186823, "loss": 2.7532, "step": 3485 }, { "crossentropy": 2.8687052726745605, "epoch": 0.18956469724571087, "grad_norm": 0.04578787460923195, "grad_norm_var": 5.487729426686353e-05, "learning_rate": 0.008469557864190467, "loss": 2.8687, "step": 3486 }, { "crossentropy": 2.6614235639572144, "epoch": 0.18961907610321108, "grad_norm": 0.04710790887475014, "grad_norm_var": 5.2341798929857417e-05, "learning_rate": 0.008468661151015367, "loss": 2.6614, "step": 3487 }, { "crossentropy": 2.6961846351623535, "epoch": 0.18967345496071128, "grad_norm": 0.04495891183614731, "grad_norm_var": 4.977805263755199e-05, "learning_rate": 0.008467764222717136, "loss": 2.6962, "step": 3488 }, { "crossentropy": 2.7978343963623047, "epoch": 0.18972783381821148, "grad_norm": 0.04903203248977661, "grad_norm_var": 4.908733487635249e-05, "learning_rate": 0.0084668670793514, "loss": 2.7978, "step": 3489 }, { "crossentropy": 2.7656004428863525, "epoch": 0.1897822126757117, "grad_norm": 0.04748835787177086, "grad_norm_var": 4.8952932143127525e-05, "learning_rate": 0.0084659697209738, "loss": 2.7656, "step": 3490 }, { "crossentropy": 2.72159743309021, "epoch": 0.1898365915332119, "grad_norm": 0.05659415200352669, "grad_norm_var": 5.577462179168306e-05, "learning_rate": 0.008465072147639991, "loss": 2.7216, "step": 3491 }, { "crossentropy": 2.705666661262512, "epoch": 0.1898909703907121, "grad_norm": 0.04123632237315178, "grad_norm_var": 5.2789985125271247e-05, "learning_rate": 0.008464174359405637, "loss": 2.7057, "step": 3492 }, { "crossentropy": 2.692087769508362, "epoch": 0.1899453492482123, "grad_norm": 0.04046621173620224, "grad_norm_var": 5.164751841257101e-05, "learning_rate": 0.008463276356326419, "loss": 2.6921, "step": 3493 }, { "crossentropy": 2.8256205320358276, "epoch": 0.1899997281057125, "grad_norm": 0.04195978865027428, "grad_norm_var": 5.296616477738116e-05, "learning_rate": 0.00846237813845803, "loss": 2.8256, "step": 3494 }, { "crossentropy": 2.6507370471954346, "epoch": 0.1900541069632127, "grad_norm": 0.04303642734885216, "grad_norm_var": 3.981880648755796e-05, "learning_rate": 0.008461479705856177, "loss": 2.6507, "step": 3495 }, { "crossentropy": 2.708101511001587, "epoch": 0.1901084858207129, "grad_norm": 0.043262362480163574, "grad_norm_var": 2.141780284902328e-05, "learning_rate": 0.00846058105857658, "loss": 2.7081, "step": 3496 }, { "crossentropy": 2.6561384201049805, "epoch": 0.1901628646782131, "grad_norm": 0.06070134788751602, "grad_norm_var": 3.8541251548116866e-05, "learning_rate": 0.008459682196674971, "loss": 2.6561, "step": 3497 }, { "crossentropy": 2.7175523042678833, "epoch": 0.19021724353571332, "grad_norm": 0.039778631180524826, "grad_norm_var": 3.82370463554528e-05, "learning_rate": 0.008458783120207099, "loss": 2.7176, "step": 3498 }, { "crossentropy": 2.7077507972717285, "epoch": 0.19027162239321352, "grad_norm": 0.03803867846727371, "grad_norm_var": 4.038055634626349e-05, "learning_rate": 0.008457883829228722, "loss": 2.7078, "step": 3499 }, { "crossentropy": 2.678486466407776, "epoch": 0.19032600125071372, "grad_norm": 0.03876965492963791, "grad_norm_var": 4.063479749619129e-05, "learning_rate": 0.008456984323795613, "loss": 2.6785, "step": 3500 }, { "crossentropy": 2.6058536767959595, "epoch": 0.19038038010821393, "grad_norm": 0.04532465711236, "grad_norm_var": 3.821137337503364e-05, "learning_rate": 0.008456084603963559, "loss": 2.6059, "step": 3501 }, { "crossentropy": 2.7083743810653687, "epoch": 0.19043475896571413, "grad_norm": 0.042290929704904556, "grad_norm_var": 3.871156509317643e-05, "learning_rate": 0.008455184669788361, "loss": 2.7084, "step": 3502 }, { "crossentropy": 2.5972591638565063, "epoch": 0.19048913782321433, "grad_norm": 0.03996710851788521, "grad_norm_var": 3.98943098776809e-05, "learning_rate": 0.008454284521325831, "loss": 2.5973, "step": 3503 }, { "crossentropy": 2.6832385063171387, "epoch": 0.19054351668071454, "grad_norm": 0.04037780687212944, "grad_norm_var": 4.096022887588567e-05, "learning_rate": 0.008453384158631795, "loss": 2.6832, "step": 3504 }, { "crossentropy": 2.842302441596985, "epoch": 0.19059789553821474, "grad_norm": 0.04112350940704346, "grad_norm_var": 3.9848150345951856e-05, "learning_rate": 0.008452483581762092, "loss": 2.8423, "step": 3505 }, { "crossentropy": 2.7161011695861816, "epoch": 0.19065227439571494, "grad_norm": 0.037696659564971924, "grad_norm_var": 4.099377561225659e-05, "learning_rate": 0.008451582790772579, "loss": 2.7161, "step": 3506 }, { "crossentropy": 2.7126277685165405, "epoch": 0.19070665325321515, "grad_norm": 0.037104297429323196, "grad_norm_var": 2.9834489959654614e-05, "learning_rate": 0.008450681785719116, "loss": 2.7126, "step": 3507 }, { "crossentropy": 2.69803786277771, "epoch": 0.19076103211071535, "grad_norm": 0.03846517577767372, "grad_norm_var": 3.057662230413107e-05, "learning_rate": 0.008449780566657588, "loss": 2.698, "step": 3508 }, { "crossentropy": 2.8084696531295776, "epoch": 0.19081541096821555, "grad_norm": 0.04100699722766876, "grad_norm_var": 3.050069616488121e-05, "learning_rate": 0.008448879133643884, "loss": 2.8085, "step": 3509 }, { "crossentropy": 2.7330570220947266, "epoch": 0.19086978982571576, "grad_norm": 0.04398954659700394, "grad_norm_var": 3.0799675627620156e-05, "learning_rate": 0.008447977486733912, "loss": 2.7331, "step": 3510 }, { "crossentropy": 2.7069895267486572, "epoch": 0.19092416868321596, "grad_norm": 0.039239559322595596, "grad_norm_var": 3.114226269867552e-05, "learning_rate": 0.008447075625983591, "loss": 2.707, "step": 3511 }, { "crossentropy": 2.698301076889038, "epoch": 0.19097854754071616, "grad_norm": 0.04341974854469299, "grad_norm_var": 3.1176679453980466e-05, "learning_rate": 0.008446173551448853, "loss": 2.6983, "step": 3512 }, { "crossentropy": 2.6312988996505737, "epoch": 0.19103292639821637, "grad_norm": 0.04050257056951523, "grad_norm_var": 5.518100622988372e-06, "learning_rate": 0.008445271263185646, "loss": 2.6313, "step": 3513 }, { "crossentropy": 2.8271994590759277, "epoch": 0.19108730525571657, "grad_norm": 0.040866561233997345, "grad_norm_var": 5.495635257808141e-06, "learning_rate": 0.008444368761249925, "loss": 2.8272, "step": 3514 }, { "crossentropy": 2.66845703125, "epoch": 0.19114168411321678, "grad_norm": 0.04486668109893799, "grad_norm_var": 6.158259566110304e-06, "learning_rate": 0.008443466045697665, "loss": 2.6685, "step": 3515 }, { "crossentropy": 2.726623773574829, "epoch": 0.19119606297071698, "grad_norm": 0.04194379597902298, "grad_norm_var": 5.870181741254808e-06, "learning_rate": 0.008442563116584851, "loss": 2.7266, "step": 3516 }, { "crossentropy": 2.8019864559173584, "epoch": 0.19125044182821718, "grad_norm": 0.04214989021420479, "grad_norm_var": 4.727314414352107e-06, "learning_rate": 0.008441659973967484, "loss": 2.802, "step": 3517 }, { "crossentropy": 2.6795157194137573, "epoch": 0.19130482068571739, "grad_norm": 0.043515224009752274, "grad_norm_var": 5.041817710895307e-06, "learning_rate": 0.008440756617901573, "loss": 2.6795, "step": 3518 }, { "crossentropy": 2.7483962774276733, "epoch": 0.1913591995432176, "grad_norm": 0.043949976563453674, "grad_norm_var": 5.476949837277904e-06, "learning_rate": 0.008439853048443142, "loss": 2.7484, "step": 3519 }, { "crossentropy": 2.6817376613616943, "epoch": 0.1914135784007178, "grad_norm": 0.04131563380360603, "grad_norm_var": 5.4211539140848475e-06, "learning_rate": 0.008438949265648235, "loss": 2.6817, "step": 3520 }, { "crossentropy": 2.831300377845764, "epoch": 0.191467957258218, "grad_norm": 0.04178599268198013, "grad_norm_var": 5.431030147015626e-06, "learning_rate": 0.0084380452695729, "loss": 2.8313, "step": 3521 }, { "crossentropy": 2.764355182647705, "epoch": 0.1915223361157182, "grad_norm": 0.038117002695798874, "grad_norm_var": 5.236554249810934e-06, "learning_rate": 0.008437141060273202, "loss": 2.7644, "step": 3522 }, { "crossentropy": 2.5779181718826294, "epoch": 0.1915767149732184, "grad_norm": 0.040196485817432404, "grad_norm_var": 4.067230791350111e-06, "learning_rate": 0.00843623663780522, "loss": 2.5779, "step": 3523 }, { "crossentropy": 2.7885334491729736, "epoch": 0.1916310938307186, "grad_norm": 0.045833490788936615, "grad_norm_var": 4.3972289389191194e-06, "learning_rate": 0.008435332002225045, "loss": 2.7885, "step": 3524 }, { "crossentropy": 2.6063300371170044, "epoch": 0.1916854726882188, "grad_norm": 0.0464266873896122, "grad_norm_var": 5.483898256171636e-06, "learning_rate": 0.008434427153588784, "loss": 2.6063, "step": 3525 }, { "crossentropy": 2.625186562538147, "epoch": 0.19173985154571901, "grad_norm": 0.041603248566389084, "grad_norm_var": 5.328457526577492e-06, "learning_rate": 0.00843352209195255, "loss": 2.6252, "step": 3526 }, { "crossentropy": 2.6596099138259888, "epoch": 0.19179423040321922, "grad_norm": 0.04005052149295807, "grad_norm_var": 5.0458548902581704e-06, "learning_rate": 0.008432616817372477, "loss": 2.6596, "step": 3527 }, { "crossentropy": 2.7313729524612427, "epoch": 0.19184860926071942, "grad_norm": 0.041508592665195465, "grad_norm_var": 4.984717074693612e-06, "learning_rate": 0.008431711329904713, "loss": 2.7314, "step": 3528 }, { "crossentropy": 2.667548656463623, "epoch": 0.19190298811821963, "grad_norm": 0.042853884398937225, "grad_norm_var": 4.809223382359895e-06, "learning_rate": 0.008430805629605407, "loss": 2.6675, "step": 3529 }, { "crossentropy": 2.8128186464309692, "epoch": 0.19195736697571983, "grad_norm": 0.03674871101975441, "grad_norm_var": 6.662344041160652e-06, "learning_rate": 0.008429899716530739, "loss": 2.8128, "step": 3530 }, { "crossentropy": 2.7795543670654297, "epoch": 0.19201174583322003, "grad_norm": 0.04007839784026146, "grad_norm_var": 6.2996733239023085e-06, "learning_rate": 0.008428993590736884, "loss": 2.7796, "step": 3531 }, { "crossentropy": 2.6868457794189453, "epoch": 0.19206612469072024, "grad_norm": 0.03922092542052269, "grad_norm_var": 6.694451649161213e-06, "learning_rate": 0.008428087252280045, "loss": 2.6868, "step": 3532 }, { "crossentropy": 2.8230957984924316, "epoch": 0.19212050354822044, "grad_norm": 0.0424940250813961, "grad_norm_var": 6.727788537371342e-06, "learning_rate": 0.00842718070121643, "loss": 2.8231, "step": 3533 }, { "crossentropy": 2.789981722831726, "epoch": 0.19217488240572064, "grad_norm": 0.04455083608627319, "grad_norm_var": 7.058423869872136e-06, "learning_rate": 0.008426273937602265, "loss": 2.79, "step": 3534 }, { "crossentropy": 2.7745944261550903, "epoch": 0.19222926126322085, "grad_norm": 0.041627831757068634, "grad_norm_var": 6.689800282140788e-06, "learning_rate": 0.008425366961493784, "loss": 2.7746, "step": 3535 }, { "crossentropy": 2.8499966859817505, "epoch": 0.19228364012072105, "grad_norm": 0.04019205644726753, "grad_norm_var": 6.800181989964858e-06, "learning_rate": 0.008424459772947238, "loss": 2.85, "step": 3536 }, { "crossentropy": 2.771500587463379, "epoch": 0.19233801897822128, "grad_norm": 0.03843717277050018, "grad_norm_var": 7.3535453616278374e-06, "learning_rate": 0.008423552372018887, "loss": 2.7715, "step": 3537 }, { "crossentropy": 2.6054881811141968, "epoch": 0.19239239783572148, "grad_norm": 0.0408744290471077, "grad_norm_var": 6.678271651695764e-06, "learning_rate": 0.00842264475876501, "loss": 2.6055, "step": 3538 }, { "crossentropy": 2.72999107837677, "epoch": 0.1924467766932217, "grad_norm": 0.04454362764954567, "grad_norm_var": 7.151024783322995e-06, "learning_rate": 0.0084217369332419, "loss": 2.73, "step": 3539 }, { "crossentropy": 2.690454602241516, "epoch": 0.1925011555507219, "grad_norm": 0.04371075704693794, "grad_norm_var": 6.259991177621528e-06, "learning_rate": 0.008420828895505852, "loss": 2.6905, "step": 3540 }, { "crossentropy": 2.7020591497421265, "epoch": 0.1925555344082221, "grad_norm": 0.03800417482852936, "grad_norm_var": 5.225675096604913e-06, "learning_rate": 0.008419920645613185, "loss": 2.7021, "step": 3541 }, { "crossentropy": 2.77986741065979, "epoch": 0.1926099132657223, "grad_norm": 0.038189999759197235, "grad_norm_var": 5.693477374582532e-06, "learning_rate": 0.00841901218362023, "loss": 2.7799, "step": 3542 }, { "crossentropy": 2.7487876415252686, "epoch": 0.1926642921232225, "grad_norm": 0.03904343023896217, "grad_norm_var": 5.859905782533888e-06, "learning_rate": 0.008418103509583323, "loss": 2.7488, "step": 3543 }, { "crossentropy": 2.6022565364837646, "epoch": 0.1927186709807227, "grad_norm": 0.03644617274403572, "grad_norm_var": 6.952946317845208e-06, "learning_rate": 0.008417194623558829, "loss": 2.6023, "step": 3544 }, { "crossentropy": 2.721113085746765, "epoch": 0.1927730498382229, "grad_norm": 0.04153107479214668, "grad_norm_var": 6.6363026683500154e-06, "learning_rate": 0.008416285525603105, "loss": 2.7211, "step": 3545 }, { "crossentropy": 2.713220953941345, "epoch": 0.1928274286957231, "grad_norm": 0.04657355323433876, "grad_norm_var": 7.94399493068675e-06, "learning_rate": 0.008415376215772541, "loss": 2.7132, "step": 3546 }, { "crossentropy": 2.6825660467147827, "epoch": 0.19288180755322332, "grad_norm": 0.055849600583314896, "grad_norm_var": 2.1614988828293678e-05, "learning_rate": 0.008414466694123527, "loss": 2.6826, "step": 3547 }, { "crossentropy": 2.7081611156463623, "epoch": 0.19293618641072352, "grad_norm": 0.04523026570677757, "grad_norm_var": 2.1680850766567227e-05, "learning_rate": 0.008413556960712472, "loss": 2.7082, "step": 3548 }, { "crossentropy": 2.7236316204071045, "epoch": 0.19299056526822372, "grad_norm": 0.036530885845422745, "grad_norm_var": 2.377382074347951e-05, "learning_rate": 0.008412647015595796, "loss": 2.7236, "step": 3549 }, { "crossentropy": 2.684242844581604, "epoch": 0.19304494412572393, "grad_norm": 0.06077450141310692, "grad_norm_var": 4.583192155045475e-05, "learning_rate": 0.008411736858829936, "loss": 2.6842, "step": 3550 }, { "crossentropy": 2.828571319580078, "epoch": 0.19309932298322413, "grad_norm": 0.03819974884390831, "grad_norm_var": 4.71810106599022e-05, "learning_rate": 0.008410826490471333, "loss": 2.8286, "step": 3551 }, { "crossentropy": 2.7373485565185547, "epoch": 0.19315370184072433, "grad_norm": 0.03927980363368988, "grad_norm_var": 4.7545154949205024e-05, "learning_rate": 0.008409915910576453, "loss": 2.7373, "step": 3552 }, { "crossentropy": 2.7597347497940063, "epoch": 0.19320808069822454, "grad_norm": 0.03959963843226433, "grad_norm_var": 4.696870817015493e-05, "learning_rate": 0.008409005119201769, "loss": 2.7597, "step": 3553 }, { "crossentropy": 2.6881250143051147, "epoch": 0.19326245955572474, "grad_norm": 0.04204292595386505, "grad_norm_var": 4.675811511165191e-05, "learning_rate": 0.008408094116403763, "loss": 2.6881, "step": 3554 }, { "crossentropy": 2.752155303955078, "epoch": 0.19331683841322495, "grad_norm": 0.040419843047857285, "grad_norm_var": 4.6888031606155216e-05, "learning_rate": 0.008407182902238939, "loss": 2.7522, "step": 3555 }, { "crossentropy": 2.630873680114746, "epoch": 0.19337121727072515, "grad_norm": 0.04029471427202225, "grad_norm_var": 4.7106504171055774e-05, "learning_rate": 0.008406271476763807, "loss": 2.6309, "step": 3556 }, { "crossentropy": 2.590308666229248, "epoch": 0.19342559612822535, "grad_norm": 0.03850722685456276, "grad_norm_var": 4.682911019211639e-05, "learning_rate": 0.008405359840034893, "loss": 2.5903, "step": 3557 }, { "crossentropy": 2.6504013538360596, "epoch": 0.19347997498572556, "grad_norm": 0.042026225477457047, "grad_norm_var": 4.5591873169047115e-05, "learning_rate": 0.008404447992108738, "loss": 2.6504, "step": 3558 }, { "crossentropy": 2.627390146255493, "epoch": 0.19353435384322576, "grad_norm": 0.04558071866631508, "grad_norm_var": 4.5122002074315416e-05, "learning_rate": 0.008403535933041892, "loss": 2.6274, "step": 3559 }, { "crossentropy": 2.731183171272278, "epoch": 0.19358873270072596, "grad_norm": 0.046572983264923096, "grad_norm_var": 4.260742584782928e-05, "learning_rate": 0.00840262366289092, "loss": 2.7312, "step": 3560 }, { "crossentropy": 2.6677193641662598, "epoch": 0.19364311155822617, "grad_norm": 0.044888902455568314, "grad_norm_var": 4.2346276233377424e-05, "learning_rate": 0.008401711181712401, "loss": 2.6677, "step": 3561 }, { "crossentropy": 2.791130781173706, "epoch": 0.19369749041572637, "grad_norm": 0.039360806345939636, "grad_norm_var": 4.3024892789615626e-05, "learning_rate": 0.008400798489562928, "loss": 2.7911, "step": 3562 }, { "crossentropy": 2.6978825330734253, "epoch": 0.19375186927322657, "grad_norm": 0.038019586354494095, "grad_norm_var": 3.341010043551995e-05, "learning_rate": 0.0083998855864991, "loss": 2.6979, "step": 3563 }, { "crossentropy": 2.70257568359375, "epoch": 0.19380624813072678, "grad_norm": 0.036496467888355255, "grad_norm_var": 3.4803724088350076e-05, "learning_rate": 0.008398972472577539, "loss": 2.7026, "step": 3564 }, { "crossentropy": 2.733267903327942, "epoch": 0.19386062698822698, "grad_norm": 0.037547122687101364, "grad_norm_var": 3.4156050698395483e-05, "learning_rate": 0.008398059147854875, "loss": 2.7333, "step": 3565 }, { "crossentropy": 2.6257052421569824, "epoch": 0.19391500584572718, "grad_norm": 0.05378951504826546, "grad_norm_var": 1.9581096117722804e-05, "learning_rate": 0.008397145612387748, "loss": 2.6257, "step": 3566 }, { "crossentropy": 2.6808688640594482, "epoch": 0.1939693847032274, "grad_norm": 0.038184430450201035, "grad_norm_var": 1.958767602340608e-05, "learning_rate": 0.00839623186623282, "loss": 2.6809, "step": 3567 }, { "crossentropy": 2.670114278793335, "epoch": 0.1940237635607276, "grad_norm": 0.037207264453172684, "grad_norm_var": 2.0445674369311676e-05, "learning_rate": 0.008395317909446759, "loss": 2.6701, "step": 3568 }, { "crossentropy": 2.723915457725525, "epoch": 0.1940781424182278, "grad_norm": 0.037305936217308044, "grad_norm_var": 2.128950683487433e-05, "learning_rate": 0.008394403742086243, "loss": 2.7239, "step": 3569 }, { "crossentropy": 2.5089019536972046, "epoch": 0.194132521275728, "grad_norm": 0.0398726612329483, "grad_norm_var": 2.1322690898326983e-05, "learning_rate": 0.008393489364207973, "loss": 2.5089, "step": 3570 }, { "crossentropy": 2.744763970375061, "epoch": 0.1941869001332282, "grad_norm": 0.03752577677369118, "grad_norm_var": 2.2071829913975632e-05, "learning_rate": 0.008392574775868657, "loss": 2.7448, "step": 3571 }, { "crossentropy": 2.6792478561401367, "epoch": 0.1942412789907284, "grad_norm": 0.039299409836530685, "grad_norm_var": 2.2203954034420658e-05, "learning_rate": 0.008391659977125017, "loss": 2.6792, "step": 3572 }, { "crossentropy": 2.7141497135162354, "epoch": 0.1942956578482286, "grad_norm": 0.0406392440199852, "grad_norm_var": 2.1847209365805088e-05, "learning_rate": 0.008390744968033785, "loss": 2.7141, "step": 3573 }, { "crossentropy": 2.7531697750091553, "epoch": 0.1943500367057288, "grad_norm": 0.04190785065293312, "grad_norm_var": 2.1830227762037467e-05, "learning_rate": 0.008389829748651713, "loss": 2.7532, "step": 3574 }, { "crossentropy": 2.6863856315612793, "epoch": 0.19440441556322902, "grad_norm": 0.039425838738679886, "grad_norm_var": 2.0346326071715715e-05, "learning_rate": 0.00838891431903556, "loss": 2.6864, "step": 3575 }, { "crossentropy": 2.8227380514144897, "epoch": 0.19445879442072922, "grad_norm": 0.0429668202996254, "grad_norm_var": 1.824039561003979e-05, "learning_rate": 0.008387998679242099, "loss": 2.8227, "step": 3576 }, { "crossentropy": 2.61402428150177, "epoch": 0.19451317327822942, "grad_norm": 0.04115847125649452, "grad_norm_var": 1.681641007071332e-05, "learning_rate": 0.00838708282932812, "loss": 2.614, "step": 3577 }, { "crossentropy": 2.6404043436050415, "epoch": 0.19456755213572963, "grad_norm": 0.03734440356492996, "grad_norm_var": 1.725426053484728e-05, "learning_rate": 0.008386166769350422, "loss": 2.6404, "step": 3578 }, { "crossentropy": 2.7578842639923096, "epoch": 0.19462193099322983, "grad_norm": 0.03780020773410797, "grad_norm_var": 1.7312803104443885e-05, "learning_rate": 0.008385250499365818, "loss": 2.7579, "step": 3579 }, { "crossentropy": 2.606223464012146, "epoch": 0.19467630985073003, "grad_norm": 0.03817247971892357, "grad_norm_var": 1.6726787720005207e-05, "learning_rate": 0.008384334019431132, "loss": 2.6062, "step": 3580 }, { "crossentropy": 2.7306859493255615, "epoch": 0.19473068870823024, "grad_norm": 0.08830221742391586, "grad_norm_var": 0.00016106993794400554, "learning_rate": 0.008383417329603208, "loss": 2.7307, "step": 3581 }, { "crossentropy": 2.6592084169387817, "epoch": 0.19478506756573044, "grad_norm": 0.04868515953421593, "grad_norm_var": 0.00015547866745121148, "learning_rate": 0.008382500429938893, "loss": 2.6592, "step": 3582 }, { "crossentropy": 2.808454394340515, "epoch": 0.19483944642323064, "grad_norm": 0.047259651124477386, "grad_norm_var": 0.00015496568079777692, "learning_rate": 0.008381583320495057, "loss": 2.8085, "step": 3583 }, { "crossentropy": 2.6865490674972534, "epoch": 0.19489382528073085, "grad_norm": 0.04389118030667305, "grad_norm_var": 0.0001522125874192278, "learning_rate": 0.008380666001328576, "loss": 2.6865, "step": 3584 }, { "crossentropy": 2.633310317993164, "epoch": 0.19494820413823105, "grad_norm": 0.041861455887556076, "grad_norm_var": 0.00014953637406664994, "learning_rate": 0.00837974847249634, "loss": 2.6333, "step": 3585 }, { "crossentropy": 2.7885290384292603, "epoch": 0.19500258299573126, "grad_norm": 0.03836695849895477, "grad_norm_var": 0.00015053318715606273, "learning_rate": 0.008378830734055254, "loss": 2.7885, "step": 3586 }, { "crossentropy": 2.8602911233901978, "epoch": 0.19505696185323146, "grad_norm": 0.25657469034194946, "grad_norm_var": 0.00295923705248895, "learning_rate": 0.008377912786062239, "loss": 2.8603, "step": 3587 }, { "crossentropy": 2.7423137426376343, "epoch": 0.19511134071073166, "grad_norm": 0.05500921607017517, "grad_norm_var": 0.002936059598397394, "learning_rate": 0.00837699462857422, "loss": 2.7423, "step": 3588 }, { "crossentropy": 2.600961685180664, "epoch": 0.19516571956823187, "grad_norm": 0.05716419219970703, "grad_norm_var": 0.0029133101388404575, "learning_rate": 0.008376076261648142, "loss": 2.601, "step": 3589 }, { "crossentropy": 2.757037043571472, "epoch": 0.19522009842573207, "grad_norm": 0.04693948104977608, "grad_norm_var": 0.002902927035964339, "learning_rate": 0.008375157685340964, "loss": 2.757, "step": 3590 }, { "crossentropy": 2.687715768814087, "epoch": 0.19527447728323227, "grad_norm": 0.04893030598759651, "grad_norm_var": 0.002882427048599386, "learning_rate": 0.008374238899709651, "loss": 2.6877, "step": 3591 }, { "crossentropy": 2.7412633895874023, "epoch": 0.19532885614073248, "grad_norm": 0.08339403569698334, "grad_norm_var": 0.002889247911292345, "learning_rate": 0.008373319904811187, "loss": 2.7413, "step": 3592 }, { "crossentropy": 2.7219531536102295, "epoch": 0.19538323499823268, "grad_norm": 0.05449848249554634, "grad_norm_var": 0.002861204041416231, "learning_rate": 0.00837240070070257, "loss": 2.722, "step": 3593 }, { "crossentropy": 2.646260142326355, "epoch": 0.19543761385573288, "grad_norm": 0.05469172075390816, "grad_norm_var": 0.0028183303231053055, "learning_rate": 0.008371481287440804, "loss": 2.6463, "step": 3594 }, { "crossentropy": 2.7281382083892822, "epoch": 0.1954919927132331, "grad_norm": 0.060582324862480164, "grad_norm_var": 0.0027678542202333196, "learning_rate": 0.008370561665082913, "loss": 2.7281, "step": 3595 }, { "crossentropy": 2.782291531562805, "epoch": 0.1955463715707333, "grad_norm": 0.07217919081449509, "grad_norm_var": 0.0027115976142892176, "learning_rate": 0.008369641833685928, "loss": 2.7823, "step": 3596 }, { "crossentropy": 2.75063693523407, "epoch": 0.1956007504282335, "grad_norm": 0.06297090649604797, "grad_norm_var": 0.002685312076303652, "learning_rate": 0.0083687217933069, "loss": 2.7506, "step": 3597 }, { "crossentropy": 2.635762572288513, "epoch": 0.1956551292857337, "grad_norm": 0.05113832280039787, "grad_norm_var": 0.0026796772078097014, "learning_rate": 0.008367801544002884, "loss": 2.6358, "step": 3598 }, { "crossentropy": 2.6527814865112305, "epoch": 0.1957095081432339, "grad_norm": 0.0451185405254364, "grad_norm_var": 0.0026856608273398844, "learning_rate": 0.008366881085830958, "loss": 2.6528, "step": 3599 }, { "crossentropy": 2.715909481048584, "epoch": 0.1957638870007341, "grad_norm": 0.04181912913918495, "grad_norm_var": 0.0026923361560774123, "learning_rate": 0.008365960418848207, "loss": 2.7159, "step": 3600 }, { "crossentropy": 2.7328522205352783, "epoch": 0.1958182658582343, "grad_norm": 0.04424254223704338, "grad_norm_var": 0.0026847246660854137, "learning_rate": 0.008365039543111727, "loss": 2.7329, "step": 3601 }, { "crossentropy": 2.6973495483398438, "epoch": 0.1958726447157345, "grad_norm": 0.04252462834119797, "grad_norm_var": 0.0026698760270629433, "learning_rate": 0.008364118458678634, "loss": 2.6973, "step": 3602 }, { "crossentropy": 2.7145326137542725, "epoch": 0.19592702357323472, "grad_norm": 0.04046023264527321, "grad_norm_var": 0.00013672840601248367, "learning_rate": 0.008363197165606047, "loss": 2.7145, "step": 3603 }, { "crossentropy": 2.73012912273407, "epoch": 0.19598140243073492, "grad_norm": 0.0431932769715786, "grad_norm_var": 0.0001436343633892158, "learning_rate": 0.00836227566395111, "loss": 2.7301, "step": 3604 }, { "crossentropy": 2.6685596704483032, "epoch": 0.19603578128823512, "grad_norm": 0.0457686111330986, "grad_norm_var": 0.00014559887570138083, "learning_rate": 0.00836135395377097, "loss": 2.6686, "step": 3605 }, { "crossentropy": 2.8099247217178345, "epoch": 0.19609016014573533, "grad_norm": 0.05731860548257828, "grad_norm_var": 0.00014477056951102662, "learning_rate": 0.008360432035122791, "loss": 2.8099, "step": 3606 }, { "crossentropy": 2.7580440044403076, "epoch": 0.19614453900323553, "grad_norm": 0.03899311646819115, "grad_norm_var": 0.00015640328221869318, "learning_rate": 0.008359509908063752, "loss": 2.758, "step": 3607 }, { "crossentropy": 2.729653000831604, "epoch": 0.19619891786073573, "grad_norm": 0.0374765619635582, "grad_norm_var": 9.861237197317958e-05, "learning_rate": 0.00835858757265104, "loss": 2.7297, "step": 3608 }, { "crossentropy": 2.667736768722534, "epoch": 0.19625329671823594, "grad_norm": 0.03977344185113907, "grad_norm_var": 0.00010247012132189356, "learning_rate": 0.00835766502894186, "loss": 2.6677, "step": 3609 }, { "crossentropy": 2.6653677225112915, "epoch": 0.19630767557573614, "grad_norm": 0.03717239946126938, "grad_norm_var": 0.00010751839114657805, "learning_rate": 0.008356742276993422, "loss": 2.6654, "step": 3610 }, { "crossentropy": 2.6037288904190063, "epoch": 0.19636205443323637, "grad_norm": 0.03685200959444046, "grad_norm_var": 0.00010146557831763857, "learning_rate": 0.008355819316862956, "loss": 2.6037, "step": 3611 }, { "crossentropy": 2.6342746019363403, "epoch": 0.19641643329073658, "grad_norm": 0.03679206222295761, "grad_norm_var": 5.650562022662282e-05, "learning_rate": 0.008354896148607708, "loss": 2.6343, "step": 3612 }, { "crossentropy": 2.63552188873291, "epoch": 0.19647081214823678, "grad_norm": 0.05163869634270668, "grad_norm_var": 3.56422148158901e-05, "learning_rate": 0.008353972772284926, "loss": 2.6355, "step": 3613 }, { "crossentropy": 2.8010753393173218, "epoch": 0.19652519100573698, "grad_norm": 0.04316366836428642, "grad_norm_var": 3.1115197585266226e-05, "learning_rate": 0.008353049187951883, "loss": 2.8011, "step": 3614 }, { "crossentropy": 2.630753755569458, "epoch": 0.19657956986323719, "grad_norm": 0.04213765636086464, "grad_norm_var": 3.06871303372767e-05, "learning_rate": 0.008352125395665853, "loss": 2.6308, "step": 3615 }, { "crossentropy": 2.7477078437805176, "epoch": 0.1966339487207374, "grad_norm": 0.042235977947711945, "grad_norm_var": 3.066248690862013e-05, "learning_rate": 0.008351201395484132, "loss": 2.7477, "step": 3616 }, { "crossentropy": 2.736894130706787, "epoch": 0.1966883275782376, "grad_norm": 0.04542370140552521, "grad_norm_var": 3.102663712044758e-05, "learning_rate": 0.008350277187464023, "loss": 2.7369, "step": 3617 }, { "crossentropy": 2.777536153793335, "epoch": 0.1967427064357378, "grad_norm": 0.04328979179263115, "grad_norm_var": 3.105984606094758e-05, "learning_rate": 0.008349352771662847, "loss": 2.7775, "step": 3618 }, { "crossentropy": 2.770626187324524, "epoch": 0.196797085293238, "grad_norm": 0.04086586460471153, "grad_norm_var": 3.0954098322964234e-05, "learning_rate": 0.008348428148137936, "loss": 2.7706, "step": 3619 }, { "crossentropy": 2.6767090559005737, "epoch": 0.1968514641507382, "grad_norm": 0.03976186737418175, "grad_norm_var": 3.143273942548628e-05, "learning_rate": 0.008347503316946635, "loss": 2.6767, "step": 3620 }, { "crossentropy": 2.5335887670516968, "epoch": 0.1969058430082384, "grad_norm": 0.0394747294485569, "grad_norm_var": 3.109551150742018e-05, "learning_rate": 0.008346578278146295, "loss": 2.5336, "step": 3621 }, { "crossentropy": 2.6421884298324585, "epoch": 0.1969602218657386, "grad_norm": 0.03878137841820717, "grad_norm_var": 1.4767561351843907e-05, "learning_rate": 0.008345653031794291, "loss": 2.6422, "step": 3622 }, { "crossentropy": 2.724219799041748, "epoch": 0.19701460072323881, "grad_norm": 0.04014023765921593, "grad_norm_var": 1.4563568301032409e-05, "learning_rate": 0.008344727577948005, "loss": 2.7242, "step": 3623 }, { "crossentropy": 2.739624261856079, "epoch": 0.19706897958073902, "grad_norm": 0.0378655381500721, "grad_norm_var": 1.4393593059188146e-05, "learning_rate": 0.008343801916664836, "loss": 2.7396, "step": 3624 }, { "crossentropy": 2.7265446186065674, "epoch": 0.19712335843823922, "grad_norm": 0.0378064326941967, "grad_norm_var": 1.494675734225222e-05, "learning_rate": 0.008342876048002186, "loss": 2.7265, "step": 3625 }, { "crossentropy": 2.7322689294815063, "epoch": 0.19717773729573942, "grad_norm": 0.04591785743832588, "grad_norm_var": 1.5453069390980225e-05, "learning_rate": 0.008341949972017482, "loss": 2.7323, "step": 3626 }, { "crossentropy": 2.7027363777160645, "epoch": 0.19723211615323963, "grad_norm": 0.03959522023797035, "grad_norm_var": 1.4265688118038761e-05, "learning_rate": 0.008341023688768155, "loss": 2.7027, "step": 3627 }, { "crossentropy": 2.6585750579833984, "epoch": 0.19728649501073983, "grad_norm": 0.06869740039110184, "grad_norm_var": 5.762300908650249e-05, "learning_rate": 0.008340097198311655, "loss": 2.6586, "step": 3628 }, { "crossentropy": 2.708632707595825, "epoch": 0.19734087386824004, "grad_norm": 0.04130527377128601, "grad_norm_var": 5.315187030602945e-05, "learning_rate": 0.00833917050070544, "loss": 2.7086, "step": 3629 }, { "crossentropy": 2.792261481285095, "epoch": 0.19739525272574024, "grad_norm": 0.03911798447370529, "grad_norm_var": 5.4034723874980294e-05, "learning_rate": 0.008338243596006985, "loss": 2.7923, "step": 3630 }, { "crossentropy": 2.8168399333953857, "epoch": 0.19744963158324044, "grad_norm": 0.03822799026966095, "grad_norm_var": 5.5257696948844e-05, "learning_rate": 0.008337316484273771, "loss": 2.8168, "step": 3631 }, { "crossentropy": 2.772316336631775, "epoch": 0.19750401044074065, "grad_norm": 0.040603842586278915, "grad_norm_var": 5.546134139022361e-05, "learning_rate": 0.008336389165563302, "loss": 2.7723, "step": 3632 }, { "crossentropy": 2.7664244174957275, "epoch": 0.19755838929824085, "grad_norm": 0.04194832593202591, "grad_norm_var": 5.477093500715387e-05, "learning_rate": 0.00833546163993309, "loss": 2.7664, "step": 3633 }, { "crossentropy": 2.739452600479126, "epoch": 0.19761276815574105, "grad_norm": 0.043207116425037384, "grad_norm_var": 5.475810870364991e-05, "learning_rate": 0.008334533907440654, "loss": 2.7395, "step": 3634 }, { "crossentropy": 2.7686835527420044, "epoch": 0.19766714701324126, "grad_norm": 0.04339882358908653, "grad_norm_var": 5.474827166579895e-05, "learning_rate": 0.008333605968143533, "loss": 2.7687, "step": 3635 }, { "crossentropy": 2.6589183807373047, "epoch": 0.19772152587074146, "grad_norm": 0.03905290737748146, "grad_norm_var": 5.5013997794287756e-05, "learning_rate": 0.008332677822099281, "loss": 2.6589, "step": 3636 }, { "crossentropy": 2.842031955718994, "epoch": 0.19777590472824166, "grad_norm": 0.03999943658709526, "grad_norm_var": 5.4840800345040686e-05, "learning_rate": 0.008331749469365456, "loss": 2.842, "step": 3637 }, { "crossentropy": 2.6322739124298096, "epoch": 0.19783028358574187, "grad_norm": 0.03845905512571335, "grad_norm_var": 5.499546485622456e-05, "learning_rate": 0.008330820909999634, "loss": 2.6323, "step": 3638 }, { "crossentropy": 2.616412401199341, "epoch": 0.19788466244324207, "grad_norm": 0.037499893456697464, "grad_norm_var": 5.615946547549381e-05, "learning_rate": 0.008329892144059405, "loss": 2.6164, "step": 3639 }, { "crossentropy": 2.7214232683181763, "epoch": 0.19793904130074227, "grad_norm": 0.04163922369480133, "grad_norm_var": 5.494711092606218e-05, "learning_rate": 0.00832896317160237, "loss": 2.7214, "step": 3640 }, { "crossentropy": 2.597372055053711, "epoch": 0.19799342015824248, "grad_norm": 0.0412616953253746, "grad_norm_var": 5.363240102432502e-05, "learning_rate": 0.008328033992686142, "loss": 2.5974, "step": 3641 }, { "crossentropy": 2.6762027740478516, "epoch": 0.19804779901574268, "grad_norm": 0.03988170623779297, "grad_norm_var": 5.315541739840761e-05, "learning_rate": 0.008327104607368348, "loss": 2.6762, "step": 3642 }, { "crossentropy": 2.7070600986480713, "epoch": 0.19810217787324289, "grad_norm": 0.04301566258072853, "grad_norm_var": 5.2735870099250105e-05, "learning_rate": 0.00832617501570663, "loss": 2.7071, "step": 3643 }, { "crossentropy": 2.7557440996170044, "epoch": 0.1981565567307431, "grad_norm": 0.04683621972799301, "grad_norm_var": 5.755607437234652e-06, "learning_rate": 0.00832524521775864, "loss": 2.7557, "step": 3644 }, { "crossentropy": 2.7121269702911377, "epoch": 0.1982109355882433, "grad_norm": 0.046767499297857285, "grad_norm_var": 7.867482014032163e-06, "learning_rate": 0.008324315213582036, "loss": 2.7121, "step": 3645 }, { "crossentropy": 2.7633326053619385, "epoch": 0.1982653144457435, "grad_norm": 0.04175379499793053, "grad_norm_var": 7.532271664555575e-06, "learning_rate": 0.008323385003234507, "loss": 2.7633, "step": 3646 }, { "crossentropy": 2.5959484577178955, "epoch": 0.1983196933032437, "grad_norm": 0.03944503888487816, "grad_norm_var": 7.098419347814575e-06, "learning_rate": 0.008322454586773737, "loss": 2.5959, "step": 3647 }, { "crossentropy": 2.6493643522262573, "epoch": 0.1983740721607439, "grad_norm": 0.03775469958782196, "grad_norm_var": 7.964495495592824e-06, "learning_rate": 0.00832152396425743, "loss": 2.6494, "step": 3648 }, { "crossentropy": 2.6898908615112305, "epoch": 0.1984284510182441, "grad_norm": 0.041962847113609314, "grad_norm_var": 7.965628271784884e-06, "learning_rate": 0.008320593135743303, "loss": 2.6899, "step": 3649 }, { "crossentropy": 2.7830801010131836, "epoch": 0.1984828298757443, "grad_norm": 0.037451744079589844, "grad_norm_var": 8.626874981327674e-06, "learning_rate": 0.008319662101289087, "loss": 2.7831, "step": 3650 }, { "crossentropy": 2.8511135578155518, "epoch": 0.1985372087332445, "grad_norm": 0.03642572462558746, "grad_norm_var": 9.44605793916081e-06, "learning_rate": 0.008318730860952523, "loss": 2.8511, "step": 3651 }, { "crossentropy": 2.689163565635681, "epoch": 0.19859158759074472, "grad_norm": 0.038691774010658264, "grad_norm_var": 9.527520992355654e-06, "learning_rate": 0.008317799414791363, "loss": 2.6892, "step": 3652 }, { "crossentropy": 2.5909926891326904, "epoch": 0.19864596644824492, "grad_norm": 0.042451534420251846, "grad_norm_var": 9.722374981067233e-06, "learning_rate": 0.008316867762863379, "loss": 2.591, "step": 3653 }, { "crossentropy": 2.639102816581726, "epoch": 0.19870034530574512, "grad_norm": 0.037777986377477646, "grad_norm_var": 9.955421080542517e-06, "learning_rate": 0.008315935905226345, "loss": 2.6391, "step": 3654 }, { "crossentropy": 2.716806173324585, "epoch": 0.19875472416324533, "grad_norm": 0.03844878077507019, "grad_norm_var": 9.611432867776688e-06, "learning_rate": 0.00831500384193806, "loss": 2.7168, "step": 3655 }, { "crossentropy": 2.715667486190796, "epoch": 0.19880910302074553, "grad_norm": 0.04505416378378868, "grad_norm_var": 1.0757535084375026e-05, "learning_rate": 0.008314071573056327, "loss": 2.7157, "step": 3656 }, { "crossentropy": 2.7573448419570923, "epoch": 0.19886348187824573, "grad_norm": 0.04646684229373932, "grad_norm_var": 1.2676709886144609e-05, "learning_rate": 0.008313139098638966, "loss": 2.7573, "step": 3657 }, { "crossentropy": 2.858298659324646, "epoch": 0.19891786073574594, "grad_norm": 0.04266064614057541, "grad_norm_var": 1.2648071295900432e-05, "learning_rate": 0.008312206418743806, "loss": 2.8583, "step": 3658 }, { "crossentropy": 2.844995617866516, "epoch": 0.19897223959324614, "grad_norm": 0.0432838574051857, "grad_norm_var": 1.2709079145532126e-05, "learning_rate": 0.008311273533428691, "loss": 2.845, "step": 3659 }, { "crossentropy": 2.6722798347473145, "epoch": 0.19902661845074635, "grad_norm": 0.03592663258314133, "grad_norm_var": 1.2315928668108628e-05, "learning_rate": 0.008310340442751479, "loss": 2.6723, "step": 3660 }, { "crossentropy": 2.707910180091858, "epoch": 0.19908099730824655, "grad_norm": 0.03644296154379845, "grad_norm_var": 1.0722302777003285e-05, "learning_rate": 0.00830940714677004, "loss": 2.7079, "step": 3661 }, { "crossentropy": 2.732632637023926, "epoch": 0.19913537616574675, "grad_norm": 0.037820979952812195, "grad_norm_var": 1.083486064804706e-05, "learning_rate": 0.008308473645542254, "loss": 2.7326, "step": 3662 }, { "crossentropy": 2.6518930196762085, "epoch": 0.19918975502324696, "grad_norm": 0.03740973025560379, "grad_norm_var": 1.1211569242008714e-05, "learning_rate": 0.008307539939126016, "loss": 2.6519, "step": 3663 }, { "crossentropy": 2.7166905403137207, "epoch": 0.19924413388074716, "grad_norm": 0.036347538232803345, "grad_norm_var": 1.1710049370751661e-05, "learning_rate": 0.008306606027579237, "loss": 2.7167, "step": 3664 }, { "crossentropy": 2.6892426013946533, "epoch": 0.19929851273824736, "grad_norm": 0.038072939962148666, "grad_norm_var": 1.1463445251736331e-05, "learning_rate": 0.008305671910959832, "loss": 2.6892, "step": 3665 }, { "crossentropy": 2.7061758041381836, "epoch": 0.19935289159574757, "grad_norm": 0.04051808640360832, "grad_norm_var": 1.1246032286403122e-05, "learning_rate": 0.008304737589325739, "loss": 2.7062, "step": 3666 }, { "crossentropy": 2.7244197130203247, "epoch": 0.19940727045324777, "grad_norm": 0.03997623175382614, "grad_norm_var": 1.052528581031604e-05, "learning_rate": 0.008303803062734903, "loss": 2.7244, "step": 3667 }, { "crossentropy": 2.7447965145111084, "epoch": 0.19946164931074797, "grad_norm": 0.04130859300494194, "grad_norm_var": 1.0554590709948147e-05, "learning_rate": 0.00830286833124528, "loss": 2.7448, "step": 3668 }, { "crossentropy": 2.710708498954773, "epoch": 0.19951602816824818, "grad_norm": 0.04764784127473831, "grad_norm_var": 1.3942121487494028e-05, "learning_rate": 0.008301933394914841, "loss": 2.7107, "step": 3669 }, { "crossentropy": 2.6882599592208862, "epoch": 0.19957040702574838, "grad_norm": 0.04698081687092781, "grad_norm_var": 1.611285439128696e-05, "learning_rate": 0.008300998253801572, "loss": 2.6883, "step": 3670 }, { "crossentropy": 2.6908693313598633, "epoch": 0.19962478588324858, "grad_norm": 0.03822982311248779, "grad_norm_var": 1.618735169302554e-05, "learning_rate": 0.008300062907963469, "loss": 2.6909, "step": 3671 }, { "crossentropy": 2.758992314338684, "epoch": 0.1996791647407488, "grad_norm": 0.04259122163057327, "grad_norm_var": 1.5197107947742943e-05, "learning_rate": 0.008299127357458542, "loss": 2.759, "step": 3672 }, { "crossentropy": 2.616289973258972, "epoch": 0.199733543598249, "grad_norm": 0.03954818844795227, "grad_norm_var": 1.2896953597578826e-05, "learning_rate": 0.00829819160234481, "loss": 2.6163, "step": 3673 }, { "crossentropy": 2.7814879417419434, "epoch": 0.1997879224557492, "grad_norm": 0.03994084894657135, "grad_norm_var": 1.2502452211235876e-05, "learning_rate": 0.008297255642680312, "loss": 2.7815, "step": 3674 }, { "crossentropy": 2.639193058013916, "epoch": 0.1998423013132494, "grad_norm": 0.03742802515625954, "grad_norm_var": 1.21815191654893e-05, "learning_rate": 0.00829631947852309, "loss": 2.6392, "step": 3675 }, { "crossentropy": 2.7340255975723267, "epoch": 0.1998966801707496, "grad_norm": 0.0407903678715229, "grad_norm_var": 1.1172848431742894e-05, "learning_rate": 0.00829538310993121, "loss": 2.734, "step": 3676 }, { "crossentropy": 2.6356360912323, "epoch": 0.1999510590282498, "grad_norm": 0.03602112457156181, "grad_norm_var": 1.1387741279542985e-05, "learning_rate": 0.00829444653696274, "loss": 2.6356, "step": 3677 }, { "crossentropy": 2.707829713821411, "epoch": 0.20000543788575, "grad_norm": 0.04102497547864914, "grad_norm_var": 1.1081580503095357e-05, "learning_rate": 0.00829350975967577, "loss": 2.7078, "step": 3678 }, { "crossentropy": 2.7806832790374756, "epoch": 0.2000598167432502, "grad_norm": 0.03793349489569664, "grad_norm_var": 1.09010893134714e-05, "learning_rate": 0.008292572778128393, "loss": 2.7807, "step": 3679 }, { "crossentropy": 2.7603986263275146, "epoch": 0.20011419560075042, "grad_norm": 0.03920740261673927, "grad_norm_var": 9.915615246695354e-06, "learning_rate": 0.008291635592378721, "loss": 2.7604, "step": 3680 }, { "crossentropy": 2.768267869949341, "epoch": 0.20016857445825062, "grad_norm": 0.03595856949687004, "grad_norm_var": 1.0865508883710294e-05, "learning_rate": 0.00829069820248488, "loss": 2.7683, "step": 3681 }, { "crossentropy": 2.7799253463745117, "epoch": 0.20022295331575082, "grad_norm": 0.03632726892828941, "grad_norm_var": 1.1852004971266093e-05, "learning_rate": 0.008289760608505004, "loss": 2.7799, "step": 3682 }, { "crossentropy": 2.6203233003616333, "epoch": 0.20027733217325103, "grad_norm": 0.03908213973045349, "grad_norm_var": 1.1911616889231866e-05, "learning_rate": 0.008288822810497243, "loss": 2.6203, "step": 3683 }, { "crossentropy": 2.664803147315979, "epoch": 0.20033171103075123, "grad_norm": 0.04412782937288284, "grad_norm_var": 1.2899784111360576e-05, "learning_rate": 0.008287884808519755, "loss": 2.6648, "step": 3684 }, { "crossentropy": 2.6643091440200806, "epoch": 0.20038608988825143, "grad_norm": 0.06076003611087799, "grad_norm_var": 3.670573688576498e-05, "learning_rate": 0.008286946602630718, "loss": 2.6643, "step": 3685 }, { "crossentropy": 2.737370252609253, "epoch": 0.20044046874575167, "grad_norm": 0.04415053874254227, "grad_norm_var": 3.494827920248831e-05, "learning_rate": 0.008286008192888319, "loss": 2.7374, "step": 3686 }, { "crossentropy": 2.7614651918411255, "epoch": 0.20049484760325187, "grad_norm": 0.037628669291734695, "grad_norm_var": 3.517848774661862e-05, "learning_rate": 0.008285069579350752, "loss": 2.7615, "step": 3687 }, { "crossentropy": 2.763929009437561, "epoch": 0.20054922646075207, "grad_norm": 0.03735420107841492, "grad_norm_var": 3.5629692388324206e-05, "learning_rate": 0.008284130762076234, "loss": 2.7639, "step": 3688 }, { "crossentropy": 2.651336193084717, "epoch": 0.20060360531825228, "grad_norm": 0.039544835686683655, "grad_norm_var": 3.5630098570079104e-05, "learning_rate": 0.008283191741122988, "loss": 2.6513, "step": 3689 }, { "crossentropy": 2.605222702026367, "epoch": 0.20065798417575248, "grad_norm": 0.038359854370355606, "grad_norm_var": 3.589470705201477e-05, "learning_rate": 0.008282252516549252, "loss": 2.6052, "step": 3690 }, { "crossentropy": 2.7647844552993774, "epoch": 0.20071236303325268, "grad_norm": 0.03809966519474983, "grad_norm_var": 3.566067614807483e-05, "learning_rate": 0.008281313088413273, "loss": 2.7648, "step": 3691 }, { "crossentropy": 2.6121851205825806, "epoch": 0.2007667418907529, "grad_norm": 0.03723495826125145, "grad_norm_var": 3.626481903971999e-05, "learning_rate": 0.008280373456773316, "loss": 2.6122, "step": 3692 }, { "crossentropy": 2.761350989341736, "epoch": 0.2008211207482531, "grad_norm": 0.08940522372722626, "grad_norm_var": 0.00018480748649811357, "learning_rate": 0.008279433621687657, "loss": 2.7614, "step": 3693 }, { "crossentropy": 2.8312976360321045, "epoch": 0.2008754996057533, "grad_norm": 0.04693777486681938, "grad_norm_var": 0.00018503148009611867, "learning_rate": 0.00827849358321458, "loss": 2.8313, "step": 3694 }, { "crossentropy": 2.714306950569153, "epoch": 0.2009298784632535, "grad_norm": 0.052409347146749496, "grad_norm_var": 0.0001866470271683027, "learning_rate": 0.00827755334141239, "loss": 2.7143, "step": 3695 }, { "crossentropy": 2.7587387561798096, "epoch": 0.2009842573207537, "grad_norm": 0.046063799411058426, "grad_norm_var": 0.00018448458265266313, "learning_rate": 0.008276612896339394, "loss": 2.7587, "step": 3696 }, { "crossentropy": 2.600058436393738, "epoch": 0.2010386361782539, "grad_norm": 0.04306452348828316, "grad_norm_var": 0.00017887011247240628, "learning_rate": 0.008275672248053924, "loss": 2.6001, "step": 3697 }, { "crossentropy": 2.779986023902893, "epoch": 0.2010930150357541, "grad_norm": 0.039628397673368454, "grad_norm_var": 0.00017544365370699786, "learning_rate": 0.008274731396614315, "loss": 2.78, "step": 3698 }, { "crossentropy": 2.5871083736419678, "epoch": 0.2011473938932543, "grad_norm": 0.04688439145684242, "grad_norm_var": 0.0001721913715161114, "learning_rate": 0.008273790342078917, "loss": 2.5871, "step": 3699 }, { "crossentropy": 2.6196571588516235, "epoch": 0.20120177275075452, "grad_norm": 0.03855983540415764, "grad_norm_var": 0.00017578127673102353, "learning_rate": 0.008272849084506094, "loss": 2.6197, "step": 3700 }, { "crossentropy": 2.721043586730957, "epoch": 0.20125615160825472, "grad_norm": 0.040524180978536606, "grad_norm_var": 0.00016156464207460545, "learning_rate": 0.00827190762395422, "loss": 2.721, "step": 3701 }, { "crossentropy": 2.563952326774597, "epoch": 0.20131053046575492, "grad_norm": 0.03796418756246567, "grad_norm_var": 0.0001644433168021854, "learning_rate": 0.008270965960481689, "loss": 2.564, "step": 3702 }, { "crossentropy": 2.6442930698394775, "epoch": 0.20136490932325513, "grad_norm": 0.03912276029586792, "grad_norm_var": 0.00016324307048866618, "learning_rate": 0.008270024094146898, "loss": 2.6443, "step": 3703 }, { "crossentropy": 2.7049193382263184, "epoch": 0.20141928818075533, "grad_norm": 0.04047286882996559, "grad_norm_var": 0.00016090145230160554, "learning_rate": 0.008269082025008258, "loss": 2.7049, "step": 3704 }, { "crossentropy": 2.7916375398635864, "epoch": 0.20147366703825553, "grad_norm": 0.041026800870895386, "grad_norm_var": 0.00016003148323287693, "learning_rate": 0.008268139753124203, "loss": 2.7916, "step": 3705 }, { "crossentropy": 2.7491354942321777, "epoch": 0.20152804589575574, "grad_norm": 0.037620823830366135, "grad_norm_var": 0.00016069380011590287, "learning_rate": 0.008267197278553165, "loss": 2.7491, "step": 3706 }, { "crossentropy": 2.6703296899795532, "epoch": 0.20158242475325594, "grad_norm": 0.037865445017814636, "grad_norm_var": 0.00016090300079049975, "learning_rate": 0.008266254601353598, "loss": 2.6703, "step": 3707 }, { "crossentropy": 2.748559594154358, "epoch": 0.20163680361075614, "grad_norm": 0.03770608827471733, "grad_norm_var": 0.00016044956755285512, "learning_rate": 0.008265311721583965, "loss": 2.7486, "step": 3708 }, { "crossentropy": 2.6964707374572754, "epoch": 0.20169118246825635, "grad_norm": 0.03571668267250061, "grad_norm_var": 2.0607439460786024e-05, "learning_rate": 0.008264368639302746, "loss": 2.6965, "step": 3709 }, { "crossentropy": 2.7299529314041138, "epoch": 0.20174556132575655, "grad_norm": 0.03826265037059784, "grad_norm_var": 1.884544530797357e-05, "learning_rate": 0.008263425354568426, "loss": 2.73, "step": 3710 }, { "crossentropy": 2.7183910608291626, "epoch": 0.20179994018325675, "grad_norm": 0.03862093389034271, "grad_norm_var": 9.395364205415774e-06, "learning_rate": 0.008262481867439506, "loss": 2.7184, "step": 3711 }, { "crossentropy": 2.6895540952682495, "epoch": 0.20185431904075696, "grad_norm": 0.03981286659836769, "grad_norm_var": 6.736924552114681e-06, "learning_rate": 0.008261538177974505, "loss": 2.6896, "step": 3712 }, { "crossentropy": 2.6697369813919067, "epoch": 0.20190869789825716, "grad_norm": 0.04045407846570015, "grad_norm_var": 5.940725747812826e-06, "learning_rate": 0.008260594286231946, "loss": 2.6697, "step": 3713 }, { "crossentropy": 2.626654267311096, "epoch": 0.20196307675575736, "grad_norm": 0.04252779856324196, "grad_norm_var": 6.558222686619576e-06, "learning_rate": 0.00825965019227037, "loss": 2.6267, "step": 3714 }, { "crossentropy": 2.703150153160095, "epoch": 0.20201745561325757, "grad_norm": 0.040450435131788254, "grad_norm_var": 2.8719304217572975e-06, "learning_rate": 0.008258705896148326, "loss": 2.7032, "step": 3715 }, { "crossentropy": 2.787423253059387, "epoch": 0.20207183447075777, "grad_norm": 0.038666605949401855, "grad_norm_var": 2.8639668594933334e-06, "learning_rate": 0.008257761397924382, "loss": 2.7874, "step": 3716 }, { "crossentropy": 2.656783699989319, "epoch": 0.20212621332825798, "grad_norm": 0.043667085468769073, "grad_norm_var": 4.046313700744632e-06, "learning_rate": 0.008256816697657113, "loss": 2.6568, "step": 3717 }, { "crossentropy": 2.715192675590515, "epoch": 0.20218059218575818, "grad_norm": 0.042542606592178345, "grad_norm_var": 4.496793197998096e-06, "learning_rate": 0.008255871795405108, "loss": 2.7152, "step": 3718 }, { "crossentropy": 2.795955777168274, "epoch": 0.20223497104325838, "grad_norm": 0.03798719868063927, "grad_norm_var": 4.658507375054679e-06, "learning_rate": 0.008254926691226969, "loss": 2.796, "step": 3719 }, { "crossentropy": 2.721607804298401, "epoch": 0.20228934990075859, "grad_norm": 0.05082190781831741, "grad_norm_var": 1.2574032326307696e-05, "learning_rate": 0.008253981385181313, "loss": 2.7216, "step": 3720 }, { "crossentropy": 2.723402500152588, "epoch": 0.2023437287582588, "grad_norm": 0.03752006217837334, "grad_norm_var": 1.2972097853586096e-05, "learning_rate": 0.008253035877326764, "loss": 2.7234, "step": 3721 }, { "crossentropy": 2.7473340034484863, "epoch": 0.202398107615759, "grad_norm": 0.04042504355311394, "grad_norm_var": 1.2568326661434546e-05, "learning_rate": 0.008252090167721963, "loss": 2.7473, "step": 3722 }, { "crossentropy": 2.6156378984451294, "epoch": 0.2024524864732592, "grad_norm": 0.04111620783805847, "grad_norm_var": 1.2221046374758655e-05, "learning_rate": 0.008251144256425562, "loss": 2.6156, "step": 3723 }, { "crossentropy": 2.7824018001556396, "epoch": 0.2025068653307594, "grad_norm": 0.039562251418828964, "grad_norm_var": 1.1771242111478093e-05, "learning_rate": 0.008250198143496226, "loss": 2.7824, "step": 3724 }, { "crossentropy": 2.6362292766571045, "epoch": 0.2025612441882596, "grad_norm": 0.03906845673918724, "grad_norm_var": 1.0331398545553444e-05, "learning_rate": 0.00824925182899263, "loss": 2.6362, "step": 3725 }, { "crossentropy": 2.681689143180847, "epoch": 0.2026156230457598, "grad_norm": 0.03928588703274727, "grad_norm_var": 1.0061694607545422e-05, "learning_rate": 0.008248305312973466, "loss": 2.6817, "step": 3726 }, { "crossentropy": 2.6223559379577637, "epoch": 0.20267000190326, "grad_norm": 0.057947780936956406, "grad_norm_var": 2.783544736678119e-05, "learning_rate": 0.008247358595497434, "loss": 2.6224, "step": 3727 }, { "crossentropy": 2.6865214109420776, "epoch": 0.20272438076076021, "grad_norm": 0.046343423426151276, "grad_norm_var": 2.860435343064252e-05, "learning_rate": 0.008246411676623249, "loss": 2.6865, "step": 3728 }, { "crossentropy": 2.6366668939590454, "epoch": 0.20277875961826042, "grad_norm": 0.03583037108182907, "grad_norm_var": 3.11396624129045e-05, "learning_rate": 0.00824546455640964, "loss": 2.6367, "step": 3729 }, { "crossentropy": 2.7162399291992188, "epoch": 0.20283313847576062, "grad_norm": 0.039559800177812576, "grad_norm_var": 3.152496631330237e-05, "learning_rate": 0.008244517234915344, "loss": 2.7162, "step": 3730 }, { "crossentropy": 2.7123560905456543, "epoch": 0.20288751733326083, "grad_norm": 0.03878067433834076, "grad_norm_var": 3.202744419881128e-05, "learning_rate": 0.008243569712199115, "loss": 2.7124, "step": 3731 }, { "crossentropy": 2.6989954710006714, "epoch": 0.20294189619076103, "grad_norm": 0.03921893984079361, "grad_norm_var": 3.181425637089318e-05, "learning_rate": 0.008242621988319715, "loss": 2.699, "step": 3732 }, { "crossentropy": 2.7666077613830566, "epoch": 0.20299627504826123, "grad_norm": 0.03762015327811241, "grad_norm_var": 3.263846940662936e-05, "learning_rate": 0.008241674063335922, "loss": 2.7666, "step": 3733 }, { "crossentropy": 2.734948754310608, "epoch": 0.20305065390576144, "grad_norm": 0.039739035069942474, "grad_norm_var": 3.2731357475523855e-05, "learning_rate": 0.008240725937306528, "loss": 2.7349, "step": 3734 }, { "crossentropy": 2.775290369987488, "epoch": 0.20310503276326164, "grad_norm": 0.038972318172454834, "grad_norm_var": 3.2356653991405294e-05, "learning_rate": 0.00823977761029033, "loss": 2.7753, "step": 3735 }, { "crossentropy": 2.9140576124191284, "epoch": 0.20315941162076184, "grad_norm": 0.048934586346149445, "grad_norm_var": 3.0199079075698734e-05, "learning_rate": 0.008238829082346147, "loss": 2.9141, "step": 3736 }, { "crossentropy": 2.5774322748184204, "epoch": 0.20321379047826205, "grad_norm": 0.04109129309654236, "grad_norm_var": 2.922235442250707e-05, "learning_rate": 0.008237880353532803, "loss": 2.5774, "step": 3737 }, { "crossentropy": 2.7081278562545776, "epoch": 0.20326816933576225, "grad_norm": 0.038899682462215424, "grad_norm_var": 2.9579997367654847e-05, "learning_rate": 0.008236931423909138, "loss": 2.7081, "step": 3738 }, { "crossentropy": 2.720534563064575, "epoch": 0.20332254819326245, "grad_norm": 0.03718499839305878, "grad_norm_var": 3.068059208689924e-05, "learning_rate": 0.008235982293534005, "loss": 2.7205, "step": 3739 }, { "crossentropy": 2.6243958473205566, "epoch": 0.20337692705076266, "grad_norm": 0.045262426137924194, "grad_norm_var": 3.1521732730109035e-05, "learning_rate": 0.008235032962466266, "loss": 2.6244, "step": 3740 }, { "crossentropy": 2.493009924888611, "epoch": 0.20343130590826286, "grad_norm": 0.04915066063404083, "grad_norm_var": 3.462806063299351e-05, "learning_rate": 0.008234083430764799, "loss": 2.493, "step": 3741 }, { "crossentropy": 2.6458921432495117, "epoch": 0.20348568476576306, "grad_norm": 0.04672369733452797, "grad_norm_var": 3.528108422270471e-05, "learning_rate": 0.008233133698488493, "loss": 2.6459, "step": 3742 }, { "crossentropy": 2.696790099143982, "epoch": 0.20354006362326327, "grad_norm": 0.04319534823298454, "grad_norm_var": 1.8652460610282805e-05, "learning_rate": 0.00823218376569625, "loss": 2.6968, "step": 3743 }, { "crossentropy": 2.7966866493225098, "epoch": 0.20359444248076347, "grad_norm": 0.03829389810562134, "grad_norm_var": 1.767203333031056e-05, "learning_rate": 0.008231233632446981, "loss": 2.7967, "step": 3744 }, { "crossentropy": 2.6984479427337646, "epoch": 0.20364882133826367, "grad_norm": 0.03733987361192703, "grad_norm_var": 1.674305180613192e-05, "learning_rate": 0.008230283298799617, "loss": 2.6984, "step": 3745 }, { "crossentropy": 2.7769839763641357, "epoch": 0.20370320019576388, "grad_norm": 0.03747744485735893, "grad_norm_var": 1.748277811902954e-05, "learning_rate": 0.008229332764813094, "loss": 2.777, "step": 3746 }, { "crossentropy": 2.7155630588531494, "epoch": 0.20375757905326408, "grad_norm": 0.03600161522626877, "grad_norm_var": 1.8831482922441328e-05, "learning_rate": 0.00822838203054636, "loss": 2.7156, "step": 3747 }, { "crossentropy": 2.6896896362304688, "epoch": 0.20381195791076429, "grad_norm": 0.0362650528550148, "grad_norm_var": 2.005628965593582e-05, "learning_rate": 0.008227431096058387, "loss": 2.6897, "step": 3748 }, { "crossentropy": 2.7204058170318604, "epoch": 0.2038663367682645, "grad_norm": 0.036011915653944016, "grad_norm_var": 2.0891117946991492e-05, "learning_rate": 0.008226479961408144, "loss": 2.7204, "step": 3749 }, { "crossentropy": 2.8234113454818726, "epoch": 0.2039207156257647, "grad_norm": 0.03851042315363884, "grad_norm_var": 2.113616330819314e-05, "learning_rate": 0.008225528626654623, "loss": 2.8234, "step": 3750 }, { "crossentropy": 2.54752254486084, "epoch": 0.2039750944832649, "grad_norm": 0.037696607410907745, "grad_norm_var": 2.1511711020545146e-05, "learning_rate": 0.008224577091856821, "loss": 2.5475, "step": 3751 }, { "crossentropy": 2.6651694774627686, "epoch": 0.2040294733407651, "grad_norm": 0.03814060986042023, "grad_norm_var": 1.665810645116213e-05, "learning_rate": 0.008223625357073758, "loss": 2.6652, "step": 3752 }, { "crossentropy": 2.6261677742004395, "epoch": 0.2040838521982653, "grad_norm": 0.038135550916194916, "grad_norm_var": 1.670620927909644e-05, "learning_rate": 0.008222673422364452, "loss": 2.6262, "step": 3753 }, { "crossentropy": 2.747623920440674, "epoch": 0.2041382310557655, "grad_norm": 0.0354338176548481, "grad_norm_var": 1.7800523498621526e-05, "learning_rate": 0.008221721287787948, "loss": 2.7476, "step": 3754 }, { "crossentropy": 2.7796562910079956, "epoch": 0.2041926099132657, "grad_norm": 0.03762161731719971, "grad_norm_var": 1.7681947536086576e-05, "learning_rate": 0.008220768953403292, "loss": 2.7797, "step": 3755 }, { "crossentropy": 2.91738498210907, "epoch": 0.2042469887707659, "grad_norm": 0.04337963089346886, "grad_norm_var": 1.6445307302433096e-05, "learning_rate": 0.008219816419269547, "loss": 2.9174, "step": 3756 }, { "crossentropy": 2.590019106864929, "epoch": 0.20430136762826612, "grad_norm": 0.040262218564748764, "grad_norm_var": 9.751607429477293e-06, "learning_rate": 0.008218863685445792, "loss": 2.59, "step": 3757 }, { "crossentropy": 2.7216076850891113, "epoch": 0.20435574648576632, "grad_norm": 0.04353499785065651, "grad_norm_var": 7.010001091774474e-06, "learning_rate": 0.008217910751991112, "loss": 2.7216, "step": 3758 }, { "crossentropy": 2.7279202938079834, "epoch": 0.20441012534326652, "grad_norm": 0.05012558400630951, "grad_norm_var": 1.427529747074398e-05, "learning_rate": 0.008216957618964604, "loss": 2.7279, "step": 3759 }, { "crossentropy": 2.643255352973938, "epoch": 0.20446450420076676, "grad_norm": 0.044924475252628326, "grad_norm_var": 1.6386077629472876e-05, "learning_rate": 0.008216004286425388, "loss": 2.6433, "step": 3760 }, { "crossentropy": 2.832130789756775, "epoch": 0.20451888305826696, "grad_norm": 0.040098272264003754, "grad_norm_var": 1.609333182846362e-05, "learning_rate": 0.00821505075443258, "loss": 2.8321, "step": 3761 }, { "crossentropy": 2.6926571130752563, "epoch": 0.20457326191576716, "grad_norm": 0.03790220618247986, "grad_norm_var": 1.5984327431878043e-05, "learning_rate": 0.008214097023045324, "loss": 2.6927, "step": 3762 }, { "crossentropy": 2.6895558834075928, "epoch": 0.20462764077326737, "grad_norm": 0.036686625331640244, "grad_norm_var": 1.5682459600997675e-05, "learning_rate": 0.008213143092322768, "loss": 2.6896, "step": 3763 }, { "crossentropy": 2.730016350746155, "epoch": 0.20468201963076757, "grad_norm": 0.03946153447031975, "grad_norm_var": 1.4869617037944574e-05, "learning_rate": 0.008212188962324072, "loss": 2.73, "step": 3764 }, { "crossentropy": 2.654040813446045, "epoch": 0.20473639848826777, "grad_norm": 0.039195384830236435, "grad_norm_var": 1.386524796596437e-05, "learning_rate": 0.00821123463310841, "loss": 2.654, "step": 3765 }, { "crossentropy": 2.7880223989486694, "epoch": 0.20479077734576798, "grad_norm": 0.03731885179877281, "grad_norm_var": 1.4201664015274043e-05, "learning_rate": 0.008210280104734973, "loss": 2.788, "step": 3766 }, { "crossentropy": 2.645078659057617, "epoch": 0.20484515620326818, "grad_norm": 0.035866495221853256, "grad_norm_var": 1.4971807364983804e-05, "learning_rate": 0.008209325377262955, "loss": 2.6451, "step": 3767 }, { "crossentropy": 2.637888550758362, "epoch": 0.20489953506076838, "grad_norm": 0.03922322764992714, "grad_norm_var": 1.4793910876990251e-05, "learning_rate": 0.008208370450751568, "loss": 2.6379, "step": 3768 }, { "crossentropy": 2.7282315492630005, "epoch": 0.2049539139182686, "grad_norm": 0.039287276566028595, "grad_norm_var": 1.459846559514824e-05, "learning_rate": 0.008207415325260039, "loss": 2.7282, "step": 3769 }, { "crossentropy": 2.7159085273742676, "epoch": 0.2050082927757688, "grad_norm": 0.038947541266679764, "grad_norm_var": 1.3221431271322424e-05, "learning_rate": 0.0082064600008476, "loss": 2.7159, "step": 3770 }, { "crossentropy": 2.698259949684143, "epoch": 0.205062671633269, "grad_norm": 0.040076080709695816, "grad_norm_var": 1.2741142141641828e-05, "learning_rate": 0.008205504477573502, "loss": 2.6983, "step": 3771 }, { "crossentropy": 2.711275100708008, "epoch": 0.2051170504907692, "grad_norm": 0.03839574009180069, "grad_norm_var": 1.2309017610010603e-05, "learning_rate": 0.008204548755497004, "loss": 2.7113, "step": 3772 }, { "crossentropy": 2.6422717571258545, "epoch": 0.2051714293482694, "grad_norm": 0.035303711891174316, "grad_norm_var": 1.3726316459551115e-05, "learning_rate": 0.008203592834677382, "loss": 2.6423, "step": 3773 }, { "crossentropy": 2.652527332305908, "epoch": 0.2052258082057696, "grad_norm": 0.041581638157367706, "grad_norm_var": 1.2984662205595133e-05, "learning_rate": 0.008202636715173916, "loss": 2.6525, "step": 3774 }, { "crossentropy": 2.775501251220703, "epoch": 0.2052801870632698, "grad_norm": 0.03973696753382683, "grad_norm_var": 5.2191649110938074e-06, "learning_rate": 0.00820168039704591, "loss": 2.7755, "step": 3775 }, { "crossentropy": 2.7105883359909058, "epoch": 0.20533456592077, "grad_norm": 0.041888996958732605, "grad_norm_var": 3.3973849827089706e-06, "learning_rate": 0.008200723880352668, "loss": 2.7106, "step": 3776 }, { "crossentropy": 2.724832534790039, "epoch": 0.20538894477827022, "grad_norm": 0.06785009801387787, "grad_norm_var": 5.629710543429602e-05, "learning_rate": 0.008199767165153516, "loss": 2.7248, "step": 3777 }, { "crossentropy": 2.7395949363708496, "epoch": 0.20544332363577042, "grad_norm": 0.03914259746670723, "grad_norm_var": 5.595616173723339e-05, "learning_rate": 0.008198810251507786, "loss": 2.7396, "step": 3778 }, { "crossentropy": 2.6065627336502075, "epoch": 0.20549770249327062, "grad_norm": 0.04069218784570694, "grad_norm_var": 5.485680026363343e-05, "learning_rate": 0.008197853139474829, "loss": 2.6066, "step": 3779 }, { "crossentropy": 2.7336130142211914, "epoch": 0.20555208135077083, "grad_norm": 0.037535909563302994, "grad_norm_var": 5.5450951279091495e-05, "learning_rate": 0.008196895829114, "loss": 2.7336, "step": 3780 }, { "crossentropy": 2.6233832836151123, "epoch": 0.20560646020827103, "grad_norm": 0.040649209171533585, "grad_norm_var": 5.5281182568439555e-05, "learning_rate": 0.008195938320484673, "loss": 2.6234, "step": 3781 }, { "crossentropy": 2.690175414085388, "epoch": 0.20566083906577123, "grad_norm": 0.03793869912624359, "grad_norm_var": 5.501389383155604e-05, "learning_rate": 0.00819498061364623, "loss": 2.6902, "step": 3782 }, { "crossentropy": 2.70819091796875, "epoch": 0.20571521792327144, "grad_norm": 0.04047754034399986, "grad_norm_var": 5.3259021692263674e-05, "learning_rate": 0.00819402270865807, "loss": 2.7082, "step": 3783 }, { "crossentropy": 2.6469486951828003, "epoch": 0.20576959678077164, "grad_norm": 0.037426456809043884, "grad_norm_var": 5.3927294091121675e-05, "learning_rate": 0.008193064605579599, "loss": 2.6469, "step": 3784 }, { "crossentropy": 2.7259551286697388, "epoch": 0.20582397563827184, "grad_norm": 0.03891361132264137, "grad_norm_var": 5.4024249995137984e-05, "learning_rate": 0.008192106304470237, "loss": 2.726, "step": 3785 }, { "crossentropy": 2.567463517189026, "epoch": 0.20587835449577205, "grad_norm": 0.07589343190193176, "grad_norm_var": 0.0001290545354369972, "learning_rate": 0.008191147805389418, "loss": 2.5675, "step": 3786 }, { "crossentropy": 2.7608083486557007, "epoch": 0.20593273335327225, "grad_norm": 0.037614986300468445, "grad_norm_var": 0.00013050542839507888, "learning_rate": 0.008190189108396586, "loss": 2.7608, "step": 3787 }, { "crossentropy": 2.7361936569213867, "epoch": 0.20598711221077246, "grad_norm": 0.03949398174881935, "grad_norm_var": 0.00012987876137660376, "learning_rate": 0.008189230213551201, "loss": 2.7362, "step": 3788 }, { "crossentropy": 2.7906328439712524, "epoch": 0.20604149106827266, "grad_norm": 0.039037615060806274, "grad_norm_var": 0.0001267896920787885, "learning_rate": 0.008188271120912732, "loss": 2.7906, "step": 3789 }, { "crossentropy": 2.703473448753357, "epoch": 0.20609586992577286, "grad_norm": 0.03803010284900665, "grad_norm_var": 0.0001284827158153344, "learning_rate": 0.00818731183054066, "loss": 2.7035, "step": 3790 }, { "crossentropy": 2.6073479652404785, "epoch": 0.20615024878327307, "grad_norm": 0.03790203854441643, "grad_norm_var": 0.0001295575694922959, "learning_rate": 0.008186352342494482, "loss": 2.6073, "step": 3791 }, { "crossentropy": 2.7470967769622803, "epoch": 0.20620462764077327, "grad_norm": 0.03871219605207443, "grad_norm_var": 0.00013072476634483812, "learning_rate": 0.008185392656833698, "loss": 2.7471, "step": 3792 }, { "crossentropy": 2.7045788764953613, "epoch": 0.20625900649827347, "grad_norm": 0.04395771399140358, "grad_norm_var": 8.710164822687919e-05, "learning_rate": 0.008184432773617833, "loss": 2.7046, "step": 3793 }, { "crossentropy": 2.6761668920516968, "epoch": 0.20631338535577368, "grad_norm": 0.043848276138305664, "grad_norm_var": 8.702933258861635e-05, "learning_rate": 0.008183472692906417, "loss": 2.6762, "step": 3794 }, { "crossentropy": 2.796906590461731, "epoch": 0.20636776421327388, "grad_norm": 0.04079660400748253, "grad_norm_var": 8.701517912578875e-05, "learning_rate": 0.008182512414758992, "loss": 2.7969, "step": 3795 }, { "crossentropy": 2.784270763397217, "epoch": 0.20642214307077408, "grad_norm": 0.041707057505846024, "grad_norm_var": 8.575096639187688e-05, "learning_rate": 0.008181551939235114, "loss": 2.7843, "step": 3796 }, { "crossentropy": 2.6904376745224, "epoch": 0.2064765219282743, "grad_norm": 0.04567554220557213, "grad_norm_var": 8.640796356095988e-05, "learning_rate": 0.008180591266394352, "loss": 2.6904, "step": 3797 }, { "crossentropy": 2.6625157594680786, "epoch": 0.2065309007857745, "grad_norm": 0.041806530207395554, "grad_norm_var": 8.50736284039953e-05, "learning_rate": 0.008179630396296284, "loss": 2.6625, "step": 3798 }, { "crossentropy": 2.796363353729248, "epoch": 0.2065852796432747, "grad_norm": 0.04209215193986893, "grad_norm_var": 8.478375916859774e-05, "learning_rate": 0.008178669329000503, "loss": 2.7964, "step": 3799 }, { "crossentropy": 2.6813548803329468, "epoch": 0.2066396585007749, "grad_norm": 0.03884120285511017, "grad_norm_var": 8.391752911871183e-05, "learning_rate": 0.008177708064566613, "loss": 2.6814, "step": 3800 }, { "crossentropy": 2.7611361742019653, "epoch": 0.2066940373582751, "grad_norm": 0.04270968213677406, "grad_norm_var": 8.286618420249396e-05, "learning_rate": 0.008176746603054232, "loss": 2.7611, "step": 3801 }, { "crossentropy": 2.68035888671875, "epoch": 0.2067484162157753, "grad_norm": 0.03986402973532677, "grad_norm_var": 6.01688631660374e-06, "learning_rate": 0.00817578494452299, "loss": 2.6804, "step": 3802 }, { "crossentropy": 2.630603551864624, "epoch": 0.2068027950732755, "grad_norm": 0.03798248618841171, "grad_norm_var": 5.871436961936832e-06, "learning_rate": 0.008174823089032524, "loss": 2.6306, "step": 3803 }, { "crossentropy": 2.793931484222412, "epoch": 0.2068571739307757, "grad_norm": 0.03933436796069145, "grad_norm_var": 5.900367767878017e-06, "learning_rate": 0.008173861036642492, "loss": 2.7939, "step": 3804 }, { "crossentropy": 2.7794690132141113, "epoch": 0.20691155278827592, "grad_norm": 0.038764216005802155, "grad_norm_var": 5.968139404990448e-06, "learning_rate": 0.008172898787412557, "loss": 2.7795, "step": 3805 }, { "crossentropy": 2.5497264862060547, "epoch": 0.20696593164577612, "grad_norm": 0.04336458817124367, "grad_norm_var": 5.811042687638389e-06, "learning_rate": 0.008171936341402397, "loss": 2.5497, "step": 3806 }, { "crossentropy": 2.635491967201233, "epoch": 0.20702031050327632, "grad_norm": 0.04964952915906906, "grad_norm_var": 9.450817893466058e-06, "learning_rate": 0.008170973698671702, "loss": 2.6355, "step": 3807 }, { "crossentropy": 2.6065253019332886, "epoch": 0.20707468936077653, "grad_norm": 0.03955373913049698, "grad_norm_var": 9.146463569951502e-06, "learning_rate": 0.008170010859280175, "loss": 2.6065, "step": 3808 }, { "crossentropy": 2.7875397205352783, "epoch": 0.20712906821827673, "grad_norm": 0.038456991314888, "grad_norm_var": 9.50766456170223e-06, "learning_rate": 0.00816904782328753, "loss": 2.7875, "step": 3809 }, { "crossentropy": 2.7536925077438354, "epoch": 0.20718344707577693, "grad_norm": 0.03440690040588379, "grad_norm_var": 1.2157929102861916e-05, "learning_rate": 0.008168084590753492, "loss": 2.7537, "step": 3810 }, { "crossentropy": 2.72527277469635, "epoch": 0.20723782593327714, "grad_norm": 0.03692128136754036, "grad_norm_var": 1.3169545734264867e-05, "learning_rate": 0.008167121161737803, "loss": 2.7253, "step": 3811 }, { "crossentropy": 2.7100311517715454, "epoch": 0.20729220479077734, "grad_norm": 0.04000048339366913, "grad_norm_var": 1.3121430043867323e-05, "learning_rate": 0.008166157536300211, "loss": 2.71, "step": 3812 }, { "crossentropy": 2.675601840019226, "epoch": 0.20734658364827754, "grad_norm": 0.04058261588215828, "grad_norm_var": 1.1288485618759075e-05, "learning_rate": 0.00816519371450048, "loss": 2.6756, "step": 3813 }, { "crossentropy": 2.5435155630111694, "epoch": 0.20740096250577775, "grad_norm": 0.036647602915763855, "grad_norm_var": 1.1895444902830047e-05, "learning_rate": 0.008164229696398387, "loss": 2.5435, "step": 3814 }, { "crossentropy": 2.6525237560272217, "epoch": 0.20745534136327795, "grad_norm": 0.0388241745531559, "grad_norm_var": 1.1628758011373621e-05, "learning_rate": 0.00816326548205372, "loss": 2.6525, "step": 3815 }, { "crossentropy": 2.736133337020874, "epoch": 0.20750972022077815, "grad_norm": 0.037094950675964355, "grad_norm_var": 1.2029545250707465e-05, "learning_rate": 0.008162301071526273, "loss": 2.7361, "step": 3816 }, { "crossentropy": 2.5950759649276733, "epoch": 0.20756409907827836, "grad_norm": 0.03638906031847, "grad_norm_var": 1.1935125086527203e-05, "learning_rate": 0.008161336464875863, "loss": 2.5951, "step": 3817 }, { "crossentropy": 2.6451215744018555, "epoch": 0.20761847793577856, "grad_norm": 0.036754146218299866, "grad_norm_var": 1.2280754114321555e-05, "learning_rate": 0.008160371662162314, "loss": 2.6451, "step": 3818 }, { "crossentropy": 2.749575614929199, "epoch": 0.20767285679327877, "grad_norm": 0.03866390883922577, "grad_norm_var": 1.221319852707971e-05, "learning_rate": 0.008159406663445461, "loss": 2.7496, "step": 3819 }, { "crossentropy": 2.8051685094833374, "epoch": 0.20772723565077897, "grad_norm": 0.04684395715594292, "grad_norm_var": 1.5984467287449788e-05, "learning_rate": 0.008158441468785152, "loss": 2.8052, "step": 3820 }, { "crossentropy": 2.6750636100769043, "epoch": 0.20778161450827917, "grad_norm": 0.05191820487380028, "grad_norm_var": 2.5407571339590892e-05, "learning_rate": 0.008157476078241245, "loss": 2.6751, "step": 3821 }, { "crossentropy": 2.664794683456421, "epoch": 0.20783599336577938, "grad_norm": 0.04391307011246681, "grad_norm_var": 2.5644675013544728e-05, "learning_rate": 0.008156510491873619, "loss": 2.6648, "step": 3822 }, { "crossentropy": 2.720085382461548, "epoch": 0.20789037222327958, "grad_norm": 0.04084652662277222, "grad_norm_var": 1.964767859731371e-05, "learning_rate": 0.008155544709742153, "loss": 2.7201, "step": 3823 }, { "crossentropy": 2.6598504781723022, "epoch": 0.20794475108077978, "grad_norm": 0.04126963019371033, "grad_norm_var": 1.9760804367823424e-05, "learning_rate": 0.008154578731906748, "loss": 2.6599, "step": 3824 }, { "crossentropy": 2.687751054763794, "epoch": 0.20799912993828, "grad_norm": 0.03971056640148163, "grad_norm_var": 1.9605989613673732e-05, "learning_rate": 0.00815361255842731, "loss": 2.6878, "step": 3825 }, { "crossentropy": 2.6678820848464966, "epoch": 0.2080535087957802, "grad_norm": 0.07938799262046814, "grad_norm_var": 0.0001122226275592488, "learning_rate": 0.00815264618936376, "loss": 2.6679, "step": 3826 }, { "crossentropy": 2.76524817943573, "epoch": 0.2081078876532804, "grad_norm": 0.03861650824546814, "grad_norm_var": 0.0001110597941882368, "learning_rate": 0.008151679624776033, "loss": 2.7652, "step": 3827 }, { "crossentropy": 2.775449275970459, "epoch": 0.2081622665107806, "grad_norm": 0.16107480227947235, "grad_norm_var": 0.0009793661935108527, "learning_rate": 0.008150712864724075, "loss": 2.7754, "step": 3828 }, { "crossentropy": 2.6902642250061035, "epoch": 0.2082166453682808, "grad_norm": 0.04848090186715126, "grad_norm_var": 0.0009727856892247038, "learning_rate": 0.008149745909267843, "loss": 2.6903, "step": 3829 }, { "crossentropy": 2.7194007635116577, "epoch": 0.208271024225781, "grad_norm": 0.04850250855088234, "grad_norm_var": 0.0009588401151661578, "learning_rate": 0.008148778758467305, "loss": 2.7194, "step": 3830 }, { "crossentropy": 2.8106685876846313, "epoch": 0.2083254030832812, "grad_norm": 0.05158296227455139, "grad_norm_var": 0.0009469943028325951, "learning_rate": 0.008147811412382444, "loss": 2.8107, "step": 3831 }, { "crossentropy": 2.6472885608673096, "epoch": 0.2083797819407814, "grad_norm": 0.05467062070965767, "grad_norm_var": 0.0009300465292962921, "learning_rate": 0.008146843871073254, "loss": 2.6473, "step": 3832 }, { "crossentropy": 2.7220500707626343, "epoch": 0.20843416079828161, "grad_norm": 0.09631174802780151, "grad_norm_var": 0.0010164449358318116, "learning_rate": 0.008145876134599743, "loss": 2.7221, "step": 3833 }, { "crossentropy": 2.769405961036682, "epoch": 0.20848853965578182, "grad_norm": 0.04531409218907356, "grad_norm_var": 0.0009974502644456249, "learning_rate": 0.008144908203021925, "loss": 2.7694, "step": 3834 }, { "crossentropy": 2.7645962238311768, "epoch": 0.20854291851328205, "grad_norm": 0.0431676022708416, "grad_norm_var": 0.0009871402683713094, "learning_rate": 0.008143940076399831, "loss": 2.7646, "step": 3835 }, { "crossentropy": 2.817046642303467, "epoch": 0.20859729737078225, "grad_norm": 0.039329223334789276, "grad_norm_var": 0.0010020738532427625, "learning_rate": 0.008142971754793506, "loss": 2.817, "step": 3836 }, { "crossentropy": 2.781318426132202, "epoch": 0.20865167622828246, "grad_norm": 0.04005272686481476, "grad_norm_var": 0.0010201090602060237, "learning_rate": 0.008142003238263003, "loss": 2.7813, "step": 3837 }, { "crossentropy": 2.776244878768921, "epoch": 0.20870605508578266, "grad_norm": 0.04098431393504143, "grad_norm_var": 0.001025761267536197, "learning_rate": 0.00814103452686839, "loss": 2.7762, "step": 3838 }, { "crossentropy": 2.6654449701309204, "epoch": 0.20876043394328286, "grad_norm": 0.037092648446559906, "grad_norm_var": 0.001034642705140857, "learning_rate": 0.008140065620669741, "loss": 2.6654, "step": 3839 }, { "crossentropy": 2.6787527799606323, "epoch": 0.20881481280078307, "grad_norm": 0.03949769213795662, "grad_norm_var": 0.0010384601138612502, "learning_rate": 0.008139096519727152, "loss": 2.6788, "step": 3840 }, { "crossentropy": 2.7751253843307495, "epoch": 0.20886919165828327, "grad_norm": 0.04049122333526611, "grad_norm_var": 0.001036752082544147, "learning_rate": 0.008138127224100723, "loss": 2.7751, "step": 3841 }, { "crossentropy": 2.685335874557495, "epoch": 0.20892357051578347, "grad_norm": 0.04007254168391228, "grad_norm_var": 0.0010135610667495463, "learning_rate": 0.008137157733850566, "loss": 2.6853, "step": 3842 }, { "crossentropy": 2.7256468534469604, "epoch": 0.20897794937328368, "grad_norm": 0.04201088100671768, "grad_norm_var": 0.0010072837347239517, "learning_rate": 0.008136188049036815, "loss": 2.7256, "step": 3843 }, { "crossentropy": 2.723689556121826, "epoch": 0.20903232823078388, "grad_norm": 0.041497860103845596, "grad_norm_var": 0.00019841220445019017, "learning_rate": 0.008135218169719604, "loss": 2.7237, "step": 3844 }, { "crossentropy": 2.6136332750320435, "epoch": 0.20908670708828409, "grad_norm": 0.038094114512205124, "grad_norm_var": 0.00020284961502788722, "learning_rate": 0.008134248095959085, "loss": 2.6136, "step": 3845 }, { "crossentropy": 2.679305672645569, "epoch": 0.2091410859457843, "grad_norm": 0.037599675357341766, "grad_norm_var": 0.00020688401506409326, "learning_rate": 0.008133277827815422, "loss": 2.6793, "step": 3846 }, { "crossentropy": 2.7199209928512573, "epoch": 0.2091954648032845, "grad_norm": 0.03941184654831886, "grad_norm_var": 0.00020624765819215687, "learning_rate": 0.008132307365348788, "loss": 2.7199, "step": 3847 }, { "crossentropy": 2.79558789730072, "epoch": 0.2092498436607847, "grad_norm": 0.04321306571364403, "grad_norm_var": 0.00019925860135441744, "learning_rate": 0.008131336708619373, "loss": 2.7956, "step": 3848 }, { "crossentropy": 2.6844996213912964, "epoch": 0.2093042225182849, "grad_norm": 0.043447185307741165, "grad_norm_var": 5.262179618093869e-06, "learning_rate": 0.008130365857687371, "loss": 2.6845, "step": 3849 }, { "crossentropy": 2.687562584877014, "epoch": 0.2093586013757851, "grad_norm": 0.044161148369312286, "grad_norm_var": 4.636691227636637e-06, "learning_rate": 0.008129394812613, "loss": 2.6876, "step": 3850 }, { "crossentropy": 2.609371304512024, "epoch": 0.2094129802332853, "grad_norm": 0.041992202401161194, "grad_norm_var": 4.325774567901647e-06, "learning_rate": 0.008128423573456481, "loss": 2.6094, "step": 3851 }, { "crossentropy": 2.704107165336609, "epoch": 0.2094673590907855, "grad_norm": 0.04037534445524216, "grad_norm_var": 4.222602029244765e-06, "learning_rate": 0.008127452140278046, "loss": 2.7041, "step": 3852 }, { "crossentropy": 2.7636550664901733, "epoch": 0.2095217379482857, "grad_norm": 0.04142262414097786, "grad_norm_var": 4.2354264279541195e-06, "learning_rate": 0.008126480513137947, "loss": 2.7637, "step": 3853 }, { "crossentropy": 2.7304344177246094, "epoch": 0.20957611680578592, "grad_norm": 0.04031316563487053, "grad_norm_var": 4.239055979761373e-06, "learning_rate": 0.008125508692096441, "loss": 2.7304, "step": 3854 }, { "crossentropy": 2.694019913673401, "epoch": 0.20963049566328612, "grad_norm": 0.04062989354133606, "grad_norm_var": 3.33465593068723e-06, "learning_rate": 0.008124536677213799, "loss": 2.694, "step": 3855 }, { "crossentropy": 2.8164795637130737, "epoch": 0.20968487452078632, "grad_norm": 0.0662415623664856, "grad_norm_var": 4.307418319216364e-05, "learning_rate": 0.008123564468550306, "loss": 2.8165, "step": 3856 }, { "crossentropy": 2.756739377975464, "epoch": 0.20973925337828653, "grad_norm": 0.04121607542037964, "grad_norm_var": 4.290699382089776e-05, "learning_rate": 0.008122592066166256, "loss": 2.7567, "step": 3857 }, { "crossentropy": 2.61056125164032, "epoch": 0.20979363223578673, "grad_norm": 0.04068887606263161, "grad_norm_var": 4.2722524880841e-05, "learning_rate": 0.008121619470121959, "loss": 2.6106, "step": 3858 }, { "crossentropy": 2.6767079830169678, "epoch": 0.20984801109328693, "grad_norm": 0.039333876222372055, "grad_norm_var": 4.339666073432617e-05, "learning_rate": 0.008120646680477732, "loss": 2.6767, "step": 3859 }, { "crossentropy": 2.7045453786849976, "epoch": 0.20990238995078714, "grad_norm": 0.04263824224472046, "grad_norm_var": 4.332899912081166e-05, "learning_rate": 0.008119673697293906, "loss": 2.7045, "step": 3860 }, { "crossentropy": 2.7905783653259277, "epoch": 0.20995676880828734, "grad_norm": 0.0658341571688652, "grad_norm_var": 7.494738958305347e-05, "learning_rate": 0.008118700520630826, "loss": 2.7906, "step": 3861 }, { "crossentropy": 2.653040647506714, "epoch": 0.21001114766578755, "grad_norm": 0.04488075524568558, "grad_norm_var": 7.177307926550241e-05, "learning_rate": 0.008117727150548848, "loss": 2.653, "step": 3862 }, { "crossentropy": 2.7267866134643555, "epoch": 0.21006552652328775, "grad_norm": 0.040609460324048996, "grad_norm_var": 7.101231135458504e-05, "learning_rate": 0.008116753587108338, "loss": 2.7268, "step": 3863 }, { "crossentropy": 2.7266368865966797, "epoch": 0.21011990538078795, "grad_norm": 0.0432114414870739, "grad_norm_var": 7.101265786660644e-05, "learning_rate": 0.008115779830369678, "loss": 2.7266, "step": 3864 }, { "crossentropy": 2.627975583076477, "epoch": 0.21017428423828816, "grad_norm": 0.04517320543527603, "grad_norm_var": 7.088470375843092e-05, "learning_rate": 0.008114805880393258, "loss": 2.628, "step": 3865 }, { "crossentropy": 2.749024748802185, "epoch": 0.21022866309578836, "grad_norm": 0.041164498776197433, "grad_norm_var": 7.174920042112235e-05, "learning_rate": 0.008113831737239478, "loss": 2.749, "step": 3866 }, { "crossentropy": 2.6943023204803467, "epoch": 0.21028304195328856, "grad_norm": 0.037774596363306046, "grad_norm_var": 7.440215151266263e-05, "learning_rate": 0.008112857400968758, "loss": 2.6943, "step": 3867 }, { "crossentropy": 2.5690665245056152, "epoch": 0.21033742081078877, "grad_norm": 0.04002634435892105, "grad_norm_var": 7.460026654859466e-05, "learning_rate": 0.008111882871641525, "loss": 2.5691, "step": 3868 }, { "crossentropy": 2.7110642194747925, "epoch": 0.21039179966828897, "grad_norm": 0.04148600250482559, "grad_norm_var": 7.457495668904916e-05, "learning_rate": 0.00811090814931822, "loss": 2.7111, "step": 3869 }, { "crossentropy": 2.801303505897522, "epoch": 0.21044617852578917, "grad_norm": 0.038089219480752945, "grad_norm_var": 7.611116786551773e-05, "learning_rate": 0.008109933234059287, "loss": 2.8013, "step": 3870 }, { "crossentropy": 2.723649263381958, "epoch": 0.21050055738328938, "grad_norm": 0.03904445096850395, "grad_norm_var": 7.704672074369256e-05, "learning_rate": 0.008108958125925197, "loss": 2.7236, "step": 3871 }, { "crossentropy": 2.671271562576294, "epoch": 0.21055493624078958, "grad_norm": 0.04362522438168526, "grad_norm_var": 4.2588901943572434e-05, "learning_rate": 0.00810798282497642, "loss": 2.6713, "step": 3872 }, { "crossentropy": 2.62840735912323, "epoch": 0.21060931509828978, "grad_norm": 0.03983410447835922, "grad_norm_var": 4.3000084385070615e-05, "learning_rate": 0.008107007331273449, "loss": 2.6284, "step": 3873 }, { "crossentropy": 2.689884305000305, "epoch": 0.21066369395579, "grad_norm": 0.03833986446261406, "grad_norm_var": 4.397903546856739e-05, "learning_rate": 0.008106031644876778, "loss": 2.6899, "step": 3874 }, { "crossentropy": 2.6683881282806396, "epoch": 0.2107180728132902, "grad_norm": 0.041725654155015945, "grad_norm_var": 4.330564853226206e-05, "learning_rate": 0.00810505576584692, "loss": 2.6684, "step": 3875 }, { "crossentropy": 2.7424228191375732, "epoch": 0.2107724516707904, "grad_norm": 0.04080595821142197, "grad_norm_var": 4.3534492809644126e-05, "learning_rate": 0.0081040796942444, "loss": 2.7424, "step": 3876 }, { "crossentropy": 2.6684811115264893, "epoch": 0.2108268305282906, "grad_norm": 0.03830942511558533, "grad_norm_var": 5.622365784317164e-06, "learning_rate": 0.00810310343012975, "loss": 2.6685, "step": 3877 }, { "crossentropy": 2.75822377204895, "epoch": 0.2108812093857908, "grad_norm": 0.0388457365334034, "grad_norm_var": 4.680438277411778e-06, "learning_rate": 0.00810212697356352, "loss": 2.7582, "step": 3878 }, { "crossentropy": 2.7358661890029907, "epoch": 0.210935588243291, "grad_norm": 0.03698180615901947, "grad_norm_var": 5.451956522220929e-06, "learning_rate": 0.008101150324606265, "loss": 2.7359, "step": 3879 }, { "crossentropy": 2.8713310956954956, "epoch": 0.2109899671007912, "grad_norm": 0.038162168115377426, "grad_norm_var": 5.0700639610980085e-06, "learning_rate": 0.008100173483318557, "loss": 2.8713, "step": 3880 }, { "crossentropy": 2.73195743560791, "epoch": 0.2110443459582914, "grad_norm": 0.03855440765619278, "grad_norm_var": 3.2089659053481542e-06, "learning_rate": 0.008099196449760981, "loss": 2.732, "step": 3881 }, { "crossentropy": 2.661875009536743, "epoch": 0.21109872481579162, "grad_norm": 0.04160914197564125, "grad_norm_var": 3.3171525709367484e-06, "learning_rate": 0.008098219223994133, "loss": 2.6619, "step": 3882 }, { "crossentropy": 2.7275068759918213, "epoch": 0.21115310367329182, "grad_norm": 0.044339925050735474, "grad_norm_var": 4.434319954652044e-06, "learning_rate": 0.008097241806078615, "loss": 2.7275, "step": 3883 }, { "crossentropy": 2.7640684843063354, "epoch": 0.21120748253079202, "grad_norm": 0.046388428658246994, "grad_norm_var": 6.998118431597068e-06, "learning_rate": 0.008096264196075047, "loss": 2.7641, "step": 3884 }, { "crossentropy": 2.7897948026657104, "epoch": 0.21126186138829223, "grad_norm": 0.05497700721025467, "grad_norm_var": 2.035613077564265e-05, "learning_rate": 0.008095286394044063, "loss": 2.7898, "step": 3885 }, { "crossentropy": 2.7601994276046753, "epoch": 0.21131624024579243, "grad_norm": 0.047564394772052765, "grad_norm_var": 2.200313762589755e-05, "learning_rate": 0.0080943084000463, "loss": 2.7602, "step": 3886 }, { "crossentropy": 2.709566116333008, "epoch": 0.21137061910329263, "grad_norm": 0.040884751826524734, "grad_norm_var": 2.1533949456660945e-05, "learning_rate": 0.008093330214142417, "loss": 2.7096, "step": 3887 }, { "crossentropy": 2.7218470573425293, "epoch": 0.21142499796079284, "grad_norm": 0.0501248836517334, "grad_norm_var": 2.5639731996590122e-05, "learning_rate": 0.008092351836393074, "loss": 2.7218, "step": 3888 }, { "crossentropy": 2.7281512022018433, "epoch": 0.21147937681829304, "grad_norm": 0.03607066720724106, "grad_norm_var": 2.778262583729655e-05, "learning_rate": 0.008091373266858957, "loss": 2.7282, "step": 3889 }, { "crossentropy": 2.671225428581238, "epoch": 0.21153375567579324, "grad_norm": 0.037153106182813644, "grad_norm_var": 2.8466466383214998e-05, "learning_rate": 0.008090394505600751, "loss": 2.6712, "step": 3890 }, { "crossentropy": 2.802462935447693, "epoch": 0.21158813453329345, "grad_norm": 0.04921358823776245, "grad_norm_var": 3.1665842818952524e-05, "learning_rate": 0.008089415552679158, "loss": 2.8025, "step": 3891 }, { "crossentropy": 2.721709966659546, "epoch": 0.21164251339079365, "grad_norm": 0.03946349769830704, "grad_norm_var": 3.208154152331162e-05, "learning_rate": 0.008088436408154896, "loss": 2.7217, "step": 3892 }, { "crossentropy": 2.812601327896118, "epoch": 0.21169689224829386, "grad_norm": 0.0468892939388752, "grad_norm_var": 3.198551005260669e-05, "learning_rate": 0.008087457072088685, "loss": 2.8126, "step": 3893 }, { "crossentropy": 2.6228139400482178, "epoch": 0.21175127110579406, "grad_norm": 0.04294935613870621, "grad_norm_var": 3.07915662611837e-05, "learning_rate": 0.008086477544541264, "loss": 2.6228, "step": 3894 }, { "crossentropy": 2.6199086904525757, "epoch": 0.21180564996329426, "grad_norm": 0.03717362508177757, "grad_norm_var": 3.0634628196439934e-05, "learning_rate": 0.008085497825573383, "loss": 2.6199, "step": 3895 }, { "crossentropy": 2.757573962211609, "epoch": 0.21186002882079447, "grad_norm": 0.04104914143681526, "grad_norm_var": 2.9208673796419076e-05, "learning_rate": 0.008084517915245806, "loss": 2.7576, "step": 3896 }, { "crossentropy": 2.6977500915527344, "epoch": 0.21191440767829467, "grad_norm": 0.049061402678489685, "grad_norm_var": 2.9319677435720298e-05, "learning_rate": 0.008083537813619302, "loss": 2.6978, "step": 3897 }, { "crossentropy": 2.8076916933059692, "epoch": 0.21196878653579487, "grad_norm": 0.03728734329342842, "grad_norm_var": 3.1897609830203584e-05, "learning_rate": 0.008082557520754657, "loss": 2.8077, "step": 3898 }, { "crossentropy": 2.6833566427230835, "epoch": 0.21202316539329508, "grad_norm": 0.0378536693751812, "grad_norm_var": 3.404880516282826e-05, "learning_rate": 0.008081577036712669, "loss": 2.6834, "step": 3899 }, { "crossentropy": 2.600961685180664, "epoch": 0.21207754425079528, "grad_norm": 0.03616027906537056, "grad_norm_var": 3.648655001608307e-05, "learning_rate": 0.008080596361554145, "loss": 2.601, "step": 3900 }, { "crossentropy": 2.759278178215027, "epoch": 0.21213192310829548, "grad_norm": 0.03705458343029022, "grad_norm_var": 2.7325513672345433e-05, "learning_rate": 0.00807961549533991, "loss": 2.7593, "step": 3901 }, { "crossentropy": 2.762635588645935, "epoch": 0.2121863019657957, "grad_norm": 0.03815566003322601, "grad_norm_var": 2.540368380563133e-05, "learning_rate": 0.00807863443813079, "loss": 2.7626, "step": 3902 }, { "crossentropy": 2.775293469429016, "epoch": 0.2122406808232959, "grad_norm": 0.040523629635572433, "grad_norm_var": 2.5419023182356403e-05, "learning_rate": 0.008077653189987632, "loss": 2.7753, "step": 3903 }, { "crossentropy": 2.7033344507217407, "epoch": 0.2122950596807961, "grad_norm": 0.04068403318524361, "grad_norm_var": 1.9517859666850062e-05, "learning_rate": 0.008076671750971294, "loss": 2.7033, "step": 3904 }, { "crossentropy": 2.7227195501327515, "epoch": 0.2123494385382963, "grad_norm": 0.039630480110645294, "grad_norm_var": 1.82448229741056e-05, "learning_rate": 0.008075690121142643, "loss": 2.7227, "step": 3905 }, { "crossentropy": 2.619662284851074, "epoch": 0.2124038173957965, "grad_norm": 0.0371008962392807, "grad_norm_var": 1.8269294020537007e-05, "learning_rate": 0.008074708300562557, "loss": 2.6197, "step": 3906 }, { "crossentropy": 2.6621984243392944, "epoch": 0.2124581962532967, "grad_norm": 0.06379955261945724, "grad_norm_var": 4.823879122773725e-05, "learning_rate": 0.008073726289291929, "loss": 2.6622, "step": 3907 }, { "crossentropy": 2.7151074409484863, "epoch": 0.2125125751107969, "grad_norm": 0.039391908794641495, "grad_norm_var": 4.8259049334929174e-05, "learning_rate": 0.008072744087391661, "loss": 2.7151, "step": 3908 }, { "crossentropy": 2.6761595010757446, "epoch": 0.21256695396829714, "grad_norm": 0.04035312682390213, "grad_norm_var": 4.6274091931375094e-05, "learning_rate": 0.008071761694922672, "loss": 2.6762, "step": 3909 }, { "crossentropy": 2.6326687335968018, "epoch": 0.21262133282579734, "grad_norm": 0.03708608075976372, "grad_norm_var": 4.7007663662723626e-05, "learning_rate": 0.008070779111945885, "loss": 2.6327, "step": 3910 }, { "crossentropy": 2.7302955389022827, "epoch": 0.21267571168329755, "grad_norm": 0.040953777730464935, "grad_norm_var": 4.608668408080732e-05, "learning_rate": 0.008069796338522239, "loss": 2.7303, "step": 3911 }, { "crossentropy": 2.707846522331238, "epoch": 0.21273009054079775, "grad_norm": 0.044525906443595886, "grad_norm_var": 4.686074047626354e-05, "learning_rate": 0.008068813374712687, "loss": 2.7078, "step": 3912 }, { "crossentropy": 2.5663715600967407, "epoch": 0.21278446939829795, "grad_norm": 0.046959392726421356, "grad_norm_var": 4.4940991638209243e-05, "learning_rate": 0.008067830220578191, "loss": 2.5664, "step": 3913 }, { "crossentropy": 2.687354803085327, "epoch": 0.21283884825579816, "grad_norm": 0.04193417727947235, "grad_norm_var": 4.3931406055027164e-05, "learning_rate": 0.008066846876179725, "loss": 2.6874, "step": 3914 }, { "crossentropy": 2.784201145172119, "epoch": 0.21289322711329836, "grad_norm": 0.0416923426091671, "grad_norm_var": 4.3044723880399324e-05, "learning_rate": 0.008065863341578277, "loss": 2.7842, "step": 3915 }, { "crossentropy": 2.711888551712036, "epoch": 0.21294760597079856, "grad_norm": 0.04371388629078865, "grad_norm_var": 4.110663770321224e-05, "learning_rate": 0.008064879616834839, "loss": 2.7119, "step": 3916 }, { "crossentropy": 2.5725523233413696, "epoch": 0.21300198482829877, "grad_norm": 0.041565895080566406, "grad_norm_var": 3.9345298985530436e-05, "learning_rate": 0.008063895702010427, "loss": 2.5726, "step": 3917 }, { "crossentropy": 2.7194631099700928, "epoch": 0.21305636368579897, "grad_norm": 0.03706800192594528, "grad_norm_var": 4.003177097051114e-05, "learning_rate": 0.008062911597166059, "loss": 2.7195, "step": 3918 }, { "crossentropy": 2.711351990699768, "epoch": 0.21311074254329918, "grad_norm": 0.03651511296629906, "grad_norm_var": 4.199156447006469e-05, "learning_rate": 0.008061927302362768, "loss": 2.7114, "step": 3919 }, { "crossentropy": 2.7257978916168213, "epoch": 0.21316512140079938, "grad_norm": 0.04277055338025093, "grad_norm_var": 4.188061133904112e-05, "learning_rate": 0.008060942817661603, "loss": 2.7258, "step": 3920 }, { "crossentropy": 2.7539256811141968, "epoch": 0.21321950025829958, "grad_norm": 0.03731324151158333, "grad_norm_var": 4.30074207361121e-05, "learning_rate": 0.008059958143123618, "loss": 2.7539, "step": 3921 }, { "crossentropy": 2.635540723800659, "epoch": 0.21327387911579979, "grad_norm": 0.03827868029475212, "grad_norm_var": 4.231747353165722e-05, "learning_rate": 0.00805897327880988, "loss": 2.6355, "step": 3922 }, { "crossentropy": 2.775520086288452, "epoch": 0.2133282579733, "grad_norm": 0.03965805843472481, "grad_norm_var": 8.959964286170812e-06, "learning_rate": 0.008057988224781472, "loss": 2.7755, "step": 3923 }, { "crossentropy": 2.7106086015701294, "epoch": 0.2133826368308002, "grad_norm": 0.037885941565036774, "grad_norm_var": 9.346550548742366e-06, "learning_rate": 0.008057002981099485, "loss": 2.7106, "step": 3924 }, { "crossentropy": 2.6797516345977783, "epoch": 0.2134370156883004, "grad_norm": 0.036402784287929535, "grad_norm_var": 1.0408261583993762e-05, "learning_rate": 0.008056017547825024, "loss": 2.6798, "step": 3925 }, { "crossentropy": 2.7218822240829468, "epoch": 0.2134913945458006, "grad_norm": 0.04315183684229851, "grad_norm_var": 1.0132604789405096e-05, "learning_rate": 0.008055031925019203, "loss": 2.7219, "step": 3926 }, { "crossentropy": 2.6960036754608154, "epoch": 0.2135457734033008, "grad_norm": 0.043506424874067307, "grad_norm_var": 1.0643468357406499e-05, "learning_rate": 0.00805404611274315, "loss": 2.696, "step": 3927 }, { "crossentropy": 2.7189981937408447, "epoch": 0.213600152260801, "grad_norm": 0.04182470962405205, "grad_norm_var": 9.760778228947118e-06, "learning_rate": 0.008053060111058006, "loss": 2.719, "step": 3928 }, { "crossentropy": 2.7044988870620728, "epoch": 0.2136545311183012, "grad_norm": 0.040853433310985565, "grad_norm_var": 6.946208336562852e-06, "learning_rate": 0.008052073920024919, "loss": 2.7045, "step": 3929 }, { "crossentropy": 2.587363123893738, "epoch": 0.21370890997580141, "grad_norm": 0.039517033845186234, "grad_norm_var": 6.77130423172295e-06, "learning_rate": 0.008051087539705052, "loss": 2.5874, "step": 3930 }, { "crossentropy": 2.730374574661255, "epoch": 0.21376328883330162, "grad_norm": 0.060751356184482574, "grad_norm_var": 3.350191172381718e-05, "learning_rate": 0.008050100970159582, "loss": 2.7304, "step": 3931 }, { "crossentropy": 2.7774354219436646, "epoch": 0.21381766769080182, "grad_norm": 0.03699745982885361, "grad_norm_var": 3.4158328691903604e-05, "learning_rate": 0.008049114211449692, "loss": 2.7774, "step": 3932 }, { "crossentropy": 2.716030478477478, "epoch": 0.21387204654830202, "grad_norm": 0.03830792009830475, "grad_norm_var": 3.452324944790321e-05, "learning_rate": 0.008048127263636582, "loss": 2.716, "step": 3933 }, { "crossentropy": 2.7286617755889893, "epoch": 0.21392642540580223, "grad_norm": 0.03852474316954613, "grad_norm_var": 3.395525439726469e-05, "learning_rate": 0.00804714012678146, "loss": 2.7287, "step": 3934 }, { "crossentropy": 2.672532558441162, "epoch": 0.21398080426330243, "grad_norm": 0.03971053659915924, "grad_norm_var": 3.278221956059483e-05, "learning_rate": 0.00804615280094555, "loss": 2.6725, "step": 3935 }, { "crossentropy": 2.7931004762649536, "epoch": 0.21403518312080264, "grad_norm": 0.04524204507470131, "grad_norm_var": 3.375867149174311e-05, "learning_rate": 0.008045165286190082, "loss": 2.7931, "step": 3936 }, { "crossentropy": 2.756627917289734, "epoch": 0.21408956197830284, "grad_norm": 0.045668672770261765, "grad_norm_var": 3.3880618529116774e-05, "learning_rate": 0.008044177582576303, "loss": 2.7566, "step": 3937 }, { "crossentropy": 2.6720070838928223, "epoch": 0.21414394083580304, "grad_norm": 0.04499801993370056, "grad_norm_var": 3.368868592321939e-05, "learning_rate": 0.008043189690165467, "loss": 2.672, "step": 3938 }, { "crossentropy": 2.6427754163742065, "epoch": 0.21419831969330325, "grad_norm": 0.04135116934776306, "grad_norm_var": 3.332503801437934e-05, "learning_rate": 0.008042201609018845, "loss": 2.6428, "step": 3939 }, { "crossentropy": 2.614005208015442, "epoch": 0.21425269855080345, "grad_norm": 0.03966172784566879, "grad_norm_var": 3.25081670732793e-05, "learning_rate": 0.008041213339197713, "loss": 2.614, "step": 3940 }, { "crossentropy": 2.775914430618286, "epoch": 0.21430707740830365, "grad_norm": 0.03879602253437042, "grad_norm_var": 3.099093303815982e-05, "learning_rate": 0.008040224880763367, "loss": 2.7759, "step": 3941 }, { "crossentropy": 2.686419725418091, "epoch": 0.21436145626580386, "grad_norm": 0.03802833333611488, "grad_norm_var": 3.2137743850917515e-05, "learning_rate": 0.008039236233777109, "loss": 2.6864, "step": 3942 }, { "crossentropy": 2.6809853315353394, "epoch": 0.21441583512330406, "grad_norm": 0.040607038885354996, "grad_norm_var": 3.212281697362266e-05, "learning_rate": 0.00803824739830025, "loss": 2.681, "step": 3943 }, { "crossentropy": 2.7055141925811768, "epoch": 0.21447021398080426, "grad_norm": 0.03834186866879463, "grad_norm_var": 3.292869341895149e-05, "learning_rate": 0.008037258374394124, "loss": 2.7055, "step": 3944 }, { "crossentropy": 2.718138813972473, "epoch": 0.21452459283830447, "grad_norm": 0.03754105791449547, "grad_norm_var": 3.399266328475796e-05, "learning_rate": 0.008036269162120063, "loss": 2.7181, "step": 3945 }, { "crossentropy": 2.640084981918335, "epoch": 0.21457897169580467, "grad_norm": 0.041540712118148804, "grad_norm_var": 3.371280750999502e-05, "learning_rate": 0.00803527976153942, "loss": 2.6401, "step": 3946 }, { "crossentropy": 2.548694610595703, "epoch": 0.21463335055330487, "grad_norm": 0.04000202938914299, "grad_norm_var": 7.71855862351419e-06, "learning_rate": 0.008034290172713555, "loss": 2.5487, "step": 3947 }, { "crossentropy": 2.76166832447052, "epoch": 0.21468772941080508, "grad_norm": 0.03712454438209534, "grad_norm_var": 7.663057763801264e-06, "learning_rate": 0.008033300395703845, "loss": 2.7617, "step": 3948 }, { "crossentropy": 2.7574864625930786, "epoch": 0.21474210826830528, "grad_norm": 0.04001612588763237, "grad_norm_var": 7.382510912332512e-06, "learning_rate": 0.00803231043057167, "loss": 2.7575, "step": 3949 }, { "crossentropy": 2.753840923309326, "epoch": 0.21479648712580549, "grad_norm": 0.041589170694351196, "grad_norm_var": 7.183947482243778e-06, "learning_rate": 0.008031320277378429, "loss": 2.7538, "step": 3950 }, { "crossentropy": 2.802356243133545, "epoch": 0.2148508659833057, "grad_norm": 0.03937700018286705, "grad_norm_var": 7.232176883943154e-06, "learning_rate": 0.00803032993618553, "loss": 2.8024, "step": 3951 }, { "crossentropy": 2.8140478134155273, "epoch": 0.2149052448408059, "grad_norm": 0.040860824286937714, "grad_norm_var": 5.730585518280012e-06, "learning_rate": 0.008029339407054395, "loss": 2.814, "step": 3952 }, { "crossentropy": 2.6365026235580444, "epoch": 0.2149596236983061, "grad_norm": 0.04994507133960724, "grad_norm_var": 9.909604626998948e-06, "learning_rate": 0.008028348690046453, "loss": 2.6365, "step": 3953 }, { "crossentropy": 2.6899490356445312, "epoch": 0.2150140025558063, "grad_norm": 0.03718077391386032, "grad_norm_var": 9.156656803094703e-06, "learning_rate": 0.008027357785223151, "loss": 2.6899, "step": 3954 }, { "crossentropy": 2.6837888956069946, "epoch": 0.2150683814133065, "grad_norm": 0.03505946695804596, "grad_norm_var": 1.0600210795132641e-05, "learning_rate": 0.008026366692645939, "loss": 2.6838, "step": 3955 }, { "crossentropy": 2.6685707569122314, "epoch": 0.2151227602708067, "grad_norm": 0.05065884068608284, "grad_norm_var": 1.8059389631356707e-05, "learning_rate": 0.008025375412376287, "loss": 2.6686, "step": 3956 }, { "crossentropy": 2.7817994356155396, "epoch": 0.2151771391283069, "grad_norm": 0.04061686620116234, "grad_norm_var": 1.7873114393792724e-05, "learning_rate": 0.008024383944475673, "loss": 2.7818, "step": 3957 }, { "crossentropy": 2.616194009780884, "epoch": 0.2152315179858071, "grad_norm": 0.039780959486961365, "grad_norm_var": 1.7480355323079335e-05, "learning_rate": 0.008023392289005586, "loss": 2.6162, "step": 3958 }, { "crossentropy": 2.6708998680114746, "epoch": 0.21528589684330732, "grad_norm": 0.037787631154060364, "grad_norm_var": 1.798961758327124e-05, "learning_rate": 0.008022400446027528, "loss": 2.6709, "step": 3959 }, { "crossentropy": 2.5665934085845947, "epoch": 0.21534027570080752, "grad_norm": 0.04052424430847168, "grad_norm_var": 1.7669804486576216e-05, "learning_rate": 0.008021408415603011, "loss": 2.5666, "step": 3960 }, { "crossentropy": 2.6664226055145264, "epoch": 0.21539465455830772, "grad_norm": 0.041182901710271835, "grad_norm_var": 1.7013223636886768e-05, "learning_rate": 0.008020416197793563, "loss": 2.6664, "step": 3961 }, { "crossentropy": 2.70856773853302, "epoch": 0.21544903341580793, "grad_norm": 0.03902731463313103, "grad_norm_var": 1.716918518596579e-05, "learning_rate": 0.008019423792660716, "loss": 2.7086, "step": 3962 }, { "crossentropy": 2.772583842277527, "epoch": 0.21550341227330813, "grad_norm": 0.038338713347911835, "grad_norm_var": 1.7490429248580528e-05, "learning_rate": 0.008018431200266024, "loss": 2.7726, "step": 3963 }, { "crossentropy": 2.7244796752929688, "epoch": 0.21555779113080833, "grad_norm": 0.04218915104866028, "grad_norm_var": 1.6769010254667065e-05, "learning_rate": 0.00801743842067104, "loss": 2.7245, "step": 3964 }, { "crossentropy": 2.769739031791687, "epoch": 0.21561216998830854, "grad_norm": 0.04894500970840454, "grad_norm_var": 2.0719266542796633e-05, "learning_rate": 0.008016445453937338, "loss": 2.7697, "step": 3965 }, { "crossentropy": 2.734490394592285, "epoch": 0.21566654884580874, "grad_norm": 0.04085030034184456, "grad_norm_var": 2.0738838825570816e-05, "learning_rate": 0.008015452300126503, "loss": 2.7345, "step": 3966 }, { "crossentropy": 2.787935256958008, "epoch": 0.21572092770330895, "grad_norm": 0.0367647185921669, "grad_norm_var": 2.1868327960040437e-05, "learning_rate": 0.008014458959300126, "loss": 2.7879, "step": 3967 }, { "crossentropy": 2.7436578273773193, "epoch": 0.21577530656080915, "grad_norm": 0.038063615560531616, "grad_norm_var": 2.2495803988801557e-05, "learning_rate": 0.008013465431519816, "loss": 2.7437, "step": 3968 }, { "crossentropy": 2.6050446033477783, "epoch": 0.21582968541830935, "grad_norm": 0.03850887715816498, "grad_norm_var": 1.7117542358745645e-05, "learning_rate": 0.00801247171684719, "loss": 2.605, "step": 3969 }, { "crossentropy": 2.7210012674331665, "epoch": 0.21588406427580956, "grad_norm": 0.039042290300130844, "grad_norm_var": 1.6549382283057718e-05, "learning_rate": 0.008011477815343875, "loss": 2.721, "step": 3970 }, { "crossentropy": 2.6154950857162476, "epoch": 0.21593844313330976, "grad_norm": 0.034314364194869995, "grad_norm_var": 1.7120489273565973e-05, "learning_rate": 0.008010483727071513, "loss": 2.6155, "step": 3971 }, { "crossentropy": 2.6715309619903564, "epoch": 0.21599282199080996, "grad_norm": 0.035912465304136276, "grad_norm_var": 1.0564762332984646e-05, "learning_rate": 0.008009489452091758, "loss": 2.6715, "step": 3972 }, { "crossentropy": 2.6463069915771484, "epoch": 0.21604720084831017, "grad_norm": 0.03886578604578972, "grad_norm_var": 1.0493444705189647e-05, "learning_rate": 0.008008494990466275, "loss": 2.6463, "step": 3973 }, { "crossentropy": 2.684525728225708, "epoch": 0.21610157970581037, "grad_norm": 0.04034767299890518, "grad_norm_var": 1.054372806278935e-05, "learning_rate": 0.008007500342256734, "loss": 2.6845, "step": 3974 }, { "crossentropy": 2.593840718269348, "epoch": 0.21615595856331057, "grad_norm": 0.037161555141210556, "grad_norm_var": 1.0704204534420167e-05, "learning_rate": 0.008006505507524828, "loss": 2.5938, "step": 3975 }, { "crossentropy": 2.7066584825515747, "epoch": 0.21621033742081078, "grad_norm": 0.038308195769786835, "grad_norm_var": 1.0672283010786503e-05, "learning_rate": 0.008005510486332251, "loss": 2.7067, "step": 3976 }, { "crossentropy": 2.7872445583343506, "epoch": 0.21626471627831098, "grad_norm": 0.03946889191865921, "grad_norm_var": 1.0411633226521898e-05, "learning_rate": 0.008004515278740718, "loss": 2.7872, "step": 3977 }, { "crossentropy": 2.6992130279541016, "epoch": 0.21631909513581118, "grad_norm": 0.036932144314050674, "grad_norm_var": 1.0715182642948944e-05, "learning_rate": 0.008003519884811947, "loss": 2.6992, "step": 3978 }, { "crossentropy": 2.803106427192688, "epoch": 0.2163734739933114, "grad_norm": 0.036405257880687714, "grad_norm_var": 1.1119520612958474e-05, "learning_rate": 0.008002524304607675, "loss": 2.8031, "step": 3979 }, { "crossentropy": 2.6712183952331543, "epoch": 0.2164278528508116, "grad_norm": 0.03755269944667816, "grad_norm_var": 1.0417378952273792e-05, "learning_rate": 0.008001528538189643, "loss": 2.6712, "step": 3980 }, { "crossentropy": 2.6805936098098755, "epoch": 0.2164822317083118, "grad_norm": 0.04539518058300018, "grad_norm_var": 6.3039377704558286e-06, "learning_rate": 0.00800053258561961, "loss": 2.6806, "step": 3981 }, { "crossentropy": 2.8104617595672607, "epoch": 0.216536610565812, "grad_norm": 0.04379343241453171, "grad_norm_var": 7.819265248969603e-06, "learning_rate": 0.007999536446959343, "loss": 2.8105, "step": 3982 }, { "crossentropy": 2.7617465257644653, "epoch": 0.2165909894233122, "grad_norm": 0.15021660923957825, "grad_norm_var": 0.0007852365560909985, "learning_rate": 0.007998540122270623, "loss": 2.7617, "step": 3983 }, { "crossentropy": 2.6549689769744873, "epoch": 0.21664536828081243, "grad_norm": 0.04735647141933441, "grad_norm_var": 0.0007812425818666872, "learning_rate": 0.007997543611615239, "loss": 2.655, "step": 3984 }, { "crossentropy": 2.727257251739502, "epoch": 0.21669974713831264, "grad_norm": 0.048569776117801666, "grad_norm_var": 0.0007772196377338049, "learning_rate": 0.007996546915054996, "loss": 2.7273, "step": 3985 }, { "crossentropy": 2.738140106201172, "epoch": 0.21675412599581284, "grad_norm": 0.042012568563222885, "grad_norm_var": 0.0007746778453448223, "learning_rate": 0.007995550032651707, "loss": 2.7381, "step": 3986 }, { "crossentropy": 2.7690643072128296, "epoch": 0.21680850485331304, "grad_norm": 0.043094947934150696, "grad_norm_var": 0.0007646000133206972, "learning_rate": 0.007994552964467196, "loss": 2.7691, "step": 3987 }, { "crossentropy": 2.6390483379364014, "epoch": 0.21686288371081325, "grad_norm": 0.042526498436927795, "grad_norm_var": 0.0007570385771679342, "learning_rate": 0.007993555710563302, "loss": 2.639, "step": 3988 }, { "crossentropy": 2.675992488861084, "epoch": 0.21691726256831345, "grad_norm": 0.04395955055952072, "grad_norm_var": 0.0007524562318053789, "learning_rate": 0.007992558271001876, "loss": 2.676, "step": 3989 }, { "crossentropy": 2.7033145427703857, "epoch": 0.21697164142581365, "grad_norm": 0.04332166537642479, "grad_norm_var": 0.0007498481955858762, "learning_rate": 0.007991560645844773, "loss": 2.7033, "step": 3990 }, { "crossentropy": 2.6760388612747192, "epoch": 0.21702602028331386, "grad_norm": 0.038159340620040894, "grad_norm_var": 0.0007484013470092143, "learning_rate": 0.007990562835153871, "loss": 2.676, "step": 3991 }, { "crossentropy": 2.7494723796844482, "epoch": 0.21708039914081406, "grad_norm": 0.04011879116296768, "grad_norm_var": 0.0007461296135976703, "learning_rate": 0.007989564838991048, "loss": 2.7495, "step": 3992 }, { "crossentropy": 2.721083402633667, "epoch": 0.21713477799831427, "grad_norm": 0.04091465845704079, "grad_norm_var": 0.00074448459264446, "learning_rate": 0.007988566657418202, "loss": 2.7211, "step": 3993 }, { "crossentropy": 2.769782781600952, "epoch": 0.21718915685581447, "grad_norm": 0.03895355015993118, "grad_norm_var": 0.0007415492630660005, "learning_rate": 0.007987568290497236, "loss": 2.7698, "step": 3994 }, { "crossentropy": 2.568224310874939, "epoch": 0.21724353571331467, "grad_norm": 0.036494333297014236, "grad_norm_var": 0.0007414013987607101, "learning_rate": 0.007986569738290072, "loss": 2.5682, "step": 3995 }, { "crossentropy": 2.6388182640075684, "epoch": 0.21729791457081488, "grad_norm": 0.03929628059267998, "grad_norm_var": 0.0007389528292886193, "learning_rate": 0.007985571000858638, "loss": 2.6388, "step": 3996 }, { "crossentropy": 2.709826707839966, "epoch": 0.21735229342831508, "grad_norm": 0.03758638724684715, "grad_norm_var": 0.0007465290972981255, "learning_rate": 0.00798457207826487, "loss": 2.7098, "step": 3997 }, { "crossentropy": 2.7211647033691406, "epoch": 0.21740667228581528, "grad_norm": 0.03907734900712967, "grad_norm_var": 0.0007508934618279968, "learning_rate": 0.00798357297057073, "loss": 2.7212, "step": 3998 }, { "crossentropy": 2.6338080167770386, "epoch": 0.2174610511433155, "grad_norm": 0.03607628867030144, "grad_norm_var": 1.3019505371651638e-05, "learning_rate": 0.007982573677838172, "loss": 2.6338, "step": 3999 }, { "crossentropy": 2.8719773292541504, "epoch": 0.2175154300008157, "grad_norm": 0.03887435048818588, "grad_norm_var": 1.0434637113033013e-05, "learning_rate": 0.007981574200129175, "loss": 2.872, "step": 4000 }, { "crossentropy": 2.7567298412323, "epoch": 0.2175698088583159, "grad_norm": 0.03743842616677284, "grad_norm_var": 6.2979532168522255e-06, "learning_rate": 0.007980574537505726, "loss": 2.7567, "step": 4001 }, { "crossentropy": 2.620676636695862, "epoch": 0.2176241877158161, "grad_norm": 0.04072767496109009, "grad_norm_var": 6.03391391767686e-06, "learning_rate": 0.007979574690029825, "loss": 2.6207, "step": 4002 }, { "crossentropy": 2.6947872638702393, "epoch": 0.2176785665733163, "grad_norm": 0.04048531875014305, "grad_norm_var": 5.309157807387166e-06, "learning_rate": 0.00797857465776348, "loss": 2.6948, "step": 4003 }, { "crossentropy": 2.5936368703842163, "epoch": 0.2177329454308165, "grad_norm": 0.04338490217924118, "grad_norm_var": 5.6872241320113835e-06, "learning_rate": 0.00797757444076871, "loss": 2.5936, "step": 4004 }, { "crossentropy": 2.683396577835083, "epoch": 0.2177873242883167, "grad_norm": 0.048403024673461914, "grad_norm_var": 9.457141521225046e-06, "learning_rate": 0.007976574039107547, "loss": 2.6834, "step": 4005 }, { "crossentropy": 2.660645365715027, "epoch": 0.2178417031458169, "grad_norm": 0.04218382388353348, "grad_norm_var": 9.027601682866775e-06, "learning_rate": 0.00797557345284204, "loss": 2.6606, "step": 4006 }, { "crossentropy": 2.7627655267715454, "epoch": 0.21789608200331712, "grad_norm": 0.07154780626296997, "grad_norm_var": 7.101563700700007e-05, "learning_rate": 0.00797457268203424, "loss": 2.7628, "step": 4007 }, { "crossentropy": 2.594394326210022, "epoch": 0.21795046086081732, "grad_norm": 0.03887570649385452, "grad_norm_var": 7.141948872140807e-05, "learning_rate": 0.007973571726746218, "loss": 2.5944, "step": 4008 }, { "crossentropy": 2.746080994606018, "epoch": 0.21800483971831752, "grad_norm": 0.03843538463115692, "grad_norm_var": 7.212773253815729e-05, "learning_rate": 0.007972570587040049, "loss": 2.7461, "step": 4009 }, { "crossentropy": 2.631129741668701, "epoch": 0.21805921857581773, "grad_norm": 0.03698447719216347, "grad_norm_var": 7.310163360217756e-05, "learning_rate": 0.007971569262977821, "loss": 2.6311, "step": 4010 }, { "crossentropy": 2.730591058731079, "epoch": 0.21811359743331793, "grad_norm": 0.040420204401016235, "grad_norm_var": 7.138347060210159e-05, "learning_rate": 0.007970567754621639, "loss": 2.7306, "step": 4011 }, { "crossentropy": 2.689277768135071, "epoch": 0.21816797629081813, "grad_norm": 0.04037998989224434, "grad_norm_var": 7.108609086907298e-05, "learning_rate": 0.007969566062033615, "loss": 2.6893, "step": 4012 }, { "crossentropy": 2.6617757081985474, "epoch": 0.21822235514831834, "grad_norm": 0.04014510288834572, "grad_norm_var": 7.001338021923413e-05, "learning_rate": 0.007968564185275872, "loss": 2.6618, "step": 4013 }, { "crossentropy": 2.7099140882492065, "epoch": 0.21827673400581854, "grad_norm": 0.03759488835930824, "grad_norm_var": 7.074621859638377e-05, "learning_rate": 0.007967562124410545, "loss": 2.7099, "step": 4014 }, { "crossentropy": 2.6385399103164673, "epoch": 0.21833111286331874, "grad_norm": 0.0510399267077446, "grad_norm_var": 7.292723544247865e-05, "learning_rate": 0.007966559879499784, "loss": 2.6385, "step": 4015 }, { "crossentropy": 2.6768521070480347, "epoch": 0.21838549172081895, "grad_norm": 0.05262105539441109, "grad_norm_var": 7.729970562364569e-05, "learning_rate": 0.007965557450605742, "loss": 2.6769, "step": 4016 }, { "crossentropy": 2.711631655693054, "epoch": 0.21843987057831915, "grad_norm": 0.03604651987552643, "grad_norm_var": 7.859988750984944e-05, "learning_rate": 0.007964554837790594, "loss": 2.7116, "step": 4017 }, { "crossentropy": 2.7479395866394043, "epoch": 0.21849424943581935, "grad_norm": 0.036162957549095154, "grad_norm_var": 8.171410463563474e-05, "learning_rate": 0.007963552041116517, "loss": 2.7479, "step": 4018 }, { "crossentropy": 2.6407811641693115, "epoch": 0.21854862829331956, "grad_norm": 0.039264287799596786, "grad_norm_var": 8.228497446980583e-05, "learning_rate": 0.007962549060645707, "loss": 2.6408, "step": 4019 }, { "crossentropy": 2.665430784225464, "epoch": 0.21860300715081976, "grad_norm": 0.045549988746643066, "grad_norm_var": 8.259000857158486e-05, "learning_rate": 0.007961545896440367, "loss": 2.6654, "step": 4020 }, { "crossentropy": 2.6572526693344116, "epoch": 0.21865738600831996, "grad_norm": 0.049524154514074326, "grad_norm_var": 8.340471236106348e-05, "learning_rate": 0.007960542548562708, "loss": 2.6573, "step": 4021 }, { "crossentropy": 2.7494016885757446, "epoch": 0.21871176486582017, "grad_norm": 0.03837728872895241, "grad_norm_var": 8.50029529540386e-05, "learning_rate": 0.007959539017074965, "loss": 2.7494, "step": 4022 }, { "crossentropy": 2.749351143836975, "epoch": 0.21876614372332037, "grad_norm": 0.044532373547554016, "grad_norm_var": 2.8905538529580554e-05, "learning_rate": 0.007958535302039367, "loss": 2.7494, "step": 4023 }, { "crossentropy": 2.8251839876174927, "epoch": 0.21882052258082058, "grad_norm": 0.04190421849489212, "grad_norm_var": 2.8369765410006984e-05, "learning_rate": 0.00795753140351817, "loss": 2.8252, "step": 4024 }, { "crossentropy": 2.7569881677627563, "epoch": 0.21887490143832078, "grad_norm": 0.04381490871310234, "grad_norm_var": 2.7756937467835324e-05, "learning_rate": 0.007956527321573632, "loss": 2.757, "step": 4025 }, { "crossentropy": 2.667617440223694, "epoch": 0.21892928029582098, "grad_norm": 0.04255871847271919, "grad_norm_var": 2.586151435666108e-05, "learning_rate": 0.007955523056268026, "loss": 2.6676, "step": 4026 }, { "crossentropy": 2.7344484329223633, "epoch": 0.2189836591533212, "grad_norm": 0.041162922978401184, "grad_norm_var": 2.5690423411331837e-05, "learning_rate": 0.007954518607663637, "loss": 2.7344, "step": 4027 }, { "crossentropy": 2.6577956676483154, "epoch": 0.2190380380108214, "grad_norm": 0.03788146749138832, "grad_norm_var": 2.6800982938287158e-05, "learning_rate": 0.007953513975822755, "loss": 2.6578, "step": 4028 }, { "crossentropy": 2.813645839691162, "epoch": 0.2190924168683216, "grad_norm": 0.03658716380596161, "grad_norm_var": 2.8655371216164635e-05, "learning_rate": 0.007952509160807694, "loss": 2.8136, "step": 4029 }, { "crossentropy": 2.6086227893829346, "epoch": 0.2191467957258218, "grad_norm": 0.035925302654504776, "grad_norm_var": 2.984671133371026e-05, "learning_rate": 0.007951504162680765, "loss": 2.6086, "step": 4030 }, { "crossentropy": 2.5672574043273926, "epoch": 0.219201174583322, "grad_norm": 0.03905511647462845, "grad_norm_var": 2.447357183852153e-05, "learning_rate": 0.0079504989815043, "loss": 2.5673, "step": 4031 }, { "crossentropy": 2.6798053979873657, "epoch": 0.2192555534408222, "grad_norm": 0.03563270345330238, "grad_norm_var": 1.689169846723973e-05, "learning_rate": 0.007949493617340637, "loss": 2.6798, "step": 4032 }, { "crossentropy": 2.6510089635849, "epoch": 0.2193099322983224, "grad_norm": 0.03875808045268059, "grad_norm_var": 1.583195120116356e-05, "learning_rate": 0.007948488070252132, "loss": 2.651, "step": 4033 }, { "crossentropy": 2.6388412714004517, "epoch": 0.2193643111558226, "grad_norm": 0.042521342635154724, "grad_norm_var": 1.4751214106524929e-05, "learning_rate": 0.007947482340301144, "loss": 2.6388, "step": 4034 }, { "crossentropy": 2.6446776390075684, "epoch": 0.21941869001332281, "grad_norm": 0.04732442647218704, "grad_norm_var": 1.7144377141504557e-05, "learning_rate": 0.007946476427550052, "loss": 2.6447, "step": 4035 }, { "crossentropy": 2.6986138820648193, "epoch": 0.21947306887082302, "grad_norm": 0.041862886399030685, "grad_norm_var": 1.5914225381858695e-05, "learning_rate": 0.007945470332061236, "loss": 2.6986, "step": 4036 }, { "crossentropy": 2.665176749229431, "epoch": 0.21952744772832322, "grad_norm": 0.04332999512553215, "grad_norm_var": 1.134566110465255e-05, "learning_rate": 0.007944464053897099, "loss": 2.6652, "step": 4037 }, { "crossentropy": 2.710359215736389, "epoch": 0.21958182658582343, "grad_norm": 0.041614510118961334, "grad_norm_var": 1.0997305442471867e-05, "learning_rate": 0.007943457593120045, "loss": 2.7104, "step": 4038 }, { "crossentropy": 2.6015591621398926, "epoch": 0.21963620544332363, "grad_norm": 0.05759957805275917, "grad_norm_var": 2.799075552685117e-05, "learning_rate": 0.007942450949792493, "loss": 2.6016, "step": 4039 }, { "crossentropy": 2.79144549369812, "epoch": 0.21969058430082383, "grad_norm": 0.0357060506939888, "grad_norm_var": 3.024028261895678e-05, "learning_rate": 0.00794144412397688, "loss": 2.7914, "step": 4040 }, { "crossentropy": 2.7030428647994995, "epoch": 0.21974496315832404, "grad_norm": 0.034923192113637924, "grad_norm_var": 3.223977101329678e-05, "learning_rate": 0.007940437115735643, "loss": 2.703, "step": 4041 }, { "crossentropy": 2.6066737174987793, "epoch": 0.21979934201582424, "grad_norm": 0.06611493974924088, "grad_norm_var": 7.251456840345315e-05, "learning_rate": 0.007939429925131237, "loss": 2.6067, "step": 4042 }, { "crossentropy": 2.7420610189437866, "epoch": 0.21985372087332444, "grad_norm": 0.04149302840232849, "grad_norm_var": 7.247353321734664e-05, "learning_rate": 0.007938422552226128, "loss": 2.7421, "step": 4043 }, { "crossentropy": 2.7722129821777344, "epoch": 0.21990809973082465, "grad_norm": 0.03619449958205223, "grad_norm_var": 7.363864552639617e-05, "learning_rate": 0.007937414997082792, "loss": 2.7722, "step": 4044 }, { "crossentropy": 2.823387622833252, "epoch": 0.21996247858832485, "grad_norm": 0.0394655205309391, "grad_norm_var": 7.201571959027092e-05, "learning_rate": 0.007936407259763715, "loss": 2.8234, "step": 4045 }, { "crossentropy": 2.6636877059936523, "epoch": 0.22001685744582505, "grad_norm": 0.04008009657263756, "grad_norm_var": 6.953823757998665e-05, "learning_rate": 0.0079353993403314, "loss": 2.6637, "step": 4046 }, { "crossentropy": 2.7629592418670654, "epoch": 0.22007123630332526, "grad_norm": 0.039348993450403214, "grad_norm_var": 6.940454798228778e-05, "learning_rate": 0.007934391238848353, "loss": 2.763, "step": 4047 }, { "crossentropy": 2.6532310247421265, "epoch": 0.22012561516082546, "grad_norm": 0.0511820912361145, "grad_norm_var": 7.002313111205734e-05, "learning_rate": 0.007933382955377098, "loss": 2.6532, "step": 4048 }, { "crossentropy": 2.6800951957702637, "epoch": 0.22017999401832566, "grad_norm": 0.03804980218410492, "grad_norm_var": 7.051126487521316e-05, "learning_rate": 0.007932374489980165, "loss": 2.6801, "step": 4049 }, { "crossentropy": 2.753242611885071, "epoch": 0.22023437287582587, "grad_norm": 0.04560639709234238, "grad_norm_var": 7.068270230962496e-05, "learning_rate": 0.007931365842720101, "loss": 2.7532, "step": 4050 }, { "crossentropy": 2.6723395586013794, "epoch": 0.22028875173332607, "grad_norm": 0.04644050449132919, "grad_norm_var": 7.030950013009e-05, "learning_rate": 0.007930357013659461, "loss": 2.6723, "step": 4051 }, { "crossentropy": 2.756945490837097, "epoch": 0.22034313059082627, "grad_norm": 0.04158502444624901, "grad_norm_var": 7.038195232876114e-05, "learning_rate": 0.007929348002860812, "loss": 2.7569, "step": 4052 }, { "crossentropy": 2.7071099281311035, "epoch": 0.22039750944832648, "grad_norm": 0.04269428178668022, "grad_norm_var": 7.043610531458633e-05, "learning_rate": 0.007928338810386728, "loss": 2.7071, "step": 4053 }, { "crossentropy": 2.6778690814971924, "epoch": 0.22045188830582668, "grad_norm": 0.04058757796883583, "grad_norm_var": 7.077814509002054e-05, "learning_rate": 0.007927329436299804, "loss": 2.6779, "step": 4054 }, { "crossentropy": 2.7531604766845703, "epoch": 0.22050626716332689, "grad_norm": 0.03727259114384651, "grad_norm_var": 5.857021941615085e-05, "learning_rate": 0.007926319880662636, "loss": 2.7532, "step": 4055 }, { "crossentropy": 2.6177529096603394, "epoch": 0.2205606460208271, "grad_norm": 0.036154646426439285, "grad_norm_var": 5.818860159720491e-05, "learning_rate": 0.007925310143537836, "loss": 2.6178, "step": 4056 }, { "crossentropy": 2.642090916633606, "epoch": 0.2206150248783273, "grad_norm": 0.03966584801673889, "grad_norm_var": 5.491410620249246e-05, "learning_rate": 0.00792430022498803, "loss": 2.6421, "step": 4057 }, { "crossentropy": 2.6042604446411133, "epoch": 0.22066940373582752, "grad_norm": 0.03680531680583954, "grad_norm_var": 1.67918072924183e-05, "learning_rate": 0.00792329012507585, "loss": 2.6043, "step": 4058 }, { "crossentropy": 2.649792432785034, "epoch": 0.22072378259332773, "grad_norm": 0.09721532464027405, "grad_norm_var": 0.00021608234721914212, "learning_rate": 0.007922279843863941, "loss": 2.6498, "step": 4059 }, { "crossentropy": 2.694441795349121, "epoch": 0.22077816145082793, "grad_norm": 0.04926695302128792, "grad_norm_var": 0.00021268425856523214, "learning_rate": 0.007921269381414962, "loss": 2.6944, "step": 4060 }, { "crossentropy": 2.7586158514022827, "epoch": 0.22083254030832813, "grad_norm": 0.03851304575800896, "grad_norm_var": 0.00021345509800321788, "learning_rate": 0.007920258737791577, "loss": 2.7586, "step": 4061 }, { "crossentropy": 2.5745344161987305, "epoch": 0.22088691916582834, "grad_norm": 0.038239892572164536, "grad_norm_var": 0.00021488107944219823, "learning_rate": 0.00791924791305647, "loss": 2.5745, "step": 4062 }, { "crossentropy": 2.675554633140564, "epoch": 0.22094129802332854, "grad_norm": 0.03724417835474014, "grad_norm_var": 0.0002167198197583484, "learning_rate": 0.007918236907272327, "loss": 2.6756, "step": 4063 }, { "crossentropy": 2.7560417652130127, "epoch": 0.22099567688082875, "grad_norm": 0.03761647269129753, "grad_norm_var": 0.00021664658296909745, "learning_rate": 0.007917225720501854, "loss": 2.756, "step": 4064 }, { "crossentropy": 2.771972417831421, "epoch": 0.22105005573832895, "grad_norm": 0.04450157284736633, "grad_norm_var": 0.0002141856226598496, "learning_rate": 0.00791621435280776, "loss": 2.772, "step": 4065 }, { "crossentropy": 2.6638431549072266, "epoch": 0.22110443459582915, "grad_norm": 0.04853430390357971, "grad_norm_var": 0.00021521653929462467, "learning_rate": 0.007915202804252772, "loss": 2.6638, "step": 4066 }, { "crossentropy": 2.659580707550049, "epoch": 0.22115881345332936, "grad_norm": 0.042918525636196136, "grad_norm_var": 0.00021509046138648702, "learning_rate": 0.007914191074899622, "loss": 2.6596, "step": 4067 }, { "crossentropy": 2.5739411115646362, "epoch": 0.22121319231082956, "grad_norm": 0.036088671535253525, "grad_norm_var": 0.00021896895419398806, "learning_rate": 0.00791317916481106, "loss": 2.5739, "step": 4068 }, { "crossentropy": 2.601760745048523, "epoch": 0.22126757116832976, "grad_norm": 0.03707633912563324, "grad_norm_var": 0.00022188772186926923, "learning_rate": 0.007912167074049842, "loss": 2.6018, "step": 4069 }, { "crossentropy": 2.569577217102051, "epoch": 0.22132195002582997, "grad_norm": 0.03966141864657402, "grad_norm_var": 0.00022231411178953603, "learning_rate": 0.007911154802678736, "loss": 2.5696, "step": 4070 }, { "crossentropy": 2.6513710021972656, "epoch": 0.22137632888333017, "grad_norm": 0.03636868670582771, "grad_norm_var": 0.00022312154646981935, "learning_rate": 0.007910142350760523, "loss": 2.6514, "step": 4071 }, { "crossentropy": 2.7288148403167725, "epoch": 0.22143070774083037, "grad_norm": 0.03751537576317787, "grad_norm_var": 0.00022190605929786218, "learning_rate": 0.007909129718357997, "loss": 2.7288, "step": 4072 }, { "crossentropy": 2.652135729789734, "epoch": 0.22148508659833058, "grad_norm": 0.03781479224562645, "grad_norm_var": 0.00022308551001685805, "learning_rate": 0.007908116905533957, "loss": 2.6521, "step": 4073 }, { "crossentropy": 2.7231383323669434, "epoch": 0.22153946545583078, "grad_norm": 0.04492316022515297, "grad_norm_var": 0.00021999991937605504, "learning_rate": 0.007907103912351218, "loss": 2.7231, "step": 4074 }, { "crossentropy": 2.67316734790802, "epoch": 0.22159384431333098, "grad_norm": 0.03790024667978287, "grad_norm_var": 1.8781724689688936e-05, "learning_rate": 0.007906090738872603, "loss": 2.6732, "step": 4075 }, { "crossentropy": 2.7747883796691895, "epoch": 0.2216482231708312, "grad_norm": 0.037883415818214417, "grad_norm_var": 1.3212226319185965e-05, "learning_rate": 0.00790507738516095, "loss": 2.7748, "step": 4076 }, { "crossentropy": 2.695882797241211, "epoch": 0.2217026020283314, "grad_norm": 0.038939256221055984, "grad_norm_var": 1.316465132954834e-05, "learning_rate": 0.007904063851279106, "loss": 2.6959, "step": 4077 }, { "crossentropy": 2.7975295782089233, "epoch": 0.2217569808858316, "grad_norm": 0.03795496001839638, "grad_norm_var": 1.3220510034443686e-05, "learning_rate": 0.00790305013728993, "loss": 2.7975, "step": 4078 }, { "crossentropy": 2.6770336627960205, "epoch": 0.2218113597433318, "grad_norm": 0.03677048906683922, "grad_norm_var": 1.3380724353330893e-05, "learning_rate": 0.007902036243256291, "loss": 2.677, "step": 4079 }, { "crossentropy": 2.6548198461532593, "epoch": 0.221865738600832, "grad_norm": 0.03994682803750038, "grad_norm_var": 1.3125813404633175e-05, "learning_rate": 0.00790102216924107, "loss": 2.6548, "step": 4080 }, { "crossentropy": 2.769950747489929, "epoch": 0.2219201174583322, "grad_norm": 0.03899448364973068, "grad_norm_var": 1.1477176574989367e-05, "learning_rate": 0.007900007915307159, "loss": 2.77, "step": 4081 }, { "crossentropy": 2.740941286087036, "epoch": 0.2219744963158324, "grad_norm": 0.04123164713382721, "grad_norm_var": 5.848776590509456e-06, "learning_rate": 0.00789899348151746, "loss": 2.7409, "step": 4082 }, { "crossentropy": 2.8196951150894165, "epoch": 0.2220288751733326, "grad_norm": 0.04232222959399223, "grad_norm_var": 5.549456383796963e-06, "learning_rate": 0.007897978867934887, "loss": 2.8197, "step": 4083 }, { "crossentropy": 2.712047576904297, "epoch": 0.22208325403083282, "grad_norm": 0.043035682290792465, "grad_norm_var": 6.020077271887541e-06, "learning_rate": 0.007896964074622368, "loss": 2.712, "step": 4084 }, { "crossentropy": 2.6896942853927612, "epoch": 0.22213763288833302, "grad_norm": 0.03662657365202904, "grad_norm_var": 6.164342644421339e-06, "learning_rate": 0.007895949101642838, "loss": 2.6897, "step": 4085 }, { "crossentropy": 2.61075222492218, "epoch": 0.22219201174583322, "grad_norm": 0.037412386387586594, "grad_norm_var": 6.355028519011708e-06, "learning_rate": 0.007894933949059244, "loss": 2.6108, "step": 4086 }, { "crossentropy": 2.6188794374465942, "epoch": 0.22224639060333343, "grad_norm": 0.036852508783340454, "grad_norm_var": 6.193300665586059e-06, "learning_rate": 0.007893918616934545, "loss": 2.6189, "step": 4087 }, { "crossentropy": 2.639326810836792, "epoch": 0.22230076946083363, "grad_norm": 0.04230561479926109, "grad_norm_var": 6.594434061532537e-06, "learning_rate": 0.007892903105331712, "loss": 2.6393, "step": 4088 }, { "crossentropy": 2.7628384828567505, "epoch": 0.22235514831833383, "grad_norm": 0.0423593670129776, "grad_norm_var": 6.905234200290054e-06, "learning_rate": 0.007891887414313726, "loss": 2.7628, "step": 4089 }, { "crossentropy": 2.6459157466888428, "epoch": 0.22240952717583404, "grad_norm": 0.039220619946718216, "grad_norm_var": 4.9785995013778655e-06, "learning_rate": 0.007890871543943577, "loss": 2.6459, "step": 4090 }, { "crossentropy": 2.639006733894348, "epoch": 0.22246390603333424, "grad_norm": 0.03921503946185112, "grad_norm_var": 4.830779361070641e-06, "learning_rate": 0.007889855494284273, "loss": 2.639, "step": 4091 }, { "crossentropy": 2.7473556995391846, "epoch": 0.22251828489083444, "grad_norm": 0.0406331792473793, "grad_norm_var": 4.7319431690450695e-06, "learning_rate": 0.007888839265398821, "loss": 2.7474, "step": 4092 }, { "crossentropy": 2.5893070697784424, "epoch": 0.22257266374833465, "grad_norm": 0.03928983956575394, "grad_norm_var": 4.70809359563613e-06, "learning_rate": 0.007887822857350256, "loss": 2.5893, "step": 4093 }, { "crossentropy": 2.7374744415283203, "epoch": 0.22262704260583485, "grad_norm": 0.03859153017401695, "grad_norm_var": 4.59076410275668e-06, "learning_rate": 0.007886806270201606, "loss": 2.7375, "step": 4094 }, { "crossentropy": 2.6778717041015625, "epoch": 0.22268142146333506, "grad_norm": 0.03995763137936592, "grad_norm_var": 3.991139759070237e-06, "learning_rate": 0.007885789504015923, "loss": 2.6779, "step": 4095 }, { "crossentropy": 2.67782461643219, "epoch": 0.22273580032083526, "grad_norm": 0.03840794041752815, "grad_norm_var": 4.124350612074122e-06, "learning_rate": 0.007884772558856265, "loss": 2.6778, "step": 4096 }, { "crossentropy": 2.704371213912964, "epoch": 0.22279017917833546, "grad_norm": 0.039447493851184845, "grad_norm_var": 4.089820065796233e-06, "learning_rate": 0.007883755434785705, "loss": 2.7044, "step": 4097 }, { "crossentropy": 2.7433595657348633, "epoch": 0.22284455803583567, "grad_norm": 0.035791948437690735, "grad_norm_var": 4.9058052317768145e-06, "learning_rate": 0.00788273813186732, "loss": 2.7434, "step": 4098 }, { "crossentropy": 2.654525637626648, "epoch": 0.22289893689333587, "grad_norm": 0.04097382724285126, "grad_norm_var": 4.506081795136301e-06, "learning_rate": 0.007881720650164204, "loss": 2.6545, "step": 4099 }, { "crossentropy": 2.667494535446167, "epoch": 0.22295331575083607, "grad_norm": 0.04238082095980644, "grad_norm_var": 4.213913916941275e-06, "learning_rate": 0.007880702989739459, "loss": 2.6675, "step": 4100 }, { "crossentropy": 2.761878728866577, "epoch": 0.22300769460833628, "grad_norm": 0.04042166471481323, "grad_norm_var": 3.7402245684751516e-06, "learning_rate": 0.0078796851506562, "loss": 2.7619, "step": 4101 }, { "crossentropy": 2.6696470975875854, "epoch": 0.22306207346583648, "grad_norm": 0.0423487052321434, "grad_norm_var": 3.8372709989672436e-06, "learning_rate": 0.007878667132977553, "loss": 2.6696, "step": 4102 }, { "crossentropy": 2.8567004203796387, "epoch": 0.22311645232333668, "grad_norm": 0.04124891012907028, "grad_norm_var": 3.2663037396036983e-06, "learning_rate": 0.007877648936766655, "loss": 2.8567, "step": 4103 }, { "crossentropy": 2.6625319719314575, "epoch": 0.2231708311808369, "grad_norm": 0.04301842302083969, "grad_norm_var": 3.5017785448991185e-06, "learning_rate": 0.007876630562086652, "loss": 2.6625, "step": 4104 }, { "crossentropy": 2.7573670148849487, "epoch": 0.2232252100383371, "grad_norm": 0.04293520748615265, "grad_norm_var": 3.6877833432480412e-06, "learning_rate": 0.007875612009000705, "loss": 2.7574, "step": 4105 }, { "crossentropy": 2.764150381088257, "epoch": 0.2232795888958373, "grad_norm": 0.03970597684383392, "grad_norm_var": 3.6363650971186002e-06, "learning_rate": 0.00787459327757198, "loss": 2.7642, "step": 4106 }, { "crossentropy": 2.6726698875427246, "epoch": 0.2233339677533375, "grad_norm": 0.038357000797986984, "grad_norm_var": 3.8034166213738244e-06, "learning_rate": 0.007873574367863661, "loss": 2.6727, "step": 4107 }, { "crossentropy": 2.6993409395217896, "epoch": 0.2233883466108377, "grad_norm": 0.03863171488046646, "grad_norm_var": 3.943355912545445e-06, "learning_rate": 0.007872555279938938, "loss": 2.6993, "step": 4108 }, { "crossentropy": 2.71451735496521, "epoch": 0.2234427254683379, "grad_norm": 0.03584746643900871, "grad_norm_var": 5.053205710312906e-06, "learning_rate": 0.007871536013861017, "loss": 2.7145, "step": 4109 }, { "crossentropy": 2.6097527742385864, "epoch": 0.2234971043258381, "grad_norm": 0.04119245707988739, "grad_norm_var": 5.029475992679849e-06, "learning_rate": 0.007870516569693106, "loss": 2.6098, "step": 4110 }, { "crossentropy": 2.828624963760376, "epoch": 0.2235514831833383, "grad_norm": 0.03990723937749863, "grad_norm_var": 5.030199549118051e-06, "learning_rate": 0.007869496947498436, "loss": 2.8286, "step": 4111 }, { "crossentropy": 2.7316789627075195, "epoch": 0.22360586204083852, "grad_norm": 0.039732012897729874, "grad_norm_var": 4.85189987579233e-06, "learning_rate": 0.00786847714734024, "loss": 2.7317, "step": 4112 }, { "crossentropy": 2.674302577972412, "epoch": 0.22366024089833872, "grad_norm": 0.03748210892081261, "grad_norm_var": 5.2698939260251504e-06, "learning_rate": 0.007867457169281764, "loss": 2.6743, "step": 4113 }, { "crossentropy": 2.6810039281845093, "epoch": 0.22371461975583892, "grad_norm": 0.03852275386452675, "grad_norm_var": 4.2043502891766135e-06, "learning_rate": 0.00786643701338627, "loss": 2.681, "step": 4114 }, { "crossentropy": 2.693224549293518, "epoch": 0.22376899861333913, "grad_norm": 0.042516957968473434, "grad_norm_var": 4.518742939453532e-06, "learning_rate": 0.007865416679717024, "loss": 2.6932, "step": 4115 }, { "crossentropy": 2.8263920545578003, "epoch": 0.22382337747083933, "grad_norm": 0.04475812986493111, "grad_norm_var": 5.542442495044762e-06, "learning_rate": 0.007864396168337307, "loss": 2.8264, "step": 4116 }, { "crossentropy": 2.702813148498535, "epoch": 0.22387775632833953, "grad_norm": 0.04786384105682373, "grad_norm_var": 9.011503130747165e-06, "learning_rate": 0.007863375479310408, "loss": 2.7028, "step": 4117 }, { "crossentropy": 2.7079707384109497, "epoch": 0.22393213518583974, "grad_norm": 0.04449847340583801, "grad_norm_var": 9.721529264639037e-06, "learning_rate": 0.007862354612699634, "loss": 2.708, "step": 4118 }, { "crossentropy": 2.6731501817703247, "epoch": 0.22398651404333994, "grad_norm": 0.03871413692831993, "grad_norm_var": 1.0043591442406481e-05, "learning_rate": 0.007861333568568296, "loss": 2.6732, "step": 4119 }, { "crossentropy": 2.7666313648223877, "epoch": 0.22404089290084014, "grad_norm": 0.036508720368146896, "grad_norm_var": 1.0814551931738411e-05, "learning_rate": 0.007860312346979715, "loss": 2.7666, "step": 4120 }, { "crossentropy": 2.704486608505249, "epoch": 0.22409527175834035, "grad_norm": 0.03935693949460983, "grad_norm_var": 1.0428334210230092e-05, "learning_rate": 0.00785929094799723, "loss": 2.7045, "step": 4121 }, { "crossentropy": 2.7086976766586304, "epoch": 0.22414965061584055, "grad_norm": 0.039968542754650116, "grad_norm_var": 1.0414481547074636e-05, "learning_rate": 0.007858269371684188, "loss": 2.7087, "step": 4122 }, { "crossentropy": 2.6289405822753906, "epoch": 0.22420402947334075, "grad_norm": 0.046499237418174744, "grad_norm_var": 1.2512484282811477e-05, "learning_rate": 0.007857247618103943, "loss": 2.6289, "step": 4123 }, { "crossentropy": 2.6676658391952515, "epoch": 0.22425840833084096, "grad_norm": 0.04365574195981026, "grad_norm_var": 1.2671030371672458e-05, "learning_rate": 0.007856225687319862, "loss": 2.6677, "step": 4124 }, { "crossentropy": 2.6238343715667725, "epoch": 0.22431278718834116, "grad_norm": 0.0382113941013813, "grad_norm_var": 1.1376073963773915e-05, "learning_rate": 0.00785520357939533, "loss": 2.6238, "step": 4125 }, { "crossentropy": 2.613938331604004, "epoch": 0.22436716604584137, "grad_norm": 0.035724978893995285, "grad_norm_var": 1.3258502125841851e-05, "learning_rate": 0.007854181294393733, "loss": 2.6139, "step": 4126 }, { "crossentropy": 2.6155225038528442, "epoch": 0.22442154490334157, "grad_norm": 0.03926745057106018, "grad_norm_var": 1.3366220143094335e-05, "learning_rate": 0.007853158832378472, "loss": 2.6155, "step": 4127 }, { "crossentropy": 2.6964985132217407, "epoch": 0.22447592376084177, "grad_norm": 0.03822425380349159, "grad_norm_var": 1.3729054922205034e-05, "learning_rate": 0.007852136193412962, "loss": 2.6965, "step": 4128 }, { "crossentropy": 2.7497142553329468, "epoch": 0.22453030261834198, "grad_norm": 0.041424937546253204, "grad_norm_var": 1.2990145708138313e-05, "learning_rate": 0.007851113377560623, "loss": 2.7497, "step": 4129 }, { "crossentropy": 2.5869038105010986, "epoch": 0.22458468147584218, "grad_norm": 0.03583033010363579, "grad_norm_var": 1.4326162420305078e-05, "learning_rate": 0.007850090384884892, "loss": 2.5869, "step": 4130 }, { "crossentropy": 2.7460010051727295, "epoch": 0.22463906033334238, "grad_norm": 0.03770636394619942, "grad_norm_var": 1.4680229913906622e-05, "learning_rate": 0.007849067215449212, "loss": 2.746, "step": 4131 }, { "crossentropy": 2.799539566040039, "epoch": 0.2246934391908426, "grad_norm": 0.0458107590675354, "grad_norm_var": 1.53452400322845e-05, "learning_rate": 0.00784804386931704, "loss": 2.7995, "step": 4132 }, { "crossentropy": 2.778722405433655, "epoch": 0.22474781804834282, "grad_norm": 0.04683888703584671, "grad_norm_var": 1.4415365868228336e-05, "learning_rate": 0.007847020346551839, "loss": 2.7787, "step": 4133 }, { "crossentropy": 2.7432278394699097, "epoch": 0.22480219690584302, "grad_norm": 0.042331404983997345, "grad_norm_var": 1.3557903595282778e-05, "learning_rate": 0.007845996647217093, "loss": 2.7432, "step": 4134 }, { "crossentropy": 2.7363643646240234, "epoch": 0.22485657576334322, "grad_norm": 0.0401388444006443, "grad_norm_var": 1.3368386845539575e-05, "learning_rate": 0.007844972771376288, "loss": 2.7364, "step": 4135 }, { "crossentropy": 2.6901803016662598, "epoch": 0.22491095462084343, "grad_norm": 0.03686999902129173, "grad_norm_var": 1.3185791584457059e-05, "learning_rate": 0.007843948719092924, "loss": 2.6902, "step": 4136 }, { "crossentropy": 2.600290536880493, "epoch": 0.22496533347834363, "grad_norm": 0.04217800498008728, "grad_norm_var": 1.3256528813010194e-05, "learning_rate": 0.007842924490430512, "loss": 2.6003, "step": 4137 }, { "crossentropy": 2.7329131364822388, "epoch": 0.22501971233584384, "grad_norm": 0.043853338807821274, "grad_norm_var": 1.3837678889154897e-05, "learning_rate": 0.007841900085452573, "loss": 2.7329, "step": 4138 }, { "crossentropy": 2.5539239645004272, "epoch": 0.22507409119334404, "grad_norm": 0.04272434115409851, "grad_norm_var": 1.1915308245883232e-05, "learning_rate": 0.007840875504222642, "loss": 2.5539, "step": 4139 }, { "crossentropy": 2.7134405374526978, "epoch": 0.22512847005084424, "grad_norm": 0.04269235581159592, "grad_norm_var": 1.1590362554809534e-05, "learning_rate": 0.00783985074680426, "loss": 2.7134, "step": 4140 }, { "crossentropy": 2.619081139564514, "epoch": 0.22518284890834445, "grad_norm": 0.03770830109715462, "grad_norm_var": 1.1767361306309435e-05, "learning_rate": 0.007838825813260982, "loss": 2.6191, "step": 4141 }, { "crossentropy": 2.654461145401001, "epoch": 0.22523722776584465, "grad_norm": 0.03750642016530037, "grad_norm_var": 1.0811854411244687e-05, "learning_rate": 0.007837800703656372, "loss": 2.6545, "step": 4142 }, { "crossentropy": 2.71551775932312, "epoch": 0.22529160662334485, "grad_norm": 0.05206410586833954, "grad_norm_var": 1.8612282934486052e-05, "learning_rate": 0.007836775418054009, "loss": 2.7155, "step": 4143 }, { "crossentropy": 2.6765748262405396, "epoch": 0.22534598548084506, "grad_norm": 0.04068915545940399, "grad_norm_var": 1.7917430762231547e-05, "learning_rate": 0.007835749956517481, "loss": 2.6766, "step": 4144 }, { "crossentropy": 2.7237632274627686, "epoch": 0.22540036433834526, "grad_norm": 0.04438913241028786, "grad_norm_var": 1.83784350137652e-05, "learning_rate": 0.007834724319110382, "loss": 2.7238, "step": 4145 }, { "crossentropy": 2.6907765865325928, "epoch": 0.22545474319584546, "grad_norm": 0.04291775822639465, "grad_norm_var": 1.584522568002783e-05, "learning_rate": 0.007833698505896325, "loss": 2.6908, "step": 4146 }, { "crossentropy": 2.6493210792541504, "epoch": 0.22550912205334567, "grad_norm": 0.0375734344124794, "grad_norm_var": 1.592732553091017e-05, "learning_rate": 0.00783267251693893, "loss": 2.6493, "step": 4147 }, { "crossentropy": 2.7011916637420654, "epoch": 0.22556350091084587, "grad_norm": 0.04167184606194496, "grad_norm_var": 1.5042837937156012e-05, "learning_rate": 0.007831646352301824, "loss": 2.7012, "step": 4148 }, { "crossentropy": 2.6522884368896484, "epoch": 0.22561787976834607, "grad_norm": 0.03634015470743179, "grad_norm_var": 1.517106506064286e-05, "learning_rate": 0.007830620012048652, "loss": 2.6523, "step": 4149 }, { "crossentropy": 2.767833948135376, "epoch": 0.22567225862584628, "grad_norm": 0.037514228373765945, "grad_norm_var": 1.5992993507561033e-05, "learning_rate": 0.007829593496243067, "loss": 2.7678, "step": 4150 }, { "crossentropy": 2.6298022270202637, "epoch": 0.22572663748334648, "grad_norm": 0.03587177023291588, "grad_norm_var": 1.7650501728709115e-05, "learning_rate": 0.007828566804948728, "loss": 2.6298, "step": 4151 }, { "crossentropy": 2.715449571609497, "epoch": 0.22578101634084669, "grad_norm": 0.03846760839223862, "grad_norm_var": 1.6976013789371468e-05, "learning_rate": 0.007827539938229315, "loss": 2.7154, "step": 4152 }, { "crossentropy": 2.777248978614807, "epoch": 0.2258353951983469, "grad_norm": 0.04136166349053383, "grad_norm_var": 1.687694012119841e-05, "learning_rate": 0.007826512896148512, "loss": 2.7772, "step": 4153 }, { "crossentropy": 2.813336491584778, "epoch": 0.2258897740558471, "grad_norm": 0.04261606186628342, "grad_norm_var": 1.6474534054130145e-05, "learning_rate": 0.007825485678770012, "loss": 2.8133, "step": 4154 }, { "crossentropy": 2.722074866294861, "epoch": 0.2259441529133473, "grad_norm": 0.046734388917684555, "grad_norm_var": 1.853157089410524e-05, "learning_rate": 0.007824458286157526, "loss": 2.7221, "step": 4155 }, { "crossentropy": 2.59792160987854, "epoch": 0.2259985317708475, "grad_norm": 0.042839664965867996, "grad_norm_var": 1.8566021749841638e-05, "learning_rate": 0.00782343071837477, "loss": 2.5979, "step": 4156 }, { "crossentropy": 2.7181758880615234, "epoch": 0.2260529106283477, "grad_norm": 0.038146618753671646, "grad_norm_var": 1.838468427583963e-05, "learning_rate": 0.007822402975485475, "loss": 2.7182, "step": 4157 }, { "crossentropy": 2.6309438943862915, "epoch": 0.2261072894858479, "grad_norm": 0.036796387284994125, "grad_norm_var": 1.8751099916752312e-05, "learning_rate": 0.007821375057553375, "loss": 2.6309, "step": 4158 }, { "crossentropy": 2.7821964025497437, "epoch": 0.2261616683433481, "grad_norm": 0.03934292495250702, "grad_norm_var": 1.0098273763397592e-05, "learning_rate": 0.007820346964642228, "loss": 2.7822, "step": 4159 }, { "crossentropy": 2.467634916305542, "epoch": 0.2262160472008483, "grad_norm": 0.03694053739309311, "grad_norm_var": 1.073431886739051e-05, "learning_rate": 0.00781931869681579, "loss": 2.4676, "step": 4160 }, { "crossentropy": 2.6349093914031982, "epoch": 0.22627042605834852, "grad_norm": 0.03662107139825821, "grad_norm_var": 9.928934035267207e-06, "learning_rate": 0.007818290254137834, "loss": 2.6349, "step": 4161 }, { "crossentropy": 2.58943510055542, "epoch": 0.22632480491584872, "grad_norm": 0.03783104941248894, "grad_norm_var": 9.217740247203544e-06, "learning_rate": 0.007817261636672145, "loss": 2.5894, "step": 4162 }, { "crossentropy": 2.663217306137085, "epoch": 0.22637918377334892, "grad_norm": 0.03696778416633606, "grad_norm_var": 9.369338727530038e-06, "learning_rate": 0.007816232844482516, "loss": 2.6632, "step": 4163 }, { "crossentropy": 2.7063528299331665, "epoch": 0.22643356263084913, "grad_norm": 0.036311905831098557, "grad_norm_var": 9.347620924631389e-06, "learning_rate": 0.007815203877632751, "loss": 2.7064, "step": 4164 }, { "crossentropy": 2.689982533454895, "epoch": 0.22648794148834933, "grad_norm": 0.03594536706805229, "grad_norm_var": 9.486527784981445e-06, "learning_rate": 0.007814174736186666, "loss": 2.69, "step": 4165 }, { "crossentropy": 2.705479621887207, "epoch": 0.22654232034584953, "grad_norm": 0.04285765811800957, "grad_norm_var": 1.037684758040484e-05, "learning_rate": 0.007813145420208088, "loss": 2.7055, "step": 4166 }, { "crossentropy": 2.7020689249038696, "epoch": 0.22659669920334974, "grad_norm": 0.03668961301445961, "grad_norm_var": 1.0066269619176069e-05, "learning_rate": 0.007812115929760854, "loss": 2.7021, "step": 4167 }, { "crossentropy": 2.724352240562439, "epoch": 0.22665107806084994, "grad_norm": 0.039213042706251144, "grad_norm_var": 1.0032738658134234e-05, "learning_rate": 0.007811086264908811, "loss": 2.7244, "step": 4168 }, { "crossentropy": 2.6435574293136597, "epoch": 0.22670545691835015, "grad_norm": 0.03632016107439995, "grad_norm_var": 1.0168875675979108e-05, "learning_rate": 0.0078100564257158195, "loss": 2.6436, "step": 4169 }, { "crossentropy": 2.7799112796783447, "epoch": 0.22675983577585035, "grad_norm": 0.11564213037490845, "grad_norm_var": 0.0003797892666631825, "learning_rate": 0.007809026412245748, "loss": 2.7799, "step": 4170 }, { "crossentropy": 2.604674816131592, "epoch": 0.22681421463335055, "grad_norm": 0.04054470360279083, "grad_norm_var": 0.0003794732173392157, "learning_rate": 0.007807996224562476, "loss": 2.6047, "step": 4171 }, { "crossentropy": 2.7494382858276367, "epoch": 0.22686859349085076, "grad_norm": 0.04529505595564842, "grad_norm_var": 0.00037977685614042333, "learning_rate": 0.007806965862729897, "loss": 2.7494, "step": 4172 }, { "crossentropy": 2.7007981538772583, "epoch": 0.22692297234835096, "grad_norm": 0.03717389330267906, "grad_norm_var": 0.0003804935566206443, "learning_rate": 0.0078059353268119126, "loss": 2.7008, "step": 4173 }, { "crossentropy": 2.7061314582824707, "epoch": 0.22697735120585116, "grad_norm": 0.060653138905763626, "grad_norm_var": 0.0003958363401581897, "learning_rate": 0.007804904616872435, "loss": 2.7061, "step": 4174 }, { "crossentropy": 2.679934859275818, "epoch": 0.22703173006335137, "grad_norm": 0.03695398196578026, "grad_norm_var": 0.0003978824760064586, "learning_rate": 0.007803873732975387, "loss": 2.6799, "step": 4175 }, { "crossentropy": 2.6918399333953857, "epoch": 0.22708610892085157, "grad_norm": 0.03858233615756035, "grad_norm_var": 0.0003963966617342105, "learning_rate": 0.007802842675184705, "loss": 2.6918, "step": 4176 }, { "crossentropy": 2.827678680419922, "epoch": 0.22714048777835177, "grad_norm": 0.04058733209967613, "grad_norm_var": 0.0003931602328157826, "learning_rate": 0.0078018114435643325, "loss": 2.8277, "step": 4177 }, { "crossentropy": 2.5778138637542725, "epoch": 0.22719486663585198, "grad_norm": 0.03818073868751526, "grad_norm_var": 0.0003928407051336199, "learning_rate": 0.007800780038178228, "loss": 2.5778, "step": 4178 }, { "crossentropy": 2.798241376876831, "epoch": 0.22724924549335218, "grad_norm": 0.039461009204387665, "grad_norm_var": 0.0003906023062152928, "learning_rate": 0.007799748459090356, "loss": 2.7982, "step": 4179 }, { "crossentropy": 2.73306405544281, "epoch": 0.22730362435085238, "grad_norm": 0.04565540701150894, "grad_norm_var": 0.00038520290587246934, "learning_rate": 0.007798716706364695, "loss": 2.7331, "step": 4180 }, { "crossentropy": 2.706952929496765, "epoch": 0.2273580032083526, "grad_norm": 0.050328705459833145, "grad_norm_var": 0.0003795988386174532, "learning_rate": 0.007797684780065233, "loss": 2.707, "step": 4181 }, { "crossentropy": 2.7235641479492188, "epoch": 0.2274123820658528, "grad_norm": 0.043483633548021317, "grad_norm_var": 0.00037931860213502534, "learning_rate": 0.007796652680255969, "loss": 2.7236, "step": 4182 }, { "crossentropy": 2.650778889656067, "epoch": 0.227466760923353, "grad_norm": 0.04077250882983208, "grad_norm_var": 0.0003749938166464933, "learning_rate": 0.007795620407000913, "loss": 2.6508, "step": 4183 }, { "crossentropy": 2.7756398916244507, "epoch": 0.2275211397808532, "grad_norm": 0.041701674461364746, "grad_norm_var": 0.0003728624205673705, "learning_rate": 0.0077945879603640855, "loss": 2.7756, "step": 4184 }, { "crossentropy": 2.5971916913986206, "epoch": 0.2275755186383534, "grad_norm": 0.04046343266963959, "grad_norm_var": 0.0003680583214430696, "learning_rate": 0.00779355534040952, "loss": 2.5972, "step": 4185 }, { "crossentropy": 2.837032914161682, "epoch": 0.2276298974958536, "grad_norm": 0.03896043077111244, "grad_norm_var": 3.597438234961258e-05, "learning_rate": 0.007792522547201253, "loss": 2.837, "step": 4186 }, { "crossentropy": 2.628721833229065, "epoch": 0.2276842763533538, "grad_norm": 0.03535021096467972, "grad_norm_var": 3.896300857246515e-05, "learning_rate": 0.007791489580803343, "loss": 2.6287, "step": 4187 }, { "crossentropy": 2.698420286178589, "epoch": 0.227738655210854, "grad_norm": 0.036484722048044205, "grad_norm_var": 4.006137080232799e-05, "learning_rate": 0.007790456441279853, "loss": 2.6984, "step": 4188 }, { "crossentropy": 2.615944504737854, "epoch": 0.22779303406835422, "grad_norm": 0.03854481503367424, "grad_norm_var": 3.937900650329632e-05, "learning_rate": 0.007789423128694854, "loss": 2.6159, "step": 4189 }, { "crossentropy": 2.639276385307312, "epoch": 0.22784741292585442, "grad_norm": 0.040354013442993164, "grad_norm_var": 1.3659557185238292e-05, "learning_rate": 0.0077883896431124346, "loss": 2.6393, "step": 4190 }, { "crossentropy": 2.72870934009552, "epoch": 0.22790179178335462, "grad_norm": 0.03807678446173668, "grad_norm_var": 1.3227463279143145e-05, "learning_rate": 0.007787355984596689, "loss": 2.7287, "step": 4191 }, { "crossentropy": 2.776379346847534, "epoch": 0.22795617064085483, "grad_norm": 0.037449661642313004, "grad_norm_var": 1.3587705077548693e-05, "learning_rate": 0.007786322153211725, "loss": 2.7764, "step": 4192 }, { "crossentropy": 2.7478206157684326, "epoch": 0.22801054949835503, "grad_norm": 0.03796330466866493, "grad_norm_var": 1.3940592431323171e-05, "learning_rate": 0.007785288149021656, "loss": 2.7478, "step": 4193 }, { "crossentropy": 2.6964868307113647, "epoch": 0.22806492835585523, "grad_norm": 0.04221370071172714, "grad_norm_var": 1.3870283830775116e-05, "learning_rate": 0.007784253972090613, "loss": 2.6965, "step": 4194 }, { "crossentropy": 2.6716936826705933, "epoch": 0.22811930721335544, "grad_norm": 0.04088679328560829, "grad_norm_var": 1.3808565356223055e-05, "learning_rate": 0.0077832196224827364, "loss": 2.6717, "step": 4195 }, { "crossentropy": 2.68262255191803, "epoch": 0.22817368607085564, "grad_norm": 0.03526638075709343, "grad_norm_var": 1.3472738674379497e-05, "learning_rate": 0.007782185100262173, "loss": 2.6826, "step": 4196 }, { "crossentropy": 2.6586930751800537, "epoch": 0.22822806492835584, "grad_norm": 0.05071897432208061, "grad_norm_var": 1.4025247282064129e-05, "learning_rate": 0.007781150405493082, "loss": 2.6587, "step": 4197 }, { "crossentropy": 2.6308255195617676, "epoch": 0.22828244378585605, "grad_norm": 0.04391777515411377, "grad_norm_var": 1.4243414861374777e-05, "learning_rate": 0.0077801155382396375, "loss": 2.6308, "step": 4198 }, { "crossentropy": 2.6292470693588257, "epoch": 0.22833682264335625, "grad_norm": 0.03857244551181793, "grad_norm_var": 1.4303284393020418e-05, "learning_rate": 0.007779080498566018, "loss": 2.6292, "step": 4199 }, { "crossentropy": 2.7310208082199097, "epoch": 0.22839120150085646, "grad_norm": 0.05637433007359505, "grad_norm_var": 3.146376072963285e-05, "learning_rate": 0.007778045286536417, "loss": 2.731, "step": 4200 }, { "crossentropy": 2.644298553466797, "epoch": 0.22844558035835666, "grad_norm": 0.040282152593135834, "grad_norm_var": 3.14721335327367e-05, "learning_rate": 0.0077770099022150384, "loss": 2.6443, "step": 4201 }, { "crossentropy": 2.7399250268936157, "epoch": 0.22849995921585686, "grad_norm": 0.042800091207027435, "grad_norm_var": 3.1496062621817235e-05, "learning_rate": 0.0077759743456660944, "loss": 2.7399, "step": 4202 }, { "crossentropy": 2.618045687675476, "epoch": 0.22855433807335707, "grad_norm": 0.04309786483645439, "grad_norm_var": 2.9459373808136263e-05, "learning_rate": 0.00777493861695381, "loss": 2.618, "step": 4203 }, { "crossentropy": 2.6682140827178955, "epoch": 0.22860871693085727, "grad_norm": 0.054336804896593094, "grad_norm_var": 3.758837394148076e-05, "learning_rate": 0.007773902716142418, "loss": 2.6682, "step": 4204 }, { "crossentropy": 2.758996367454529, "epoch": 0.22866309578835747, "grad_norm": 0.04621744528412819, "grad_norm_var": 3.7166754905294345e-05, "learning_rate": 0.007772866643296167, "loss": 2.759, "step": 4205 }, { "crossentropy": 2.6390048265457153, "epoch": 0.22871747464585768, "grad_norm": 0.04216687008738518, "grad_norm_var": 3.672460093834627e-05, "learning_rate": 0.007771830398479312, "loss": 2.639, "step": 4206 }, { "crossentropy": 2.676435708999634, "epoch": 0.2287718535033579, "grad_norm": 0.037690214812755585, "grad_norm_var": 3.699523867326318e-05, "learning_rate": 0.007770793981756118, "loss": 2.6764, "step": 4207 }, { "crossentropy": 2.7295784950256348, "epoch": 0.2288262323608581, "grad_norm": 0.044921327382326126, "grad_norm_var": 3.483326670654258e-05, "learning_rate": 0.007769757393190866, "loss": 2.7296, "step": 4208 }, { "crossentropy": 2.6950916051864624, "epoch": 0.22888061121835832, "grad_norm": 0.04560922458767891, "grad_norm_var": 3.275171588624573e-05, "learning_rate": 0.0077687206328478425, "loss": 2.6951, "step": 4209 }, { "crossentropy": 2.6434816122055054, "epoch": 0.22893499007585852, "grad_norm": 0.044177521020174026, "grad_norm_var": 3.25074733975134e-05, "learning_rate": 0.007767683700791347, "loss": 2.6435, "step": 4210 }, { "crossentropy": 2.778244733810425, "epoch": 0.22898936893335872, "grad_norm": 0.041008420288562775, "grad_norm_var": 3.245483391912454e-05, "learning_rate": 0.007766646597085687, "loss": 2.7782, "step": 4211 }, { "crossentropy": 2.7394535541534424, "epoch": 0.22904374779085893, "grad_norm": 0.03698315471410751, "grad_norm_var": 3.0594710108907255e-05, "learning_rate": 0.0077656093217951866, "loss": 2.7395, "step": 4212 }, { "crossentropy": 2.615863084793091, "epoch": 0.22909812664835913, "grad_norm": 0.042028509080410004, "grad_norm_var": 2.7882525763962367e-05, "learning_rate": 0.007764571874984174, "loss": 2.6159, "step": 4213 }, { "crossentropy": 2.683736562728882, "epoch": 0.22915250550585933, "grad_norm": 0.0386928990483284, "grad_norm_var": 2.947987139758241e-05, "learning_rate": 0.007763534256716991, "loss": 2.6837, "step": 4214 }, { "crossentropy": 2.7527374029159546, "epoch": 0.22920688436335954, "grad_norm": 0.03678707033395767, "grad_norm_var": 3.083661461133942e-05, "learning_rate": 0.007762496467057989, "loss": 2.7527, "step": 4215 }, { "crossentropy": 2.6814569234848022, "epoch": 0.22926126322085974, "grad_norm": 0.036575645208358765, "grad_norm_var": 2.0883610410354857e-05, "learning_rate": 0.007761458506071532, "loss": 2.6815, "step": 4216 }, { "crossentropy": 2.656117081642151, "epoch": 0.22931564207835994, "grad_norm": 0.03594589605927467, "grad_norm_var": 2.31017031171378e-05, "learning_rate": 0.007760420373821992, "loss": 2.6561, "step": 4217 }, { "crossentropy": 2.646457314491272, "epoch": 0.22937002093586015, "grad_norm": 0.03621697425842285, "grad_norm_var": 2.494557254821746e-05, "learning_rate": 0.007759382070373755, "loss": 2.6465, "step": 4218 }, { "crossentropy": 2.7867733240127563, "epoch": 0.22942439979336035, "grad_norm": 0.03651138022542, "grad_norm_var": 2.6168937368405373e-05, "learning_rate": 0.007758343595791215, "loss": 2.7868, "step": 4219 }, { "crossentropy": 2.7674145698547363, "epoch": 0.22947877865086055, "grad_norm": 0.036260321736335754, "grad_norm_var": 1.4427373193730218e-05, "learning_rate": 0.007757304950138777, "loss": 2.7674, "step": 4220 }, { "crossentropy": 2.7408313751220703, "epoch": 0.22953315750836076, "grad_norm": 0.035431910306215286, "grad_norm_var": 1.2558353402377226e-05, "learning_rate": 0.007756266133480856, "loss": 2.7408, "step": 4221 }, { "crossentropy": 2.6801578998565674, "epoch": 0.22958753636586096, "grad_norm": 0.03566141426563263, "grad_norm_var": 1.2619522924859488e-05, "learning_rate": 0.007755227145881878, "loss": 2.6802, "step": 4222 }, { "crossentropy": 2.735636591911316, "epoch": 0.22964191522336116, "grad_norm": 0.03460920229554176, "grad_norm_var": 1.3661060238519205e-05, "learning_rate": 0.007754187987406283, "loss": 2.7356, "step": 4223 }, { "crossentropy": 2.6767607927322388, "epoch": 0.22969629408086137, "grad_norm": 0.0419309176504612, "grad_norm_var": 1.1695057866051237e-05, "learning_rate": 0.007753148658118515, "loss": 2.6768, "step": 4224 }, { "crossentropy": 2.6273773908615112, "epoch": 0.22975067293836157, "grad_norm": 0.04569893330335617, "grad_norm_var": 1.1781768776375258e-05, "learning_rate": 0.007752109158083034, "loss": 2.6274, "step": 4225 }, { "crossentropy": 2.808850049972534, "epoch": 0.22980505179586178, "grad_norm": 0.03933626413345337, "grad_norm_var": 9.52208227501013e-06, "learning_rate": 0.0077510694873643106, "loss": 2.8089, "step": 4226 }, { "crossentropy": 2.691981077194214, "epoch": 0.22985943065336198, "grad_norm": 0.04226066172122955, "grad_norm_var": 1.0104871497952119e-05, "learning_rate": 0.007750029646026822, "loss": 2.692, "step": 4227 }, { "crossentropy": 2.694037914276123, "epoch": 0.22991380951086218, "grad_norm": 0.04192403703927994, "grad_norm_var": 1.0840072256724082e-05, "learning_rate": 0.007748989634135057, "loss": 2.694, "step": 4228 }, { "crossentropy": 2.7642319202423096, "epoch": 0.2299681883683624, "grad_norm": 0.036604367196559906, "grad_norm_var": 1.0121235842475556e-05, "learning_rate": 0.007747949451753519, "loss": 2.7642, "step": 4229 }, { "crossentropy": 2.7209272384643555, "epoch": 0.2300225672258626, "grad_norm": 0.03890310600399971, "grad_norm_var": 1.0139129783427968e-05, "learning_rate": 0.007746909098946717, "loss": 2.7209, "step": 4230 }, { "crossentropy": 2.617249608039856, "epoch": 0.2300769460833628, "grad_norm": 0.03660881146788597, "grad_norm_var": 1.0173893111318996e-05, "learning_rate": 0.007745868575779176, "loss": 2.6172, "step": 4231 }, { "crossentropy": 2.5552436113357544, "epoch": 0.230131324940863, "grad_norm": 0.040337610989809036, "grad_norm_var": 1.0266225038331929e-05, "learning_rate": 0.007744827882315424, "loss": 2.5552, "step": 4232 }, { "crossentropy": 2.697834014892578, "epoch": 0.2301857037983632, "grad_norm": 0.04314914718270302, "grad_norm_var": 1.1161643880948832e-05, "learning_rate": 0.007743787018620005, "loss": 2.6978, "step": 4233 }, { "crossentropy": 2.707613706588745, "epoch": 0.2302400826558634, "grad_norm": 0.03899422660470009, "grad_norm_var": 1.0672290188147987e-05, "learning_rate": 0.007742745984757474, "loss": 2.7076, "step": 4234 }, { "crossentropy": 2.711836338043213, "epoch": 0.2302944615133636, "grad_norm": 0.04009423777461052, "grad_norm_var": 1.02791074933485e-05, "learning_rate": 0.0077417047807923954, "loss": 2.7118, "step": 4235 }, { "crossentropy": 2.6020619869232178, "epoch": 0.2303488403708638, "grad_norm": 0.04100381210446358, "grad_norm_var": 9.802234756426757e-06, "learning_rate": 0.00774066340678934, "loss": 2.6021, "step": 4236 }, { "crossentropy": 2.7170461416244507, "epoch": 0.23040321922836401, "grad_norm": 0.0382990799844265, "grad_norm_var": 8.747729828136273e-06, "learning_rate": 0.007739621862812897, "loss": 2.717, "step": 4237 }, { "crossentropy": 2.7105480432510376, "epoch": 0.23045759808586422, "grad_norm": 0.03545872122049332, "grad_norm_var": 8.859807932809494e-06, "learning_rate": 0.007738580148927659, "loss": 2.7105, "step": 4238 }, { "crossentropy": 2.708407163619995, "epoch": 0.23051197694336442, "grad_norm": 0.036352548748254776, "grad_norm_var": 7.866234088821366e-06, "learning_rate": 0.0077375382651982344, "loss": 2.7084, "step": 4239 }, { "crossentropy": 2.659817337989807, "epoch": 0.23056635580086463, "grad_norm": 0.03916122019290924, "grad_norm_var": 7.562364387016183e-06, "learning_rate": 0.007736496211689239, "loss": 2.6598, "step": 4240 }, { "crossentropy": 2.7504208087921143, "epoch": 0.23062073465836483, "grad_norm": 0.039091501384973526, "grad_norm_var": 4.950203847819736e-06, "learning_rate": 0.0077354539884653, "loss": 2.7504, "step": 4241 }, { "crossentropy": 2.702417731285095, "epoch": 0.23067511351586503, "grad_norm": 0.04172071814537048, "grad_norm_var": 5.341339301125301e-06, "learning_rate": 0.007734411595591053, "loss": 2.7024, "step": 4242 }, { "crossentropy": 2.7626034021377563, "epoch": 0.23072949237336524, "grad_norm": 0.038317061960697174, "grad_norm_var": 4.794829279183843e-06, "learning_rate": 0.007733369033131149, "loss": 2.7626, "step": 4243 }, { "crossentropy": 2.633697271347046, "epoch": 0.23078387123086544, "grad_norm": 0.03705397993326187, "grad_norm_var": 4.460461047251877e-06, "learning_rate": 0.007732326301150246, "loss": 2.6337, "step": 4244 }, { "crossentropy": 2.684878349304199, "epoch": 0.23083825008836564, "grad_norm": 0.03626358136534691, "grad_norm_var": 4.568479277533971e-06, "learning_rate": 0.007731283399713014, "loss": 2.6849, "step": 4245 }, { "crossentropy": 2.711598753929138, "epoch": 0.23089262894586585, "grad_norm": 0.038364194333553314, "grad_norm_var": 4.579264276656734e-06, "learning_rate": 0.00773024032888413, "loss": 2.7116, "step": 4246 }, { "crossentropy": 2.6928282976150513, "epoch": 0.23094700780336605, "grad_norm": 0.037514280527830124, "grad_norm_var": 4.3699616931696755e-06, "learning_rate": 0.007729197088728288, "loss": 2.6928, "step": 4247 }, { "crossentropy": 2.5481845140457153, "epoch": 0.23100138666086625, "grad_norm": 0.04713873937726021, "grad_norm_var": 8.63394706172374e-06, "learning_rate": 0.007728153679310186, "loss": 2.5482, "step": 4248 }, { "crossentropy": 2.6150617599487305, "epoch": 0.23105576551836646, "grad_norm": 0.03916434943675995, "grad_norm_var": 7.553956593699986e-06, "learning_rate": 0.007727110100694537, "loss": 2.6151, "step": 4249 }, { "crossentropy": 2.7184847593307495, "epoch": 0.23111014437586666, "grad_norm": 0.04045593738555908, "grad_norm_var": 7.68646315403206e-06, "learning_rate": 0.007726066352946061, "loss": 2.7185, "step": 4250 }, { "crossentropy": 2.6381983757019043, "epoch": 0.23116452323336686, "grad_norm": 0.03719644621014595, "grad_norm_var": 7.82361553738874e-06, "learning_rate": 0.007725022436129492, "loss": 2.6382, "step": 4251 }, { "crossentropy": 2.662304997444153, "epoch": 0.23121890209086707, "grad_norm": 0.03538841754198074, "grad_norm_var": 8.226549945806957e-06, "learning_rate": 0.007723978350309573, "loss": 2.6623, "step": 4252 }, { "crossentropy": 2.737287759780884, "epoch": 0.23127328094836727, "grad_norm": 0.03812861815094948, "grad_norm_var": 8.234268971496888e-06, "learning_rate": 0.007722934095551054, "loss": 2.7373, "step": 4253 }, { "crossentropy": 2.6349215507507324, "epoch": 0.23132765980586747, "grad_norm": 0.0446963757276535, "grad_norm_var": 9.762456652033918e-06, "learning_rate": 0.007721889671918703, "loss": 2.6349, "step": 4254 }, { "crossentropy": 2.7264400720596313, "epoch": 0.23138203866336768, "grad_norm": 0.04057222232222557, "grad_norm_var": 9.315183959838063e-06, "learning_rate": 0.007720845079477291, "loss": 2.7264, "step": 4255 }, { "crossentropy": 2.635712742805481, "epoch": 0.23143641752086788, "grad_norm": 0.03882681578397751, "grad_norm_var": 9.332339335104261e-06, "learning_rate": 0.007719800318291605, "loss": 2.6357, "step": 4256 }, { "crossentropy": 2.6281596422195435, "epoch": 0.23149079637836809, "grad_norm": 0.03721515089273453, "grad_norm_var": 9.6216389023888e-06, "learning_rate": 0.007718755388426438, "loss": 2.6282, "step": 4257 }, { "crossentropy": 2.7474151849746704, "epoch": 0.2315451752358683, "grad_norm": 0.03612729534506798, "grad_norm_var": 9.735188637420972e-06, "learning_rate": 0.007717710289946595, "loss": 2.7474, "step": 4258 }, { "crossentropy": 2.716086506843567, "epoch": 0.2315995540933685, "grad_norm": 0.038470569998025894, "grad_norm_var": 9.724699990152492e-06, "learning_rate": 0.007716665022916896, "loss": 2.7161, "step": 4259 }, { "crossentropy": 2.7567919492721558, "epoch": 0.2316539329508687, "grad_norm": 0.04282084479928017, "grad_norm_var": 1.037530764494554e-05, "learning_rate": 0.007715619587402164, "loss": 2.7568, "step": 4260 }, { "crossentropy": 2.622824192047119, "epoch": 0.2317083118083689, "grad_norm": 0.04154606908559799, "grad_norm_var": 1.0000784572238482e-05, "learning_rate": 0.007714573983467237, "loss": 2.6228, "step": 4261 }, { "crossentropy": 2.627650022506714, "epoch": 0.2317626906658691, "grad_norm": 0.04119003936648369, "grad_norm_var": 1.0033626076127953e-05, "learning_rate": 0.007713528211176963, "loss": 2.6277, "step": 4262 }, { "crossentropy": 2.747534990310669, "epoch": 0.2318170695233693, "grad_norm": 0.0369245745241642, "grad_norm_var": 1.0233371679621515e-05, "learning_rate": 0.0077124822705962, "loss": 2.7475, "step": 4263 }, { "crossentropy": 2.636306643486023, "epoch": 0.2318714483808695, "grad_norm": 0.03703571483492851, "grad_norm_var": 6.648087784808371e-06, "learning_rate": 0.007711436161789814, "loss": 2.6363, "step": 4264 }, { "crossentropy": 2.6015900373458862, "epoch": 0.23192582723836971, "grad_norm": 0.16563645005226135, "grad_norm_var": 0.0010072646827956045, "learning_rate": 0.007710389884822684, "loss": 2.6016, "step": 4265 }, { "crossentropy": 2.617960214614868, "epoch": 0.23198020609586992, "grad_norm": 0.036426614969968796, "grad_norm_var": 0.001011802924144135, "learning_rate": 0.007709343439759702, "loss": 2.618, "step": 4266 }, { "crossentropy": 2.6075302362442017, "epoch": 0.23203458495337012, "grad_norm": 0.040279459208250046, "grad_norm_var": 0.001008464625266895, "learning_rate": 0.0077082968266657635, "loss": 2.6075, "step": 4267 }, { "crossentropy": 2.7331260442733765, "epoch": 0.23208896381087032, "grad_norm": 0.04087580367922783, "grad_norm_var": 0.0010018836416715688, "learning_rate": 0.007707250045605784, "loss": 2.7331, "step": 4268 }, { "crossentropy": 2.716190814971924, "epoch": 0.23214334266837053, "grad_norm": 0.03870926424860954, "grad_norm_var": 0.0010011948024191432, "learning_rate": 0.007706203096644679, "loss": 2.7162, "step": 4269 }, { "crossentropy": 2.65346360206604, "epoch": 0.23219772152587073, "grad_norm": 0.04043301194906235, "grad_norm_var": 0.0010038305020711373, "learning_rate": 0.00770515597984738, "loss": 2.6535, "step": 4270 }, { "crossentropy": 2.613230586051941, "epoch": 0.23225210038337094, "grad_norm": 0.04291049391031265, "grad_norm_var": 0.0010021469990025508, "learning_rate": 0.0077041086952788305, "loss": 2.6132, "step": 4271 }, { "crossentropy": 2.739366292953491, "epoch": 0.23230647924087114, "grad_norm": 0.038468629121780396, "grad_norm_var": 0.0010025555870653614, "learning_rate": 0.00770306124300398, "loss": 2.7394, "step": 4272 }, { "crossentropy": 2.5849428176879883, "epoch": 0.23236085809837134, "grad_norm": 0.0377589613199234, "grad_norm_var": 0.0010018506773603365, "learning_rate": 0.007702013623087792, "loss": 2.5849, "step": 4273 }, { "crossentropy": 2.6568942070007324, "epoch": 0.23241523695587155, "grad_norm": 0.04013190418481827, "grad_norm_var": 0.000996926927209212, "learning_rate": 0.007700965835595237, "loss": 2.6569, "step": 4274 }, { "crossentropy": 2.689648151397705, "epoch": 0.23246961581337175, "grad_norm": 0.04107445105910301, "grad_norm_var": 0.0009942240948200112, "learning_rate": 0.007699917880591302, "loss": 2.6896, "step": 4275 }, { "crossentropy": 2.8686188459396362, "epoch": 0.23252399467087195, "grad_norm": 0.04074171930551529, "grad_norm_var": 0.0009958299110462124, "learning_rate": 0.007698869758140974, "loss": 2.8686, "step": 4276 }, { "crossentropy": 2.5974512100219727, "epoch": 0.23257837352837216, "grad_norm": 0.038155850023031235, "grad_norm_var": 0.0009992436553592442, "learning_rate": 0.007697821468309261, "loss": 2.5975, "step": 4277 }, { "crossentropy": 2.7666090726852417, "epoch": 0.23263275238587236, "grad_norm": 0.037303850054740906, "grad_norm_var": 0.0010033519640872215, "learning_rate": 0.007696773011161176, "loss": 2.7666, "step": 4278 }, { "crossentropy": 2.648895025253296, "epoch": 0.23268713124337256, "grad_norm": 0.04103654995560646, "grad_norm_var": 0.0009988550479836098, "learning_rate": 0.0076957243867617426, "loss": 2.6489, "step": 4279 }, { "crossentropy": 2.703301787376404, "epoch": 0.23274151010087277, "grad_norm": 0.040162596851587296, "grad_norm_var": 0.0009951821165958311, "learning_rate": 0.007694675595175996, "loss": 2.7033, "step": 4280 }, { "crossentropy": 2.680587410926819, "epoch": 0.232795888958373, "grad_norm": 0.03603965416550636, "grad_norm_var": 3.6568792404951972e-06, "learning_rate": 0.00769362663646898, "loss": 2.6806, "step": 4281 }, { "crossentropy": 2.8083592653274536, "epoch": 0.2328502678158732, "grad_norm": 0.06524068862199783, "grad_norm_var": 4.4098050610097415e-05, "learning_rate": 0.007692577510705754, "loss": 2.8084, "step": 4282 }, { "crossentropy": 2.7246426343917847, "epoch": 0.2329046466733734, "grad_norm": 0.0394982248544693, "grad_norm_var": 4.4232883834068445e-05, "learning_rate": 0.007691528217951378, "loss": 2.7246, "step": 4283 }, { "crossentropy": 2.6602216958999634, "epoch": 0.2329590255308736, "grad_norm": 0.03754532337188721, "grad_norm_var": 4.505183219883889e-05, "learning_rate": 0.007690478758270934, "loss": 2.6602, "step": 4284 }, { "crossentropy": 2.60686194896698, "epoch": 0.2330134043883738, "grad_norm": 0.040756456553936005, "grad_norm_var": 4.470194992734483e-05, "learning_rate": 0.007689429131729504, "loss": 2.6069, "step": 4285 }, { "crossentropy": 2.597511887550354, "epoch": 0.23306778324587402, "grad_norm": 0.03975928947329521, "grad_norm_var": 4.478831605475922e-05, "learning_rate": 0.007688379338392187, "loss": 2.5975, "step": 4286 }, { "crossentropy": 2.7816150188446045, "epoch": 0.23312216210337422, "grad_norm": 0.036231234669685364, "grad_norm_var": 4.5907714516096876e-05, "learning_rate": 0.0076873293783240905, "loss": 2.7816, "step": 4287 }, { "crossentropy": 2.69527268409729, "epoch": 0.23317654096087442, "grad_norm": 0.03686339035630226, "grad_norm_var": 4.6529030318561275e-05, "learning_rate": 0.00768627925159033, "loss": 2.6953, "step": 4288 }, { "crossentropy": 2.620948553085327, "epoch": 0.23323091981837463, "grad_norm": 0.03829033300280571, "grad_norm_var": 4.635114711148599e-05, "learning_rate": 0.007685228958256035, "loss": 2.6209, "step": 4289 }, { "crossentropy": 2.738029360771179, "epoch": 0.23328529867587483, "grad_norm": 0.043577056378126144, "grad_norm_var": 4.6900005558769775e-05, "learning_rate": 0.007684178498386345, "loss": 2.738, "step": 4290 }, { "crossentropy": 2.7000060081481934, "epoch": 0.23333967753337503, "grad_norm": 0.03675176203250885, "grad_norm_var": 4.7890824220205254e-05, "learning_rate": 0.007683127872046406, "loss": 2.7, "step": 4291 }, { "crossentropy": 2.737765908241272, "epoch": 0.23339405639087524, "grad_norm": 0.03690727800130844, "grad_norm_var": 4.868470632678641e-05, "learning_rate": 0.007682077079301377, "loss": 2.7378, "step": 4292 }, { "crossentropy": 2.692458152770996, "epoch": 0.23344843524837544, "grad_norm": 0.03881242871284485, "grad_norm_var": 4.8527665847658956e-05, "learning_rate": 0.0076810261202164285, "loss": 2.6925, "step": 4293 }, { "crossentropy": 2.6946247816085815, "epoch": 0.23350281410587564, "grad_norm": 0.04096987843513489, "grad_norm_var": 4.7903851301762426e-05, "learning_rate": 0.0076799749948567395, "loss": 2.6946, "step": 4294 }, { "crossentropy": 2.567822575569153, "epoch": 0.23355719296337585, "grad_norm": 0.04108567163348198, "grad_norm_var": 4.790733528396146e-05, "learning_rate": 0.007678923703287502, "loss": 2.5678, "step": 4295 }, { "crossentropy": 2.8610031604766846, "epoch": 0.23361157182087605, "grad_norm": 0.03976789116859436, "grad_norm_var": 4.793644486134248e-05, "learning_rate": 0.0076778722455739115, "loss": 2.861, "step": 4296 }, { "crossentropy": 2.828710913658142, "epoch": 0.23366595067837626, "grad_norm": 0.042317118495702744, "grad_norm_var": 4.6661015171012056e-05, "learning_rate": 0.007676820621781182, "loss": 2.8287, "step": 4297 }, { "crossentropy": 2.5806456804275513, "epoch": 0.23372032953587646, "grad_norm": 0.037643227726221085, "grad_norm_var": 4.690788483006856e-06, "learning_rate": 0.007675768831974532, "loss": 2.5806, "step": 4298 }, { "crossentropy": 2.765469193458557, "epoch": 0.23377470839337666, "grad_norm": 0.038429923355579376, "grad_norm_var": 4.7158688943518245e-06, "learning_rate": 0.007674716876219194, "loss": 2.7655, "step": 4299 }, { "crossentropy": 2.7939341068267822, "epoch": 0.23382908725087687, "grad_norm": 0.03979940339922905, "grad_norm_var": 4.564141335150253e-06, "learning_rate": 0.00767366475458041, "loss": 2.7939, "step": 4300 }, { "crossentropy": 2.755646228790283, "epoch": 0.23388346610837707, "grad_norm": 0.03849854692816734, "grad_norm_var": 4.428541833580562e-06, "learning_rate": 0.00767261246712343, "loss": 2.7556, "step": 4301 }, { "crossentropy": 2.706436038017273, "epoch": 0.23393784496587727, "grad_norm": 0.04049719497561455, "grad_norm_var": 4.526796958616702e-06, "learning_rate": 0.007671560013913515, "loss": 2.7064, "step": 4302 }, { "crossentropy": 2.7905341386795044, "epoch": 0.23399222382337748, "grad_norm": 0.04355798661708832, "grad_norm_var": 5.027950152916514e-06, "learning_rate": 0.00767050739501594, "loss": 2.7905, "step": 4303 }, { "crossentropy": 2.6370506286621094, "epoch": 0.23404660268087768, "grad_norm": 0.04027051106095314, "grad_norm_var": 4.505484086139652e-06, "learning_rate": 0.007669454610495987, "loss": 2.6371, "step": 4304 }, { "crossentropy": 2.7169861793518066, "epoch": 0.23410098153837788, "grad_norm": 0.04275454953312874, "grad_norm_var": 4.838467768709422e-06, "learning_rate": 0.007668401660418947, "loss": 2.717, "step": 4305 }, { "crossentropy": 2.704771041870117, "epoch": 0.2341553603958781, "grad_norm": 0.04752836376428604, "grad_norm_var": 7.64479424358866e-06, "learning_rate": 0.007667348544850123, "loss": 2.7048, "step": 4306 }, { "crossentropy": 2.6907691955566406, "epoch": 0.2342097392533783, "grad_norm": 0.04138569161295891, "grad_norm_var": 6.763997318835237e-06, "learning_rate": 0.00766629526385483, "loss": 2.6908, "step": 4307 }, { "crossentropy": 2.735814094543457, "epoch": 0.2342641181108785, "grad_norm": 0.03705618157982826, "grad_norm_var": 6.691292118221383e-06, "learning_rate": 0.007665241817498391, "loss": 2.7358, "step": 4308 }, { "crossentropy": 2.6759804487228394, "epoch": 0.2343184969683787, "grad_norm": 0.0361885167658329, "grad_norm_var": 7.76392653194091e-06, "learning_rate": 0.007664188205846137, "loss": 2.676, "step": 4309 }, { "crossentropy": 2.7471734285354614, "epoch": 0.2343728758258789, "grad_norm": 0.040681563317775726, "grad_norm_var": 7.750459730371194e-06, "learning_rate": 0.007663134428963416, "loss": 2.7472, "step": 4310 }, { "crossentropy": 2.6993398666381836, "epoch": 0.2344272546833791, "grad_norm": 0.04634646698832512, "grad_norm_var": 9.914591772350264e-06, "learning_rate": 0.007662080486915582, "loss": 2.6993, "step": 4311 }, { "crossentropy": 2.5664154291152954, "epoch": 0.2344816335408793, "grad_norm": 0.04636926203966141, "grad_norm_var": 1.1734006762210922e-05, "learning_rate": 0.007661026379767997, "loss": 2.5664, "step": 4312 }, { "crossentropy": 2.5967540740966797, "epoch": 0.2345360123983795, "grad_norm": 0.034391336143016815, "grad_norm_var": 1.4487818511382579e-05, "learning_rate": 0.007659972107586034, "loss": 2.5968, "step": 4313 }, { "crossentropy": 2.6631442308425903, "epoch": 0.23459039125587972, "grad_norm": 0.038732778280973434, "grad_norm_var": 1.4116141465089771e-05, "learning_rate": 0.007658917670435084, "loss": 2.6631, "step": 4314 }, { "crossentropy": 2.7130767107009888, "epoch": 0.23464477011337992, "grad_norm": 0.04347006231546402, "grad_norm_var": 1.4124186346983758e-05, "learning_rate": 0.007657863068380537, "loss": 2.7131, "step": 4315 }, { "crossentropy": 2.6220128536224365, "epoch": 0.23469914897088012, "grad_norm": 0.039072297513484955, "grad_norm_var": 1.4282884811334049e-05, "learning_rate": 0.007656808301487801, "loss": 2.622, "step": 4316 }, { "crossentropy": 2.6840707063674927, "epoch": 0.23475352782838033, "grad_norm": 0.03931119665503502, "grad_norm_var": 1.4047692565125926e-05, "learning_rate": 0.007655753369822291, "loss": 2.6841, "step": 4317 }, { "crossentropy": 2.728845953941345, "epoch": 0.23480790668588053, "grad_norm": 0.039394035935401917, "grad_norm_var": 1.4212546191638698e-05, "learning_rate": 0.007654698273449434, "loss": 2.7288, "step": 4318 }, { "crossentropy": 2.674580454826355, "epoch": 0.23486228554338073, "grad_norm": 0.03661449998617172, "grad_norm_var": 1.4887173268592301e-05, "learning_rate": 0.007653643012434666, "loss": 2.6746, "step": 4319 }, { "crossentropy": 2.645634889602661, "epoch": 0.23491666440088094, "grad_norm": 0.03768462687730789, "grad_norm_var": 1.5417996417255568e-05, "learning_rate": 0.0076525875868434295, "loss": 2.6456, "step": 4320 }, { "crossentropy": 2.6649142503738403, "epoch": 0.23497104325838114, "grad_norm": 0.04719065502285957, "grad_norm_var": 1.8019112578740853e-05, "learning_rate": 0.007651531996741186, "loss": 2.6649, "step": 4321 }, { "crossentropy": 2.7595711946487427, "epoch": 0.23502542211588134, "grad_norm": 0.03608167916536331, "grad_norm_var": 1.5807407938295675e-05, "learning_rate": 0.007650476242193398, "loss": 2.7596, "step": 4322 }, { "crossentropy": 2.732453227043152, "epoch": 0.23507980097338155, "grad_norm": 0.05822639912366867, "grad_norm_var": 3.6648558427350665e-05, "learning_rate": 0.007649420323265546, "loss": 2.7325, "step": 4323 }, { "crossentropy": 2.684126853942871, "epoch": 0.23513417983088175, "grad_norm": 0.04202108830213547, "grad_norm_var": 3.554486578572069e-05, "learning_rate": 0.0076483642400231155, "loss": 2.6841, "step": 4324 }, { "crossentropy": 2.6754703521728516, "epoch": 0.23518855868838195, "grad_norm": 0.04900646582245827, "grad_norm_var": 3.697347125735517e-05, "learning_rate": 0.0076473079925316045, "loss": 2.6755, "step": 4325 }, { "crossentropy": 2.735003709793091, "epoch": 0.23524293754588216, "grad_norm": 0.04905734583735466, "grad_norm_var": 3.9704602343236065e-05, "learning_rate": 0.007646251580856519, "loss": 2.735, "step": 4326 }, { "crossentropy": 2.5874135494232178, "epoch": 0.23529731640338236, "grad_norm": 0.04077218100428581, "grad_norm_var": 3.8925775604757114e-05, "learning_rate": 0.007645195005063376, "loss": 2.5874, "step": 4327 }, { "crossentropy": 2.754842162132263, "epoch": 0.23535169526088257, "grad_norm": 0.04235026612877846, "grad_norm_var": 3.7774674446229044e-05, "learning_rate": 0.007644138265217708, "loss": 2.7548, "step": 4328 }, { "crossentropy": 2.585649609565735, "epoch": 0.23540607411838277, "grad_norm": 0.04129662737250328, "grad_norm_var": 3.367029296690305e-05, "learning_rate": 0.007643081361385048, "loss": 2.5856, "step": 4329 }, { "crossentropy": 2.640481114387512, "epoch": 0.23546045297588297, "grad_norm": 0.041212230920791626, "grad_norm_var": 3.280327263597508e-05, "learning_rate": 0.007642024293630947, "loss": 2.6405, "step": 4330 }, { "crossentropy": 2.6071404218673706, "epoch": 0.23551483183338318, "grad_norm": 0.04131932929158211, "grad_norm_var": 3.286369315398756e-05, "learning_rate": 0.0076409670620209615, "loss": 2.6071, "step": 4331 }, { "crossentropy": 2.7335333824157715, "epoch": 0.23556921069088338, "grad_norm": 0.044678423553705215, "grad_norm_var": 3.223729175260836e-05, "learning_rate": 0.007639909666620663, "loss": 2.7335, "step": 4332 }, { "crossentropy": 2.7453575134277344, "epoch": 0.23562358954838358, "grad_norm": 0.04016308858990669, "grad_norm_var": 3.187631166749743e-05, "learning_rate": 0.007638852107495626, "loss": 2.7454, "step": 4333 }, { "crossentropy": 2.638871431350708, "epoch": 0.2356779684058838, "grad_norm": 0.03918420523405075, "grad_norm_var": 3.197832103471968e-05, "learning_rate": 0.007637794384711443, "loss": 2.6389, "step": 4334 }, { "crossentropy": 2.735352873802185, "epoch": 0.235732347263384, "grad_norm": 0.036908648908138275, "grad_norm_var": 3.173608695717315e-05, "learning_rate": 0.007636736498333709, "loss": 2.7354, "step": 4335 }, { "crossentropy": 2.646784543991089, "epoch": 0.2357867261208842, "grad_norm": 0.037145309150218964, "grad_norm_var": 3.21326837490883e-05, "learning_rate": 0.0076356784484280385, "loss": 2.6468, "step": 4336 }, { "crossentropy": 2.670290946960449, "epoch": 0.2358411049783844, "grad_norm": 0.03386901691555977, "grad_norm_var": 3.562692154541926e-05, "learning_rate": 0.007634620235060047, "loss": 2.6703, "step": 4337 }, { "crossentropy": 2.753156542778015, "epoch": 0.2358954838358846, "grad_norm": 0.035915520042181015, "grad_norm_var": 3.576155423487826e-05, "learning_rate": 0.007633561858295363, "loss": 2.7532, "step": 4338 }, { "crossentropy": 2.6750746965408325, "epoch": 0.2359498626933848, "grad_norm": 0.03537645563483238, "grad_norm_var": 1.9172178482723157e-05, "learning_rate": 0.00763250331819963, "loss": 2.6751, "step": 4339 }, { "crossentropy": 2.653647303581238, "epoch": 0.236004241550885, "grad_norm": 0.03700811415910721, "grad_norm_var": 1.9821195543273606e-05, "learning_rate": 0.007631444614838495, "loss": 2.6536, "step": 4340 }, { "crossentropy": 2.7068307399749756, "epoch": 0.2360586204083852, "grad_norm": 0.03722856193780899, "grad_norm_var": 1.4864077449777459e-05, "learning_rate": 0.007630385748277618, "loss": 2.7068, "step": 4341 }, { "crossentropy": 2.73065447807312, "epoch": 0.23611299926588541, "grad_norm": 0.03517758473753929, "grad_norm_var": 9.38920562979483e-06, "learning_rate": 0.007629326718582669, "loss": 2.7307, "step": 4342 }, { "crossentropy": 2.7894011735916138, "epoch": 0.23616737812338562, "grad_norm": 0.03791329264640808, "grad_norm_var": 9.119810916828561e-06, "learning_rate": 0.007628267525819329, "loss": 2.7894, "step": 4343 }, { "crossentropy": 2.7163474559783936, "epoch": 0.23622175698088582, "grad_norm": 0.03888380900025368, "grad_norm_var": 8.112829548728196e-06, "learning_rate": 0.007627208170053286, "loss": 2.7163, "step": 4344 }, { "crossentropy": 2.7745869159698486, "epoch": 0.23627613583838603, "grad_norm": 0.038252636790275574, "grad_norm_var": 7.487901059869776e-06, "learning_rate": 0.007626148651350242, "loss": 2.7746, "step": 4345 }, { "crossentropy": 2.599792003631592, "epoch": 0.23633051469588623, "grad_norm": 0.03760434687137604, "grad_norm_var": 6.823438953147224e-06, "learning_rate": 0.007625088969775908, "loss": 2.5998, "step": 4346 }, { "crossentropy": 2.647233247756958, "epoch": 0.23638489355338643, "grad_norm": 0.03749646246433258, "grad_norm_var": 6.001222362877513e-06, "learning_rate": 0.0076240291253960035, "loss": 2.6472, "step": 4347 }, { "crossentropy": 2.5875308513641357, "epoch": 0.23643927241088664, "grad_norm": 0.03363789990544319, "grad_norm_var": 3.310520440535867e-06, "learning_rate": 0.007622969118276257, "loss": 2.5875, "step": 4348 }, { "crossentropy": 2.654037833213806, "epoch": 0.23649365126838684, "grad_norm": 0.035541974008083344, "grad_norm_var": 2.68720505518331e-06, "learning_rate": 0.007621908948482414, "loss": 2.654, "step": 4349 }, { "crossentropy": 2.586637020111084, "epoch": 0.23654803012588704, "grad_norm": 0.03769093006849289, "grad_norm_var": 2.331259504948872e-06, "learning_rate": 0.00762084861608022, "loss": 2.5866, "step": 4350 }, { "crossentropy": 2.6076931953430176, "epoch": 0.23660240898338725, "grad_norm": 0.038848090916872025, "grad_norm_var": 2.6453462287134278e-06, "learning_rate": 0.00761978812113544, "loss": 2.6077, "step": 4351 }, { "crossentropy": 2.7608299255371094, "epoch": 0.23665678784088745, "grad_norm": 0.035745009779930115, "grad_norm_var": 2.6893075086330148e-06, "learning_rate": 0.007618727463713843, "loss": 2.7608, "step": 4352 }, { "crossentropy": 2.70325243473053, "epoch": 0.23671116669838765, "grad_norm": 0.0367625430226326, "grad_norm_var": 2.1447461502378806e-06, "learning_rate": 0.007617666643881209, "loss": 2.7033, "step": 4353 }, { "crossentropy": 2.799039840698242, "epoch": 0.23676554555588786, "grad_norm": 0.04010022431612015, "grad_norm_var": 2.73584855771062e-06, "learning_rate": 0.00761660566170333, "loss": 2.799, "step": 4354 }, { "crossentropy": 2.620996832847595, "epoch": 0.23681992441338806, "grad_norm": 0.03867864981293678, "grad_norm_var": 2.667653025886727e-06, "learning_rate": 0.007615544517246009, "loss": 2.621, "step": 4355 }, { "crossentropy": 2.747582197189331, "epoch": 0.2368743032708883, "grad_norm": 0.03849561884999275, "grad_norm_var": 2.7509034731566795e-06, "learning_rate": 0.007614483210575055, "loss": 2.7476, "step": 4356 }, { "crossentropy": 2.7158563137054443, "epoch": 0.2369286821283885, "grad_norm": 0.04213673621416092, "grad_norm_var": 4.158349562480984e-06, "learning_rate": 0.007613421741756289, "loss": 2.7159, "step": 4357 }, { "crossentropy": 2.6979931592941284, "epoch": 0.2369830609858887, "grad_norm": 0.04031673073768616, "grad_norm_var": 4.090647369422474e-06, "learning_rate": 0.007612360110855544, "loss": 2.698, "step": 4358 }, { "crossentropy": 2.647719621658325, "epoch": 0.2370374398433889, "grad_norm": 0.03804647549986839, "grad_norm_var": 4.090099764079286e-06, "learning_rate": 0.00761129831793866, "loss": 2.6477, "step": 4359 }, { "crossentropy": 2.6495484113693237, "epoch": 0.2370918187008891, "grad_norm": 0.03483131527900696, "grad_norm_var": 4.647010240861095e-06, "learning_rate": 0.007610236363071488, "loss": 2.6495, "step": 4360 }, { "crossentropy": 2.7413394451141357, "epoch": 0.2371461975583893, "grad_norm": 0.03727039322257042, "grad_norm_var": 4.6430017265573316e-06, "learning_rate": 0.007609174246319892, "loss": 2.7413, "step": 4361 }, { "crossentropy": 2.6334537267684937, "epoch": 0.2372005764158895, "grad_norm": 0.036850471049547195, "grad_norm_var": 4.6881583837610555e-06, "learning_rate": 0.0076081119677497405, "loss": 2.6335, "step": 4362 }, { "crossentropy": 2.767336845397949, "epoch": 0.23725495527338972, "grad_norm": 0.04163574054837227, "grad_norm_var": 5.672563564308961e-06, "learning_rate": 0.007607049527426916, "loss": 2.7673, "step": 4363 }, { "crossentropy": 2.6769752502441406, "epoch": 0.23730933413088992, "grad_norm": 0.03993155434727669, "grad_norm_var": 4.561733742515737e-06, "learning_rate": 0.00760598692541731, "loss": 2.677, "step": 4364 }, { "crossentropy": 2.5713212490081787, "epoch": 0.23736371298839012, "grad_norm": 0.03669668361544609, "grad_norm_var": 4.21964571458751e-06, "learning_rate": 0.007604924161786826, "loss": 2.5713, "step": 4365 }, { "crossentropy": 2.709255576133728, "epoch": 0.23741809184589033, "grad_norm": 0.036772746592760086, "grad_norm_var": 4.356368301372306e-06, "learning_rate": 0.007603861236601373, "loss": 2.7093, "step": 4366 }, { "crossentropy": 2.672771692276001, "epoch": 0.23747247070339053, "grad_norm": 0.036245815455913544, "grad_norm_var": 4.5963543750836165e-06, "learning_rate": 0.007602798149926875, "loss": 2.6728, "step": 4367 }, { "crossentropy": 2.7100452184677124, "epoch": 0.23752684956089073, "grad_norm": 0.03572703152894974, "grad_norm_var": 4.602157063647987e-06, "learning_rate": 0.00760173490182926, "loss": 2.71, "step": 4368 }, { "crossentropy": 2.6693183183670044, "epoch": 0.23758122841839094, "grad_norm": 0.04178902506828308, "grad_norm_var": 5.247246286562273e-06, "learning_rate": 0.0076006714923744735, "loss": 2.6693, "step": 4369 }, { "crossentropy": 2.542009711265564, "epoch": 0.23763560727589114, "grad_norm": 0.040020786225795746, "grad_norm_var": 5.230377216689357e-06, "learning_rate": 0.007599607921628467, "loss": 2.542, "step": 4370 }, { "crossentropy": 2.676853656768799, "epoch": 0.23768998613339135, "grad_norm": 0.03449924290180206, "grad_norm_var": 6.203236183262527e-06, "learning_rate": 0.007598544189657199, "loss": 2.6769, "step": 4371 }, { "crossentropy": 2.6153037548065186, "epoch": 0.23774436499089155, "grad_norm": 0.037190280854701996, "grad_norm_var": 6.259001315666162e-06, "learning_rate": 0.007597480296526645, "loss": 2.6153, "step": 4372 }, { "crossentropy": 2.622314453125, "epoch": 0.23779874384839175, "grad_norm": 0.03757332265377045, "grad_norm_var": 5.118104246798425e-06, "learning_rate": 0.007596416242302782, "loss": 2.6223, "step": 4373 }, { "crossentropy": 2.707187056541443, "epoch": 0.23785312270589196, "grad_norm": 0.04426528885960579, "grad_norm_var": 7.397878713272012e-06, "learning_rate": 0.007595352027051605, "loss": 2.7072, "step": 4374 }, { "crossentropy": 2.7330514192581177, "epoch": 0.23790750156339216, "grad_norm": 0.04198124259710312, "grad_norm_var": 8.345770255839488e-06, "learning_rate": 0.007594287650839114, "loss": 2.7331, "step": 4375 }, { "crossentropy": 2.703155517578125, "epoch": 0.23796188042089236, "grad_norm": 0.03893495723605156, "grad_norm_var": 7.4839171598265844e-06, "learning_rate": 0.007593223113731323, "loss": 2.7032, "step": 4376 }, { "crossentropy": 2.7662532329559326, "epoch": 0.23801625927839257, "grad_norm": 0.04057219251990318, "grad_norm_var": 7.585865813345786e-06, "learning_rate": 0.007592158415794252, "loss": 2.7663, "step": 4377 }, { "crossentropy": 2.591365933418274, "epoch": 0.23807063813589277, "grad_norm": 0.04488179087638855, "grad_norm_var": 9.537212397140298e-06, "learning_rate": 0.007591093557093934, "loss": 2.5914, "step": 4378 }, { "crossentropy": 2.6959322690963745, "epoch": 0.23812501699339297, "grad_norm": 0.04157795384526253, "grad_norm_var": 9.519384839692426e-06, "learning_rate": 0.007590028537696408, "loss": 2.6959, "step": 4379 }, { "crossentropy": 2.7520898580551147, "epoch": 0.23817939585089318, "grad_norm": 0.04275169596076012, "grad_norm_var": 1.0257228277765242e-05, "learning_rate": 0.007588963357667725, "loss": 2.7521, "step": 4380 }, { "crossentropy": 2.7352265119552612, "epoch": 0.23823377470839338, "grad_norm": 0.04478694126009941, "grad_norm_var": 1.1359108787911167e-05, "learning_rate": 0.007587898017073951, "loss": 2.7352, "step": 4381 }, { "crossentropy": 2.7149044275283813, "epoch": 0.23828815356589358, "grad_norm": 0.04384513571858406, "grad_norm_var": 1.1467348726636125e-05, "learning_rate": 0.007586832515981154, "loss": 2.7149, "step": 4382 }, { "crossentropy": 2.766009211540222, "epoch": 0.2383425324233938, "grad_norm": 0.05196360498666763, "grad_norm_var": 1.81701693191621e-05, "learning_rate": 0.007585766854455413, "loss": 2.766, "step": 4383 }, { "crossentropy": 2.822037696838379, "epoch": 0.238396911280894, "grad_norm": 0.04723081365227699, "grad_norm_var": 1.7743606901362193e-05, "learning_rate": 0.007584701032562827, "loss": 2.822, "step": 4384 }, { "crossentropy": 2.738744020462036, "epoch": 0.2384512901383942, "grad_norm": 0.034930236637592316, "grad_norm_var": 2.0983286305953133e-05, "learning_rate": 0.007583635050369491, "loss": 2.7387, "step": 4385 }, { "crossentropy": 2.766207695007324, "epoch": 0.2385056689958944, "grad_norm": 0.03557777777314186, "grad_norm_var": 2.3204622829761137e-05, "learning_rate": 0.0075825689079415184, "loss": 2.7662, "step": 4386 }, { "crossentropy": 2.701047897338867, "epoch": 0.2385600478533946, "grad_norm": 0.03531802445650101, "grad_norm_var": 2.249205272965519e-05, "learning_rate": 0.007581502605345028, "loss": 2.701, "step": 4387 }, { "crossentropy": 2.8048064708709717, "epoch": 0.2386144267108948, "grad_norm": 0.037573594599962234, "grad_norm_var": 2.2282948970560554e-05, "learning_rate": 0.007580436142646155, "loss": 2.8048, "step": 4388 }, { "crossentropy": 2.7755658626556396, "epoch": 0.238668805568395, "grad_norm": 0.03530777245759964, "grad_norm_var": 2.3785443691790956e-05, "learning_rate": 0.007579369519911036, "loss": 2.7756, "step": 4389 }, { "crossentropy": 2.7722359895706177, "epoch": 0.2387231844258952, "grad_norm": 0.035041678696870804, "grad_norm_var": 2.5509603754819698e-05, "learning_rate": 0.0075783027372058245, "loss": 2.7722, "step": 4390 }, { "crossentropy": 2.708992600440979, "epoch": 0.23877756328339542, "grad_norm": 0.03661024942994118, "grad_norm_var": 2.644317100991226e-05, "learning_rate": 0.007577235794596682, "loss": 2.709, "step": 4391 }, { "crossentropy": 2.699063301086426, "epoch": 0.23883194214089562, "grad_norm": 0.036721352487802505, "grad_norm_var": 2.719113218912026e-05, "learning_rate": 0.007576168692149776, "loss": 2.6991, "step": 4392 }, { "crossentropy": 2.7720264196395874, "epoch": 0.23888632099839582, "grad_norm": 0.03564620390534401, "grad_norm_var": 2.8524459775535783e-05, "learning_rate": 0.007575101429931293, "loss": 2.772, "step": 4393 }, { "crossentropy": 2.715427875518799, "epoch": 0.23894069985589603, "grad_norm": 0.03761495277285576, "grad_norm_var": 2.7080627366669192e-05, "learning_rate": 0.007574034008007418, "loss": 2.7154, "step": 4394 }, { "crossentropy": 2.606290578842163, "epoch": 0.23899507871339623, "grad_norm": 0.03864643722772598, "grad_norm_var": 2.681769724102856e-05, "learning_rate": 0.007572966426444354, "loss": 2.6063, "step": 4395 }, { "crossentropy": 2.6714484691619873, "epoch": 0.23904945757089643, "grad_norm": 0.045252203941345215, "grad_norm_var": 2.8343308695511864e-05, "learning_rate": 0.0075718986853083115, "loss": 2.6714, "step": 4396 }, { "crossentropy": 2.7615654468536377, "epoch": 0.23910383642839664, "grad_norm": 0.03582848235964775, "grad_norm_var": 2.7049136994225508e-05, "learning_rate": 0.00757083078466551, "loss": 2.7616, "step": 4397 }, { "crossentropy": 2.767727255821228, "epoch": 0.23915821528589684, "grad_norm": 0.03652996942400932, "grad_norm_var": 2.5613541510780685e-05, "learning_rate": 0.007569762724582179, "loss": 2.7677, "step": 4398 }, { "crossentropy": 2.691305160522461, "epoch": 0.23921259414339704, "grad_norm": 0.0428890734910965, "grad_norm_var": 1.4454488892132189e-05, "learning_rate": 0.007568694505124562, "loss": 2.6913, "step": 4399 }, { "crossentropy": 2.7316200733184814, "epoch": 0.23926697300089725, "grad_norm": 0.04361148923635483, "grad_norm_var": 1.077999203761276e-05, "learning_rate": 0.007567626126358904, "loss": 2.7316, "step": 4400 }, { "crossentropy": 2.64241623878479, "epoch": 0.23932135185839745, "grad_norm": 0.038129109889268875, "grad_norm_var": 1.0240870912230545e-05, "learning_rate": 0.007566557588351468, "loss": 2.6424, "step": 4401 }, { "crossentropy": 2.8943629264831543, "epoch": 0.23937573071589766, "grad_norm": 0.039078567177057266, "grad_norm_var": 9.925858000867093e-06, "learning_rate": 0.007565488891168524, "loss": 2.8944, "step": 4402 }, { "crossentropy": 2.770977258682251, "epoch": 0.23943010957339786, "grad_norm": 0.03754088282585144, "grad_norm_var": 9.406462460263138e-06, "learning_rate": 0.007564420034876351, "loss": 2.771, "step": 4403 }, { "crossentropy": 2.638210892677307, "epoch": 0.23948448843089806, "grad_norm": 0.035416267812252045, "grad_norm_var": 9.892300672830791e-06, "learning_rate": 0.007563351019541236, "loss": 2.6382, "step": 4404 }, { "crossentropy": 2.722753167152405, "epoch": 0.23953886728839827, "grad_norm": 0.03787531331181526, "grad_norm_var": 9.342766107976047e-06, "learning_rate": 0.0075622818452294824, "loss": 2.7228, "step": 4405 }, { "crossentropy": 2.720533847808838, "epoch": 0.23959324614589847, "grad_norm": 0.03596040979027748, "grad_norm_var": 8.999199789127878e-06, "learning_rate": 0.007561212512007399, "loss": 2.7205, "step": 4406 }, { "crossentropy": 2.603296399116516, "epoch": 0.23964762500339867, "grad_norm": 0.0370698943734169, "grad_norm_var": 8.906735937936189e-06, "learning_rate": 0.007560143019941301, "loss": 2.6033, "step": 4407 }, { "crossentropy": 2.6441503763198853, "epoch": 0.23970200386089888, "grad_norm": 0.04092215374112129, "grad_norm_var": 9.090067277479387e-06, "learning_rate": 0.00755907336909752, "loss": 2.6442, "step": 4408 }, { "crossentropy": 2.593457341194153, "epoch": 0.23975638271839908, "grad_norm": 0.03545943647623062, "grad_norm_var": 9.166444112603441e-06, "learning_rate": 0.007558003559542395, "loss": 2.5935, "step": 4409 }, { "crossentropy": 2.552873969078064, "epoch": 0.23981076157589928, "grad_norm": 0.03438553586602211, "grad_norm_var": 1.024846095463878e-05, "learning_rate": 0.0075569335913422755, "loss": 2.5529, "step": 4410 }, { "crossentropy": 2.682005524635315, "epoch": 0.2398651404333995, "grad_norm": 0.03594378009438515, "grad_norm_var": 1.0620575396651175e-05, "learning_rate": 0.007555863464563515, "loss": 2.682, "step": 4411 }, { "crossentropy": 2.6097054481506348, "epoch": 0.2399195192908997, "grad_norm": 0.039772387593984604, "grad_norm_var": 7.37633823240195e-06, "learning_rate": 0.007554793179272489, "loss": 2.6097, "step": 4412 }, { "crossentropy": 2.7268893718719482, "epoch": 0.2399738981483999, "grad_norm": 0.040056683123111725, "grad_norm_var": 7.3254049742736575e-06, "learning_rate": 0.00755372273553557, "loss": 2.7269, "step": 4413 }, { "crossentropy": 2.6786484718322754, "epoch": 0.2400282770059001, "grad_norm": 0.04148855060338974, "grad_norm_var": 7.781095088993153e-06, "learning_rate": 0.007552652133419148, "loss": 2.6786, "step": 4414 }, { "crossentropy": 2.615518093109131, "epoch": 0.2400826558634003, "grad_norm": 0.041607946157455444, "grad_norm_var": 7.129671888148753e-06, "learning_rate": 0.00755158137298962, "loss": 2.6155, "step": 4415 }, { "crossentropy": 2.55035400390625, "epoch": 0.2401370347209005, "grad_norm": 0.04093201085925102, "grad_norm_var": 5.7146989335842816e-06, "learning_rate": 0.007550510454313395, "loss": 2.5504, "step": 4416 }, { "crossentropy": 2.780103921890259, "epoch": 0.2401914135784007, "grad_norm": 0.03696219623088837, "grad_norm_var": 5.815102362060944e-06, "learning_rate": 0.007549439377456889, "loss": 2.7801, "step": 4417 }, { "crossentropy": 2.707034707069397, "epoch": 0.2402457924359009, "grad_norm": 0.05685098469257355, "grad_norm_var": 2.7745997533018484e-05, "learning_rate": 0.007548368142486528, "loss": 2.707, "step": 4418 }, { "crossentropy": 2.7313902378082275, "epoch": 0.24030017129340112, "grad_norm": 0.045502498745918274, "grad_norm_var": 2.9877176885796607e-05, "learning_rate": 0.007547296749468754, "loss": 2.7314, "step": 4419 }, { "crossentropy": 2.5559568405151367, "epoch": 0.24035455015090132, "grad_norm": 0.04279287904500961, "grad_norm_var": 2.9002975903875904e-05, "learning_rate": 0.007546225198470008, "loss": 2.556, "step": 4420 }, { "crossentropy": 2.7177454233169556, "epoch": 0.24040892900840152, "grad_norm": 0.039317186921834946, "grad_norm_var": 2.8681394885013366e-05, "learning_rate": 0.007545153489556751, "loss": 2.7177, "step": 4421 }, { "crossentropy": 2.7304505109786987, "epoch": 0.24046330786590173, "grad_norm": 0.037280142307281494, "grad_norm_var": 2.802416826978733e-05, "learning_rate": 0.007544081622795445, "loss": 2.7305, "step": 4422 }, { "crossentropy": 2.8161776065826416, "epoch": 0.24051768672340193, "grad_norm": 0.04381398484110832, "grad_norm_var": 2.7875501632275828e-05, "learning_rate": 0.007543009598252571, "loss": 2.8162, "step": 4423 }, { "crossentropy": 2.800346851348877, "epoch": 0.24057206558090213, "grad_norm": 0.03834730386734009, "grad_norm_var": 2.8254117674623298e-05, "learning_rate": 0.007541937415994611, "loss": 2.8003, "step": 4424 }, { "crossentropy": 2.7153372764587402, "epoch": 0.24062644443840234, "grad_norm": 0.04223478585481644, "grad_norm_var": 2.642774300814846e-05, "learning_rate": 0.007540865076088063, "loss": 2.7153, "step": 4425 }, { "crossentropy": 2.7063040733337402, "epoch": 0.24068082329590254, "grad_norm": 0.04012712463736534, "grad_norm_var": 2.336276966137067e-05, "learning_rate": 0.007539792578599431, "loss": 2.7063, "step": 4426 }, { "crossentropy": 2.6454803943634033, "epoch": 0.24073520215340274, "grad_norm": 0.04224755987524986, "grad_norm_var": 2.1227279384984875e-05, "learning_rate": 0.007538719923595232, "loss": 2.6455, "step": 4427 }, { "crossentropy": 2.648109197616577, "epoch": 0.24078958101090295, "grad_norm": 0.051759593188762665, "grad_norm_var": 2.6914011528017937e-05, "learning_rate": 0.0075376471111419895, "loss": 2.6481, "step": 4428 }, { "crossentropy": 2.627424955368042, "epoch": 0.24084395986840315, "grad_norm": 0.0440351739525795, "grad_norm_var": 2.6563379845255472e-05, "learning_rate": 0.007536574141306238, "loss": 2.6274, "step": 4429 }, { "crossentropy": 2.6429824829101562, "epoch": 0.24089833872590338, "grad_norm": 0.034467943012714386, "grad_norm_var": 3.0900808863039826e-05, "learning_rate": 0.007535501014154526, "loss": 2.643, "step": 4430 }, { "crossentropy": 2.6355512142181396, "epoch": 0.2409527175834036, "grad_norm": 0.03614373877644539, "grad_norm_var": 3.333847059819479e-05, "learning_rate": 0.007534427729753403, "loss": 2.6356, "step": 4431 }, { "crossentropy": 2.68513023853302, "epoch": 0.2410070964409038, "grad_norm": 0.056682929396629333, "grad_norm_var": 4.6494288269507907e-05, "learning_rate": 0.007533354288169435, "loss": 2.6851, "step": 4432 }, { "crossentropy": 2.639678716659546, "epoch": 0.241061475298404, "grad_norm": 0.035958610475063324, "grad_norm_var": 4.7369898265268476e-05, "learning_rate": 0.007532280689469197, "loss": 2.6397, "step": 4433 }, { "crossentropy": 2.6223654747009277, "epoch": 0.2411158541559042, "grad_norm": 0.036374349147081375, "grad_norm_var": 3.568481252781026e-05, "learning_rate": 0.00753120693371927, "loss": 2.6224, "step": 4434 }, { "crossentropy": 2.791897177696228, "epoch": 0.2411702330134044, "grad_norm": 0.0385417602956295, "grad_norm_var": 3.517733771514619e-05, "learning_rate": 0.00753013302098625, "loss": 2.7919, "step": 4435 }, { "crossentropy": 2.7034621238708496, "epoch": 0.2412246118709046, "grad_norm": 0.03843536600470543, "grad_norm_var": 3.547220875891298e-05, "learning_rate": 0.007529058951336738, "loss": 2.7035, "step": 4436 }, { "crossentropy": 2.635038375854492, "epoch": 0.2412789907284048, "grad_norm": 0.0362732894718647, "grad_norm_var": 3.672836924605201e-05, "learning_rate": 0.007527984724837349, "loss": 2.635, "step": 4437 }, { "crossentropy": 2.6630252599716187, "epoch": 0.241333369585905, "grad_norm": 0.037165317684412, "grad_norm_var": 3.6783009081429026e-05, "learning_rate": 0.007526910341554702, "loss": 2.663, "step": 4438 }, { "crossentropy": 2.6812925338745117, "epoch": 0.24138774844340521, "grad_norm": 0.03747449815273285, "grad_norm_var": 3.673711242274461e-05, "learning_rate": 0.007525835801555434, "loss": 2.6813, "step": 4439 }, { "crossentropy": 2.6149232387542725, "epoch": 0.24144212730090542, "grad_norm": 0.03551434725522995, "grad_norm_var": 3.801099047014515e-05, "learning_rate": 0.007524761104906182, "loss": 2.6149, "step": 4440 }, { "crossentropy": 2.780308961868286, "epoch": 0.24149650615840562, "grad_norm": 0.040621768683195114, "grad_norm_var": 3.773916269649424e-05, "learning_rate": 0.007523686251673603, "loss": 2.7803, "step": 4441 }, { "crossentropy": 2.73456871509552, "epoch": 0.24155088501590583, "grad_norm": 0.04223479703068733, "grad_norm_var": 3.8020504768732494e-05, "learning_rate": 0.007522611241924353, "loss": 2.7346, "step": 4442 }, { "crossentropy": 2.8185884952545166, "epoch": 0.24160526387340603, "grad_norm": 0.03998097404837608, "grad_norm_var": 3.7736605036167196e-05, "learning_rate": 0.007521536075725106, "loss": 2.8186, "step": 4443 }, { "crossentropy": 2.7236084938049316, "epoch": 0.24165964273090623, "grad_norm": 0.039411138743162155, "grad_norm_var": 2.8076447598884836e-05, "learning_rate": 0.007520460753142544, "loss": 2.7236, "step": 4444 }, { "crossentropy": 2.688820004463196, "epoch": 0.24171402158840644, "grad_norm": 0.03522633761167526, "grad_norm_var": 2.7402534287399028e-05, "learning_rate": 0.007519385274243355, "loss": 2.6888, "step": 4445 }, { "crossentropy": 2.602357268333435, "epoch": 0.24176840044590664, "grad_norm": 0.04322539269924164, "grad_norm_var": 2.7158843293809826e-05, "learning_rate": 0.007518309639094239, "loss": 2.6024, "step": 4446 }, { "crossentropy": 2.627118468284607, "epoch": 0.24182277930340684, "grad_norm": 0.036121711134910583, "grad_norm_var": 2.7168228905174487e-05, "learning_rate": 0.007517233847761908, "loss": 2.6271, "step": 4447 }, { "crossentropy": 2.704210877418518, "epoch": 0.24187715816090705, "grad_norm": 0.035764746367931366, "grad_norm_var": 6.110955683492382e-06, "learning_rate": 0.007516157900313081, "loss": 2.7042, "step": 4448 }, { "crossentropy": 2.725653290748596, "epoch": 0.24193153701840725, "grad_norm": 0.03842718526721001, "grad_norm_var": 5.813238851838191e-06, "learning_rate": 0.007515081796814486, "loss": 2.7257, "step": 4449 }, { "crossentropy": 2.8018884658813477, "epoch": 0.24198591587590745, "grad_norm": 0.03724835067987442, "grad_norm_var": 5.651196187249757e-06, "learning_rate": 0.007514005537332862, "loss": 2.8019, "step": 4450 }, { "crossentropy": 2.642876982688904, "epoch": 0.24204029473340766, "grad_norm": 0.03606799244880676, "grad_norm_var": 5.930568587382788e-06, "learning_rate": 0.007512929121934962, "loss": 2.6429, "step": 4451 }, { "crossentropy": 2.6334335803985596, "epoch": 0.24209467359090786, "grad_norm": 0.03573256731033325, "grad_norm_var": 6.257119541282161e-06, "learning_rate": 0.007511852550687537, "loss": 2.6334, "step": 4452 }, { "crossentropy": 2.708417057991028, "epoch": 0.24214905244840806, "grad_norm": 0.03567172959446907, "grad_norm_var": 6.410665103856866e-06, "learning_rate": 0.007510775823657358, "loss": 2.7084, "step": 4453 }, { "crossentropy": 2.6218650341033936, "epoch": 0.24220343130590827, "grad_norm": 0.035837527364492416, "grad_norm_var": 6.645265727248974e-06, "learning_rate": 0.007509698940911206, "loss": 2.6219, "step": 4454 }, { "crossentropy": 2.7382547855377197, "epoch": 0.24225781016340847, "grad_norm": 0.039455022662878036, "grad_norm_var": 6.808408791050824e-06, "learning_rate": 0.007508621902515864, "loss": 2.7383, "step": 4455 }, { "crossentropy": 2.600300669670105, "epoch": 0.24231218902090867, "grad_norm": 0.04165240377187729, "grad_norm_var": 7.203463881069738e-06, "learning_rate": 0.007507544708538132, "loss": 2.6003, "step": 4456 }, { "crossentropy": 2.5262757539749146, "epoch": 0.24236656787840888, "grad_norm": 0.04994744062423706, "grad_norm_var": 1.5535267339209824e-05, "learning_rate": 0.007506467359044813, "loss": 2.5263, "step": 4457 }, { "crossentropy": 2.694175601005554, "epoch": 0.24242094673590908, "grad_norm": 0.04125218093395233, "grad_norm_var": 1.5155471360622785e-05, "learning_rate": 0.007505389854102726, "loss": 2.6942, "step": 4458 }, { "crossentropy": 2.687375545501709, "epoch": 0.24247532559340929, "grad_norm": 0.0351158007979393, "grad_norm_var": 1.587778400488528e-05, "learning_rate": 0.007504312193778696, "loss": 2.6874, "step": 4459 }, { "crossentropy": 2.7125961780548096, "epoch": 0.2425297044509095, "grad_norm": 0.3212074339389801, "grad_norm_var": 0.0050128139293459965, "learning_rate": 0.00750323437813956, "loss": 2.7126, "step": 4460 }, { "crossentropy": 2.752168893814087, "epoch": 0.2425840833084097, "grad_norm": 0.04184368997812271, "grad_norm_var": 0.004997114135200046, "learning_rate": 0.0075021564072521585, "loss": 2.7522, "step": 4461 }, { "crossentropy": 2.655245780944824, "epoch": 0.2426384621659099, "grad_norm": 0.04801097512245178, "grad_norm_var": 0.00499005248893088, "learning_rate": 0.007501078281183351, "loss": 2.6552, "step": 4462 }, { "crossentropy": 2.6911003589630127, "epoch": 0.2426928410234101, "grad_norm": 0.05012795329093933, "grad_norm_var": 0.004963631748249072, "learning_rate": 0.0075, "loss": 2.6911, "step": 4463 }, { "crossentropy": 2.672515630722046, "epoch": 0.2427472198809103, "grad_norm": 0.042284779250621796, "grad_norm_var": 0.00494721067533264, "learning_rate": 0.00749892156376898, "loss": 2.6725, "step": 4464 }, { "crossentropy": 2.683703064918518, "epoch": 0.2428015987384105, "grad_norm": 0.04487254470586777, "grad_norm_var": 0.004932885441112442, "learning_rate": 0.007497842972557174, "loss": 2.6837, "step": 4465 }, { "crossentropy": 2.715338349342346, "epoch": 0.2428559775959107, "grad_norm": 0.043676890432834625, "grad_norm_var": 0.004917235121792055, "learning_rate": 0.007496764226431476, "loss": 2.7153, "step": 4466 }, { "crossentropy": 2.661231279373169, "epoch": 0.2429103564534109, "grad_norm": 0.04823683947324753, "grad_norm_var": 0.00488940875109347, "learning_rate": 0.0074956853254587895, "loss": 2.6612, "step": 4467 }, { "crossentropy": 2.6572747230529785, "epoch": 0.24296473531091112, "grad_norm": 0.04304744303226471, "grad_norm_var": 0.004869393840909378, "learning_rate": 0.007494606269706023, "loss": 2.6573, "step": 4468 }, { "crossentropy": 2.650161862373352, "epoch": 0.24301911416841132, "grad_norm": 0.06343299895524979, "grad_norm_var": 0.004826992328098092, "learning_rate": 0.007493527059240104, "loss": 2.6502, "step": 4469 }, { "crossentropy": 2.6495331525802612, "epoch": 0.24307349302591152, "grad_norm": 0.03809035196900368, "grad_norm_var": 0.00481948844527449, "learning_rate": 0.007492447694127961, "loss": 2.6495, "step": 4470 }, { "crossentropy": 2.73660409450531, "epoch": 0.24312787188341173, "grad_norm": 0.04299536347389221, "grad_norm_var": 0.004809622051493369, "learning_rate": 0.007491368174436535, "loss": 2.7366, "step": 4471 }, { "crossentropy": 2.6488109827041626, "epoch": 0.24318225074091193, "grad_norm": 0.03789646178483963, "grad_norm_var": 0.004820812448532144, "learning_rate": 0.0074902885002327765, "loss": 2.6488, "step": 4472 }, { "crossentropy": 2.6921634674072266, "epoch": 0.24323662959841214, "grad_norm": 0.04008982703089714, "grad_norm_var": 0.004842730211589601, "learning_rate": 0.00748920867158365, "loss": 2.6922, "step": 4473 }, { "crossentropy": 2.660141706466675, "epoch": 0.24329100845591234, "grad_norm": 0.03555009141564369, "grad_norm_var": 0.0048600699004351065, "learning_rate": 0.007488128688556119, "loss": 2.6601, "step": 4474 }, { "crossentropy": 2.778874397277832, "epoch": 0.24334538731341254, "grad_norm": 0.03825104236602783, "grad_norm_var": 0.004849851303736375, "learning_rate": 0.007487048551217168, "loss": 2.7789, "step": 4475 }, { "crossentropy": 2.669983386993408, "epoch": 0.24339976617091275, "grad_norm": 0.038678042590618134, "grad_norm_var": 4.5123932518744265e-05, "learning_rate": 0.007485968259633785, "loss": 2.67, "step": 4476 }, { "crossentropy": 2.7632596492767334, "epoch": 0.24345414502841295, "grad_norm": 0.04003113880753517, "grad_norm_var": 4.574594553783395e-05, "learning_rate": 0.007484887813872968, "loss": 2.7633, "step": 4477 }, { "crossentropy": 2.6847840547561646, "epoch": 0.24350852388591315, "grad_norm": 0.040329091250896454, "grad_norm_var": 4.4767226851029586e-05, "learning_rate": 0.007483807214001725, "loss": 2.6848, "step": 4478 }, { "crossentropy": 2.7392786741256714, "epoch": 0.24356290274341336, "grad_norm": 0.03973797708749771, "grad_norm_var": 4.160420888045246e-05, "learning_rate": 0.007482726460087074, "loss": 2.7393, "step": 4479 }, { "crossentropy": 2.6765830516815186, "epoch": 0.24361728160091356, "grad_norm": 0.03812108933925629, "grad_norm_var": 4.27100880420211e-05, "learning_rate": 0.0074816455521960435, "loss": 2.6766, "step": 4480 }, { "crossentropy": 2.7029500007629395, "epoch": 0.24367166045841376, "grad_norm": 0.03862626105546951, "grad_norm_var": 4.281021627082377e-05, "learning_rate": 0.00748056449039567, "loss": 2.703, "step": 4481 }, { "crossentropy": 2.7328439950942993, "epoch": 0.24372603931591397, "grad_norm": 0.03883364051580429, "grad_norm_var": 4.298316216750156e-05, "learning_rate": 0.007479483274752998, "loss": 2.7328, "step": 4482 }, { "crossentropy": 2.7768094539642334, "epoch": 0.24378041817341417, "grad_norm": 0.03859337791800499, "grad_norm_var": 3.996830992475566e-05, "learning_rate": 0.007478401905335084, "loss": 2.7768, "step": 4483 }, { "crossentropy": 2.641988158226013, "epoch": 0.24383479703091437, "grad_norm": 0.04582240432500839, "grad_norm_var": 4.129259313134046e-05, "learning_rate": 0.007477320382208996, "loss": 2.642, "step": 4484 }, { "crossentropy": 2.7715483903884888, "epoch": 0.24388917588841458, "grad_norm": 0.04245985299348831, "grad_norm_var": 5.891630541326962e-06, "learning_rate": 0.007476238705441807, "loss": 2.7715, "step": 4485 }, { "crossentropy": 2.701271176338196, "epoch": 0.24394355474591478, "grad_norm": 0.04451083019375801, "grad_norm_var": 7.148610594814343e-06, "learning_rate": 0.007475156875100602, "loss": 2.7013, "step": 4486 }, { "crossentropy": 2.5922337770462036, "epoch": 0.24399793360341498, "grad_norm": 0.03796731308102608, "grad_norm_var": 6.742639710122002e-06, "learning_rate": 0.007474074891252476, "loss": 2.5922, "step": 4487 }, { "crossentropy": 2.614925742149353, "epoch": 0.2440523124609152, "grad_norm": 0.03517201915383339, "grad_norm_var": 7.868478686914812e-06, "learning_rate": 0.007472992753964532, "loss": 2.6149, "step": 4488 }, { "crossentropy": 2.661274552345276, "epoch": 0.2441066913184154, "grad_norm": 0.03495502099394798, "grad_norm_var": 9.145668060019039e-06, "learning_rate": 0.007471910463303881, "loss": 2.6613, "step": 4489 }, { "crossentropy": 2.609672427177429, "epoch": 0.2441610701759156, "grad_norm": 0.03672695904970169, "grad_norm_var": 8.655196477497008e-06, "learning_rate": 0.00747082801933765, "loss": 2.6097, "step": 4490 }, { "crossentropy": 2.7257566452026367, "epoch": 0.2442154490334158, "grad_norm": 0.037050165235996246, "grad_norm_var": 8.913444717434383e-06, "learning_rate": 0.007469745422132967, "loss": 2.7258, "step": 4491 }, { "crossentropy": 2.7644554376602173, "epoch": 0.244269827890916, "grad_norm": 0.05807671323418617, "grad_norm_var": 3.101556707397057e-05, "learning_rate": 0.0074686626717569765, "loss": 2.7645, "step": 4492 }, { "crossentropy": 2.694704055786133, "epoch": 0.2443242067484162, "grad_norm": 0.03808419033885002, "grad_norm_var": 3.1358193447743264e-05, "learning_rate": 0.007467579768276829, "loss": 2.6947, "step": 4493 }, { "crossentropy": 2.7175393104553223, "epoch": 0.2443785856059164, "grad_norm": 0.04090627655386925, "grad_norm_var": 3.137996989897931e-05, "learning_rate": 0.007466496711759684, "loss": 2.7175, "step": 4494 }, { "crossentropy": 2.763167977333069, "epoch": 0.2444329644634166, "grad_norm": 0.05043312907218933, "grad_norm_var": 3.7652425399384295e-05, "learning_rate": 0.007465413502272714, "loss": 2.7632, "step": 4495 }, { "crossentropy": 2.761210083961487, "epoch": 0.24448734332091682, "grad_norm": 0.04674585908651352, "grad_norm_var": 3.896654990411044e-05, "learning_rate": 0.007464330139883093, "loss": 2.7612, "step": 4496 }, { "crossentropy": 2.706866979598999, "epoch": 0.24454172217841702, "grad_norm": 0.039771661162376404, "grad_norm_var": 3.860046728629459e-05, "learning_rate": 0.007463246624658018, "loss": 2.7069, "step": 4497 }, { "crossentropy": 2.7035967111587524, "epoch": 0.24459610103591722, "grad_norm": 0.03592195734381676, "grad_norm_var": 4.021666468982382e-05, "learning_rate": 0.007462162956664682, "loss": 2.7036, "step": 4498 }, { "crossentropy": 2.6793243885040283, "epoch": 0.24465047989341743, "grad_norm": 0.03522447124123573, "grad_norm_var": 4.220910581024754e-05, "learning_rate": 0.007461079135970296, "loss": 2.6793, "step": 4499 }, { "crossentropy": 2.6112927198410034, "epoch": 0.24470485875091763, "grad_norm": 0.03801336511969566, "grad_norm_var": 4.1248473461949485e-05, "learning_rate": 0.007459995162642076, "loss": 2.6113, "step": 4500 }, { "crossentropy": 2.7400633096694946, "epoch": 0.24475923760841783, "grad_norm": 0.03760787844657898, "grad_norm_var": 4.161447185389554e-05, "learning_rate": 0.007458911036747249, "loss": 2.7401, "step": 4501 }, { "crossentropy": 2.762709617614746, "epoch": 0.24481361646591804, "grad_norm": 0.040878914296627045, "grad_norm_var": 4.047144422440422e-05, "learning_rate": 0.007457826758353054, "loss": 2.7627, "step": 4502 }, { "crossentropy": 2.5953110456466675, "epoch": 0.24486799532341824, "grad_norm": 0.040277447551488876, "grad_norm_var": 4.0110815377561856e-05, "learning_rate": 0.007456742327526731, "loss": 2.5953, "step": 4503 }, { "crossentropy": 2.722172260284424, "epoch": 0.24492237418091845, "grad_norm": 0.04055727273225784, "grad_norm_var": 3.819436879556303e-05, "learning_rate": 0.007455657744335543, "loss": 2.7222, "step": 4504 }, { "crossentropy": 2.6132417917251587, "epoch": 0.24497675303841868, "grad_norm": 0.04455357417464256, "grad_norm_var": 3.6597665735426986e-05, "learning_rate": 0.00745457300884675, "loss": 2.6132, "step": 4505 }, { "crossentropy": 2.691275477409363, "epoch": 0.24503113189591888, "grad_norm": 0.036559347063302994, "grad_norm_var": 3.670166279891995e-05, "learning_rate": 0.0074534881211276265, "loss": 2.6913, "step": 4506 }, { "crossentropy": 2.50054669380188, "epoch": 0.24508551075341908, "grad_norm": 0.03589232265949249, "grad_norm_var": 3.7440206177152764e-05, "learning_rate": 0.007452403081245458, "loss": 2.5005, "step": 4507 }, { "crossentropy": 2.60702383518219, "epoch": 0.2451398896109193, "grad_norm": 0.04151557758450508, "grad_norm_var": 1.7357826091534723e-05, "learning_rate": 0.007451317889267537, "loss": 2.607, "step": 4508 }, { "crossentropy": 2.7214958667755127, "epoch": 0.2451942684684195, "grad_norm": 0.040114279836416245, "grad_norm_var": 1.7047044295710225e-05, "learning_rate": 0.007450232545261165, "loss": 2.7215, "step": 4509 }, { "crossentropy": 2.5915491580963135, "epoch": 0.2452486473259197, "grad_norm": 0.03774286061525345, "grad_norm_var": 1.7421343059766346e-05, "learning_rate": 0.007449147049293656, "loss": 2.5915, "step": 4510 }, { "crossentropy": 2.68216872215271, "epoch": 0.2453030261834199, "grad_norm": 0.0395955890417099, "grad_norm_var": 9.849641806125344e-06, "learning_rate": 0.007448061401432329, "loss": 2.6822, "step": 4511 }, { "crossentropy": 2.7059860229492188, "epoch": 0.2453574050409201, "grad_norm": 0.039331261068582535, "grad_norm_var": 6.058812265691214e-06, "learning_rate": 0.007446975601744516, "loss": 2.706, "step": 4512 }, { "crossentropy": 2.6763776540756226, "epoch": 0.2454117838984203, "grad_norm": 0.03928300738334656, "grad_norm_var": 6.021658716305554e-06, "learning_rate": 0.007445889650297558, "loss": 2.6764, "step": 4513 }, { "crossentropy": 2.7509982585906982, "epoch": 0.2454661627559205, "grad_norm": 0.03961081802845001, "grad_norm_var": 5.386825665505921e-06, "learning_rate": 0.007444803547158805, "loss": 2.751, "step": 4514 }, { "crossentropy": 2.7304271459579468, "epoch": 0.2455205416134207, "grad_norm": 0.03660336881875992, "grad_norm_var": 4.779826734197372e-06, "learning_rate": 0.007443717292395615, "loss": 2.7304, "step": 4515 }, { "crossentropy": 2.7142070531845093, "epoch": 0.24557492047092092, "grad_norm": 0.04058936610817909, "grad_norm_var": 4.766881575914092e-06, "learning_rate": 0.007442630886075357, "loss": 2.7142, "step": 4516 }, { "crossentropy": 2.7039129734039307, "epoch": 0.24562929932842112, "grad_norm": 0.043337631970644, "grad_norm_var": 5.434699684168237e-06, "learning_rate": 0.0074415443282654075, "loss": 2.7039, "step": 4517 }, { "crossentropy": 2.805156707763672, "epoch": 0.24568367818592132, "grad_norm": 0.038252778351306915, "grad_norm_var": 5.480132487687555e-06, "learning_rate": 0.007440457619033156, "loss": 2.8052, "step": 4518 }, { "crossentropy": 2.696630597114563, "epoch": 0.24573805704342153, "grad_norm": 0.03822094202041626, "grad_norm_var": 5.562412136867791e-06, "learning_rate": 0.007439370758445998, "loss": 2.6966, "step": 4519 }, { "crossentropy": 2.6943622827529907, "epoch": 0.24579243590092173, "grad_norm": 0.0374457873404026, "grad_norm_var": 5.7226477792528265e-06, "learning_rate": 0.007438283746571339, "loss": 2.6944, "step": 4520 }, { "crossentropy": 2.6635749340057373, "epoch": 0.24584681475842193, "grad_norm": 0.0368206612765789, "grad_norm_var": 4.03353292282004e-06, "learning_rate": 0.007437196583476596, "loss": 2.6636, "step": 4521 }, { "crossentropy": 2.7083953619003296, "epoch": 0.24590119361592214, "grad_norm": 0.03622160106897354, "grad_norm_var": 4.141890678617475e-06, "learning_rate": 0.007436109269229193, "loss": 2.7084, "step": 4522 }, { "crossentropy": 2.714311122894287, "epoch": 0.24595557247342234, "grad_norm": 0.038338806480169296, "grad_norm_var": 3.572021783996432e-06, "learning_rate": 0.007435021803896565, "loss": 2.7143, "step": 4523 }, { "crossentropy": 2.6622039079666138, "epoch": 0.24600995133092254, "grad_norm": 0.03566554933786392, "grad_norm_var": 3.701224898055612e-06, "learning_rate": 0.0074339341875461545, "loss": 2.6622, "step": 4524 }, { "crossentropy": 2.618623733520508, "epoch": 0.24606433018842275, "grad_norm": 0.03788888081908226, "grad_norm_var": 3.5535385917453397e-06, "learning_rate": 0.007432846420245415, "loss": 2.6186, "step": 4525 }, { "crossentropy": 2.586141586303711, "epoch": 0.24611870904592295, "grad_norm": 0.039108846336603165, "grad_norm_var": 3.5442243110359503e-06, "learning_rate": 0.007431758502061807, "loss": 2.5861, "step": 4526 }, { "crossentropy": 2.759344696998596, "epoch": 0.24617308790342315, "grad_norm": 0.037348151206970215, "grad_norm_var": 3.5375054934503233e-06, "learning_rate": 0.007430670433062806, "loss": 2.7593, "step": 4527 }, { "crossentropy": 2.677112340927124, "epoch": 0.24622746676092336, "grad_norm": 0.036531608551740646, "grad_norm_var": 3.6719978699473186e-06, "learning_rate": 0.00742958221331589, "loss": 2.6771, "step": 4528 }, { "crossentropy": 2.6290313005447388, "epoch": 0.24628184561842356, "grad_norm": 0.037354305386543274, "grad_norm_var": 3.6270744135249074e-06, "learning_rate": 0.007428493842888549, "loss": 2.629, "step": 4529 }, { "crossentropy": 2.492087483406067, "epoch": 0.24633622447592377, "grad_norm": 0.03884594142436981, "grad_norm_var": 3.5078976460702534e-06, "learning_rate": 0.0074274053218482865, "loss": 2.4921, "step": 4530 }, { "crossentropy": 2.5384403467178345, "epoch": 0.24639060333342397, "grad_norm": 0.03809184953570366, "grad_norm_var": 3.362067220684326e-06, "learning_rate": 0.007426316650262606, "loss": 2.5384, "step": 4531 }, { "crossentropy": 2.7263076305389404, "epoch": 0.24644498219092417, "grad_norm": 0.039324019104242325, "grad_norm_var": 3.047026944390183e-06, "learning_rate": 0.007425227828199032, "loss": 2.7263, "step": 4532 }, { "crossentropy": 2.716739773750305, "epoch": 0.24649936104842438, "grad_norm": 0.044765833765268326, "grad_norm_var": 4.181450781168215e-06, "learning_rate": 0.007424138855725087, "loss": 2.7167, "step": 4533 }, { "crossentropy": 2.6953235864639282, "epoch": 0.24655373990592458, "grad_norm": 0.046658456325531006, "grad_norm_var": 8.724823145590289e-06, "learning_rate": 0.0074230497329083114, "loss": 2.6953, "step": 4534 }, { "crossentropy": 2.5874578952789307, "epoch": 0.24660811876342478, "grad_norm": 0.036532703787088394, "grad_norm_var": 9.002790929230065e-06, "learning_rate": 0.007421960459816249, "loss": 2.5875, "step": 4535 }, { "crossentropy": 2.6885409355163574, "epoch": 0.246662497620925, "grad_norm": 0.035656556487083435, "grad_norm_var": 9.468432827008412e-06, "learning_rate": 0.007420871036516459, "loss": 2.6885, "step": 4536 }, { "crossentropy": 2.6035637855529785, "epoch": 0.2467168764784252, "grad_norm": 0.03655853495001793, "grad_norm_var": 9.52957190955833e-06, "learning_rate": 0.007419781463076503, "loss": 2.6036, "step": 4537 }, { "crossentropy": 2.661062002182007, "epoch": 0.2467712553359254, "grad_norm": 0.05228159576654434, "grad_norm_var": 2.0919311151210094e-05, "learning_rate": 0.007418691739563957, "loss": 2.6611, "step": 4538 }, { "crossentropy": 2.727553606033325, "epoch": 0.2468256341934256, "grad_norm": 0.03672188147902489, "grad_norm_var": 2.1318929760344383e-05, "learning_rate": 0.007417601866046403, "loss": 2.7276, "step": 4539 }, { "crossentropy": 2.682394862174988, "epoch": 0.2468800130509258, "grad_norm": 0.03841321915388107, "grad_norm_var": 2.0447039145985294e-05, "learning_rate": 0.007416511842591436, "loss": 2.6824, "step": 4540 }, { "crossentropy": 2.7369827032089233, "epoch": 0.246934391908426, "grad_norm": 0.03897121176123619, "grad_norm_var": 2.028700922030117e-05, "learning_rate": 0.007415421669266658, "loss": 2.737, "step": 4541 }, { "crossentropy": 2.6336318254470825, "epoch": 0.2469887707659262, "grad_norm": 0.037657465785741806, "grad_norm_var": 2.0508447896275205e-05, "learning_rate": 0.007414331346139679, "loss": 2.6336, "step": 4542 }, { "crossentropy": 2.6935441493988037, "epoch": 0.2470431496234264, "grad_norm": 0.03850090876221657, "grad_norm_var": 2.0263513491697075e-05, "learning_rate": 0.007413240873278122, "loss": 2.6935, "step": 4543 }, { "crossentropy": 2.643980026245117, "epoch": 0.24709752848092661, "grad_norm": 0.03586117550730705, "grad_norm_var": 2.056179252226837e-05, "learning_rate": 0.007412150250749614, "loss": 2.644, "step": 4544 }, { "crossentropy": 2.758555293083191, "epoch": 0.24715190733842682, "grad_norm": 0.038582704961299896, "grad_norm_var": 2.0302663925124882e-05, "learning_rate": 0.007411059478621798, "loss": 2.7586, "step": 4545 }, { "crossentropy": 2.6904388666152954, "epoch": 0.24720628619592702, "grad_norm": 0.038409292697906494, "grad_norm_var": 2.035784126419027e-05, "learning_rate": 0.007409968556962321, "loss": 2.6904, "step": 4546 }, { "crossentropy": 2.597109794616699, "epoch": 0.24726066505342723, "grad_norm": 0.0490947924554348, "grad_norm_var": 2.576801164598889e-05, "learning_rate": 0.007408877485838841, "loss": 2.5971, "step": 4547 }, { "crossentropy": 2.568037509918213, "epoch": 0.24731504391092743, "grad_norm": 0.040541987866163254, "grad_norm_var": 2.5710449616441154e-05, "learning_rate": 0.007407786265319023, "loss": 2.568, "step": 4548 }, { "crossentropy": 2.707917094230652, "epoch": 0.24736942276842763, "grad_norm": 0.03642386198043823, "grad_norm_var": 2.5120934264125498e-05, "learning_rate": 0.007406694895470547, "loss": 2.7079, "step": 4549 }, { "crossentropy": 2.670075297355652, "epoch": 0.24742380162592784, "grad_norm": 0.03769327700138092, "grad_norm_var": 2.195098831439529e-05, "learning_rate": 0.007405603376361098, "loss": 2.6701, "step": 4550 }, { "crossentropy": 2.764753580093384, "epoch": 0.24747818048342804, "grad_norm": 0.03954852744936943, "grad_norm_var": 2.1429270113490898e-05, "learning_rate": 0.0074045117080583705, "loss": 2.7648, "step": 4551 }, { "crossentropy": 2.650825619697571, "epoch": 0.24753255934092824, "grad_norm": 0.04973771795630455, "grad_norm_var": 2.6732777735223822e-05, "learning_rate": 0.007403419890630067, "loss": 2.6508, "step": 4552 }, { "crossentropy": 2.663129448890686, "epoch": 0.24758693819842845, "grad_norm": 0.03774520009756088, "grad_norm_var": 2.622684692870399e-05, "learning_rate": 0.007402327924143903, "loss": 2.6631, "step": 4553 }, { "crossentropy": 2.7611531019210815, "epoch": 0.24764131705592865, "grad_norm": 0.038050178438425064, "grad_norm_var": 1.6314062103973497e-05, "learning_rate": 0.007401235808667602, "loss": 2.7612, "step": 4554 }, { "crossentropy": 2.5858813524246216, "epoch": 0.24769569591342885, "grad_norm": 0.041122615337371826, "grad_norm_var": 1.589607322334859e-05, "learning_rate": 0.0074001435442688935, "loss": 2.5859, "step": 4555 }, { "crossentropy": 2.6879035234451294, "epoch": 0.24775007477092906, "grad_norm": 0.043908290565013885, "grad_norm_var": 1.678766709804902e-05, "learning_rate": 0.007399051131015521, "loss": 2.6879, "step": 4556 }, { "crossentropy": 2.656158924102783, "epoch": 0.24780445362842926, "grad_norm": 0.042752668261528015, "grad_norm_var": 1.7104398847566974e-05, "learning_rate": 0.007397958568975234, "loss": 2.6562, "step": 4557 }, { "crossentropy": 2.675718307495117, "epoch": 0.24785883248592946, "grad_norm": 0.041049279272556305, "grad_norm_var": 1.660488050010533e-05, "learning_rate": 0.0073968658582157926, "loss": 2.6757, "step": 4558 }, { "crossentropy": 2.6937915086746216, "epoch": 0.24791321134342967, "grad_norm": 0.03738602623343468, "grad_norm_var": 1.6989232139664966e-05, "learning_rate": 0.0073957729988049625, "loss": 2.6938, "step": 4559 }, { "crossentropy": 2.6480886936187744, "epoch": 0.24796759020092987, "grad_norm": 0.039547912776470184, "grad_norm_var": 1.556128940951376e-05, "learning_rate": 0.007394679990810528, "loss": 2.6481, "step": 4560 }, { "crossentropy": 2.68385112285614, "epoch": 0.24802196905843008, "grad_norm": 0.03594893217086792, "grad_norm_var": 1.674702159936703e-05, "learning_rate": 0.007393586834300271, "loss": 2.6839, "step": 4561 }, { "crossentropy": 2.6918694972991943, "epoch": 0.24807634791593028, "grad_norm": 0.035800911486148834, "grad_norm_var": 1.7920244003941792e-05, "learning_rate": 0.00739249352934199, "loss": 2.6919, "step": 4562 }, { "crossentropy": 2.771610140800476, "epoch": 0.24813072677343048, "grad_norm": 0.03961705043911934, "grad_norm_var": 1.2543091709876775e-05, "learning_rate": 0.007391400076003492, "loss": 2.7716, "step": 4563 }, { "crossentropy": 2.484560489654541, "epoch": 0.24818510563093069, "grad_norm": 0.03790056332945824, "grad_norm_var": 1.2719479778034819e-05, "learning_rate": 0.0073903064743525895, "loss": 2.4846, "step": 4564 }, { "crossentropy": 2.6908695697784424, "epoch": 0.2482394844884309, "grad_norm": 0.03836733475327492, "grad_norm_var": 1.2122263951261774e-05, "learning_rate": 0.00738921272445711, "loss": 2.6909, "step": 4565 }, { "crossentropy": 2.632898688316345, "epoch": 0.2482938633459311, "grad_norm": 0.04471246898174286, "grad_norm_var": 1.3266385593757995e-05, "learning_rate": 0.007388118826384882, "loss": 2.6329, "step": 4566 }, { "crossentropy": 2.7052340507507324, "epoch": 0.2483482422034313, "grad_norm": 0.0354502834379673, "grad_norm_var": 1.4671949019045595e-05, "learning_rate": 0.007387024780203753, "loss": 2.7052, "step": 4567 }, { "crossentropy": 2.691611170768738, "epoch": 0.2484026210609315, "grad_norm": 0.03625689446926117, "grad_norm_var": 8.42584788594188e-06, "learning_rate": 0.007385930585981572, "loss": 2.6916, "step": 4568 }, { "crossentropy": 2.820786476135254, "epoch": 0.2484569999184317, "grad_norm": 0.037424251437187195, "grad_norm_var": 8.490306475217082e-06, "learning_rate": 0.0073848362437862, "loss": 2.8208, "step": 4569 }, { "crossentropy": 2.6287835836410522, "epoch": 0.2485113787759319, "grad_norm": 0.03846605867147446, "grad_norm_var": 8.44395764690812e-06, "learning_rate": 0.007383741753685508, "loss": 2.6288, "step": 4570 }, { "crossentropy": 2.6111422777175903, "epoch": 0.2485657576334321, "grad_norm": 0.03754592686891556, "grad_norm_var": 8.282257341794771e-06, "learning_rate": 0.007382647115747375, "loss": 2.6111, "step": 4571 }, { "crossentropy": 2.664872169494629, "epoch": 0.24862013649093231, "grad_norm": 0.03625207021832466, "grad_norm_var": 6.816337210862241e-06, "learning_rate": 0.00738155233003969, "loss": 2.6649, "step": 4572 }, { "crossentropy": 2.6952311992645264, "epoch": 0.24867451534843252, "grad_norm": 0.03963451087474823, "grad_norm_var": 5.616421423908687e-06, "learning_rate": 0.007380457396630347, "loss": 2.6952, "step": 4573 }, { "crossentropy": 2.6095880270004272, "epoch": 0.24872889420593272, "grad_norm": 0.04247136414051056, "grad_norm_var": 6.281170604149668e-06, "learning_rate": 0.007379362315587257, "loss": 2.6096, "step": 4574 }, { "crossentropy": 2.541927218437195, "epoch": 0.24878327306343292, "grad_norm": 0.040536537766456604, "grad_norm_var": 6.518054834396489e-06, "learning_rate": 0.007378267086978335, "loss": 2.5419, "step": 4575 }, { "crossentropy": 2.7747284173965454, "epoch": 0.24883765192093313, "grad_norm": 0.04015728458762169, "grad_norm_var": 6.626745545073217e-06, "learning_rate": 0.007377171710871507, "loss": 2.7747, "step": 4576 }, { "crossentropy": 2.757659435272217, "epoch": 0.24889203077843333, "grad_norm": 0.041671667248010635, "grad_norm_var": 6.701188036058123e-06, "learning_rate": 0.007376076187334701, "loss": 2.7577, "step": 4577 }, { "crossentropy": 2.6141903400421143, "epoch": 0.24894640963593354, "grad_norm": 0.03751303628087044, "grad_norm_var": 6.17885216229741e-06, "learning_rate": 0.0073749805164358695, "loss": 2.6142, "step": 4578 }, { "crossentropy": 2.7197344303131104, "epoch": 0.24900078849343377, "grad_norm": 0.03627306595444679, "grad_norm_var": 6.601988258221151e-06, "learning_rate": 0.007373884698242959, "loss": 2.7197, "step": 4579 }, { "crossentropy": 2.6962368488311768, "epoch": 0.24905516735093397, "grad_norm": 0.035837266594171524, "grad_norm_var": 7.112637543615676e-06, "learning_rate": 0.0073727887328239325, "loss": 2.6962, "step": 4580 }, { "crossentropy": 2.680193543434143, "epoch": 0.24910954620843417, "grad_norm": 0.03635336086153984, "grad_norm_var": 7.444900781528041e-06, "learning_rate": 0.007371692620246762, "loss": 2.6802, "step": 4581 }, { "crossentropy": 2.689908504486084, "epoch": 0.24916392506593438, "grad_norm": 0.036273788660764694, "grad_norm_var": 4.9447059521645055e-06, "learning_rate": 0.007370596360579426, "loss": 2.6899, "step": 4582 }, { "crossentropy": 2.7734345197677612, "epoch": 0.24921830392343458, "grad_norm": 0.045395705848932266, "grad_norm_var": 7.735875078291633e-06, "learning_rate": 0.007369499953889913, "loss": 2.7734, "step": 4583 }, { "crossentropy": 2.6384698152542114, "epoch": 0.24927268278093478, "grad_norm": 0.03945920616388321, "grad_norm_var": 7.3640028629551674e-06, "learning_rate": 0.0073684034002462215, "loss": 2.6385, "step": 4584 }, { "crossentropy": 2.6377826929092407, "epoch": 0.249327061638435, "grad_norm": 0.03676271066069603, "grad_norm_var": 7.51526766218924e-06, "learning_rate": 0.007367306699716361, "loss": 2.6378, "step": 4585 }, { "crossentropy": 2.6971852779388428, "epoch": 0.2493814404959352, "grad_norm": 0.0369936041533947, "grad_norm_var": 7.713926704638364e-06, "learning_rate": 0.007366209852368344, "loss": 2.6972, "step": 4586 }, { "crossentropy": 2.7857182025909424, "epoch": 0.2494358193534354, "grad_norm": 0.036353450268507004, "grad_norm_var": 7.985611147579703e-06, "learning_rate": 0.007365112858270199, "loss": 2.7857, "step": 4587 }, { "crossentropy": 2.7268359661102295, "epoch": 0.2494901982109356, "grad_norm": 0.043460406363010406, "grad_norm_var": 8.956154398714368e-06, "learning_rate": 0.007364015717489959, "loss": 2.7268, "step": 4588 }, { "crossentropy": 2.6823865175247192, "epoch": 0.2495445770684358, "grad_norm": 0.038330405950546265, "grad_norm_var": 8.964583039268853e-06, "learning_rate": 0.0073629184300956696, "loss": 2.6824, "step": 4589 }, { "crossentropy": 2.6796988248825073, "epoch": 0.249598955925936, "grad_norm": 0.036868125200271606, "grad_norm_var": 8.326062551531193e-06, "learning_rate": 0.007361820996155382, "loss": 2.6797, "step": 4590 }, { "crossentropy": 2.700947403907776, "epoch": 0.2496533347834362, "grad_norm": 0.037738699465990067, "grad_norm_var": 8.10780331518916e-06, "learning_rate": 0.007360723415737156, "loss": 2.7009, "step": 4591 }, { "crossentropy": 2.6483579874038696, "epoch": 0.2497077136409364, "grad_norm": 0.03537853807210922, "grad_norm_var": 8.456883967386936e-06, "learning_rate": 0.007359625688909066, "loss": 2.6484, "step": 4592 }, { "crossentropy": 2.718071222305298, "epoch": 0.24976209249843662, "grad_norm": 0.03640168905258179, "grad_norm_var": 7.729679599112703e-06, "learning_rate": 0.007358527815739191, "loss": 2.7181, "step": 4593 }, { "crossentropy": 2.6674509048461914, "epoch": 0.24981647135593682, "grad_norm": 0.04588029906153679, "grad_norm_var": 1.1743873346179805e-05, "learning_rate": 0.007357429796295618, "loss": 2.6675, "step": 4594 }, { "crossentropy": 2.6101983785629272, "epoch": 0.24987085021343702, "grad_norm": 0.06397505849599838, "grad_norm_var": 5.1998025843747314e-05, "learning_rate": 0.00735633163064645, "loss": 2.6102, "step": 4595 }, { "crossentropy": 2.6492879390716553, "epoch": 0.24992522907093723, "grad_norm": 0.03732939064502716, "grad_norm_var": 5.129081973596442e-05, "learning_rate": 0.0073552333188597885, "loss": 2.6493, "step": 4596 }, { "crossentropy": 2.649802565574646, "epoch": 0.24997960792843743, "grad_norm": 0.04746710881590843, "grad_norm_var": 5.333319732051811e-05, "learning_rate": 0.0073541348610037536, "loss": 2.6498, "step": 4597 }, { "crossentropy": 2.6893537044525146, "epoch": 0.25003398678593763, "grad_norm": 0.0396851971745491, "grad_norm_var": 5.19657340911606e-05, "learning_rate": 0.0073530362571464685, "loss": 2.6894, "step": 4598 }, { "crossentropy": 2.695223331451416, "epoch": 0.25008836564343784, "grad_norm": 0.039273474365472794, "grad_norm_var": 5.0795624534324434e-05, "learning_rate": 0.00735193750735607, "loss": 2.6952, "step": 4599 }, { "crossentropy": 2.618735194206238, "epoch": 0.25014274450093804, "grad_norm": 0.046284135431051254, "grad_norm_var": 5.256879258470996e-05, "learning_rate": 0.007350838611700699, "loss": 2.6187, "step": 4600 }, { "crossentropy": 2.614606022834778, "epoch": 0.25019712335843824, "grad_norm": 0.0366491973400116, "grad_norm_var": 5.263579408002385e-05, "learning_rate": 0.007349739570248508, "loss": 2.6146, "step": 4601 }, { "crossentropy": 2.6480653285980225, "epoch": 0.25025150221593845, "grad_norm": 0.03905168175697327, "grad_norm_var": 5.176564690645586e-05, "learning_rate": 0.007348640383067659, "loss": 2.6481, "step": 4602 }, { "crossentropy": 2.618582010269165, "epoch": 0.25030588107343865, "grad_norm": 0.03616886958479881, "grad_norm_var": 5.188847921152757e-05, "learning_rate": 0.007347541050226325, "loss": 2.6186, "step": 4603 }, { "crossentropy": 2.678531050682068, "epoch": 0.25036025993093886, "grad_norm": 0.037720005959272385, "grad_norm_var": 5.2253414547249036e-05, "learning_rate": 0.007346441571792685, "loss": 2.6785, "step": 4604 }, { "crossentropy": 2.6334545612335205, "epoch": 0.25041463878843906, "grad_norm": 0.041603319346904755, "grad_norm_var": 5.180697468578071e-05, "learning_rate": 0.0073453419478349226, "loss": 2.6335, "step": 4605 }, { "crossentropy": 2.6739877462387085, "epoch": 0.25046901764593926, "grad_norm": 0.03787006437778473, "grad_norm_var": 5.1305418605808394e-05, "learning_rate": 0.007344242178421242, "loss": 2.674, "step": 4606 }, { "crossentropy": 2.713192343711853, "epoch": 0.25052339650343947, "grad_norm": 0.03923957794904709, "grad_norm_var": 5.076258901998305e-05, "learning_rate": 0.007343142263619846, "loss": 2.7132, "step": 4607 }, { "crossentropy": 2.7371256351470947, "epoch": 0.25057777536093967, "grad_norm": 0.03728117048740387, "grad_norm_var": 4.9499696853421336e-05, "learning_rate": 0.007342042203498951, "loss": 2.7371, "step": 4608 }, { "crossentropy": 2.6406800746917725, "epoch": 0.2506321542184399, "grad_norm": 0.05063129588961601, "grad_norm_var": 5.273323701253009e-05, "learning_rate": 0.007340941998126782, "loss": 2.6407, "step": 4609 }, { "crossentropy": 2.7068077325820923, "epoch": 0.2506865330759401, "grad_norm": 0.03561597689986229, "grad_norm_var": 5.435906098538209e-05, "learning_rate": 0.007339841647571574, "loss": 2.7068, "step": 4610 }, { "crossentropy": 2.6086381673812866, "epoch": 0.2507409119334403, "grad_norm": 0.03525294363498688, "grad_norm_var": 2.0289953458893447e-05, "learning_rate": 0.007338741151901567, "loss": 2.6086, "step": 4611 }, { "crossentropy": 2.5224218368530273, "epoch": 0.2507952907909405, "grad_norm": 0.03792836144566536, "grad_norm_var": 2.0113452345504646e-05, "learning_rate": 0.007337640511185016, "loss": 2.5224, "step": 4612 }, { "crossentropy": 2.631919264793396, "epoch": 0.2508496696484407, "grad_norm": 0.04359348118305206, "grad_norm_var": 1.7121102329218688e-05, "learning_rate": 0.007336539725490178, "loss": 2.6319, "step": 4613 }, { "crossentropy": 2.745334029197693, "epoch": 0.2509040485059409, "grad_norm": 0.038821205496788025, "grad_norm_var": 1.7159733813549607e-05, "learning_rate": 0.007335438794885324, "loss": 2.7453, "step": 4614 }, { "crossentropy": 2.6904197931289673, "epoch": 0.2509584273634411, "grad_norm": 0.03492918983101845, "grad_norm_var": 1.850614725081943e-05, "learning_rate": 0.007334337719438735, "loss": 2.6904, "step": 4615 }, { "crossentropy": 2.8139747381210327, "epoch": 0.2510128062209413, "grad_norm": 0.0378054678440094, "grad_norm_var": 1.5092375429004203e-05, "learning_rate": 0.007333236499218696, "loss": 2.814, "step": 4616 }, { "crossentropy": 2.6305665969848633, "epoch": 0.2510671850784415, "grad_norm": 0.038656748831272125, "grad_norm_var": 1.4779230597880555e-05, "learning_rate": 0.007332135134293505, "loss": 2.6306, "step": 4617 }, { "crossentropy": 2.653324007987976, "epoch": 0.2511215639359417, "grad_norm": 0.03878576681017876, "grad_norm_var": 1.4777761006756705e-05, "learning_rate": 0.007331033624731468, "loss": 2.6533, "step": 4618 }, { "crossentropy": 2.65482497215271, "epoch": 0.2511759427934419, "grad_norm": 0.03642551228404045, "grad_norm_var": 1.4689482950290486e-05, "learning_rate": 0.007329931970600897, "loss": 2.6548, "step": 4619 }, { "crossentropy": 2.730894684791565, "epoch": 0.2512303216509421, "grad_norm": 0.03763047605752945, "grad_norm_var": 1.4703890898287488e-05, "learning_rate": 0.007328830171970119, "loss": 2.7309, "step": 4620 }, { "crossentropy": 2.6430881023406982, "epoch": 0.2512847005084423, "grad_norm": 0.035412658005952835, "grad_norm_var": 1.4850785432032828e-05, "learning_rate": 0.0073277282289074645, "loss": 2.6431, "step": 4621 }, { "crossentropy": 2.6078954935073853, "epoch": 0.2513390793659425, "grad_norm": 0.03804340958595276, "grad_norm_var": 1.4838277451520849e-05, "learning_rate": 0.007326626141481274, "loss": 2.6079, "step": 4622 }, { "crossentropy": 2.505293130874634, "epoch": 0.2513934582234427, "grad_norm": 0.03476895019412041, "grad_norm_var": 1.5648567675254307e-05, "learning_rate": 0.0073255239097599015, "loss": 2.5053, "step": 4623 }, { "crossentropy": 2.7526315450668335, "epoch": 0.2514478370809429, "grad_norm": 0.03475896641612053, "grad_norm_var": 1.6363200931350923e-05, "learning_rate": 0.0073244215338117025, "loss": 2.7526, "step": 4624 }, { "crossentropy": 2.714635491371155, "epoch": 0.25150221593844313, "grad_norm": 0.037510983645915985, "grad_norm_var": 5.141181165934279e-06, "learning_rate": 0.0073233190137050475, "loss": 2.7146, "step": 4625 }, { "crossentropy": 2.633998990058899, "epoch": 0.25155659479594333, "grad_norm": 0.0374833345413208, "grad_norm_var": 4.953211615910519e-06, "learning_rate": 0.007322216349508314, "loss": 2.634, "step": 4626 }, { "crossentropy": 2.717471957206726, "epoch": 0.25161097365344354, "grad_norm": 0.04037882760167122, "grad_norm_var": 5.15328221858749e-06, "learning_rate": 0.007321113541289887, "loss": 2.7175, "step": 4627 }, { "crossentropy": 2.752997875213623, "epoch": 0.25166535251094374, "grad_norm": 0.04235786199569702, "grad_norm_var": 6.524275297107619e-06, "learning_rate": 0.007320010589118162, "loss": 2.753, "step": 4628 }, { "crossentropy": 2.6201943159103394, "epoch": 0.25171973136844394, "grad_norm": 0.04064524918794632, "grad_norm_var": 4.8530916444410856e-06, "learning_rate": 0.0073189074930615425, "loss": 2.6202, "step": 4629 }, { "crossentropy": 2.690687656402588, "epoch": 0.25177411022594415, "grad_norm": 0.03656025603413582, "grad_norm_var": 4.85747112900133e-06, "learning_rate": 0.007317804253188445, "loss": 2.6907, "step": 4630 }, { "crossentropy": 2.742868661880493, "epoch": 0.25182848908344435, "grad_norm": 0.034800294786691666, "grad_norm_var": 4.905004758163726e-06, "learning_rate": 0.007316700869567285, "loss": 2.7429, "step": 4631 }, { "crossentropy": 2.8042632341384888, "epoch": 0.25188286794094455, "grad_norm": 0.0383685939013958, "grad_norm_var": 4.938258136061707e-06, "learning_rate": 0.0073155973422665, "loss": 2.8043, "step": 4632 }, { "crossentropy": 2.72170627117157, "epoch": 0.25193724679844476, "grad_norm": 0.07294336706399918, "grad_norm_var": 8.296023747387474e-05, "learning_rate": 0.007314493671354524, "loss": 2.7217, "step": 4633 }, { "crossentropy": 2.584333896636963, "epoch": 0.25199162565594496, "grad_norm": 0.03770532086491585, "grad_norm_var": 8.317997842381541e-05, "learning_rate": 0.007313389856899811, "loss": 2.5843, "step": 4634 }, { "crossentropy": 2.7030948400497437, "epoch": 0.25204600451344517, "grad_norm": 0.11963855475187302, "grad_norm_var": 0.00047921300500205116, "learning_rate": 0.007312285898970814, "loss": 2.7031, "step": 4635 }, { "crossentropy": 2.694566488265991, "epoch": 0.25210038337094537, "grad_norm": 0.03503705561161041, "grad_norm_var": 0.00048216021426901527, "learning_rate": 0.0073111817976360014, "loss": 2.6946, "step": 4636 }, { "crossentropy": 2.684777617454529, "epoch": 0.2521547622284456, "grad_norm": 0.03698908910155296, "grad_norm_var": 0.00048034747716534283, "learning_rate": 0.007310077552963848, "loss": 2.6848, "step": 4637 }, { "crossentropy": 2.756254553794861, "epoch": 0.2522091410859458, "grad_norm": 0.04317750409245491, "grad_norm_var": 0.0004773187955322361, "learning_rate": 0.00730897316502284, "loss": 2.7563, "step": 4638 }, { "crossentropy": 2.709243059158325, "epoch": 0.252263519943446, "grad_norm": 0.04271681606769562, "grad_norm_var": 0.0004702179063951648, "learning_rate": 0.0073078686338814685, "loss": 2.7092, "step": 4639 }, { "crossentropy": 2.61380398273468, "epoch": 0.2523178988009462, "grad_norm": 0.03907034173607826, "grad_norm_var": 0.0004650947961764018, "learning_rate": 0.007306763959608235, "loss": 2.6138, "step": 4640 }, { "crossentropy": 2.6653449535369873, "epoch": 0.2523722776584464, "grad_norm": 0.03657980263233185, "grad_norm_var": 0.0004661981802639897, "learning_rate": 0.007305659142271652, "loss": 2.6653, "step": 4641 }, { "crossentropy": 2.6278587579727173, "epoch": 0.2524266565159466, "grad_norm": 0.03683120012283325, "grad_norm_var": 0.00046695688391022055, "learning_rate": 0.0073045541819402385, "loss": 2.6279, "step": 4642 }, { "crossentropy": 2.663241386413574, "epoch": 0.2524810353734468, "grad_norm": 0.035569656640291214, "grad_norm_var": 0.00047191865291748363, "learning_rate": 0.007303449078682522, "loss": 2.6632, "step": 4643 }, { "crossentropy": 2.6223970651626587, "epoch": 0.252535414230947, "grad_norm": 0.0400470532476902, "grad_norm_var": 0.0004732395928131137, "learning_rate": 0.007302343832567043, "loss": 2.6224, "step": 4644 }, { "crossentropy": 2.6335608959198, "epoch": 0.2525897930884472, "grad_norm": 0.03551829606294632, "grad_norm_var": 0.0004781447333216402, "learning_rate": 0.007301238443662345, "loss": 2.6336, "step": 4645 }, { "crossentropy": 2.680748701095581, "epoch": 0.2526441719459474, "grad_norm": 0.036360617727041245, "grad_norm_var": 0.0004783744611035256, "learning_rate": 0.007300132912036985, "loss": 2.6807, "step": 4646 }, { "crossentropy": 2.620506763458252, "epoch": 0.2526985508034476, "grad_norm": 0.03650274872779846, "grad_norm_var": 0.0004762211343216823, "learning_rate": 0.007299027237759526, "loss": 2.6205, "step": 4647 }, { "crossentropy": 2.689873218536377, "epoch": 0.2527529296609478, "grad_norm": 0.036513883620500565, "grad_norm_var": 0.00047812327639200233, "learning_rate": 0.00729792142089854, "loss": 2.6899, "step": 4648 }, { "crossentropy": 2.642221689224243, "epoch": 0.252807308518448, "grad_norm": 0.0348719023168087, "grad_norm_var": 0.00042724819424496584, "learning_rate": 0.00729681546152261, "loss": 2.6422, "step": 4649 }, { "crossentropy": 2.7075910568237305, "epoch": 0.2528616873759482, "grad_norm": 0.03677378222346306, "grad_norm_var": 0.0004279222497783447, "learning_rate": 0.007295709359700328, "loss": 2.7076, "step": 4650 }, { "crossentropy": 2.729104161262512, "epoch": 0.2529160662334484, "grad_norm": 0.04393688216805458, "grad_norm_var": 8.87814756678762e-06, "learning_rate": 0.00729460311550029, "loss": 2.7291, "step": 4651 }, { "crossentropy": 2.6394518613815308, "epoch": 0.2529704450909486, "grad_norm": 0.0506979376077652, "grad_norm_var": 1.8216323359948606e-05, "learning_rate": 0.007293496728991107, "loss": 2.6395, "step": 4652 }, { "crossentropy": 2.7290079593658447, "epoch": 0.25302482394844883, "grad_norm": 0.04515048488974571, "grad_norm_var": 2.0316412359920136e-05, "learning_rate": 0.007292390200241396, "loss": 2.729, "step": 4653 }, { "crossentropy": 2.691314220428467, "epoch": 0.25307920280594903, "grad_norm": 0.0407417006790638, "grad_norm_var": 1.9458753190191672e-05, "learning_rate": 0.00729128352931978, "loss": 2.6913, "step": 4654 }, { "crossentropy": 2.766525626182556, "epoch": 0.25313358166344924, "grad_norm": 0.03979288414120674, "grad_norm_var": 1.8638676512432577e-05, "learning_rate": 0.0072901767162948964, "loss": 2.7665, "step": 4655 }, { "crossentropy": 2.6255284547805786, "epoch": 0.25318796052094944, "grad_norm": 0.039012230932712555, "grad_norm_var": 1.8638807037912643e-05, "learning_rate": 0.0072890697612353885, "loss": 2.6255, "step": 4656 }, { "crossentropy": 2.6563068628311157, "epoch": 0.25324233937844964, "grad_norm": 0.03768226504325867, "grad_norm_var": 1.8350735874202392e-05, "learning_rate": 0.007287962664209906, "loss": 2.6563, "step": 4657 }, { "crossentropy": 2.672554850578308, "epoch": 0.25329671823594985, "grad_norm": 0.037187326699495316, "grad_norm_var": 1.8249734296856868e-05, "learning_rate": 0.0072868554252871155, "loss": 2.6726, "step": 4658 }, { "crossentropy": 2.487873673439026, "epoch": 0.25335109709345005, "grad_norm": 0.03427286446094513, "grad_norm_var": 1.897346415467363e-05, "learning_rate": 0.0072857480445356814, "loss": 2.4879, "step": 4659 }, { "crossentropy": 2.682829976081848, "epoch": 0.25340547595095025, "grad_norm": 0.03710825368762016, "grad_norm_var": 1.9129000250843928e-05, "learning_rate": 0.007284640522024285, "loss": 2.6828, "step": 4660 }, { "crossentropy": 2.545536160469055, "epoch": 0.25345985480845046, "grad_norm": 0.035472165793180466, "grad_norm_var": 1.91498270300983e-05, "learning_rate": 0.007283532857821613, "loss": 2.5455, "step": 4661 }, { "crossentropy": 2.7685540914535522, "epoch": 0.25351423366595066, "grad_norm": 0.039184462279081345, "grad_norm_var": 1.8699677665418892e-05, "learning_rate": 0.0072824250519963616, "loss": 2.7686, "step": 4662 }, { "crossentropy": 2.8266831636428833, "epoch": 0.25356861252345086, "grad_norm": 0.0368613600730896, "grad_norm_var": 1.8585614721020918e-05, "learning_rate": 0.007281317104617239, "loss": 2.8267, "step": 4663 }, { "crossentropy": 2.763224959373474, "epoch": 0.25362299138095107, "grad_norm": 0.035703253000974655, "grad_norm_var": 1.890390868742129e-05, "learning_rate": 0.007280209015752955, "loss": 2.7632, "step": 4664 }, { "crossentropy": 2.675117015838623, "epoch": 0.25367737023845127, "grad_norm": 0.03637426346540451, "grad_norm_var": 1.821242680818194e-05, "learning_rate": 0.0072791007854722325, "loss": 2.6751, "step": 4665 }, { "crossentropy": 2.6494492292404175, "epoch": 0.2537317490959515, "grad_norm": 0.03717055171728134, "grad_norm_var": 1.809803872399405e-05, "learning_rate": 0.007277992413843806, "loss": 2.6494, "step": 4666 }, { "crossentropy": 2.6513372659683228, "epoch": 0.2537861279534517, "grad_norm": 0.0419139601290226, "grad_norm_var": 1.7061808494820732e-05, "learning_rate": 0.007276883900936413, "loss": 2.6513, "step": 4667 }, { "crossentropy": 2.7302275896072388, "epoch": 0.2538405068109519, "grad_norm": 0.037201955914497375, "grad_norm_var": 7.432293473201047e-06, "learning_rate": 0.007275775246818802, "loss": 2.7302, "step": 4668 }, { "crossentropy": 2.703416347503662, "epoch": 0.2538948856684521, "grad_norm": 0.03614841774106026, "grad_norm_var": 4.126864067409824e-06, "learning_rate": 0.007274666451559734, "loss": 2.7034, "step": 4669 }, { "crossentropy": 2.6991981267929077, "epoch": 0.2539492645259523, "grad_norm": 0.045410145074129105, "grad_norm_var": 7.435726274072387e-06, "learning_rate": 0.007273557515227971, "loss": 2.6992, "step": 4670 }, { "crossentropy": 2.597445249557495, "epoch": 0.2540036433834525, "grad_norm": 0.049676086753606796, "grad_norm_var": 1.6027013582382324e-05, "learning_rate": 0.007272448437892292, "loss": 2.5974, "step": 4671 }, { "crossentropy": 2.745440363883972, "epoch": 0.2540580222409527, "grad_norm": 0.04207771271467209, "grad_norm_var": 1.681400566490154e-05, "learning_rate": 0.007271339219621479, "loss": 2.7454, "step": 4672 }, { "crossentropy": 2.7254685163497925, "epoch": 0.2541124010984529, "grad_norm": 0.035889383405447006, "grad_norm_var": 1.726185877474288e-05, "learning_rate": 0.007270229860484327, "loss": 2.7255, "step": 4673 }, { "crossentropy": 2.7211456298828125, "epoch": 0.2541667799559531, "grad_norm": 0.04507890343666077, "grad_norm_var": 1.966431053972978e-05, "learning_rate": 0.007269120360549634, "loss": 2.7211, "step": 4674 }, { "crossentropy": 2.7500884532928467, "epoch": 0.2542211588134533, "grad_norm": 0.035857830196619034, "grad_norm_var": 1.8801948312338614e-05, "learning_rate": 0.0072680107198862115, "loss": 2.7501, "step": 4675 }, { "crossentropy": 2.6942578554153442, "epoch": 0.2542755376709535, "grad_norm": 0.0389220155775547, "grad_norm_var": 1.8502776870621174e-05, "learning_rate": 0.007266900938562878, "loss": 2.6943, "step": 4676 }, { "crossentropy": 2.6529994010925293, "epoch": 0.2543299165284537, "grad_norm": 0.03739888221025467, "grad_norm_var": 1.7749150712091135e-05, "learning_rate": 0.007265791016648462, "loss": 2.653, "step": 4677 }, { "crossentropy": 2.676923155784607, "epoch": 0.2543842953859539, "grad_norm": 0.03713873401284218, "grad_norm_var": 1.807750287134141e-05, "learning_rate": 0.007264680954211801, "loss": 2.6769, "step": 4678 }, { "crossentropy": 2.889059066772461, "epoch": 0.2544386742434541, "grad_norm": 0.03837799280881882, "grad_norm_var": 1.7727831203422028e-05, "learning_rate": 0.007263570751321738, "loss": 2.8891, "step": 4679 }, { "crossentropy": 2.5806127786636353, "epoch": 0.2544930531009543, "grad_norm": 0.03757735341787338, "grad_norm_var": 1.7024539304492474e-05, "learning_rate": 0.0072624604080471266, "loss": 2.5806, "step": 4680 }, { "crossentropy": 2.648027181625366, "epoch": 0.25454743195845453, "grad_norm": 0.03505668416619301, "grad_norm_var": 1.7684512775748412e-05, "learning_rate": 0.007261349924456831, "loss": 2.648, "step": 4681 }, { "crossentropy": 2.6831194162368774, "epoch": 0.25460181081595473, "grad_norm": 0.0339786596596241, "grad_norm_var": 1.9283304169736885e-05, "learning_rate": 0.00726023930061972, "loss": 2.6831, "step": 4682 }, { "crossentropy": 2.7094842195510864, "epoch": 0.25465618967345494, "grad_norm": 0.040780071169137955, "grad_norm_var": 1.8958119200875265e-05, "learning_rate": 0.007259128536604676, "loss": 2.7095, "step": 4683 }, { "crossentropy": 2.7553277015686035, "epoch": 0.25471056853095514, "grad_norm": 0.054617270827293396, "grad_norm_var": 3.3365711734241696e-05, "learning_rate": 0.007258017632480587, "loss": 2.7553, "step": 4684 }, { "crossentropy": 2.76176381111145, "epoch": 0.25476494738845534, "grad_norm": 0.041585393249988556, "grad_norm_var": 3.2240523173703425e-05, "learning_rate": 0.0072569065883163475, "loss": 2.7618, "step": 4685 }, { "crossentropy": 2.6550604104995728, "epoch": 0.25481932624595555, "grad_norm": 0.04595198482275009, "grad_norm_var": 3.2607181619063547e-05, "learning_rate": 0.007255795404180866, "loss": 2.6551, "step": 4686 }, { "crossentropy": 2.6379836797714233, "epoch": 0.25487370510345575, "grad_norm": 0.04107808694243431, "grad_norm_var": 2.6848855311182676e-05, "learning_rate": 0.007254684080143057, "loss": 2.638, "step": 4687 }, { "crossentropy": 2.72083842754364, "epoch": 0.25492808396095595, "grad_norm": 0.03629979491233826, "grad_norm_var": 2.7400547211166535e-05, "learning_rate": 0.007253572616271844, "loss": 2.7208, "step": 4688 }, { "crossentropy": 2.496917963027954, "epoch": 0.25498246281845616, "grad_norm": 0.03557043522596359, "grad_norm_var": 2.7569991136396483e-05, "learning_rate": 0.007252461012636158, "loss": 2.4969, "step": 4689 }, { "crossentropy": 2.6783807277679443, "epoch": 0.25503684167595636, "grad_norm": 0.036661092191934586, "grad_norm_var": 2.5966483814617257e-05, "learning_rate": 0.007251349269304938, "loss": 2.6784, "step": 4690 }, { "crossentropy": 2.6342668533325195, "epoch": 0.2550912205334566, "grad_norm": 0.035453829914331436, "grad_norm_var": 2.6155545847942468e-05, "learning_rate": 0.007250237386347138, "loss": 2.6343, "step": 4691 }, { "crossentropy": 2.577548027038574, "epoch": 0.2551455993909568, "grad_norm": 0.0366649255156517, "grad_norm_var": 2.6543468292986908e-05, "learning_rate": 0.0072491253638317125, "loss": 2.5775, "step": 4692 }, { "crossentropy": 2.6333104372024536, "epoch": 0.255199978248457, "grad_norm": 0.04022775590419769, "grad_norm_var": 2.643520439762435e-05, "learning_rate": 0.007248013201827628, "loss": 2.6333, "step": 4693 }, { "crossentropy": 2.5678915977478027, "epoch": 0.25525435710595723, "grad_norm": 0.03678346797823906, "grad_norm_var": 2.6540199768254093e-05, "learning_rate": 0.007246900900403861, "loss": 2.5679, "step": 4694 }, { "crossentropy": 2.6813563108444214, "epoch": 0.25530873596345743, "grad_norm": 0.03402552753686905, "grad_norm_var": 2.818181921476013e-05, "learning_rate": 0.007245788459629397, "loss": 2.6814, "step": 4695 }, { "crossentropy": 2.626889944076538, "epoch": 0.25536311482095764, "grad_norm": 0.03702601045370102, "grad_norm_var": 2.8297646038355438e-05, "learning_rate": 0.007244675879573222, "loss": 2.6269, "step": 4696 }, { "crossentropy": 2.7648985385894775, "epoch": 0.25541749367845784, "grad_norm": 0.03704417496919632, "grad_norm_var": 2.7536638082431563e-05, "learning_rate": 0.007243563160304345, "loss": 2.7649, "step": 4697 }, { "crossentropy": 2.72378408908844, "epoch": 0.25547187253595804, "grad_norm": 0.03650178387761116, "grad_norm_var": 2.625054920678364e-05, "learning_rate": 0.007242450301891772, "loss": 2.7238, "step": 4698 }, { "crossentropy": 2.628893256187439, "epoch": 0.25552625139345825, "grad_norm": 0.036752134561538696, "grad_norm_var": 2.6384813469768932e-05, "learning_rate": 0.007241337304404522, "loss": 2.6289, "step": 4699 }, { "crossentropy": 2.6374948024749756, "epoch": 0.25558063025095845, "grad_norm": 0.03762419894337654, "grad_norm_var": 8.799161275631813e-06, "learning_rate": 0.007240224167911621, "loss": 2.6375, "step": 4700 }, { "crossentropy": 2.6648099422454834, "epoch": 0.25563500910845866, "grad_norm": 0.039802197366952896, "grad_norm_var": 8.104580832717495e-06, "learning_rate": 0.007239110892482106, "loss": 2.6648, "step": 4701 }, { "crossentropy": 2.6448534727096558, "epoch": 0.25568938796595886, "grad_norm": 0.036404773592948914, "grad_norm_var": 3.318223976548201e-06, "learning_rate": 0.0072379974781850214, "loss": 2.6449, "step": 4702 }, { "crossentropy": 2.7714483737945557, "epoch": 0.25574376682345906, "grad_norm": 0.03641221672296524, "grad_norm_var": 2.2164885842483144e-06, "learning_rate": 0.007236883925089418, "loss": 2.7714, "step": 4703 }, { "crossentropy": 2.633497953414917, "epoch": 0.25579814568095927, "grad_norm": 0.0373559333384037, "grad_norm_var": 2.211766211996681e-06, "learning_rate": 0.007235770233264362, "loss": 2.6335, "step": 4704 }, { "crossentropy": 2.714403748512268, "epoch": 0.25585252453845947, "grad_norm": 0.03815259784460068, "grad_norm_var": 2.1726620523613086e-06, "learning_rate": 0.007234656402778919, "loss": 2.7144, "step": 4705 }, { "crossentropy": 2.624888062477112, "epoch": 0.2559069033959597, "grad_norm": 0.03684910759329796, "grad_norm_var": 2.1649768756209472e-06, "learning_rate": 0.00723354243370217, "loss": 2.6249, "step": 4706 }, { "crossentropy": 2.641821503639221, "epoch": 0.2559612822534599, "grad_norm": 0.03426944091916084, "grad_norm_var": 2.507485146922781e-06, "learning_rate": 0.0072324283261032, "loss": 2.6418, "step": 4707 }, { "crossentropy": 2.682077646255493, "epoch": 0.2560156611109601, "grad_norm": 0.03799673914909363, "grad_norm_var": 2.559993712713572e-06, "learning_rate": 0.007231314080051108, "loss": 2.6821, "step": 4708 }, { "crossentropy": 2.699362635612488, "epoch": 0.2560700399684603, "grad_norm": 0.0367252491414547, "grad_norm_var": 1.8551949888423458e-06, "learning_rate": 0.007230199695614995, "loss": 2.6994, "step": 4709 }, { "crossentropy": 2.6636143922805786, "epoch": 0.2561244188259605, "grad_norm": 0.03677677363157272, "grad_norm_var": 1.8552641790398409e-06, "learning_rate": 0.007229085172863978, "loss": 2.6636, "step": 4710 }, { "crossentropy": 2.606309175491333, "epoch": 0.2561787976834607, "grad_norm": 0.035663504153490067, "grad_norm_var": 1.40447125051175e-06, "learning_rate": 0.007227970511867176, "loss": 2.6063, "step": 4711 }, { "crossentropy": 2.5833598375320435, "epoch": 0.2562331765409609, "grad_norm": 0.03554878756403923, "grad_norm_var": 1.5278174004479741e-06, "learning_rate": 0.00722685571269372, "loss": 2.5834, "step": 4712 }, { "crossentropy": 2.666675567626953, "epoch": 0.2562875553984611, "grad_norm": 0.03654598444700241, "grad_norm_var": 1.531592201571968e-06, "learning_rate": 0.00722574077541275, "loss": 2.6667, "step": 4713 }, { "crossentropy": 2.5984593629837036, "epoch": 0.2563419342559613, "grad_norm": 0.036094944924116135, "grad_norm_var": 1.5600850759952521e-06, "learning_rate": 0.0072246257000934105, "loss": 2.5985, "step": 4714 }, { "crossentropy": 2.7054020166397095, "epoch": 0.2563963131134615, "grad_norm": 0.03443164378404617, "grad_norm_var": 1.9148129269563306e-06, "learning_rate": 0.007223510486804859, "loss": 2.7054, "step": 4715 }, { "crossentropy": 2.6702431440353394, "epoch": 0.2564506919709617, "grad_norm": 0.035571884363889694, "grad_norm_var": 1.915826588534617e-06, "learning_rate": 0.007222395135616261, "loss": 2.6702, "step": 4716 }, { "crossentropy": 2.7797226905822754, "epoch": 0.2565050708284619, "grad_norm": 0.03620214760303497, "grad_norm_var": 1.1588259412794362e-06, "learning_rate": 0.007221279646596789, "loss": 2.7797, "step": 4717 }, { "crossentropy": 2.6525338888168335, "epoch": 0.2565594496859621, "grad_norm": 0.0366859957575798, "grad_norm_var": 1.1672246762589138e-06, "learning_rate": 0.0072201640198156225, "loss": 2.6525, "step": 4718 }, { "crossentropy": 2.67218554019928, "epoch": 0.2566138285434623, "grad_norm": 0.03520756959915161, "grad_norm_var": 1.2447470997082692e-06, "learning_rate": 0.0072190482553419555, "loss": 2.6722, "step": 4719 }, { "crossentropy": 2.7410000562667847, "epoch": 0.2566682074009625, "grad_norm": 0.038009028881788254, "grad_norm_var": 1.3672833138538934e-06, "learning_rate": 0.007217932353244983, "loss": 2.741, "step": 4720 }, { "crossentropy": 2.6455907821655273, "epoch": 0.2567225862584627, "grad_norm": 0.03522421419620514, "grad_norm_var": 1.1782246986724586e-06, "learning_rate": 0.007216816313593915, "loss": 2.6456, "step": 4721 }, { "crossentropy": 2.6258385181427, "epoch": 0.25677696511596293, "grad_norm": 0.037614982575178146, "grad_norm_var": 1.2900856423201748e-06, "learning_rate": 0.007215700136457966, "loss": 2.6258, "step": 4722 }, { "crossentropy": 2.7784229516983032, "epoch": 0.25683134397346313, "grad_norm": 0.034032441675662994, "grad_norm_var": 1.3533552177267053e-06, "learning_rate": 0.007214583821906357, "loss": 2.7784, "step": 4723 }, { "crossentropy": 2.7176109552383423, "epoch": 0.25688572283096334, "grad_norm": 0.03640657663345337, "grad_norm_var": 1.1189425226579632e-06, "learning_rate": 0.007213467370008327, "loss": 2.7176, "step": 4724 }, { "crossentropy": 2.694791078567505, "epoch": 0.25694010168846354, "grad_norm": 0.03414361551403999, "grad_norm_var": 1.3018081276544002e-06, "learning_rate": 0.0072123507808331145, "loss": 2.6948, "step": 4725 }, { "crossentropy": 2.785314202308655, "epoch": 0.25699448054596374, "grad_norm": 0.03649795055389404, "grad_norm_var": 1.2735143009136192e-06, "learning_rate": 0.007211234054449969, "loss": 2.7853, "step": 4726 }, { "crossentropy": 2.7173911333084106, "epoch": 0.25704885940346395, "grad_norm": 0.038229625672101974, "grad_norm_var": 1.6152512390065743e-06, "learning_rate": 0.0072101171909281486, "loss": 2.7174, "step": 4727 }, { "crossentropy": 2.7283912897109985, "epoch": 0.25710323826096415, "grad_norm": 0.04181774705648422, "grad_norm_var": 3.6709686171918407e-06, "learning_rate": 0.00720900019033692, "loss": 2.7284, "step": 4728 }, { "crossentropy": 2.692627429962158, "epoch": 0.25715761711846435, "grad_norm": 0.03677266836166382, "grad_norm_var": 3.677994925552208e-06, "learning_rate": 0.007207883052745559, "loss": 2.6926, "step": 4729 }, { "crossentropy": 2.6916282176971436, "epoch": 0.25721199597596456, "grad_norm": 0.03542477265000343, "grad_norm_var": 3.736356936249324e-06, "learning_rate": 0.00720676577822335, "loss": 2.6916, "step": 4730 }, { "crossentropy": 2.6749207973480225, "epoch": 0.25726637483346476, "grad_norm": 0.03591321408748627, "grad_norm_var": 3.4862828445234956e-06, "learning_rate": 0.007205648366839583, "loss": 2.6749, "step": 4731 }, { "crossentropy": 2.6028342247009277, "epoch": 0.25732075369096497, "grad_norm": 0.03885303810238838, "grad_norm_var": 3.7598317479604905e-06, "learning_rate": 0.007204530818663564, "loss": 2.6028, "step": 4732 }, { "crossentropy": 2.689779043197632, "epoch": 0.25737513254846517, "grad_norm": 0.039377547800540924, "grad_norm_var": 4.183596160364789e-06, "learning_rate": 0.007203413133764597, "loss": 2.6898, "step": 4733 }, { "crossentropy": 2.5872881412506104, "epoch": 0.25742951140596537, "grad_norm": 0.038920652121305466, "grad_norm_var": 4.435458054328524e-06, "learning_rate": 0.007202295312212002, "loss": 2.5873, "step": 4734 }, { "crossentropy": 2.6629189252853394, "epoch": 0.2574838902634656, "grad_norm": 0.03739691525697708, "grad_norm_var": 4.203671309582248e-06, "learning_rate": 0.007201177354075105, "loss": 2.6629, "step": 4735 }, { "crossentropy": 2.722872257232666, "epoch": 0.2575382691209658, "grad_norm": 0.037098899483680725, "grad_norm_var": 4.1529808906454194e-06, "learning_rate": 0.00720005925942324, "loss": 2.7229, "step": 4736 }, { "crossentropy": 2.6890816688537598, "epoch": 0.257592647978466, "grad_norm": 0.037690646946430206, "grad_norm_var": 3.91375358336013e-06, "learning_rate": 0.007198941028325752, "loss": 2.6891, "step": 4737 }, { "crossentropy": 2.6179364919662476, "epoch": 0.2576470268359662, "grad_norm": 0.03950950503349304, "grad_norm_var": 4.227255136252973e-06, "learning_rate": 0.007197822660851991, "loss": 2.6179, "step": 4738 }, { "crossentropy": 2.656209945678711, "epoch": 0.2577014056934664, "grad_norm": 0.04120349884033203, "grad_norm_var": 4.240173711834936e-06, "learning_rate": 0.007196704157071317, "loss": 2.6562, "step": 4739 }, { "crossentropy": 2.7043094635009766, "epoch": 0.2577557845509666, "grad_norm": 0.04272092133760452, "grad_norm_var": 5.5349267942871846e-06, "learning_rate": 0.007195585517053099, "loss": 2.7043, "step": 4740 }, { "crossentropy": 2.625313639640808, "epoch": 0.2578101634084668, "grad_norm": 0.03853645920753479, "grad_norm_var": 4.3515297401206425e-06, "learning_rate": 0.007194466740866715, "loss": 2.6253, "step": 4741 }, { "crossentropy": 2.6695494651794434, "epoch": 0.257864542265967, "grad_norm": 0.03555511310696602, "grad_norm_var": 4.658487241701534e-06, "learning_rate": 0.007193347828581549, "loss": 2.6695, "step": 4742 }, { "crossentropy": 2.6618751287460327, "epoch": 0.2579189211234672, "grad_norm": 0.038914065808057785, "grad_norm_var": 4.668674486947056e-06, "learning_rate": 0.007192228780266996, "loss": 2.6619, "step": 4743 }, { "crossentropy": 2.651402473449707, "epoch": 0.2579732999809674, "grad_norm": 0.038985297083854675, "grad_norm_var": 3.910170043722588e-06, "learning_rate": 0.007191109595992458, "loss": 2.6514, "step": 4744 }, { "crossentropy": 2.68716037273407, "epoch": 0.2580276788384676, "grad_norm": 0.041738905012607574, "grad_norm_var": 4.4372637823169555e-06, "learning_rate": 0.007189990275827345, "loss": 2.6872, "step": 4745 }, { "crossentropy": 2.658191680908203, "epoch": 0.2580820576959678, "grad_norm": 0.03841876611113548, "grad_norm_var": 3.7239912840089074e-06, "learning_rate": 0.007188870819841077, "loss": 2.6582, "step": 4746 }, { "crossentropy": 2.781126379966736, "epoch": 0.258136436553468, "grad_norm": 0.0375446118414402, "grad_norm_var": 3.2619449525315056e-06, "learning_rate": 0.0071877512281030825, "loss": 2.7811, "step": 4747 }, { "crossentropy": 2.599375367164612, "epoch": 0.2581908154109682, "grad_norm": 0.035378966480493546, "grad_norm_var": 4.03989875038327e-06, "learning_rate": 0.007186631500682795, "loss": 2.5994, "step": 4748 }, { "crossentropy": 2.5572856664657593, "epoch": 0.2582451942684684, "grad_norm": 0.03727685660123825, "grad_norm_var": 4.122266688779084e-06, "learning_rate": 0.007185511637649661, "loss": 2.5573, "step": 4749 }, { "crossentropy": 2.757123827934265, "epoch": 0.25829957312596863, "grad_norm": 0.039148326963186264, "grad_norm_var": 4.136587275029256e-06, "learning_rate": 0.007184391639073133, "loss": 2.7571, "step": 4750 }, { "crossentropy": 2.7469226121902466, "epoch": 0.25835395198346883, "grad_norm": 0.04346727579832077, "grad_norm_var": 5.490307676314412e-06, "learning_rate": 0.007183271505022672, "loss": 2.7469, "step": 4751 }, { "crossentropy": 2.698594093322754, "epoch": 0.25840833084096904, "grad_norm": 0.045704640448093414, "grad_norm_var": 7.995821347160185e-06, "learning_rate": 0.007182151235567746, "loss": 2.6986, "step": 4752 }, { "crossentropy": 2.617879629135132, "epoch": 0.25846270969846924, "grad_norm": 0.06102188304066658, "grad_norm_var": 3.642896984846437e-05, "learning_rate": 0.007181030830777837, "loss": 2.6179, "step": 4753 }, { "crossentropy": 2.745938301086426, "epoch": 0.25851708855596944, "grad_norm": 0.034904059022665024, "grad_norm_var": 3.863627788585599e-05, "learning_rate": 0.007179910290722429, "loss": 2.7459, "step": 4754 }, { "crossentropy": 2.803023338317871, "epoch": 0.25857146741346965, "grad_norm": 0.03548619896173477, "grad_norm_var": 4.026301237509596e-05, "learning_rate": 0.0071787896154710176, "loss": 2.803, "step": 4755 }, { "crossentropy": 2.777299165725708, "epoch": 0.25862584627096985, "grad_norm": 0.03580615669488907, "grad_norm_var": 4.10195072329191e-05, "learning_rate": 0.007177668805093104, "loss": 2.7773, "step": 4756 }, { "crossentropy": 2.719481945037842, "epoch": 0.25868022512847005, "grad_norm": 0.03519440442323685, "grad_norm_var": 4.2310923036309224e-05, "learning_rate": 0.007176547859658202, "loss": 2.7195, "step": 4757 }, { "crossentropy": 2.6546274423599243, "epoch": 0.25873460398597026, "grad_norm": 0.041362326592206955, "grad_norm_var": 4.1240962508225694e-05, "learning_rate": 0.007175426779235832, "loss": 2.6546, "step": 4758 }, { "crossentropy": 2.765852689743042, "epoch": 0.25878898284347046, "grad_norm": 0.040633317083120346, "grad_norm_var": 4.117171531639781e-05, "learning_rate": 0.007174305563895518, "loss": 2.7659, "step": 4759 }, { "crossentropy": 2.6533154249191284, "epoch": 0.25884336170097066, "grad_norm": 0.04190585017204285, "grad_norm_var": 4.1259256688840525e-05, "learning_rate": 0.007173184213706804, "loss": 2.6533, "step": 4760 }, { "crossentropy": 2.608802556991577, "epoch": 0.25889774055847087, "grad_norm": 0.038376420736312866, "grad_norm_var": 4.1326189659662606e-05, "learning_rate": 0.0071720627287392286, "loss": 2.6088, "step": 4761 }, { "crossentropy": 2.7305996417999268, "epoch": 0.25895211941597107, "grad_norm": 0.0335749015212059, "grad_norm_var": 4.387966467083165e-05, "learning_rate": 0.007170941109062349, "loss": 2.7306, "step": 4762 }, { "crossentropy": 2.58743155002594, "epoch": 0.2590064982734713, "grad_norm": 0.03693905845284462, "grad_norm_var": 4.4084614504516445e-05, "learning_rate": 0.007169819354745725, "loss": 2.5874, "step": 4763 }, { "crossentropy": 2.7657634019851685, "epoch": 0.2590608771309715, "grad_norm": 0.03780936077237129, "grad_norm_var": 4.303368726085854e-05, "learning_rate": 0.007168697465858928, "loss": 2.7658, "step": 4764 }, { "crossentropy": 2.7339192628860474, "epoch": 0.2591152559884717, "grad_norm": 0.037165068089962006, "grad_norm_var": 4.3073763205065246e-05, "learning_rate": 0.007167575442471537, "loss": 2.7339, "step": 4765 }, { "crossentropy": 2.6934670209884644, "epoch": 0.2591696348459719, "grad_norm": 0.03709900379180908, "grad_norm_var": 4.354333044933022e-05, "learning_rate": 0.007166453284653135, "loss": 2.6935, "step": 4766 }, { "crossentropy": 2.6835427284240723, "epoch": 0.2592240137034721, "grad_norm": 0.04569770395755768, "grad_norm_var": 4.49513755529124e-05, "learning_rate": 0.007165330992473322, "loss": 2.6835, "step": 4767 }, { "crossentropy": 2.630625605583191, "epoch": 0.2592783925609723, "grad_norm": 0.0373721569776535, "grad_norm_var": 4.286129238288208e-05, "learning_rate": 0.0071642085660016995, "loss": 2.6306, "step": 4768 }, { "crossentropy": 2.7346460819244385, "epoch": 0.2593327714184725, "grad_norm": 0.03604089841246605, "grad_norm_var": 9.835416133169295e-06, "learning_rate": 0.007163086005307881, "loss": 2.7346, "step": 4769 }, { "crossentropy": 2.6998506784439087, "epoch": 0.2593871502759727, "grad_norm": 0.03585151582956314, "grad_norm_var": 9.521207753720486e-06, "learning_rate": 0.0071619633104614815, "loss": 2.6999, "step": 4770 }, { "crossentropy": 2.7328680753707886, "epoch": 0.2594415291334729, "grad_norm": 0.03931557014584541, "grad_norm_var": 9.20800108482033e-06, "learning_rate": 0.007160840481532136, "loss": 2.7329, "step": 4771 }, { "crossentropy": 2.659379243850708, "epoch": 0.2594959079909731, "grad_norm": 0.037638451904058456, "grad_norm_var": 8.849130914013956e-06, "learning_rate": 0.007159717518589477, "loss": 2.6594, "step": 4772 }, { "crossentropy": 2.551084280014038, "epoch": 0.2595502868484733, "grad_norm": 0.03750703111290932, "grad_norm_var": 8.241664821932031e-06, "learning_rate": 0.0071585944217031515, "loss": 2.5511, "step": 4773 }, { "crossentropy": 2.698429226875305, "epoch": 0.2596046657059735, "grad_norm": 0.03503621369600296, "grad_norm_var": 8.238357150445497e-06, "learning_rate": 0.007157471190942812, "loss": 2.6984, "step": 4774 }, { "crossentropy": 2.725244164466858, "epoch": 0.2596590445634737, "grad_norm": 0.03675958514213562, "grad_norm_var": 7.81490785472863e-06, "learning_rate": 0.007156347826378121, "loss": 2.7252, "step": 4775 }, { "crossentropy": 2.7879955768585205, "epoch": 0.2597134234209739, "grad_norm": 0.03714586794376373, "grad_norm_var": 6.596949570800842e-06, "learning_rate": 0.007155224328078749, "loss": 2.788, "step": 4776 }, { "crossentropy": 2.564718246459961, "epoch": 0.2597678022784741, "grad_norm": 0.03835507854819298, "grad_norm_var": 6.594364701569856e-06, "learning_rate": 0.007154100696114372, "loss": 2.5647, "step": 4777 }, { "crossentropy": 2.6994292736053467, "epoch": 0.25982218113597433, "grad_norm": 0.03955762833356857, "grad_norm_var": 5.734916536708802e-06, "learning_rate": 0.00715297693055468, "loss": 2.6994, "step": 4778 }, { "crossentropy": 2.615734815597534, "epoch": 0.25987655999347453, "grad_norm": 0.04540563374757767, "grad_norm_var": 9.208615281258482e-06, "learning_rate": 0.007151853031469364, "loss": 2.6157, "step": 4779 }, { "crossentropy": 2.7317005395889282, "epoch": 0.25993093885097474, "grad_norm": 0.03956517204642296, "grad_norm_var": 9.272433000216222e-06, "learning_rate": 0.00715072899892813, "loss": 2.7317, "step": 4780 }, { "crossentropy": 2.652787446975708, "epoch": 0.25998531770847494, "grad_norm": 0.03662737458944321, "grad_norm_var": 9.38402318273283e-06, "learning_rate": 0.007149604833000688, "loss": 2.6528, "step": 4781 }, { "crossentropy": 2.730714201927185, "epoch": 0.26003969656597514, "grad_norm": 0.03549407050013542, "grad_norm_var": 9.831101762131493e-06, "learning_rate": 0.00714848053375676, "loss": 2.7307, "step": 4782 }, { "crossentropy": 2.7472842931747437, "epoch": 0.26009407542347535, "grad_norm": 0.03635521978139877, "grad_norm_var": 6.115542933629997e-06, "learning_rate": 0.007147356101266071, "loss": 2.7473, "step": 4783 }, { "crossentropy": 2.630472779273987, "epoch": 0.26014845428097555, "grad_norm": 0.035473644733428955, "grad_norm_var": 6.436894591251424e-06, "learning_rate": 0.007146231535598358, "loss": 2.6305, "step": 4784 }, { "crossentropy": 2.7418198585510254, "epoch": 0.26020283313847575, "grad_norm": 0.03700320050120354, "grad_norm_var": 6.290485811952645e-06, "learning_rate": 0.007145106836823368, "loss": 2.7418, "step": 4785 }, { "crossentropy": 2.7298054695129395, "epoch": 0.26025721199597596, "grad_norm": 0.037473708391189575, "grad_norm_var": 6.056612132949018e-06, "learning_rate": 0.0071439820050108495, "loss": 2.7298, "step": 4786 }, { "crossentropy": 2.6725330352783203, "epoch": 0.26031159085347616, "grad_norm": 0.035577766597270966, "grad_norm_var": 6.171794274857182e-06, "learning_rate": 0.007142857040230567, "loss": 2.6725, "step": 4787 }, { "crossentropy": 2.507550597190857, "epoch": 0.26036596971097636, "grad_norm": 0.036401499062776566, "grad_norm_var": 6.254644999033563e-06, "learning_rate": 0.007141731942552287, "loss": 2.5076, "step": 4788 }, { "crossentropy": 2.6207082271575928, "epoch": 0.26042034856847657, "grad_norm": 0.036187946796417236, "grad_norm_var": 6.359284982166737e-06, "learning_rate": 0.007140606712045792, "loss": 2.6207, "step": 4789 }, { "crossentropy": 2.696410298347473, "epoch": 0.26047472742597677, "grad_norm": 0.03544497489929199, "grad_norm_var": 6.240831155430483e-06, "learning_rate": 0.007139481348780863, "loss": 2.6964, "step": 4790 }, { "crossentropy": 2.685770869255066, "epoch": 0.260529106283477, "grad_norm": 0.035410672426223755, "grad_norm_var": 6.474551132137849e-06, "learning_rate": 0.007138355852827295, "loss": 2.6858, "step": 4791 }, { "crossentropy": 2.686863899230957, "epoch": 0.2605834851409772, "grad_norm": 0.03497852012515068, "grad_norm_var": 6.824951319023044e-06, "learning_rate": 0.00713723022425489, "loss": 2.6869, "step": 4792 }, { "crossentropy": 2.6731427907943726, "epoch": 0.2606378639984774, "grad_norm": 0.036746710538864136, "grad_norm_var": 6.74042641162307e-06, "learning_rate": 0.007136104463133461, "loss": 2.6731, "step": 4793 }, { "crossentropy": 2.7046107053756714, "epoch": 0.2606922428559776, "grad_norm": 0.033901017159223557, "grad_norm_var": 6.891565101116437e-06, "learning_rate": 0.007134978569532823, "loss": 2.7046, "step": 4794 }, { "crossentropy": 2.7334094047546387, "epoch": 0.2607466217134778, "grad_norm": 0.03438550606369972, "grad_norm_var": 1.7679357715196775e-06, "learning_rate": 0.007133852543522805, "loss": 2.7334, "step": 4795 }, { "crossentropy": 2.6173095703125, "epoch": 0.260801000570978, "grad_norm": 0.036027491092681885, "grad_norm_var": 8.98752873954023e-07, "learning_rate": 0.007132726385173241, "loss": 2.6173, "step": 4796 }, { "crossentropy": 2.4946157932281494, "epoch": 0.2608553794284782, "grad_norm": 0.03733726218342781, "grad_norm_var": 1.0044836926815028e-06, "learning_rate": 0.007131600094553976, "loss": 2.4946, "step": 4797 }, { "crossentropy": 2.621387243270874, "epoch": 0.2609097582859784, "grad_norm": 0.044740159064531326, "grad_norm_var": 5.8626545725993035e-06, "learning_rate": 0.0071304736717348605, "loss": 2.6214, "step": 4798 }, { "crossentropy": 2.6719777584075928, "epoch": 0.2609641371434786, "grad_norm": 0.035235729068517685, "grad_norm_var": 5.957419118703353e-06, "learning_rate": 0.007129347116785754, "loss": 2.672, "step": 4799 }, { "crossentropy": 2.6744030714035034, "epoch": 0.2610185160009788, "grad_norm": 0.03453381732106209, "grad_norm_var": 6.128124659484681e-06, "learning_rate": 0.0071282204297765245, "loss": 2.6744, "step": 4800 }, { "crossentropy": 2.6737375259399414, "epoch": 0.261072894858479, "grad_norm": 0.039221301674842834, "grad_norm_var": 6.6327608931983825e-06, "learning_rate": 0.007127093610777047, "loss": 2.6737, "step": 4801 }, { "crossentropy": 2.6554808616638184, "epoch": 0.2611272737159792, "grad_norm": 0.03757162019610405, "grad_norm_var": 6.6463947777705675e-06, "learning_rate": 0.007125966659857208, "loss": 2.6555, "step": 4802 }, { "crossentropy": 2.6781513690948486, "epoch": 0.2611816525734794, "grad_norm": 0.03541036695241928, "grad_norm_var": 6.6683146816061295e-06, "learning_rate": 0.007124839577086898, "loss": 2.6782, "step": 4803 }, { "crossentropy": 2.5517085790634155, "epoch": 0.2612360314309796, "grad_norm": 0.03647640347480774, "grad_norm_var": 6.6679721022012976e-06, "learning_rate": 0.007123712362536019, "loss": 2.5517, "step": 4804 }, { "crossentropy": 2.624628782272339, "epoch": 0.2612904102884798, "grad_norm": 0.03819847106933594, "grad_norm_var": 6.843500618119771e-06, "learning_rate": 0.007122585016274479, "loss": 2.6246, "step": 4805 }, { "crossentropy": 2.6706095933914185, "epoch": 0.26134478914598, "grad_norm": 0.03868182376027107, "grad_norm_var": 6.99929936629756e-06, "learning_rate": 0.007121457538372196, "loss": 2.6706, "step": 4806 }, { "crossentropy": 2.5922383069992065, "epoch": 0.26139916800348023, "grad_norm": 0.03750236704945564, "grad_norm_var": 6.884283987182438e-06, "learning_rate": 0.007120329928899092, "loss": 2.5922, "step": 4807 }, { "crossentropy": 2.7234649658203125, "epoch": 0.26145354686098043, "grad_norm": 0.04041609540581703, "grad_norm_var": 7.314286072510751e-06, "learning_rate": 0.007119202187925104, "loss": 2.7235, "step": 4808 }, { "crossentropy": 2.622104287147522, "epoch": 0.26150792571848064, "grad_norm": 0.036559391766786575, "grad_norm_var": 7.329651931668215e-06, "learning_rate": 0.007118074315520172, "loss": 2.6221, "step": 4809 }, { "crossentropy": 2.693069815635681, "epoch": 0.26156230457598084, "grad_norm": 0.03338142856955528, "grad_norm_var": 7.579398522713533e-06, "learning_rate": 0.007116946311754246, "loss": 2.6931, "step": 4810 }, { "crossentropy": 2.6384772062301636, "epoch": 0.26161668343348105, "grad_norm": 0.03738910332322121, "grad_norm_var": 7.004105566669969e-06, "learning_rate": 0.007115818176697284, "loss": 2.6385, "step": 4811 }, { "crossentropy": 2.6623387336730957, "epoch": 0.26167106229098125, "grad_norm": 0.03696553036570549, "grad_norm_var": 6.885227224691602e-06, "learning_rate": 0.00711468991041925, "loss": 2.6623, "step": 4812 }, { "crossentropy": 2.7616829872131348, "epoch": 0.26172544114848145, "grad_norm": 0.03501170128583908, "grad_norm_var": 7.266355319705299e-06, "learning_rate": 0.007113561512990119, "loss": 2.7617, "step": 4813 }, { "crossentropy": 2.7124446630477905, "epoch": 0.26177982000598166, "grad_norm": 0.03591177240014076, "grad_norm_var": 3.416124131649413e-06, "learning_rate": 0.0071124329844798745, "loss": 2.7124, "step": 4814 }, { "crossentropy": 2.611332058906555, "epoch": 0.26183419886348186, "grad_norm": 0.03492260351777077, "grad_norm_var": 3.486691411024902e-06, "learning_rate": 0.007111304324958506, "loss": 2.6113, "step": 4815 }, { "crossentropy": 2.6671539545059204, "epoch": 0.26188857772098206, "grad_norm": 0.03492891788482666, "grad_norm_var": 3.3791928883031288e-06, "learning_rate": 0.00711017553449601, "loss": 2.6672, "step": 4816 }, { "crossentropy": 2.78719162940979, "epoch": 0.26194295657848227, "grad_norm": 0.036559171974658966, "grad_norm_var": 2.9571132148906627e-06, "learning_rate": 0.007109046613162397, "loss": 2.7872, "step": 4817 }, { "crossentropy": 2.708674192428589, "epoch": 0.26199733543598247, "grad_norm": 0.03785530477762222, "grad_norm_var": 2.998216246725913e-06, "learning_rate": 0.007107917561027677, "loss": 2.7087, "step": 4818 }, { "crossentropy": 2.702614188194275, "epoch": 0.2620517142934827, "grad_norm": 0.03583231568336487, "grad_norm_var": 2.9404093871235674e-06, "learning_rate": 0.007106788378161876, "loss": 2.7026, "step": 4819 }, { "crossentropy": 2.786384701728821, "epoch": 0.2621060931509829, "grad_norm": 0.035703882575035095, "grad_norm_var": 2.9968282201304e-06, "learning_rate": 0.007105659064635024, "loss": 2.7864, "step": 4820 }, { "crossentropy": 2.643815040588379, "epoch": 0.2621604720084831, "grad_norm": 0.03696579858660698, "grad_norm_var": 2.831335651877718e-06, "learning_rate": 0.007104529620517159, "loss": 2.6438, "step": 4821 }, { "crossentropy": 2.601263642311096, "epoch": 0.2622148508659833, "grad_norm": 0.03718499839305878, "grad_norm_var": 2.543249394779528e-06, "learning_rate": 0.007103400045878331, "loss": 2.6013, "step": 4822 }, { "crossentropy": 2.7088104486465454, "epoch": 0.2622692297234835, "grad_norm": 0.03949200361967087, "grad_norm_var": 3.071659786117037e-06, "learning_rate": 0.007102270340788592, "loss": 2.7088, "step": 4823 }, { "crossentropy": 2.6944185495376587, "epoch": 0.2623236085809837, "grad_norm": 0.03934508562088013, "grad_norm_var": 2.5937668931880554e-06, "learning_rate": 0.007101140505318006, "loss": 2.6944, "step": 4824 }, { "crossentropy": 2.6872016191482544, "epoch": 0.2623779874384839, "grad_norm": 0.03759939223527908, "grad_norm_var": 2.669524526440157e-06, "learning_rate": 0.007100010539536646, "loss": 2.6872, "step": 4825 }, { "crossentropy": 2.750451922416687, "epoch": 0.2624323662959841, "grad_norm": 0.036973029375076294, "grad_norm_var": 1.9509305758423517e-06, "learning_rate": 0.007098880443514588, "loss": 2.7505, "step": 4826 }, { "crossentropy": 2.5792078971862793, "epoch": 0.2624867451534843, "grad_norm": 0.03313827142119408, "grad_norm_var": 2.7407421469245624e-06, "learning_rate": 0.007097750217321923, "loss": 2.5792, "step": 4827 }, { "crossentropy": 2.5983983278274536, "epoch": 0.2625411240109845, "grad_norm": 0.03708355873823166, "grad_norm_var": 2.7485555454463222e-06, "learning_rate": 0.007096619861028746, "loss": 2.5984, "step": 4828 }, { "crossentropy": 2.62829327583313, "epoch": 0.2625955028684847, "grad_norm": 0.0466713048517704, "grad_norm_var": 8.882132678237655e-06, "learning_rate": 0.007095489374705158, "loss": 2.6283, "step": 4829 }, { "crossentropy": 2.597829222679138, "epoch": 0.2626498817259849, "grad_norm": 0.037489861249923706, "grad_norm_var": 8.753999926622985e-06, "learning_rate": 0.007094358758421274, "loss": 2.5978, "step": 4830 }, { "crossentropy": 2.5932910442352295, "epoch": 0.2627042605834851, "grad_norm": 0.04051128029823303, "grad_norm_var": 8.890514144660872e-06, "learning_rate": 0.007093228012247213, "loss": 2.5933, "step": 4831 }, { "crossentropy": 2.6447728872299194, "epoch": 0.2627586394409853, "grad_norm": 0.04439608007669449, "grad_norm_var": 1.0983721480732101e-05, "learning_rate": 0.007092097136253102, "loss": 2.6448, "step": 4832 }, { "crossentropy": 2.7977486848831177, "epoch": 0.2628130182984855, "grad_norm": 0.03865502029657364, "grad_norm_var": 1.0771766148580791e-05, "learning_rate": 0.007090966130509075, "loss": 2.7977, "step": 4833 }, { "crossentropy": 2.6455272436141968, "epoch": 0.2628673971559857, "grad_norm": 0.03599206358194351, "grad_norm_var": 1.1131785033395974e-05, "learning_rate": 0.00708983499508528, "loss": 2.6455, "step": 4834 }, { "crossentropy": 2.7120052576065063, "epoch": 0.26292177601348593, "grad_norm": 0.036741387099027634, "grad_norm_var": 1.0882556589384557e-05, "learning_rate": 0.007088703730051866, "loss": 2.712, "step": 4835 }, { "crossentropy": 2.497440814971924, "epoch": 0.26297615487098613, "grad_norm": 0.03859419375658035, "grad_norm_var": 1.03766663154925e-05, "learning_rate": 0.007087572335478995, "loss": 2.4974, "step": 4836 }, { "crossentropy": 2.666420578956604, "epoch": 0.26303053372848634, "grad_norm": 0.03616515174508095, "grad_norm_var": 1.0586071514519467e-05, "learning_rate": 0.007086440811436832, "loss": 2.6664, "step": 4837 }, { "crossentropy": 2.5529141426086426, "epoch": 0.26308491258598654, "grad_norm": 0.03731599077582359, "grad_norm_var": 1.056414091694559e-05, "learning_rate": 0.0070853091579955564, "loss": 2.5529, "step": 4838 }, { "crossentropy": 2.6331511735916138, "epoch": 0.26313929144348674, "grad_norm": 0.03941219300031662, "grad_norm_var": 1.0554091560463439e-05, "learning_rate": 0.007084177375225351, "loss": 2.6332, "step": 4839 }, { "crossentropy": 2.7592780590057373, "epoch": 0.263193670300987, "grad_norm": 0.04011817276477814, "grad_norm_var": 1.0678015235291288e-05, "learning_rate": 0.007083045463196407, "loss": 2.7593, "step": 4840 }, { "crossentropy": 2.65324866771698, "epoch": 0.2632480491584872, "grad_norm": 0.0424242839217186, "grad_norm_var": 1.1519155174329459e-05, "learning_rate": 0.007081913421978927, "loss": 2.6532, "step": 4841 }, { "crossentropy": 2.608792781829834, "epoch": 0.2633024280159874, "grad_norm": 0.04084374010562897, "grad_norm_var": 1.1484220639993286e-05, "learning_rate": 0.007080781251643117, "loss": 2.6088, "step": 4842 }, { "crossentropy": 2.6643115282058716, "epoch": 0.2633568068734876, "grad_norm": 0.03616253659129143, "grad_norm_var": 9.653072884777067e-06, "learning_rate": 0.007079648952259194, "loss": 2.6643, "step": 4843 }, { "crossentropy": 2.8095418214797974, "epoch": 0.2634111857309878, "grad_norm": 0.03681529313325882, "grad_norm_var": 9.736351182539013e-06, "learning_rate": 0.00707851652389738, "loss": 2.8095, "step": 4844 }, { "crossentropy": 2.500259518623352, "epoch": 0.263465564588488, "grad_norm": 0.03638671338558197, "grad_norm_var": 6.196918595554231e-06, "learning_rate": 0.00707738396662791, "loss": 2.5003, "step": 4845 }, { "crossentropy": 2.70217502117157, "epoch": 0.2635199434459882, "grad_norm": 0.03634190186858177, "grad_norm_var": 6.45325675012459e-06, "learning_rate": 0.007076251280521024, "loss": 2.7022, "step": 4846 }, { "crossentropy": 2.769518494606018, "epoch": 0.26357432230348843, "grad_norm": 0.03772738575935364, "grad_norm_var": 6.211399571066402e-06, "learning_rate": 0.007075118465646968, "loss": 2.7695, "step": 4847 }, { "crossentropy": 2.6466184854507446, "epoch": 0.26362870116098863, "grad_norm": 0.03911162167787552, "grad_norm_var": 3.7183799005042923e-06, "learning_rate": 0.007073985522076001, "loss": 2.6466, "step": 4848 }, { "crossentropy": 2.552661180496216, "epoch": 0.26368308001848884, "grad_norm": 0.03662419319152832, "grad_norm_var": 3.8124499783284116e-06, "learning_rate": 0.007072852449878384, "loss": 2.5527, "step": 4849 }, { "crossentropy": 2.556345582008362, "epoch": 0.26373745887598904, "grad_norm": 0.03500468656420708, "grad_norm_var": 4.127662954811253e-06, "learning_rate": 0.007071719249124392, "loss": 2.5563, "step": 4850 }, { "crossentropy": 2.70858633518219, "epoch": 0.26379183773348924, "grad_norm": 0.03517900034785271, "grad_norm_var": 4.513639565900442e-06, "learning_rate": 0.0070705859198843014, "loss": 2.7086, "step": 4851 }, { "crossentropy": 2.7292412519454956, "epoch": 0.26384621659098945, "grad_norm": 0.03632710129022598, "grad_norm_var": 4.583978980976279e-06, "learning_rate": 0.007069452462228405, "loss": 2.7292, "step": 4852 }, { "crossentropy": 2.6557401418685913, "epoch": 0.26390059544848965, "grad_norm": 0.03828894719481468, "grad_norm_var": 4.453204996062351e-06, "learning_rate": 0.007068318876226994, "loss": 2.6557, "step": 4853 }, { "crossentropy": 2.6203354597091675, "epoch": 0.26395497430598985, "grad_norm": 0.03626558184623718, "grad_norm_var": 4.583683083002275e-06, "learning_rate": 0.007067185161950376, "loss": 2.6203, "step": 4854 }, { "crossentropy": 2.572000741958618, "epoch": 0.26400935316349006, "grad_norm": 0.03625890240073204, "grad_norm_var": 4.480884341469491e-06, "learning_rate": 0.0070660513194688605, "loss": 2.572, "step": 4855 }, { "crossentropy": 2.6852450370788574, "epoch": 0.26406373202099026, "grad_norm": 0.03613912686705589, "grad_norm_var": 4.077413797308876e-06, "learning_rate": 0.00706491734885277, "loss": 2.6852, "step": 4856 }, { "crossentropy": 2.7069064378738403, "epoch": 0.26411811087849046, "grad_norm": 0.03458602353930473, "grad_norm_var": 2.5031923615727312e-06, "learning_rate": 0.0070637832501724285, "loss": 2.7069, "step": 4857 }, { "crossentropy": 2.6973453760147095, "epoch": 0.26417248973599067, "grad_norm": 0.03790521249175072, "grad_norm_var": 1.440470695444375e-06, "learning_rate": 0.0070626490234981755, "loss": 2.6973, "step": 4858 }, { "crossentropy": 2.6886128187179565, "epoch": 0.26422686859349087, "grad_norm": 0.035443857312202454, "grad_norm_var": 1.5118219964543415e-06, "learning_rate": 0.007061514668900353, "loss": 2.6886, "step": 4859 }, { "crossentropy": 2.6652694940567017, "epoch": 0.2642812474509911, "grad_norm": 0.0348748154938221, "grad_norm_var": 1.672144922574131e-06, "learning_rate": 0.0070603801864493115, "loss": 2.6653, "step": 4860 }, { "crossentropy": 2.5836586952209473, "epoch": 0.2643356263084913, "grad_norm": 0.03679404780268669, "grad_norm_var": 1.6815725115554292e-06, "learning_rate": 0.007059245576215412, "loss": 2.5837, "step": 4861 }, { "crossentropy": 2.7601804733276367, "epoch": 0.2643900051659915, "grad_norm": 0.04032100364565849, "grad_norm_var": 2.6246623277270675e-06, "learning_rate": 0.007058110838269022, "loss": 2.7602, "step": 4862 }, { "crossentropy": 2.7267080545425415, "epoch": 0.2644443840234917, "grad_norm": 0.03786534443497658, "grad_norm_var": 2.6451507486007752e-06, "learning_rate": 0.007056975972680517, "loss": 2.7267, "step": 4863 }, { "crossentropy": 2.6836622953414917, "epoch": 0.2644987628809919, "grad_norm": 0.03894040733575821, "grad_norm_var": 2.5916286128079813e-06, "learning_rate": 0.007055840979520281, "loss": 2.6837, "step": 4864 }, { "crossentropy": 2.6513736248016357, "epoch": 0.2645531417384921, "grad_norm": 0.037389494478702545, "grad_norm_var": 2.6229332553236982e-06, "learning_rate": 0.007054705858858701, "loss": 2.6514, "step": 4865 }, { "crossentropy": 2.66079044342041, "epoch": 0.2646075205959923, "grad_norm": 0.04224298521876335, "grad_norm_var": 4.238200127955281e-06, "learning_rate": 0.007053570610766181, "loss": 2.6608, "step": 4866 }, { "crossentropy": 2.6228755712509155, "epoch": 0.2646618994534925, "grad_norm": 0.036208976060152054, "grad_norm_var": 4.030204873147793e-06, "learning_rate": 0.007052435235313127, "loss": 2.6229, "step": 4867 }, { "crossentropy": 2.528764247894287, "epoch": 0.2647162783109927, "grad_norm": 0.041048914194107056, "grad_norm_var": 4.84847104723022e-06, "learning_rate": 0.007051299732569951, "loss": 2.5288, "step": 4868 }, { "crossentropy": 2.6366571187973022, "epoch": 0.2647706571684929, "grad_norm": 0.04162590205669403, "grad_norm_var": 5.879497673253834e-06, "learning_rate": 0.00705016410260708, "loss": 2.6367, "step": 4869 }, { "crossentropy": 2.70465624332428, "epoch": 0.2648250360259931, "grad_norm": 0.038339629769325256, "grad_norm_var": 5.73939701131824e-06, "learning_rate": 0.007049028345494943, "loss": 2.7047, "step": 4870 }, { "crossentropy": 2.700448513031006, "epoch": 0.2648794148834933, "grad_norm": 0.04077677056193352, "grad_norm_var": 6.042162191575532e-06, "learning_rate": 0.007047892461303978, "loss": 2.7004, "step": 4871 }, { "crossentropy": 2.744537353515625, "epoch": 0.2649337937409935, "grad_norm": 0.044372569769620895, "grad_norm_var": 8.064456346602392e-06, "learning_rate": 0.00704675645010463, "loss": 2.7445, "step": 4872 }, { "crossentropy": 2.565277934074402, "epoch": 0.2649881725984937, "grad_norm": 0.04060885310173035, "grad_norm_var": 7.051197794604297e-06, "learning_rate": 0.007045620311967355, "loss": 2.5653, "step": 4873 }, { "crossentropy": 2.642234683036804, "epoch": 0.2650425514559939, "grad_norm": 0.03534944728016853, "grad_norm_var": 7.848673702294933e-06, "learning_rate": 0.007044484046962619, "loss": 2.6422, "step": 4874 }, { "crossentropy": 2.7163058519363403, "epoch": 0.26509693031349413, "grad_norm": 0.035373084247112274, "grad_norm_var": 7.881484153958857e-06, "learning_rate": 0.007043347655160886, "loss": 2.7163, "step": 4875 }, { "crossentropy": 2.7604023218154907, "epoch": 0.26515130917099433, "grad_norm": 0.035534556955099106, "grad_norm_var": 7.556082420767142e-06, "learning_rate": 0.007042211136632637, "loss": 2.7604, "step": 4876 }, { "crossentropy": 2.699199676513672, "epoch": 0.26520568802849454, "grad_norm": 0.0358571857213974, "grad_norm_var": 7.877064545838696e-06, "learning_rate": 0.0070410744914483556, "loss": 2.6992, "step": 4877 }, { "crossentropy": 2.6955673694610596, "epoch": 0.26526006688599474, "grad_norm": 0.03357089310884476, "grad_norm_var": 9.415240107699523e-06, "learning_rate": 0.0070399377196785394, "loss": 2.6956, "step": 4878 }, { "crossentropy": 2.646539330482483, "epoch": 0.26531444574349494, "grad_norm": 0.0360836386680603, "grad_norm_var": 9.751125743443639e-06, "learning_rate": 0.007038800821393686, "loss": 2.6465, "step": 4879 }, { "crossentropy": 2.798827052116394, "epoch": 0.26536882460099515, "grad_norm": 0.03434726223349571, "grad_norm_var": 1.0697519538403234e-05, "learning_rate": 0.007037663796664309, "loss": 2.7988, "step": 4880 }, { "crossentropy": 2.688087224960327, "epoch": 0.26542320345849535, "grad_norm": 0.03625643998384476, "grad_norm_var": 1.0876883580186216e-05, "learning_rate": 0.00703652664556092, "loss": 2.6881, "step": 4881 }, { "crossentropy": 2.7180291414260864, "epoch": 0.26547758231599555, "grad_norm": 0.037595588713884354, "grad_norm_var": 9.58199572758476e-06, "learning_rate": 0.0070353893681540484, "loss": 2.718, "step": 4882 }, { "crossentropy": 2.6187299489974976, "epoch": 0.26553196117349576, "grad_norm": 0.04051201045513153, "grad_norm_var": 9.892770284453123e-06, "learning_rate": 0.0070342519645142255, "loss": 2.6187, "step": 4883 }, { "crossentropy": 2.728121280670166, "epoch": 0.26558634003099596, "grad_norm": 0.0375605933368206, "grad_norm_var": 9.213493230847818e-06, "learning_rate": 0.007033114434711994, "loss": 2.7281, "step": 4884 }, { "crossentropy": 2.5983798503875732, "epoch": 0.26564071888849616, "grad_norm": 0.04053324833512306, "grad_norm_var": 8.721297309259116e-06, "learning_rate": 0.007031976778817899, "loss": 2.5984, "step": 4885 }, { "crossentropy": 2.563662052154541, "epoch": 0.26569509774599637, "grad_norm": 0.036309339106082916, "grad_norm_var": 8.796838941475121e-06, "learning_rate": 0.007030838996902497, "loss": 2.5637, "step": 4886 }, { "crossentropy": 2.642132520675659, "epoch": 0.26574947660349657, "grad_norm": 0.035556357353925705, "grad_norm_var": 8.24722738509027e-06, "learning_rate": 0.007029701089036357, "loss": 2.6421, "step": 4887 }, { "crossentropy": 2.7054041624069214, "epoch": 0.2658038554609968, "grad_norm": 0.04225543141365051, "grad_norm_var": 6.506560154601453e-06, "learning_rate": 0.007028563055290045, "loss": 2.7054, "step": 4888 }, { "crossentropy": 2.7589735984802246, "epoch": 0.265858234318497, "grad_norm": 0.04028647020459175, "grad_norm_var": 6.361434529067456e-06, "learning_rate": 0.007027424895734145, "loss": 2.759, "step": 4889 }, { "crossentropy": 2.736244797706604, "epoch": 0.2659126131759972, "grad_norm": 0.03685012459754944, "grad_norm_var": 6.159652071920039e-06, "learning_rate": 0.007026286610439243, "loss": 2.7362, "step": 4890 }, { "crossentropy": 2.6965736150741577, "epoch": 0.2659669920334974, "grad_norm": 0.03344271332025528, "grad_norm_var": 6.8512180482870865e-06, "learning_rate": 0.007025148199475935, "loss": 2.6966, "step": 4891 }, { "crossentropy": 2.697041869163513, "epoch": 0.2660213708909976, "grad_norm": 0.03386634588241577, "grad_norm_var": 7.358778569345933e-06, "learning_rate": 0.007024009662914823, "loss": 2.697, "step": 4892 }, { "crossentropy": 2.6894274950027466, "epoch": 0.2660757497484978, "grad_norm": 0.03568520396947861, "grad_norm_var": 7.385232994518385e-06, "learning_rate": 0.0070228710008265185, "loss": 2.6894, "step": 4893 }, { "crossentropy": 2.671463131904602, "epoch": 0.266130128605998, "grad_norm": 0.03419532999396324, "grad_norm_var": 7.130805694956626e-06, "learning_rate": 0.007021732213281641, "loss": 2.6715, "step": 4894 }, { "crossentropy": 2.7882914543151855, "epoch": 0.2661845074634982, "grad_norm": 0.03499311953783035, "grad_norm_var": 7.332340649131701e-06, "learning_rate": 0.0070205933003508175, "loss": 2.7883, "step": 4895 }, { "crossentropy": 2.8013498783111572, "epoch": 0.2662388863209984, "grad_norm": 0.038339968770742416, "grad_norm_var": 6.974857539823685e-06, "learning_rate": 0.007019454262104679, "loss": 2.8013, "step": 4896 }, { "crossentropy": 2.657273054122925, "epoch": 0.2662932651784986, "grad_norm": 0.04046466574072838, "grad_norm_var": 7.585977765524564e-06, "learning_rate": 0.007018315098613872, "loss": 2.6573, "step": 4897 }, { "crossentropy": 2.6982122659683228, "epoch": 0.2663476440359988, "grad_norm": 0.037193089723587036, "grad_norm_var": 7.585762541724844e-06, "learning_rate": 0.007017175809949043, "loss": 2.6982, "step": 4898 }, { "crossentropy": 2.675496220588684, "epoch": 0.266402022893499, "grad_norm": 0.03719408065080643, "grad_norm_var": 6.887236512552869e-06, "learning_rate": 0.007016036396180852, "loss": 2.6755, "step": 4899 }, { "crossentropy": 2.6381053924560547, "epoch": 0.2664564017509992, "grad_norm": 0.04137686267495155, "grad_norm_var": 7.996035488775216e-06, "learning_rate": 0.007014896857379961, "loss": 2.6381, "step": 4900 }, { "crossentropy": 2.6837360858917236, "epoch": 0.2665107806084994, "grad_norm": 0.03889339044690132, "grad_norm_var": 7.480974056227734e-06, "learning_rate": 0.0070137571936170475, "loss": 2.6837, "step": 4901 }, { "crossentropy": 2.6330546140670776, "epoch": 0.2665651594659996, "grad_norm": 0.036071084439754486, "grad_norm_var": 7.5161959934330036e-06, "learning_rate": 0.00701261740496279, "loss": 2.6331, "step": 4902 }, { "crossentropy": 2.688931107521057, "epoch": 0.2666195383234998, "grad_norm": 0.037281595170497894, "grad_norm_var": 7.3030825785562005e-06, "learning_rate": 0.007011477491487875, "loss": 2.6889, "step": 4903 }, { "crossentropy": 2.6639434099197388, "epoch": 0.26667391718100003, "grad_norm": 0.038615625351667404, "grad_norm_var": 5.774397985367346e-06, "learning_rate": 0.0070103374532630046, "loss": 2.6639, "step": 4904 }, { "crossentropy": 2.6560568809509277, "epoch": 0.26672829603850023, "grad_norm": 0.03817129507660866, "grad_norm_var": 5.175626466166703e-06, "learning_rate": 0.007009197290358878, "loss": 2.6561, "step": 4905 }, { "crossentropy": 2.6133828163146973, "epoch": 0.26678267489600044, "grad_norm": 0.03703952953219414, "grad_norm_var": 5.173082182139549e-06, "learning_rate": 0.00700805700284621, "loss": 2.6134, "step": 4906 }, { "crossentropy": 2.642019271850586, "epoch": 0.26683705375350064, "grad_norm": 0.03801889345049858, "grad_norm_var": 4.279997301630198e-06, "learning_rate": 0.007006916590795716, "loss": 2.642, "step": 4907 }, { "crossentropy": 2.5657529830932617, "epoch": 0.26689143261100085, "grad_norm": 0.036093756556510925, "grad_norm_var": 3.5591892724145513e-06, "learning_rate": 0.007005776054278129, "loss": 2.5658, "step": 4908 }, { "crossentropy": 2.720238208770752, "epoch": 0.26694581146850105, "grad_norm": 0.036200400441884995, "grad_norm_var": 3.4527142441472444e-06, "learning_rate": 0.00700463539336418, "loss": 2.7202, "step": 4909 }, { "crossentropy": 2.6248351335525513, "epoch": 0.26700019032600125, "grad_norm": 0.0382862463593483, "grad_norm_var": 2.691274199857705e-06, "learning_rate": 0.007003494608124612, "loss": 2.6248, "step": 4910 }, { "crossentropy": 2.7120440006256104, "epoch": 0.26705456918350146, "grad_norm": 0.037569060921669006, "grad_norm_var": 2.15410174582379e-06, "learning_rate": 0.007002353698630177, "loss": 2.712, "step": 4911 }, { "crossentropy": 2.598015785217285, "epoch": 0.26710894804100166, "grad_norm": 0.0416085347533226, "grad_norm_var": 3.0024090151949586e-06, "learning_rate": 0.0070012126649516325, "loss": 2.598, "step": 4912 }, { "crossentropy": 2.6950372457504272, "epoch": 0.26716332689850186, "grad_norm": 0.04532758146524429, "grad_norm_var": 5.994253420064412e-06, "learning_rate": 0.007000071507159744, "loss": 2.695, "step": 4913 }, { "crossentropy": 2.6433194875717163, "epoch": 0.26721770575600207, "grad_norm": 0.03976593539118767, "grad_norm_var": 5.982348671675307e-06, "learning_rate": 0.006998930225325285, "loss": 2.6433, "step": 4914 }, { "crossentropy": 2.711455821990967, "epoch": 0.26727208461350227, "grad_norm": 0.03759784251451492, "grad_norm_var": 5.917139891499732e-06, "learning_rate": 0.006997788819519037, "loss": 2.7115, "step": 4915 }, { "crossentropy": 2.7080196142196655, "epoch": 0.2673264634710025, "grad_norm": 0.04278305545449257, "grad_norm_var": 6.5576444622144715e-06, "learning_rate": 0.006996647289811791, "loss": 2.708, "step": 4916 }, { "crossentropy": 2.699569344520569, "epoch": 0.2673808423285027, "grad_norm": 0.04294614866375923, "grad_norm_var": 7.684517479564917e-06, "learning_rate": 0.00699550563627434, "loss": 2.6996, "step": 4917 }, { "crossentropy": 2.6725250482559204, "epoch": 0.2674352211860029, "grad_norm": 0.03647412732243538, "grad_norm_var": 7.539366915989126e-06, "learning_rate": 0.006994363858977488, "loss": 2.6725, "step": 4918 }, { "crossentropy": 2.6886579990386963, "epoch": 0.2674896000435031, "grad_norm": 0.03461756557226181, "grad_norm_var": 8.588424645961044e-06, "learning_rate": 0.006993221957992052, "loss": 2.6887, "step": 4919 }, { "crossentropy": 2.6829023361206055, "epoch": 0.2675439789010033, "grad_norm": 0.04234578087925911, "grad_norm_var": 9.356543657801769e-06, "learning_rate": 0.006992079933388847, "loss": 2.6829, "step": 4920 }, { "crossentropy": 2.682217597961426, "epoch": 0.2675983577585035, "grad_norm": 0.04410211369395256, "grad_norm_var": 1.0857836807437865e-05, "learning_rate": 0.006990937785238702, "loss": 2.6822, "step": 4921 }, { "crossentropy": 2.7438271045684814, "epoch": 0.2676527366160037, "grad_norm": 0.03924614563584328, "grad_norm_var": 1.046074741900497e-05, "learning_rate": 0.006989795513612451, "loss": 2.7438, "step": 4922 }, { "crossentropy": 2.7197707891464233, "epoch": 0.2677071154735039, "grad_norm": 0.038455985486507416, "grad_norm_var": 1.0382789491573258e-05, "learning_rate": 0.006988653118580938, "loss": 2.7198, "step": 4923 }, { "crossentropy": 2.5392942428588867, "epoch": 0.2677614943310041, "grad_norm": 0.04103271663188934, "grad_norm_var": 9.605809758213521e-06, "learning_rate": 0.006987510600215012, "loss": 2.5393, "step": 4924 }, { "crossentropy": 2.7410486936569214, "epoch": 0.2678158731885043, "grad_norm": 0.04146261885762215, "grad_norm_var": 8.742534232182113e-06, "learning_rate": 0.006986367958585531, "loss": 2.741, "step": 4925 }, { "crossentropy": 2.754223346710205, "epoch": 0.2678702520460045, "grad_norm": 0.03484543412923813, "grad_norm_var": 1.0372550484631186e-05, "learning_rate": 0.006985225193763364, "loss": 2.7542, "step": 4926 }, { "crossentropy": 2.6641207933425903, "epoch": 0.2679246309035047, "grad_norm": 0.0354207381606102, "grad_norm_var": 1.1360565793379583e-05, "learning_rate": 0.006984082305819379, "loss": 2.6641, "step": 4927 }, { "crossentropy": 2.6632988452911377, "epoch": 0.2679790097610049, "grad_norm": 0.03724715858697891, "grad_norm_var": 1.1542511133120354e-05, "learning_rate": 0.00698293929482446, "loss": 2.6633, "step": 4928 }, { "crossentropy": 2.7432336807250977, "epoch": 0.2680333886185051, "grad_norm": 0.042806606739759445, "grad_norm_var": 1.0015996976412735e-05, "learning_rate": 0.006981796160849493, "loss": 2.7432, "step": 4929 }, { "crossentropy": 2.5321662425994873, "epoch": 0.2680877674760053, "grad_norm": 0.03850965574383736, "grad_norm_var": 1.0061192734049131e-05, "learning_rate": 0.006980652903965378, "loss": 2.5322, "step": 4930 }, { "crossentropy": 2.617563009262085, "epoch": 0.2681421463335055, "grad_norm": 0.03366953507065773, "grad_norm_var": 1.1953017139071375e-05, "learning_rate": 0.006979509524243015, "loss": 2.6176, "step": 4931 }, { "crossentropy": 2.6487263441085815, "epoch": 0.26819652519100573, "grad_norm": 0.0347772054374218, "grad_norm_var": 1.2051780939888411e-05, "learning_rate": 0.006978366021753318, "loss": 2.6487, "step": 4932 }, { "crossentropy": 2.7937464714050293, "epoch": 0.26825090404850593, "grad_norm": 0.03560210019350052, "grad_norm_var": 1.1188948362199762e-05, "learning_rate": 0.006977222396567202, "loss": 2.7937, "step": 4933 }, { "crossentropy": 2.6401182413101196, "epoch": 0.26830528290600614, "grad_norm": 0.03648043051362038, "grad_norm_var": 1.118753108036204e-05, "learning_rate": 0.0069760786487556, "loss": 2.6401, "step": 4934 }, { "crossentropy": 2.5620816946029663, "epoch": 0.26835966176350634, "grad_norm": 0.03754488006234169, "grad_norm_var": 1.033895424016861e-05, "learning_rate": 0.006974934778389439, "loss": 2.5621, "step": 4935 }, { "crossentropy": 2.6580406427383423, "epoch": 0.26841404062100654, "grad_norm": 0.04019954800605774, "grad_norm_var": 9.482488535207146e-06, "learning_rate": 0.006973790785539667, "loss": 2.658, "step": 4936 }, { "crossentropy": 2.6485440731048584, "epoch": 0.26846841947850675, "grad_norm": 0.05083362013101578, "grad_norm_var": 1.7600530722239326e-05, "learning_rate": 0.006972646670277229, "loss": 2.6485, "step": 4937 }, { "crossentropy": 2.596972942352295, "epoch": 0.26852279833600695, "grad_norm": 0.04427511990070343, "grad_norm_var": 1.9592057242026096e-05, "learning_rate": 0.006971502432673085, "loss": 2.597, "step": 4938 }, { "crossentropy": 2.6939581632614136, "epoch": 0.26857717719350716, "grad_norm": 0.0385240763425827, "grad_norm_var": 1.9587882759343882e-05, "learning_rate": 0.0069703580727981974, "loss": 2.694, "step": 4939 }, { "crossentropy": 2.6838289499282837, "epoch": 0.26863155605100736, "grad_norm": 0.03723551705479622, "grad_norm_var": 1.9435582629034654e-05, "learning_rate": 0.006969213590723538, "loss": 2.6838, "step": 4940 }, { "crossentropy": 2.5765371322631836, "epoch": 0.26868593490850756, "grad_norm": 0.038089569658041, "grad_norm_var": 1.8910798355900595e-05, "learning_rate": 0.00696806898652009, "loss": 2.5765, "step": 4941 }, { "crossentropy": 2.598284125328064, "epoch": 0.26874031376600777, "grad_norm": 0.036117181181907654, "grad_norm_var": 1.8391542478860685e-05, "learning_rate": 0.0069669242602588355, "loss": 2.5983, "step": 4942 }, { "crossentropy": 2.5686826705932617, "epoch": 0.26879469262350797, "grad_norm": 0.03433406725525856, "grad_norm_var": 1.8923568983684968e-05, "learning_rate": 0.0069657794120107745, "loss": 2.5687, "step": 4943 }, { "crossentropy": 2.628138303756714, "epoch": 0.2688490714810082, "grad_norm": 0.03729686513543129, "grad_norm_var": 1.891531813763281e-05, "learning_rate": 0.006964634441846906, "loss": 2.6281, "step": 4944 }, { "crossentropy": 2.405398726463318, "epoch": 0.2689034503385084, "grad_norm": 0.03575509786605835, "grad_norm_var": 1.7991369926947884e-05, "learning_rate": 0.0069634893498382415, "loss": 2.4054, "step": 4945 }, { "crossentropy": 2.662747859954834, "epoch": 0.2689578291960086, "grad_norm": 0.040620043873786926, "grad_norm_var": 1.8391252131814508e-05, "learning_rate": 0.006962344136055797, "loss": 2.6627, "step": 4946 }, { "crossentropy": 2.660486936569214, "epoch": 0.2690122080535088, "grad_norm": 0.03562697395682335, "grad_norm_var": 1.744578460177703e-05, "learning_rate": 0.0069611988005705994, "loss": 2.6605, "step": 4947 }, { "crossentropy": 2.7510132789611816, "epoch": 0.269066586911009, "grad_norm": 0.03577621281147003, "grad_norm_var": 1.7034655985324366e-05, "learning_rate": 0.006960053343453682, "loss": 2.751, "step": 4948 }, { "crossentropy": 2.676095724105835, "epoch": 0.2691209657685092, "grad_norm": 0.03744541481137276, "grad_norm_var": 1.6560726875228943e-05, "learning_rate": 0.006958907764776081, "loss": 2.6761, "step": 4949 }, { "crossentropy": 2.7425596714019775, "epoch": 0.2691753446260094, "grad_norm": 0.04108908399939537, "grad_norm_var": 1.6641269654541862e-05, "learning_rate": 0.006957762064608848, "loss": 2.7426, "step": 4950 }, { "crossentropy": 2.7077730894088745, "epoch": 0.2692297234835096, "grad_norm": 0.04928857833147049, "grad_norm_var": 2.3299217090702536e-05, "learning_rate": 0.006956616243023037, "loss": 2.7078, "step": 4951 }, { "crossentropy": 2.638666033744812, "epoch": 0.2692841023410098, "grad_norm": 0.0399886779487133, "grad_norm_var": 2.3283218603763662e-05, "learning_rate": 0.006955470300089713, "loss": 2.6387, "step": 4952 }, { "crossentropy": 2.639299988746643, "epoch": 0.26933848119851, "grad_norm": 0.03611508384346962, "grad_norm_var": 1.461733697110523e-05, "learning_rate": 0.006954324235879939, "loss": 2.6393, "step": 4953 }, { "crossentropy": 2.725861668586731, "epoch": 0.2693928600560102, "grad_norm": 0.03366335481405258, "grad_norm_var": 1.3623711749798962e-05, "learning_rate": 0.0069531780504648024, "loss": 2.7259, "step": 4954 }, { "crossentropy": 2.614095687866211, "epoch": 0.2694472389135104, "grad_norm": 0.03650893643498421, "grad_norm_var": 1.3719332266239291e-05, "learning_rate": 0.0069520317439153816, "loss": 2.6141, "step": 4955 }, { "crossentropy": 2.725669503211975, "epoch": 0.2695016177710106, "grad_norm": 0.036768607795238495, "grad_norm_var": 1.3768685374739426e-05, "learning_rate": 0.0069508853163027726, "loss": 2.7257, "step": 4956 }, { "crossentropy": 2.6579349040985107, "epoch": 0.2695559966285108, "grad_norm": 0.03363024443387985, "grad_norm_var": 1.4827610654449587e-05, "learning_rate": 0.006949738767698075, "loss": 2.6579, "step": 4957 }, { "crossentropy": 2.703279137611389, "epoch": 0.269610375486011, "grad_norm": 0.034538634121418, "grad_norm_var": 1.527471606833549e-05, "learning_rate": 0.006948592098172398, "loss": 2.7033, "step": 4958 }, { "crossentropy": 2.664657711982727, "epoch": 0.2696647543435112, "grad_norm": 0.039282653480768204, "grad_norm_var": 1.4780418093495359e-05, "learning_rate": 0.006947445307796855, "loss": 2.6647, "step": 4959 }, { "crossentropy": 2.687731981277466, "epoch": 0.26971913320101143, "grad_norm": 0.04039819538593292, "grad_norm_var": 1.5209832295875041e-05, "learning_rate": 0.006946298396642569, "loss": 2.6877, "step": 4960 }, { "crossentropy": 2.6841920614242554, "epoch": 0.26977351205851163, "grad_norm": 0.035777498036623, "grad_norm_var": 1.5203439618200846e-05, "learning_rate": 0.006945151364780671, "loss": 2.6842, "step": 4961 }, { "crossentropy": 2.6314101219177246, "epoch": 0.26982789091601184, "grad_norm": 0.035003941506147385, "grad_norm_var": 1.514345308606061e-05, "learning_rate": 0.006944004212282299, "loss": 2.6314, "step": 4962 }, { "crossentropy": 2.697144389152527, "epoch": 0.26988226977351204, "grad_norm": 0.037768084555864334, "grad_norm_var": 1.4879165475572126e-05, "learning_rate": 0.006942856939218598, "loss": 2.6971, "step": 4963 }, { "crossentropy": 2.6476640701293945, "epoch": 0.26993664863101224, "grad_norm": 0.0344950333237648, "grad_norm_var": 1.530870912198236e-05, "learning_rate": 0.0069417095456607205, "loss": 2.6477, "step": 4964 }, { "crossentropy": 2.5670406818389893, "epoch": 0.26999102748851245, "grad_norm": 0.036896344274282455, "grad_norm_var": 1.5339609964728066e-05, "learning_rate": 0.006940562031679829, "loss": 2.567, "step": 4965 }, { "crossentropy": 2.6911027431488037, "epoch": 0.27004540634601265, "grad_norm": 0.03720702603459358, "grad_norm_var": 1.4463010317888822e-05, "learning_rate": 0.006939414397347088, "loss": 2.6911, "step": 4966 }, { "crossentropy": 2.7210628986358643, "epoch": 0.27009978520351285, "grad_norm": 0.0374133326113224, "grad_norm_var": 4.347080659777616e-06, "learning_rate": 0.006938266642733675, "loss": 2.7211, "step": 4967 }, { "crossentropy": 2.605348229408264, "epoch": 0.27015416406101306, "grad_norm": 0.03595184534788132, "grad_norm_var": 3.5367889810020493e-06, "learning_rate": 0.0069371187679107705, "loss": 2.6053, "step": 4968 }, { "crossentropy": 2.669694423675537, "epoch": 0.27020854291851326, "grad_norm": 0.03365127742290497, "grad_norm_var": 3.989637066328933e-06, "learning_rate": 0.006935970772949568, "loss": 2.6697, "step": 4969 }, { "crossentropy": 2.698922038078308, "epoch": 0.27026292177601347, "grad_norm": 0.03601213917136192, "grad_norm_var": 3.5448272231688426e-06, "learning_rate": 0.00693482265792126, "loss": 2.6989, "step": 4970 }, { "crossentropy": 2.6230491399765015, "epoch": 0.27031730063351367, "grad_norm": 0.03784935176372528, "grad_norm_var": 3.6888359055751534e-06, "learning_rate": 0.006933674422897058, "loss": 2.623, "step": 4971 }, { "crossentropy": 2.6393011808395386, "epoch": 0.27037167949101387, "grad_norm": 0.03726120665669441, "grad_norm_var": 3.7272093852851068e-06, "learning_rate": 0.00693252606794817, "loss": 2.6393, "step": 4972 }, { "crossentropy": 2.7543314695358276, "epoch": 0.2704260583485141, "grad_norm": 0.036155298352241516, "grad_norm_var": 3.177694677860126e-06, "learning_rate": 0.006931377593145815, "loss": 2.7543, "step": 4973 }, { "crossentropy": 2.6074330806732178, "epoch": 0.2704804372060143, "grad_norm": 0.04172680899500847, "grad_norm_var": 4.4276940551884565e-06, "learning_rate": 0.006930228998561225, "loss": 2.6074, "step": 4974 }, { "crossentropy": 2.7267919778823853, "epoch": 0.2705348160635145, "grad_norm": 0.036144476383924484, "grad_norm_var": 4.110317376860665e-06, "learning_rate": 0.006929080284265631, "loss": 2.7268, "step": 4975 }, { "crossentropy": 2.7359542846679688, "epoch": 0.2705891949210147, "grad_norm": 0.035740479826927185, "grad_norm_var": 3.2670224623767194e-06, "learning_rate": 0.006927931450330277, "loss": 2.736, "step": 4976 }, { "crossentropy": 2.601251244544983, "epoch": 0.2706435737785149, "grad_norm": 0.034807756543159485, "grad_norm_var": 3.4277347856367355e-06, "learning_rate": 0.006926782496826409, "loss": 2.6013, "step": 4977 }, { "crossentropy": 2.7547223567962646, "epoch": 0.2706979526360151, "grad_norm": 0.03420975059270859, "grad_norm_var": 3.6261353935860183e-06, "learning_rate": 0.0069256334238252905, "loss": 2.7547, "step": 4978 }, { "crossentropy": 2.7231879234313965, "epoch": 0.2707523314935153, "grad_norm": 0.03608240187168121, "grad_norm_var": 3.5087483719909233e-06, "learning_rate": 0.00692448423139818, "loss": 2.7232, "step": 4979 }, { "crossentropy": 2.633504867553711, "epoch": 0.2708067103510155, "grad_norm": 0.035708971321582794, "grad_norm_var": 3.3005634578859093e-06, "learning_rate": 0.006923334919616354, "loss": 2.6335, "step": 4980 }, { "crossentropy": 2.6343213319778442, "epoch": 0.2708610892085157, "grad_norm": 0.035989560186862946, "grad_norm_var": 3.2951064269944504e-06, "learning_rate": 0.006922185488551087, "loss": 2.6343, "step": 4981 }, { "crossentropy": 2.5405062437057495, "epoch": 0.2709154680660159, "grad_norm": 0.03774520382285118, "grad_norm_var": 3.3733084444003786e-06, "learning_rate": 0.0069210359382736696, "loss": 2.5405, "step": 4982 }, { "crossentropy": 2.698856234550476, "epoch": 0.2709698469235161, "grad_norm": 0.04113060608506203, "grad_norm_var": 4.737641111692235e-06, "learning_rate": 0.006919886268855393, "loss": 2.6989, "step": 4983 }, { "crossentropy": 2.5876107215881348, "epoch": 0.2710242257810163, "grad_norm": 0.03905678912997246, "grad_norm_var": 5.0571778517499145e-06, "learning_rate": 0.00691873648036756, "loss": 2.5876, "step": 4984 }, { "crossentropy": 2.7286088466644287, "epoch": 0.2710786046385165, "grad_norm": 0.03734947368502617, "grad_norm_var": 4.3448076379513e-06, "learning_rate": 0.00691758657288148, "loss": 2.7286, "step": 4985 }, { "crossentropy": 2.6973772048950195, "epoch": 0.2711329834960167, "grad_norm": 0.05181886628270149, "grad_norm_var": 1.7750809784865724e-05, "learning_rate": 0.0069164365464684675, "loss": 2.6974, "step": 4986 }, { "crossentropy": 2.6911840438842773, "epoch": 0.2711873623535169, "grad_norm": 0.03762005642056465, "grad_norm_var": 1.776018622243325e-05, "learning_rate": 0.00691528640119985, "loss": 2.6912, "step": 4987 }, { "crossentropy": 2.6997255086898804, "epoch": 0.27124174121101713, "grad_norm": 0.038400955498218536, "grad_norm_var": 1.772390152652269e-05, "learning_rate": 0.006914136137146951, "loss": 2.6997, "step": 4988 }, { "crossentropy": 2.62570321559906, "epoch": 0.2712961200685174, "grad_norm": 0.037316881120204926, "grad_norm_var": 1.7506193709573848e-05, "learning_rate": 0.0069129857543811166, "loss": 2.6257, "step": 4989 }, { "crossentropy": 2.6709479093551636, "epoch": 0.2713504989260176, "grad_norm": 0.03851248696446419, "grad_norm_var": 1.663102781480977e-05, "learning_rate": 0.006911835252973689, "loss": 2.6709, "step": 4990 }, { "crossentropy": 2.7054189443588257, "epoch": 0.2714048777835178, "grad_norm": 0.03908633068203926, "grad_norm_var": 1.6453065576593763e-05, "learning_rate": 0.0069106846329960205, "loss": 2.7054, "step": 4991 }, { "crossentropy": 2.6944668292999268, "epoch": 0.271459256641018, "grad_norm": 0.045493729412555695, "grad_norm_var": 1.9250661605028223e-05, "learning_rate": 0.006909533894519473, "loss": 2.6945, "step": 4992 }, { "crossentropy": 2.6762378215789795, "epoch": 0.2715136354985182, "grad_norm": 0.04075051471590996, "grad_norm_var": 1.8317894768815102e-05, "learning_rate": 0.006908383037615413, "loss": 2.6762, "step": 4993 }, { "crossentropy": 2.6969728469848633, "epoch": 0.2715680143560184, "grad_norm": 0.09409578889608383, "grad_norm_var": 0.00020308065514399956, "learning_rate": 0.006907232062355217, "loss": 2.697, "step": 4994 }, { "crossentropy": 2.608454465866089, "epoch": 0.2716223932135186, "grad_norm": 0.051143523305654526, "grad_norm_var": 0.00020359753081801711, "learning_rate": 0.006906080968810266, "loss": 2.6085, "step": 4995 }, { "crossentropy": 2.6539324522018433, "epoch": 0.2716767720710188, "grad_norm": 0.038371238857507706, "grad_norm_var": 0.00020115913358403898, "learning_rate": 0.0069049297570519495, "loss": 2.6539, "step": 4996 }, { "crossentropy": 2.7439048290252686, "epoch": 0.271731150928519, "grad_norm": 0.04267990589141846, "grad_norm_var": 0.00019681757594840332, "learning_rate": 0.006903778427151667, "loss": 2.7439, "step": 4997 }, { "crossentropy": 2.6611629724502563, "epoch": 0.2717855297860192, "grad_norm": 0.0435127392411232, "grad_norm_var": 0.0001937707516516248, "learning_rate": 0.006902626979180821, "loss": 2.6612, "step": 4998 }, { "crossentropy": 2.686303496360779, "epoch": 0.2718399086435194, "grad_norm": 0.0483306422829628, "grad_norm_var": 0.00019351575532904522, "learning_rate": 0.006901475413210823, "loss": 2.6863, "step": 4999 }, { "crossentropy": 2.6413432359695435, "epoch": 0.2718942875010196, "grad_norm": 0.041341789066791534, "grad_norm_var": 0.00019196397765825307, "learning_rate": 0.006900323729313092, "loss": 2.6413, "step": 5000 }, { "crossentropy": 2.585599899291992, "epoch": 0.27194866635851983, "grad_norm": 0.04026968404650688, "grad_norm_var": 0.00018937638495983105, "learning_rate": 0.006899171927559057, "loss": 2.5856, "step": 5001 }, { "crossentropy": 2.643540143966675, "epoch": 0.27200304521602003, "grad_norm": 0.03970527648925781, "grad_norm_var": 0.00018841690744800284, "learning_rate": 0.006898020008020148, "loss": 2.6435, "step": 5002 }, { "crossentropy": 2.698397994041443, "epoch": 0.27205742407352024, "grad_norm": 0.03903541341423988, "grad_norm_var": 0.00018718913877170296, "learning_rate": 0.006896867970767808, "loss": 2.6984, "step": 5003 }, { "crossentropy": 2.5189061164855957, "epoch": 0.27211180293102044, "grad_norm": 0.03558751195669174, "grad_norm_var": 0.00019011353605042028, "learning_rate": 0.006895715815873485, "loss": 2.5189, "step": 5004 }, { "crossentropy": 2.7320404052734375, "epoch": 0.27216618178852064, "grad_norm": 0.03907899931073189, "grad_norm_var": 0.00018857245397170796, "learning_rate": 0.006894563543408634, "loss": 2.732, "step": 5005 }, { "crossentropy": 2.6186944246292114, "epoch": 0.27222056064602085, "grad_norm": 0.04077867418527603, "grad_norm_var": 0.00018698991162437168, "learning_rate": 0.00689341115344472, "loss": 2.6187, "step": 5006 }, { "crossentropy": 2.644091486930847, "epoch": 0.27227493950352105, "grad_norm": 0.04114344343543053, "grad_norm_var": 0.00018564503773051775, "learning_rate": 0.00689225864605321, "loss": 2.6441, "step": 5007 }, { "crossentropy": 2.689604640007019, "epoch": 0.27232931836102126, "grad_norm": 0.036024387925863266, "grad_norm_var": 0.000190730016113688, "learning_rate": 0.006891106021305583, "loss": 2.6896, "step": 5008 }, { "crossentropy": 2.551033854484558, "epoch": 0.27238369721852146, "grad_norm": 0.03567285090684891, "grad_norm_var": 0.0001948735495647423, "learning_rate": 0.006889953279273323, "loss": 2.551, "step": 5009 }, { "crossentropy": 2.583721160888672, "epoch": 0.27243807607602166, "grad_norm": 0.03472694382071495, "grad_norm_var": 1.9985589875687097e-05, "learning_rate": 0.006888800420027924, "loss": 2.5837, "step": 5010 }, { "crossentropy": 2.428903818130493, "epoch": 0.27249245493352187, "grad_norm": 0.03405604138970375, "grad_norm_var": 1.3899992211759601e-05, "learning_rate": 0.006887647443640885, "loss": 2.4289, "step": 5011 }, { "crossentropy": 2.6643359661102295, "epoch": 0.27254683379102207, "grad_norm": 0.0351620651781559, "grad_norm_var": 1.4981604703375358e-05, "learning_rate": 0.0068864943501837075, "loss": 2.6643, "step": 5012 }, { "crossentropy": 2.6842801570892334, "epoch": 0.2726012126485223, "grad_norm": 0.034823961555957794, "grad_norm_var": 1.5187656748205427e-05, "learning_rate": 0.006885341139727912, "loss": 2.6843, "step": 5013 }, { "crossentropy": 2.607212781906128, "epoch": 0.2726555915060225, "grad_norm": 0.035764217376708984, "grad_norm_var": 1.3971171768108655e-05, "learning_rate": 0.006884187812345017, "loss": 2.6072, "step": 5014 }, { "crossentropy": 2.66702938079834, "epoch": 0.2727099703635227, "grad_norm": 0.036029428243637085, "grad_norm_var": 6.843718279597501e-06, "learning_rate": 0.00688303436810655, "loss": 2.667, "step": 5015 }, { "crossentropy": 2.7157416343688965, "epoch": 0.2727643492210229, "grad_norm": 0.042803261429071426, "grad_norm_var": 7.735569359870221e-06, "learning_rate": 0.006881880807084045, "loss": 2.7157, "step": 5016 }, { "crossentropy": 2.617043972015381, "epoch": 0.2728187280785231, "grad_norm": 0.041257910430431366, "grad_norm_var": 8.1560965961458e-06, "learning_rate": 0.00688072712934905, "loss": 2.617, "step": 5017 }, { "crossentropy": 2.6971912384033203, "epoch": 0.2728731069360233, "grad_norm": 0.03568563610315323, "grad_norm_var": 8.039301460027506e-06, "learning_rate": 0.006879573334973111, "loss": 2.6972, "step": 5018 }, { "crossentropy": 2.619683623313904, "epoch": 0.2729274857935235, "grad_norm": 0.03360676392912865, "grad_norm_var": 8.66264628070305e-06, "learning_rate": 0.0068784194240277844, "loss": 2.6197, "step": 5019 }, { "crossentropy": 2.6299002170562744, "epoch": 0.2729818646510237, "grad_norm": 0.04249437525868416, "grad_norm_var": 1.0331780098795223e-05, "learning_rate": 0.006877265396584638, "loss": 2.6299, "step": 5020 }, { "crossentropy": 2.7178423404693604, "epoch": 0.2730362435085239, "grad_norm": 0.03488035872578621, "grad_norm_var": 1.0518436820327175e-05, "learning_rate": 0.00687611125271524, "loss": 2.7178, "step": 5021 }, { "crossentropy": 2.581214189529419, "epoch": 0.2730906223660241, "grad_norm": 0.03379451110959053, "grad_norm_var": 1.021769619729898e-05, "learning_rate": 0.006874956992491174, "loss": 2.5812, "step": 5022 }, { "crossentropy": 2.707928776741028, "epoch": 0.2731450012235243, "grad_norm": 0.03448200970888138, "grad_norm_var": 9.08479808745203e-06, "learning_rate": 0.006873802615984022, "loss": 2.7079, "step": 5023 }, { "crossentropy": 2.7139333486557007, "epoch": 0.2731993800810245, "grad_norm": 0.14451812207698822, "grad_norm_var": 0.0007403583243555924, "learning_rate": 0.006872648123265378, "loss": 2.7139, "step": 5024 }, { "crossentropy": 2.5948981046676636, "epoch": 0.2732537589385247, "grad_norm": 0.09926142543554306, "grad_norm_var": 0.0009300226229461878, "learning_rate": 0.006871493514406844, "loss": 2.5949, "step": 5025 }, { "crossentropy": 2.6438063383102417, "epoch": 0.2733081377960249, "grad_norm": 0.03778671473264694, "grad_norm_var": 0.0009255663815449487, "learning_rate": 0.006870338789480027, "loss": 2.6438, "step": 5026 }, { "crossentropy": 2.682355284690857, "epoch": 0.2733625166535251, "grad_norm": 0.042939282953739166, "grad_norm_var": 0.000914840916414013, "learning_rate": 0.006869183948556541, "loss": 2.6824, "step": 5027 }, { "crossentropy": 2.680722713470459, "epoch": 0.2734168955110253, "grad_norm": 0.04493970796465874, "grad_norm_var": 0.0009043002361214488, "learning_rate": 0.006868028991708011, "loss": 2.6807, "step": 5028 }, { "crossentropy": 2.7994768619537354, "epoch": 0.27347127436852553, "grad_norm": 0.044336672872304916, "grad_norm_var": 0.0008926837120197525, "learning_rate": 0.006866873919006062, "loss": 2.7995, "step": 5029 }, { "crossentropy": 2.654723644256592, "epoch": 0.27352565322602573, "grad_norm": 0.03779815509915352, "grad_norm_var": 0.000889342996643281, "learning_rate": 0.006865718730522336, "loss": 2.6547, "step": 5030 }, { "crossentropy": 2.7563165426254272, "epoch": 0.27358003208352594, "grad_norm": 0.04614216834306717, "grad_norm_var": 0.0008780253291710903, "learning_rate": 0.006864563426328471, "loss": 2.7563, "step": 5031 }, { "crossentropy": 2.6883323192596436, "epoch": 0.27363441094102614, "grad_norm": 0.03547358512878418, "grad_norm_var": 0.0008882164784550649, "learning_rate": 0.006863408006496124, "loss": 2.6883, "step": 5032 }, { "crossentropy": 2.672357678413391, "epoch": 0.27368878979852634, "grad_norm": 0.04151018708944321, "grad_norm_var": 0.0008879486893924967, "learning_rate": 0.006862252471096945, "loss": 2.6724, "step": 5033 }, { "crossentropy": 2.5436567068099976, "epoch": 0.27374316865602655, "grad_norm": 0.03499084711074829, "grad_norm_var": 0.0008892449943655548, "learning_rate": 0.006861096820202608, "loss": 2.5437, "step": 5034 }, { "crossentropy": 2.803316593170166, "epoch": 0.27379754751352675, "grad_norm": 0.03603426739573479, "grad_norm_var": 0.0008845307747708525, "learning_rate": 0.00685994105388478, "loss": 2.8033, "step": 5035 }, { "crossentropy": 2.665331721305847, "epoch": 0.27385192637102695, "grad_norm": 0.039815545082092285, "grad_norm_var": 0.0008874677462471707, "learning_rate": 0.006858785172215141, "loss": 2.6653, "step": 5036 }, { "crossentropy": 2.7189712524414062, "epoch": 0.27390630522852716, "grad_norm": 0.051231592893600464, "grad_norm_var": 0.0008727538749396435, "learning_rate": 0.006857629175265378, "loss": 2.719, "step": 5037 }, { "crossentropy": 2.754996180534363, "epoch": 0.27396068408602736, "grad_norm": 0.038315653800964355, "grad_norm_var": 0.000864071998208802, "learning_rate": 0.006856473063107187, "loss": 2.755, "step": 5038 }, { "crossentropy": 2.7468576431274414, "epoch": 0.27401506294352757, "grad_norm": 0.03700915724039078, "grad_norm_var": 0.0008590406543656612, "learning_rate": 0.0068553168358122685, "loss": 2.7469, "step": 5039 }, { "crossentropy": 2.6600645780563354, "epoch": 0.27406944180102777, "grad_norm": 0.037395503371953964, "grad_norm_var": 0.00023704457051847237, "learning_rate": 0.006854160493452328, "loss": 2.6601, "step": 5040 }, { "crossentropy": 2.570898413658142, "epoch": 0.274123820658528, "grad_norm": 0.03601055219769478, "grad_norm_var": 2.1558970191050873e-05, "learning_rate": 0.006853004036099082, "loss": 2.5709, "step": 5041 }, { "crossentropy": 2.7365872859954834, "epoch": 0.2741781995160282, "grad_norm": 0.03560015186667442, "grad_norm_var": 2.253456678457105e-05, "learning_rate": 0.006851847463824254, "loss": 2.7366, "step": 5042 }, { "crossentropy": 2.7151825428009033, "epoch": 0.2742325783735284, "grad_norm": 0.03883376345038414, "grad_norm_var": 2.196341715543666e-05, "learning_rate": 0.006850690776699573, "loss": 2.7152, "step": 5043 }, { "crossentropy": 2.665181875228882, "epoch": 0.2742869572310286, "grad_norm": 0.0387209914624691, "grad_norm_var": 2.0048184995659588e-05, "learning_rate": 0.006849533974796773, "loss": 2.6652, "step": 5044 }, { "crossentropy": 2.585607886314392, "epoch": 0.2743413360885288, "grad_norm": 0.04116339236497879, "grad_norm_var": 1.8557579640662327e-05, "learning_rate": 0.006848377058187603, "loss": 2.5856, "step": 5045 }, { "crossentropy": 2.5322805643081665, "epoch": 0.274395714946029, "grad_norm": 0.03644021973013878, "grad_norm_var": 1.8913579910254975e-05, "learning_rate": 0.00684722002694381, "loss": 2.5323, "step": 5046 }, { "crossentropy": 2.6235159635543823, "epoch": 0.2744500938035292, "grad_norm": 0.03451015055179596, "grad_norm_var": 1.6359675261505038e-05, "learning_rate": 0.006846062881137154, "loss": 2.6235, "step": 5047 }, { "crossentropy": 2.691640615463257, "epoch": 0.2745044726610294, "grad_norm": 0.03785237669944763, "grad_norm_var": 1.581181454785857e-05, "learning_rate": 0.006844905620839398, "loss": 2.6916, "step": 5048 }, { "crossentropy": 2.7512311935424805, "epoch": 0.2745588515185296, "grad_norm": 0.038539573550224304, "grad_norm_var": 1.5157065588933203e-05, "learning_rate": 0.006843748246122315, "loss": 2.7512, "step": 5049 }, { "crossentropy": 2.7350414991378784, "epoch": 0.2746132303760298, "grad_norm": 0.0408337228000164, "grad_norm_var": 1.4729142267875388e-05, "learning_rate": 0.006842590757057687, "loss": 2.735, "step": 5050 }, { "crossentropy": 2.695876717567444, "epoch": 0.27466760923353, "grad_norm": 0.03710462152957916, "grad_norm_var": 1.4428277493650348e-05, "learning_rate": 0.006841433153717296, "loss": 2.6959, "step": 5051 }, { "crossentropy": 2.639054536819458, "epoch": 0.2747219880910302, "grad_norm": 0.03418659418821335, "grad_norm_var": 1.5579650268375948e-05, "learning_rate": 0.006840275436172938, "loss": 2.6391, "step": 5052 }, { "crossentropy": 2.697211980819702, "epoch": 0.2747763669485304, "grad_norm": 0.03596563637256622, "grad_norm_var": 3.9440915429589735e-06, "learning_rate": 0.0068391176044964135, "loss": 2.6972, "step": 5053 }, { "crossentropy": 2.7175267934799194, "epoch": 0.2748307458060306, "grad_norm": 0.034192491322755814, "grad_norm_var": 4.506055188026557e-06, "learning_rate": 0.006837959658759531, "loss": 2.7175, "step": 5054 }, { "crossentropy": 2.628621459007263, "epoch": 0.2748851246635308, "grad_norm": 0.0349121056497097, "grad_norm_var": 4.819569115409721e-06, "learning_rate": 0.006836801599034102, "loss": 2.6286, "step": 5055 }, { "crossentropy": 2.6769790649414062, "epoch": 0.274939503521031, "grad_norm": 0.039224736392498016, "grad_norm_var": 5.121170837842313e-06, "learning_rate": 0.006835643425391951, "loss": 2.677, "step": 5056 }, { "crossentropy": 2.6774266958236694, "epoch": 0.27499388237853123, "grad_norm": 0.039296794682741165, "grad_norm_var": 5.305325668626913e-06, "learning_rate": 0.006834485137904906, "loss": 2.6774, "step": 5057 }, { "crossentropy": 2.6374164819717407, "epoch": 0.27504826123603143, "grad_norm": 0.041788045316934586, "grad_norm_var": 6.266218278834963e-06, "learning_rate": 0.006833326736644802, "loss": 2.6374, "step": 5058 }, { "crossentropy": 2.692039370536804, "epoch": 0.27510264009353164, "grad_norm": 0.04315068572759628, "grad_norm_var": 8.070400981988133e-06, "learning_rate": 0.0068321682216834845, "loss": 2.692, "step": 5059 }, { "crossentropy": 2.7572401762008667, "epoch": 0.27515701895103184, "grad_norm": 0.04069867730140686, "grad_norm_var": 8.506915286982109e-06, "learning_rate": 0.0068310095930928, "loss": 2.7572, "step": 5060 }, { "crossentropy": 2.5986112356185913, "epoch": 0.27521139780853204, "grad_norm": 0.03598732128739357, "grad_norm_var": 8.078426295427855e-06, "learning_rate": 0.006829850850944611, "loss": 2.5986, "step": 5061 }, { "crossentropy": 2.6682709455490112, "epoch": 0.27526577666603225, "grad_norm": 0.035040006041526794, "grad_norm_var": 8.453471665967327e-06, "learning_rate": 0.006828691995310775, "loss": 2.6683, "step": 5062 }, { "crossentropy": 2.602038264274597, "epoch": 0.27532015552353245, "grad_norm": 0.03520386666059494, "grad_norm_var": 8.188019695215311e-06, "learning_rate": 0.006827533026263169, "loss": 2.602, "step": 5063 }, { "crossentropy": 2.703832507133484, "epoch": 0.27537453438103265, "grad_norm": 0.038702260702848434, "grad_norm_var": 8.244925815366749e-06, "learning_rate": 0.006826373943873667, "loss": 2.7038, "step": 5064 }, { "crossentropy": 2.6443748474121094, "epoch": 0.27542891323853286, "grad_norm": 0.03628404811024666, "grad_norm_var": 8.340981191652895e-06, "learning_rate": 0.006825214748214156, "loss": 2.6444, "step": 5065 }, { "crossentropy": 2.678526282310486, "epoch": 0.27548329209603306, "grad_norm": 0.03592832013964653, "grad_norm_var": 7.769606807990724e-06, "learning_rate": 0.006824055439356527, "loss": 2.6785, "step": 5066 }, { "crossentropy": 2.672461986541748, "epoch": 0.27553767095353326, "grad_norm": 0.03661853447556496, "grad_norm_var": 7.800545924345885e-06, "learning_rate": 0.006822896017372682, "loss": 2.6725, "step": 5067 }, { "crossentropy": 2.7559770345687866, "epoch": 0.27559204981103347, "grad_norm": 0.03579655662178993, "grad_norm_var": 7.289115878202311e-06, "learning_rate": 0.006821736482334525, "loss": 2.756, "step": 5068 }, { "crossentropy": 2.62909471988678, "epoch": 0.27564642866853367, "grad_norm": 0.03541867807507515, "grad_norm_var": 7.41419654454774e-06, "learning_rate": 0.0068205768343139695, "loss": 2.6291, "step": 5069 }, { "crossentropy": 2.6685373783111572, "epoch": 0.2757008075260339, "grad_norm": 0.03648154065012932, "grad_norm_var": 6.7657203651566e-06, "learning_rate": 0.0068194170733829355, "loss": 2.6685, "step": 5070 }, { "crossentropy": 2.598480463027954, "epoch": 0.2757551863835341, "grad_norm": 0.03410925343632698, "grad_norm_var": 7.086592823835001e-06, "learning_rate": 0.006818257199613351, "loss": 2.5985, "step": 5071 }, { "crossentropy": 2.634673595428467, "epoch": 0.2758095652410343, "grad_norm": 0.03566722944378853, "grad_norm_var": 7.051457861171478e-06, "learning_rate": 0.00681709721307715, "loss": 2.6347, "step": 5072 }, { "crossentropy": 2.662062406539917, "epoch": 0.2758639440985345, "grad_norm": 0.03389772027730942, "grad_norm_var": 7.407624082107246e-06, "learning_rate": 0.006815937113846275, "loss": 2.6621, "step": 5073 }, { "crossentropy": 2.5460582971572876, "epoch": 0.2759183229560347, "grad_norm": 0.03895061835646629, "grad_norm_var": 6.070361813321062e-06, "learning_rate": 0.006814776901992673, "loss": 2.5461, "step": 5074 }, { "crossentropy": 2.568833351135254, "epoch": 0.2759727018135349, "grad_norm": 0.03467855975031853, "grad_norm_var": 3.321530448248631e-06, "learning_rate": 0.006813616577588298, "loss": 2.5688, "step": 5075 }, { "crossentropy": 2.5944453477859497, "epoch": 0.2760270806710351, "grad_norm": 0.0387473963201046, "grad_norm_var": 2.3933542474105852e-06, "learning_rate": 0.006812456140705115, "loss": 2.5944, "step": 5076 }, { "crossentropy": 2.6554876565933228, "epoch": 0.2760814595285353, "grad_norm": 0.036630723625421524, "grad_norm_var": 2.4100331007229426e-06, "learning_rate": 0.0068112955914150924, "loss": 2.6555, "step": 5077 }, { "crossentropy": 2.733309268951416, "epoch": 0.2761358383860355, "grad_norm": 0.041878990828990936, "grad_norm_var": 4.3350460256825e-06, "learning_rate": 0.006810134929790205, "loss": 2.7333, "step": 5078 }, { "crossentropy": 2.593825578689575, "epoch": 0.2761902172435357, "grad_norm": 0.03821815550327301, "grad_norm_var": 4.3570185690297926e-06, "learning_rate": 0.006808974155902436, "loss": 2.5938, "step": 5079 }, { "crossentropy": 2.5866878032684326, "epoch": 0.2762445961010359, "grad_norm": 0.034318070858716965, "grad_norm_var": 4.417441433372284e-06, "learning_rate": 0.006807813269823778, "loss": 2.5867, "step": 5080 }, { "crossentropy": 2.595155358314514, "epoch": 0.2762989749585361, "grad_norm": 0.03579423949122429, "grad_norm_var": 4.445006194456145e-06, "learning_rate": 0.006806652271626226, "loss": 2.5952, "step": 5081 }, { "crossentropy": 2.6572623252868652, "epoch": 0.2763533538160363, "grad_norm": 0.0350768081843853, "grad_norm_var": 4.5490879574310835e-06, "learning_rate": 0.006805491161381782, "loss": 2.6573, "step": 5082 }, { "crossentropy": 2.679197311401367, "epoch": 0.2764077326735365, "grad_norm": 0.03687281161546707, "grad_norm_var": 4.560785880329081e-06, "learning_rate": 0.006804329939162463, "loss": 2.6792, "step": 5083 }, { "crossentropy": 2.6694802045822144, "epoch": 0.2764621115310367, "grad_norm": 0.036180902272462845, "grad_norm_var": 4.538654441957335e-06, "learning_rate": 0.006803168605040282, "loss": 2.6695, "step": 5084 }, { "crossentropy": 2.600563406944275, "epoch": 0.27651649038853693, "grad_norm": 0.0366322360932827, "grad_norm_var": 4.466638224474157e-06, "learning_rate": 0.006802007159087266, "loss": 2.6006, "step": 5085 }, { "crossentropy": 2.7043758630752563, "epoch": 0.27657086924603713, "grad_norm": 0.04183995723724365, "grad_norm_var": 6.24194942812021e-06, "learning_rate": 0.006800845601375444, "loss": 2.7044, "step": 5086 }, { "crossentropy": 2.624133348464966, "epoch": 0.27662524810353734, "grad_norm": 0.043604958802461624, "grad_norm_var": 8.41584651510707e-06, "learning_rate": 0.0067996839319768585, "loss": 2.6241, "step": 5087 }, { "crossentropy": 2.6775132417678833, "epoch": 0.27667962696103754, "grad_norm": 0.03810984641313553, "grad_norm_var": 8.212415597524946e-06, "learning_rate": 0.006798522150963552, "loss": 2.6775, "step": 5088 }, { "crossentropy": 2.682750940322876, "epoch": 0.27673400581853774, "grad_norm": 0.033980220556259155, "grad_norm_var": 8.172231279771763e-06, "learning_rate": 0.00679736025840758, "loss": 2.6828, "step": 5089 }, { "crossentropy": 2.6510818004608154, "epoch": 0.27678838467603795, "grad_norm": 0.03552595525979996, "grad_norm_var": 8.286089211013922e-06, "learning_rate": 0.006796198254380999, "loss": 2.6511, "step": 5090 }, { "crossentropy": 2.7188122272491455, "epoch": 0.27684276353353815, "grad_norm": 0.03551208972930908, "grad_norm_var": 8.029213317656039e-06, "learning_rate": 0.006795036138955878, "loss": 2.7188, "step": 5091 }, { "crossentropy": 2.678753614425659, "epoch": 0.27689714239103835, "grad_norm": 0.03516864404082298, "grad_norm_var": 8.202355254571571e-06, "learning_rate": 0.006793873912204289, "loss": 2.6788, "step": 5092 }, { "crossentropy": 2.532585024833679, "epoch": 0.27695152124853856, "grad_norm": 0.034741200506687164, "grad_norm_var": 8.571197345899096e-06, "learning_rate": 0.006792711574198311, "loss": 2.5326, "step": 5093 }, { "crossentropy": 2.6098257303237915, "epoch": 0.27700590010603876, "grad_norm": 0.03372083231806755, "grad_norm_var": 7.522698778180311e-06, "learning_rate": 0.006791549125010034, "loss": 2.6098, "step": 5094 }, { "crossentropy": 2.6252329349517822, "epoch": 0.27706027896353896, "grad_norm": 0.035656750202178955, "grad_norm_var": 7.373645920360124e-06, "learning_rate": 0.006790386564711551, "loss": 2.6252, "step": 5095 }, { "crossentropy": 2.5936659574508667, "epoch": 0.27711465782103917, "grad_norm": 0.03445659950375557, "grad_norm_var": 7.3360037336598256e-06, "learning_rate": 0.006789223893374961, "loss": 2.5937, "step": 5096 }, { "crossentropy": 2.8167742490768433, "epoch": 0.27716903667853937, "grad_norm": 0.03718977048993111, "grad_norm_var": 7.339495609500387e-06, "learning_rate": 0.006788061111072374, "loss": 2.8168, "step": 5097 }, { "crossentropy": 2.7222901582717896, "epoch": 0.2772234155360396, "grad_norm": 0.03749970719218254, "grad_norm_var": 7.241188305155225e-06, "learning_rate": 0.0067868982178759035, "loss": 2.7223, "step": 5098 }, { "crossentropy": 2.6895004510879517, "epoch": 0.2772777943935398, "grad_norm": 0.04045687988400459, "grad_norm_var": 8.1417755261108e-06, "learning_rate": 0.006785735213857671, "loss": 2.6895, "step": 5099 }, { "crossentropy": 2.6185959577560425, "epoch": 0.27733217325104, "grad_norm": 0.036663834005594254, "grad_norm_var": 8.11054543304503e-06, "learning_rate": 0.006784572099089807, "loss": 2.6186, "step": 5100 }, { "crossentropy": 2.650025486946106, "epoch": 0.2773865521085402, "grad_norm": 0.034991610795259476, "grad_norm_var": 8.34226180588208e-06, "learning_rate": 0.006783408873644444, "loss": 2.65, "step": 5101 }, { "crossentropy": 2.6916868686676025, "epoch": 0.2774409309660404, "grad_norm": 0.03437822684645653, "grad_norm_var": 6.827687175167685e-06, "learning_rate": 0.006782245537593726, "loss": 2.6917, "step": 5102 }, { "crossentropy": 2.5966551303863525, "epoch": 0.2774953098235406, "grad_norm": 0.03715943545103073, "grad_norm_var": 3.1923696171715436e-06, "learning_rate": 0.0067810820910098, "loss": 2.5967, "step": 5103 }, { "crossentropy": 2.6832226514816284, "epoch": 0.2775496886810408, "grad_norm": 0.04256010800600052, "grad_norm_var": 5.711325277238793e-06, "learning_rate": 0.006779918533964824, "loss": 2.6832, "step": 5104 }, { "crossentropy": 2.6712390184402466, "epoch": 0.277604067538541, "grad_norm": 0.045582473278045654, "grad_norm_var": 1.0646010828730251e-05, "learning_rate": 0.00677875486653096, "loss": 2.6712, "step": 5105 }, { "crossentropy": 2.6374599933624268, "epoch": 0.2776584463960412, "grad_norm": 0.0427868515253067, "grad_norm_var": 1.2558524166890867e-05, "learning_rate": 0.006777591088780378, "loss": 2.6375, "step": 5106 }, { "crossentropy": 2.7691997289657593, "epoch": 0.2777128252535414, "grad_norm": 0.03853970021009445, "grad_norm_var": 1.236615737858811e-05, "learning_rate": 0.006776427200785252, "loss": 2.7692, "step": 5107 }, { "crossentropy": 2.679848790168762, "epoch": 0.2777672041110416, "grad_norm": 0.0371575765311718, "grad_norm_var": 1.1969409668058315e-05, "learning_rate": 0.00677526320261777, "loss": 2.6798, "step": 5108 }, { "crossentropy": 2.6261669397354126, "epoch": 0.2778215829685418, "grad_norm": 0.03392992168664932, "grad_norm_var": 1.2332909496967579e-05, "learning_rate": 0.006774099094350118, "loss": 2.6262, "step": 5109 }, { "crossentropy": 2.6317834854125977, "epoch": 0.277875961826042, "grad_norm": 0.037057843059301376, "grad_norm_var": 1.1271479230535026e-05, "learning_rate": 0.006772934876054494, "loss": 2.6318, "step": 5110 }, { "crossentropy": 2.7218148708343506, "epoch": 0.2779303406835422, "grad_norm": 0.03421364724636078, "grad_norm_var": 1.1829269280301941e-05, "learning_rate": 0.006771770547803101, "loss": 2.7218, "step": 5111 }, { "crossentropy": 2.7156851291656494, "epoch": 0.2779847195410424, "grad_norm": 0.03614750877022743, "grad_norm_var": 1.1256660130518476e-05, "learning_rate": 0.0067706061096681515, "loss": 2.7157, "step": 5112 }, { "crossentropy": 2.6509498357772827, "epoch": 0.27803909839854263, "grad_norm": 0.036127131432294846, "grad_norm_var": 1.1427112394974254e-05, "learning_rate": 0.006769441561721863, "loss": 2.6509, "step": 5113 }, { "crossentropy": 2.645459771156311, "epoch": 0.27809347725604283, "grad_norm": 0.04250771924853325, "grad_norm_var": 1.2775225402808914e-05, "learning_rate": 0.006768276904036455, "loss": 2.6455, "step": 5114 }, { "crossentropy": 2.6093744039535522, "epoch": 0.27814785611354303, "grad_norm": 0.04131828248500824, "grad_norm_var": 1.3087556560077939e-05, "learning_rate": 0.006767112136684166, "loss": 2.6094, "step": 5115 }, { "crossentropy": 2.6342562437057495, "epoch": 0.27820223497104324, "grad_norm": 0.037240225821733475, "grad_norm_var": 1.299063825577924e-05, "learning_rate": 0.006765947259737227, "loss": 2.6343, "step": 5116 }, { "crossentropy": 2.605642318725586, "epoch": 0.27825661382854344, "grad_norm": 0.03587789088487625, "grad_norm_var": 1.2656914000697532e-05, "learning_rate": 0.006764782273267888, "loss": 2.6056, "step": 5117 }, { "crossentropy": 2.6503812074661255, "epoch": 0.27831099268604365, "grad_norm": 0.03604750707745552, "grad_norm_var": 1.1961195403321392e-05, "learning_rate": 0.006763617177348393, "loss": 2.6504, "step": 5118 }, { "crossentropy": 2.6535909175872803, "epoch": 0.27836537154354385, "grad_norm": 0.034513045102357864, "grad_norm_var": 1.2833418825919374e-05, "learning_rate": 0.006762451972051009, "loss": 2.6536, "step": 5119 }, { "crossentropy": 2.6812994480133057, "epoch": 0.27841975040104405, "grad_norm": 0.033598802983760834, "grad_norm_var": 1.2673272235233512e-05, "learning_rate": 0.006761286657447996, "loss": 2.6813, "step": 5120 }, { "crossentropy": 2.6870925426483154, "epoch": 0.27847412925854426, "grad_norm": 0.03411233797669411, "grad_norm_var": 8.788009520948638e-06, "learning_rate": 0.006760121233611626, "loss": 2.6871, "step": 5121 }, { "crossentropy": 2.660062789916992, "epoch": 0.27852850811604446, "grad_norm": 0.03822730481624603, "grad_norm_var": 6.537985991101185e-06, "learning_rate": 0.0067589557006141774, "loss": 2.6601, "step": 5122 }, { "crossentropy": 2.7532646656036377, "epoch": 0.27858288697354466, "grad_norm": 0.04089553654193878, "grad_norm_var": 7.474186132511567e-06, "learning_rate": 0.006757790058527937, "loss": 2.7533, "step": 5123 }, { "crossentropy": 2.6172139644622803, "epoch": 0.27863726583104487, "grad_norm": 0.038595233112573624, "grad_norm_var": 7.669843611933918e-06, "learning_rate": 0.006756624307425197, "loss": 2.6172, "step": 5124 }, { "crossentropy": 2.7191667556762695, "epoch": 0.27869164468854507, "grad_norm": 0.03752840310335159, "grad_norm_var": 7.053826156401948e-06, "learning_rate": 0.006755458447378253, "loss": 2.7192, "step": 5125 }, { "crossentropy": 2.7241803407669067, "epoch": 0.2787460235460453, "grad_norm": 0.03583940863609314, "grad_norm_var": 7.157608226295252e-06, "learning_rate": 0.006754292478459415, "loss": 2.7242, "step": 5126 }, { "crossentropy": 2.6499907970428467, "epoch": 0.2788004024035455, "grad_norm": 0.038520220667123795, "grad_norm_var": 6.688467033985566e-06, "learning_rate": 0.006753126400740992, "loss": 2.65, "step": 5127 }, { "crossentropy": 2.7251358032226562, "epoch": 0.2788547812610457, "grad_norm": 0.0431271493434906, "grad_norm_var": 8.643399562065454e-06, "learning_rate": 0.006751960214295303, "loss": 2.7251, "step": 5128 }, { "crossentropy": 2.762390375137329, "epoch": 0.2789091601185459, "grad_norm": 0.04714430868625641, "grad_norm_var": 1.3838616800831923e-05, "learning_rate": 0.006750793919194676, "loss": 2.7624, "step": 5129 }, { "crossentropy": 2.5504848957061768, "epoch": 0.2789635389760461, "grad_norm": 0.044954560697078705, "grad_norm_var": 1.553879321074143e-05, "learning_rate": 0.006749627515511443, "loss": 2.5505, "step": 5130 }, { "crossentropy": 2.738020658493042, "epoch": 0.2790179178335463, "grad_norm": 0.038567472249269485, "grad_norm_var": 1.5013360373242595e-05, "learning_rate": 0.006748461003317941, "loss": 2.738, "step": 5131 }, { "crossentropy": 2.655145049095154, "epoch": 0.2790722966910465, "grad_norm": 0.03321237489581108, "grad_norm_var": 1.6663257950257487e-05, "learning_rate": 0.006747294382686519, "loss": 2.6551, "step": 5132 }, { "crossentropy": 2.6944767236709595, "epoch": 0.2791266755485467, "grad_norm": 0.036562465131282806, "grad_norm_var": 1.6483095147100334e-05, "learning_rate": 0.006746127653689528, "loss": 2.6945, "step": 5133 }, { "crossentropy": 2.635429859161377, "epoch": 0.2791810544060469, "grad_norm": 0.03525971993803978, "grad_norm_var": 1.6749593172546372e-05, "learning_rate": 0.006744960816399327, "loss": 2.6354, "step": 5134 }, { "crossentropy": 2.5538166761398315, "epoch": 0.2792354332635471, "grad_norm": 0.03584709018468857, "grad_norm_var": 1.6211036014166883e-05, "learning_rate": 0.006743793870888284, "loss": 2.5538, "step": 5135 }, { "crossentropy": 2.646332621574402, "epoch": 0.2792898121210473, "grad_norm": 0.03541257232427597, "grad_norm_var": 1.529193453168968e-05, "learning_rate": 0.00674262681722877, "loss": 2.6463, "step": 5136 }, { "crossentropy": 2.714392900466919, "epoch": 0.2793441909785475, "grad_norm": 0.04294499382376671, "grad_norm_var": 1.5162106869405732e-05, "learning_rate": 0.006741459655493166, "loss": 2.7144, "step": 5137 }, { "crossentropy": 2.6591631174087524, "epoch": 0.27939856983604777, "grad_norm": 0.03493855893611908, "grad_norm_var": 1.6139618921597274e-05, "learning_rate": 0.006740292385753858, "loss": 2.6592, "step": 5138 }, { "crossentropy": 2.6383227109909058, "epoch": 0.279452948693548, "grad_norm": 0.03463209420442581, "grad_norm_var": 1.6765822286954334e-05, "learning_rate": 0.0067391250080832375, "loss": 2.6383, "step": 5139 }, { "crossentropy": 2.617570996284485, "epoch": 0.2795073275510482, "grad_norm": 0.03626682609319687, "grad_norm_var": 1.7018569875023163e-05, "learning_rate": 0.006737957522553707, "loss": 2.6176, "step": 5140 }, { "crossentropy": 2.6553750038146973, "epoch": 0.2795617064085484, "grad_norm": 0.0368754044175148, "grad_norm_var": 1.710128989514109e-05, "learning_rate": 0.006736789929237671, "loss": 2.6554, "step": 5141 }, { "crossentropy": 2.7341924905776978, "epoch": 0.2796160852660486, "grad_norm": 0.03575429692864418, "grad_norm_var": 1.712775468539184e-05, "learning_rate": 0.006735622228207541, "loss": 2.7342, "step": 5142 }, { "crossentropy": 2.7541779279708862, "epoch": 0.2796704641235488, "grad_norm": 0.03519771248102188, "grad_norm_var": 1.76431695449466e-05, "learning_rate": 0.00673445441953574, "loss": 2.7542, "step": 5143 }, { "crossentropy": 2.6509737968444824, "epoch": 0.279724842981049, "grad_norm": 0.037307094782590866, "grad_norm_var": 1.5718362072795337e-05, "learning_rate": 0.006733286503294693, "loss": 2.651, "step": 5144 }, { "crossentropy": 2.739877223968506, "epoch": 0.2797792218385492, "grad_norm": 0.04148758575320244, "grad_norm_var": 1.048561188080003e-05, "learning_rate": 0.006732118479556834, "loss": 2.7399, "step": 5145 }, { "crossentropy": 2.683379292488098, "epoch": 0.2798336006960494, "grad_norm": 0.03486635908484459, "grad_norm_var": 6.417491094654754e-06, "learning_rate": 0.0067309503483946, "loss": 2.6834, "step": 5146 }, { "crossentropy": 2.6361517906188965, "epoch": 0.2798879795535496, "grad_norm": 0.03518615663051605, "grad_norm_var": 6.231883139473128e-06, "learning_rate": 0.006729782109880439, "loss": 2.6362, "step": 5147 }, { "crossentropy": 2.671446919441223, "epoch": 0.2799423584110498, "grad_norm": 0.03616846725344658, "grad_norm_var": 5.537629709293351e-06, "learning_rate": 0.0067286137640868065, "loss": 2.6714, "step": 5148 }, { "crossentropy": 2.730422258377075, "epoch": 0.27999673726855, "grad_norm": 0.05718569457530975, "grad_norm_var": 3.2170170029450994e-05, "learning_rate": 0.006727445311086156, "loss": 2.7304, "step": 5149 }, { "crossentropy": 2.7004557847976685, "epoch": 0.2800511161260502, "grad_norm": 0.037146396934986115, "grad_norm_var": 3.1745274161204307e-05, "learning_rate": 0.006726276750950962, "loss": 2.7005, "step": 5150 }, { "crossentropy": 2.7982319593429565, "epoch": 0.2801054949835504, "grad_norm": 0.040955010801553726, "grad_norm_var": 3.194301637578495e-05, "learning_rate": 0.00672510808375369, "loss": 2.7982, "step": 5151 }, { "crossentropy": 2.699378728866577, "epoch": 0.2801598738410506, "grad_norm": 0.03916562721133232, "grad_norm_var": 3.139331405193231e-05, "learning_rate": 0.006723939309566827, "loss": 2.6994, "step": 5152 }, { "crossentropy": 2.644388794898987, "epoch": 0.2802142526985508, "grad_norm": 0.038817934691905975, "grad_norm_var": 3.0014577854337437e-05, "learning_rate": 0.0067227704284628514, "loss": 2.6444, "step": 5153 }, { "crossentropy": 2.768078565597534, "epoch": 0.28026863155605103, "grad_norm": 0.036976948380470276, "grad_norm_var": 2.937509504878684e-05, "learning_rate": 0.006721601440514261, "loss": 2.7681, "step": 5154 }, { "crossentropy": 2.5580694675445557, "epoch": 0.28032301041355123, "grad_norm": 0.037248048931360245, "grad_norm_var": 2.8497519790011507e-05, "learning_rate": 0.0067204323457935545, "loss": 2.5581, "step": 5155 }, { "crossentropy": 2.774612069129944, "epoch": 0.28037738927105144, "grad_norm": 0.038279931992292404, "grad_norm_var": 2.814123270486555e-05, "learning_rate": 0.006719263144373238, "loss": 2.7746, "step": 5156 }, { "crossentropy": 2.7509756088256836, "epoch": 0.28043176812855164, "grad_norm": 0.03447410464286804, "grad_norm_var": 2.9074176658032817e-05, "learning_rate": 0.006718093836325823, "loss": 2.751, "step": 5157 }, { "crossentropy": 2.5613179206848145, "epoch": 0.28048614698605184, "grad_norm": 0.04722227901220322, "grad_norm_var": 3.307471009421544e-05, "learning_rate": 0.006716924421723831, "loss": 2.5613, "step": 5158 }, { "crossentropy": 2.6233338117599487, "epoch": 0.28054052584355205, "grad_norm": 0.034538913518190384, "grad_norm_var": 3.3456061065651775e-05, "learning_rate": 0.006715754900639789, "loss": 2.6233, "step": 5159 }, { "crossentropy": 2.6765729188919067, "epoch": 0.28059490470105225, "grad_norm": 0.14097978174686432, "grad_norm_var": 0.0006791918580611011, "learning_rate": 0.006714585273146224, "loss": 2.6766, "step": 5160 }, { "crossentropy": 2.6281814575195312, "epoch": 0.28064928355855245, "grad_norm": 0.04077118635177612, "grad_norm_var": 0.000679623314762392, "learning_rate": 0.006713415539315682, "loss": 2.6282, "step": 5161 }, { "crossentropy": 2.5747982263565063, "epoch": 0.28070366241605266, "grad_norm": 0.03671953082084656, "grad_norm_var": 0.0006771798721798984, "learning_rate": 0.006712245699220705, "loss": 2.5748, "step": 5162 }, { "crossentropy": 2.6571210622787476, "epoch": 0.28075804127355286, "grad_norm": 0.035916708409786224, "grad_norm_var": 0.000676185235861454, "learning_rate": 0.006711075752933846, "loss": 2.6571, "step": 5163 }, { "crossentropy": 2.7075897455215454, "epoch": 0.28081242013105306, "grad_norm": 0.03652450814843178, "grad_norm_var": 0.0006757366220120666, "learning_rate": 0.006709905700527662, "loss": 2.7076, "step": 5164 }, { "crossentropy": 2.644153118133545, "epoch": 0.28086679898855327, "grad_norm": 0.043155450373888016, "grad_norm_var": 0.0006667547314490188, "learning_rate": 0.006708735542074723, "loss": 2.6442, "step": 5165 }, { "crossentropy": 2.691435694694519, "epoch": 0.28092117784605347, "grad_norm": 0.10964320600032806, "grad_norm_var": 0.000919995786180025, "learning_rate": 0.006707565277647597, "loss": 2.6914, "step": 5166 }, { "crossentropy": 2.672147512435913, "epoch": 0.2809755567035537, "grad_norm": 0.03585037961602211, "grad_norm_var": 0.0009274142502087134, "learning_rate": 0.006706394907318866, "loss": 2.6721, "step": 5167 }, { "crossentropy": 2.7241153717041016, "epoch": 0.2810299355610539, "grad_norm": 0.035754818469285965, "grad_norm_var": 0.0009326787075872991, "learning_rate": 0.006705224431161112, "loss": 2.7241, "step": 5168 }, { "crossentropy": 2.7488105297088623, "epoch": 0.2810843144185541, "grad_norm": 0.037047337740659714, "grad_norm_var": 0.0009352618057050455, "learning_rate": 0.00670405384924693, "loss": 2.7488, "step": 5169 }, { "crossentropy": 2.6394859552383423, "epoch": 0.2811386932760543, "grad_norm": 0.036564670503139496, "grad_norm_var": 0.0009359233881664743, "learning_rate": 0.006702883161648917, "loss": 2.6395, "step": 5170 }, { "crossentropy": 2.652304768562317, "epoch": 0.2811930721335545, "grad_norm": 0.03674807772040367, "grad_norm_var": 0.0009367086423565133, "learning_rate": 0.0067017123684396775, "loss": 2.6523, "step": 5171 }, { "crossentropy": 2.6541190147399902, "epoch": 0.2812474509910547, "grad_norm": 0.03847070038318634, "grad_norm_var": 0.0009364442990322056, "learning_rate": 0.006700541469691824, "loss": 2.6541, "step": 5172 }, { "crossentropy": 2.6783676147460938, "epoch": 0.2813018298485549, "grad_norm": 0.03894810751080513, "grad_norm_var": 0.0009291650606612003, "learning_rate": 0.006699370465477972, "loss": 2.6784, "step": 5173 }, { "crossentropy": 2.6599276065826416, "epoch": 0.2813562087060551, "grad_norm": 0.042105674743652344, "grad_norm_var": 0.0009320505578764293, "learning_rate": 0.006698199355870749, "loss": 2.6599, "step": 5174 }, { "crossentropy": 2.742187261581421, "epoch": 0.2814105875635553, "grad_norm": 0.036771584302186966, "grad_norm_var": 0.0009281364737974473, "learning_rate": 0.006697028140942785, "loss": 2.7422, "step": 5175 }, { "crossentropy": 2.6211187839508057, "epoch": 0.2814649664210555, "grad_norm": 0.03543946146965027, "grad_norm_var": 0.0003281816322741632, "learning_rate": 0.006695856820766719, "loss": 2.6211, "step": 5176 }, { "crossentropy": 2.648835778236389, "epoch": 0.2815193452785557, "grad_norm": 0.03559798747301102, "grad_norm_var": 0.00033089288048054124, "learning_rate": 0.006694685395415194, "loss": 2.6488, "step": 5177 }, { "crossentropy": 2.618270754814148, "epoch": 0.2815737241360559, "grad_norm": 0.03475329652428627, "grad_norm_var": 0.0003325067077146572, "learning_rate": 0.00669351386496086, "loss": 2.6183, "step": 5178 }, { "crossentropy": 2.58356511592865, "epoch": 0.2816281029935561, "grad_norm": 0.03693554177880287, "grad_norm_var": 0.0003317681945917877, "learning_rate": 0.006692342229476375, "loss": 2.5836, "step": 5179 }, { "crossentropy": 2.70199191570282, "epoch": 0.2816824818510563, "grad_norm": 0.0372733473777771, "grad_norm_var": 0.0003312670815003045, "learning_rate": 0.006691170489034403, "loss": 2.702, "step": 5180 }, { "crossentropy": 2.7685364484786987, "epoch": 0.2817368607085565, "grad_norm": 0.03953483700752258, "grad_norm_var": 0.0003315002199231263, "learning_rate": 0.006689998643707612, "loss": 2.7685, "step": 5181 }, { "crossentropy": 2.701617956161499, "epoch": 0.28179123956605673, "grad_norm": 0.04229367896914482, "grad_norm_var": 5.006093038238622e-06, "learning_rate": 0.0066888266935686835, "loss": 2.7016, "step": 5182 }, { "crossentropy": 2.6661477088928223, "epoch": 0.28184561842355693, "grad_norm": 0.034508801996707916, "grad_norm_var": 5.414662227775276e-06, "learning_rate": 0.006687654638690297, "loss": 2.6661, "step": 5183 }, { "crossentropy": 2.683983087539673, "epoch": 0.28189999728105714, "grad_norm": 0.03498917818069458, "grad_norm_var": 5.621468883497809e-06, "learning_rate": 0.0066864824791451425, "loss": 2.684, "step": 5184 }, { "crossentropy": 2.6033636331558228, "epoch": 0.28195437613855734, "grad_norm": 0.03798941895365715, "grad_norm_var": 5.6359198539986e-06, "learning_rate": 0.0066853102150059174, "loss": 2.6034, "step": 5185 }, { "crossentropy": 2.757973313331604, "epoch": 0.28200875499605754, "grad_norm": 0.03694288432598114, "grad_norm_var": 5.6010831045787015e-06, "learning_rate": 0.006684137846345324, "loss": 2.758, "step": 5186 }, { "crossentropy": 2.571847438812256, "epoch": 0.28206313385355775, "grad_norm": 0.040009573101997375, "grad_norm_var": 5.957887442475708e-06, "learning_rate": 0.006682965373236072, "loss": 2.5718, "step": 5187 }, { "crossentropy": 2.751840353012085, "epoch": 0.28211751271105795, "grad_norm": 0.041306182742118835, "grad_norm_var": 6.7667855731513845e-06, "learning_rate": 0.006681792795750875, "loss": 2.7518, "step": 5188 }, { "crossentropy": 2.6022510528564453, "epoch": 0.28217189156855815, "grad_norm": 0.05496887490153313, "grad_norm_var": 2.518077780465089e-05, "learning_rate": 0.006680620113962459, "loss": 2.6023, "step": 5189 }, { "crossentropy": 2.715896964073181, "epoch": 0.28222627042605836, "grad_norm": 0.03897310420870781, "grad_norm_var": 2.442958235117128e-05, "learning_rate": 0.006679447327943549, "loss": 2.7159, "step": 5190 }, { "crossentropy": 2.6162242889404297, "epoch": 0.28228064928355856, "grad_norm": 0.04413875937461853, "grad_norm_var": 2.5983528836372403e-05, "learning_rate": 0.006678274437766882, "loss": 2.6162, "step": 5191 }, { "crossentropy": 2.6334279775619507, "epoch": 0.28233502814105876, "grad_norm": 0.04157271981239319, "grad_norm_var": 2.5338304574497547e-05, "learning_rate": 0.006677101443505198, "loss": 2.6334, "step": 5192 }, { "crossentropy": 2.712804436683655, "epoch": 0.28238940699855897, "grad_norm": 0.03720290958881378, "grad_norm_var": 2.466713321679174e-05, "learning_rate": 0.006675928345231248, "loss": 2.7128, "step": 5193 }, { "crossentropy": 2.6293071508407593, "epoch": 0.28244378585605917, "grad_norm": 0.040287621319293976, "grad_norm_var": 2.3014541011458628e-05, "learning_rate": 0.0066747551430177835, "loss": 2.6293, "step": 5194 }, { "crossentropy": 2.682602286338806, "epoch": 0.2824981647135594, "grad_norm": 0.03605344519019127, "grad_norm_var": 2.341570743725572e-05, "learning_rate": 0.006673581836937566, "loss": 2.6826, "step": 5195 }, { "crossentropy": 2.739699363708496, "epoch": 0.2825525435710596, "grad_norm": 0.034454911947250366, "grad_norm_var": 2.489092456622223e-05, "learning_rate": 0.006672408427063363, "loss": 2.7397, "step": 5196 }, { "crossentropy": 2.6098822355270386, "epoch": 0.2826069224285598, "grad_norm": 0.03380363807082176, "grad_norm_var": 2.7071335394553373e-05, "learning_rate": 0.006671234913467949, "loss": 2.6099, "step": 5197 }, { "crossentropy": 2.6463513374328613, "epoch": 0.28266130128606, "grad_norm": 0.038071148097515106, "grad_norm_var": 2.652472246457791e-05, "learning_rate": 0.006670061296224104, "loss": 2.6464, "step": 5198 }, { "crossentropy": 2.615369200706482, "epoch": 0.2827156801435602, "grad_norm": 0.03465060517191887, "grad_norm_var": 2.6439559239996995e-05, "learning_rate": 0.006668887575404613, "loss": 2.6154, "step": 5199 }, { "crossentropy": 2.606862783432007, "epoch": 0.2827700590010604, "grad_norm": 0.03315097838640213, "grad_norm_var": 2.765544603041403e-05, "learning_rate": 0.006667713751082272, "loss": 2.6069, "step": 5200 }, { "crossentropy": 2.710893392562866, "epoch": 0.2828244378585606, "grad_norm": 0.03479701280593872, "grad_norm_var": 2.871131090071971e-05, "learning_rate": 0.006666539823329879, "loss": 2.7109, "step": 5201 }, { "crossentropy": 2.7745137214660645, "epoch": 0.2828788167160608, "grad_norm": 0.04233500733971596, "grad_norm_var": 2.9212001101208464e-05, "learning_rate": 0.006665365792220239, "loss": 2.7745, "step": 5202 }, { "crossentropy": 2.6981327533721924, "epoch": 0.282933195573561, "grad_norm": 0.042392879724502563, "grad_norm_var": 2.9852544095335804e-05, "learning_rate": 0.006664191657826167, "loss": 2.6981, "step": 5203 }, { "crossentropy": 2.691408634185791, "epoch": 0.2829875744310612, "grad_norm": 0.04849347099661827, "grad_norm_var": 3.504199303996687e-05, "learning_rate": 0.0066630174202204785, "loss": 2.6914, "step": 5204 }, { "crossentropy": 2.570750594139099, "epoch": 0.2830419532885614, "grad_norm": 0.03728146106004715, "grad_norm_var": 1.860753972392585e-05, "learning_rate": 0.006661843079476, "loss": 2.5708, "step": 5205 }, { "crossentropy": 2.6146734952926636, "epoch": 0.2830963321460616, "grad_norm": 0.0339433029294014, "grad_norm_var": 1.9941003558990716e-05, "learning_rate": 0.0066606686356655625, "loss": 2.6147, "step": 5206 }, { "crossentropy": 2.7455538511276245, "epoch": 0.2831507110035618, "grad_norm": 0.037016190588474274, "grad_norm_var": 1.7556663574072073e-05, "learning_rate": 0.006659494088862005, "loss": 2.7456, "step": 5207 }, { "crossentropy": 2.6729965209960938, "epoch": 0.283205089861062, "grad_norm": 0.0977429524064064, "grad_norm_var": 0.0002426742980474729, "learning_rate": 0.0066583194391381705, "loss": 2.673, "step": 5208 }, { "crossentropy": 2.637049913406372, "epoch": 0.2832594687185622, "grad_norm": 0.046461861580610275, "grad_norm_var": 0.00024290663275513153, "learning_rate": 0.006657144686566911, "loss": 2.637, "step": 5209 }, { "crossentropy": 2.6099703311920166, "epoch": 0.2833138475760624, "grad_norm": 0.04558693617582321, "grad_norm_var": 0.00024349884509740973, "learning_rate": 0.006655969831221084, "loss": 2.61, "step": 5210 }, { "crossentropy": 2.6951589584350586, "epoch": 0.28336822643356263, "grad_norm": 0.044784627854824066, "grad_norm_var": 0.00024103251813905688, "learning_rate": 0.006654794873173552, "loss": 2.6952, "step": 5211 }, { "crossentropy": 2.6283726692199707, "epoch": 0.28342260529106283, "grad_norm": 0.040780071169137955, "grad_norm_var": 0.00023648632577091613, "learning_rate": 0.006653619812497184, "loss": 2.6284, "step": 5212 }, { "crossentropy": 2.5844624042510986, "epoch": 0.28347698414856304, "grad_norm": 0.08079437911510468, "grad_norm_var": 0.0003155860839771706, "learning_rate": 0.006652444649264856, "loss": 2.5845, "step": 5213 }, { "crossentropy": 2.666469931602478, "epoch": 0.28353136300606324, "grad_norm": 0.03953211382031441, "grad_norm_var": 0.00031414718768876224, "learning_rate": 0.006651269383549452, "loss": 2.6665, "step": 5214 }, { "crossentropy": 2.7031863927841187, "epoch": 0.28358574186356345, "grad_norm": 0.039713602513074875, "grad_norm_var": 0.00030792975572099975, "learning_rate": 0.006650094015423861, "loss": 2.7032, "step": 5215 }, { "crossentropy": 2.811950445175171, "epoch": 0.28364012072106365, "grad_norm": 0.049275804311037064, "grad_norm_var": 0.00029537187568470693, "learning_rate": 0.006648918544960976, "loss": 2.812, "step": 5216 }, { "crossentropy": 2.628702402114868, "epoch": 0.28369449957856385, "grad_norm": 0.035886213183403015, "grad_norm_var": 0.00029359275339905274, "learning_rate": 0.006647742972233702, "loss": 2.6287, "step": 5217 }, { "crossentropy": 2.6751381158828735, "epoch": 0.28374887843606406, "grad_norm": 0.034417346119880676, "grad_norm_var": 0.0003030967983083721, "learning_rate": 0.006646567297314945, "loss": 2.6751, "step": 5218 }, { "crossentropy": 2.6594579219818115, "epoch": 0.28380325729356426, "grad_norm": 0.037583015859127045, "grad_norm_var": 0.0003075816403105352, "learning_rate": 0.0066453915202776175, "loss": 2.6595, "step": 5219 }, { "crossentropy": 2.620916724205017, "epoch": 0.28385763615106446, "grad_norm": 0.04348253458738327, "grad_norm_var": 0.0003080401341773846, "learning_rate": 0.006644215641194643, "loss": 2.6209, "step": 5220 }, { "crossentropy": 2.684292793273926, "epoch": 0.28391201500856467, "grad_norm": 0.03738999739289284, "grad_norm_var": 0.00030790720881358204, "learning_rate": 0.006643039660138948, "loss": 2.6843, "step": 5221 }, { "crossentropy": 2.720747709274292, "epoch": 0.28396639386606487, "grad_norm": 0.04990020766854286, "grad_norm_var": 0.0002970536789541608, "learning_rate": 0.006641863577183465, "loss": 2.7207, "step": 5222 }, { "crossentropy": 2.6774951219558716, "epoch": 0.2840207727235651, "grad_norm": 0.038493603467941284, "grad_norm_var": 0.00029512062937769954, "learning_rate": 0.006640687392401132, "loss": 2.6775, "step": 5223 }, { "crossentropy": 2.65605628490448, "epoch": 0.2840751515810653, "grad_norm": 0.04065161570906639, "grad_norm_var": 0.00011724450762653655, "learning_rate": 0.006639511105864897, "loss": 2.6561, "step": 5224 }, { "crossentropy": 2.742418050765991, "epoch": 0.2841295304385655, "grad_norm": 0.048162929713726044, "grad_norm_var": 0.00011797332836524135, "learning_rate": 0.006638334717647713, "loss": 2.7424, "step": 5225 }, { "crossentropy": 2.6299303770065308, "epoch": 0.2841839092960657, "grad_norm": 0.0450478196144104, "grad_norm_var": 0.00011788836087288779, "learning_rate": 0.0066371582278225375, "loss": 2.6299, "step": 5226 }, { "crossentropy": 2.7530099153518677, "epoch": 0.2842382881535659, "grad_norm": 0.03934602066874504, "grad_norm_var": 0.00011925396750548379, "learning_rate": 0.0066359816364623325, "loss": 2.753, "step": 5227 }, { "crossentropy": 2.5879493951797485, "epoch": 0.2842926670110661, "grad_norm": 0.037752293050289154, "grad_norm_var": 0.00012103744165518019, "learning_rate": 0.006634804943640073, "loss": 2.5879, "step": 5228 }, { "crossentropy": 2.5352890491485596, "epoch": 0.2843470458685663, "grad_norm": 0.03839624300599098, "grad_norm_var": 2.3064341670995385e-05, "learning_rate": 0.006633628149428737, "loss": 2.5353, "step": 5229 }, { "crossentropy": 2.631226897239685, "epoch": 0.2844014247260665, "grad_norm": 0.04184648394584656, "grad_norm_var": 2.2964828314534352e-05, "learning_rate": 0.006632451253901304, "loss": 2.6312, "step": 5230 }, { "crossentropy": 2.6263991594314575, "epoch": 0.2844558035835667, "grad_norm": 0.052674539387226105, "grad_norm_var": 3.109554112164052e-05, "learning_rate": 0.006631274257130766, "loss": 2.6264, "step": 5231 }, { "crossentropy": 2.524649143218994, "epoch": 0.2845101824410669, "grad_norm": 0.03954377397894859, "grad_norm_var": 2.743662424257882e-05, "learning_rate": 0.006630097159190119, "loss": 2.5246, "step": 5232 }, { "crossentropy": 2.5474523305892944, "epoch": 0.2845645612985671, "grad_norm": 0.034932661801576614, "grad_norm_var": 2.8179972070232057e-05, "learning_rate": 0.006628919960152369, "loss": 2.5475, "step": 5233 }, { "crossentropy": 2.6723402738571167, "epoch": 0.2846189401560673, "grad_norm": 0.036341145634651184, "grad_norm_var": 2.666473864869062e-05, "learning_rate": 0.006627742660090518, "loss": 2.6723, "step": 5234 }, { "crossentropy": 2.707667827606201, "epoch": 0.2846733190135675, "grad_norm": 0.04077916592359543, "grad_norm_var": 2.5699354672362117e-05, "learning_rate": 0.006626565259077587, "loss": 2.7077, "step": 5235 }, { "crossentropy": 2.700133442878723, "epoch": 0.2847276978710677, "grad_norm": 0.03601415082812309, "grad_norm_var": 2.7257344187831915e-05, "learning_rate": 0.006625387757186594, "loss": 2.7001, "step": 5236 }, { "crossentropy": 2.6410369873046875, "epoch": 0.2847820767285679, "grad_norm": 0.03335736691951752, "grad_norm_var": 3.0257534627547004e-05, "learning_rate": 0.006624210154490569, "loss": 2.641, "step": 5237 }, { "crossentropy": 2.6848409175872803, "epoch": 0.2848364555860681, "grad_norm": 0.037173930555582047, "grad_norm_var": 2.49850143666648e-05, "learning_rate": 0.006623032451062542, "loss": 2.6848, "step": 5238 }, { "crossentropy": 2.681291103363037, "epoch": 0.28489083444356833, "grad_norm": 0.03951010853052139, "grad_norm_var": 2.4841074702197595e-05, "learning_rate": 0.006621854646975559, "loss": 2.6813, "step": 5239 }, { "crossentropy": 2.6633304357528687, "epoch": 0.28494521330106853, "grad_norm": 0.052494656294584274, "grad_norm_var": 3.4485100150198565e-05, "learning_rate": 0.00662067674230266, "loss": 2.6633, "step": 5240 }, { "crossentropy": 2.6287580728530884, "epoch": 0.28499959215856874, "grad_norm": 0.03577607125043869, "grad_norm_var": 3.197344296576628e-05, "learning_rate": 0.006619498737116903, "loss": 2.6288, "step": 5241 }, { "crossentropy": 2.612077474594116, "epoch": 0.28505397101606894, "grad_norm": 0.03478910028934479, "grad_norm_var": 3.173079967533071e-05, "learning_rate": 0.0066183206314913414, "loss": 2.6121, "step": 5242 }, { "crossentropy": 2.7649800777435303, "epoch": 0.28510834987356914, "grad_norm": 0.03650084510445595, "grad_norm_var": 3.2264986084370945e-05, "learning_rate": 0.006617142425499045, "loss": 2.765, "step": 5243 }, { "crossentropy": 2.6789186000823975, "epoch": 0.28516272873106935, "grad_norm": 0.03490472212433815, "grad_norm_var": 3.333763353388248e-05, "learning_rate": 0.006615964119213081, "loss": 2.6789, "step": 5244 }, { "crossentropy": 2.641705632209778, "epoch": 0.28521710758856955, "grad_norm": 0.035477183759212494, "grad_norm_var": 3.413035325435145e-05, "learning_rate": 0.006614785712706531, "loss": 2.6417, "step": 5245 }, { "crossentropy": 2.6854015588760376, "epoch": 0.28527148644606976, "grad_norm": 0.035070840269327164, "grad_norm_var": 3.432173643510931e-05, "learning_rate": 0.006613607206052475, "loss": 2.6854, "step": 5246 }, { "crossentropy": 2.6463104486465454, "epoch": 0.28532586530356996, "grad_norm": 0.03298662230372429, "grad_norm_var": 2.12304217725713e-05, "learning_rate": 0.006612428599324005, "loss": 2.6463, "step": 5247 }, { "crossentropy": 2.727044105529785, "epoch": 0.28538024416107016, "grad_norm": 0.03782964497804642, "grad_norm_var": 2.0884852357674898e-05, "learning_rate": 0.006611249892594217, "loss": 2.727, "step": 5248 }, { "crossentropy": 2.7729363441467285, "epoch": 0.28543462301857037, "grad_norm": 0.049167700111866, "grad_norm_var": 2.9395882068941558e-05, "learning_rate": 0.006610071085936212, "loss": 2.7729, "step": 5249 }, { "crossentropy": 2.6325442790985107, "epoch": 0.28548900187607057, "grad_norm": 0.03916456922888756, "grad_norm_var": 2.926555171196462e-05, "learning_rate": 0.0066088921794231, "loss": 2.6325, "step": 5250 }, { "crossentropy": 2.6389414072036743, "epoch": 0.2855433807335708, "grad_norm": 0.03389490395784378, "grad_norm_var": 2.9848533299418426e-05, "learning_rate": 0.006607713173127994, "loss": 2.6389, "step": 5251 }, { "crossentropy": 2.706786036491394, "epoch": 0.285597759591071, "grad_norm": 0.03917022794485092, "grad_norm_var": 2.973766491606618e-05, "learning_rate": 0.006606534067124017, "loss": 2.7068, "step": 5252 }, { "crossentropy": 2.751317024230957, "epoch": 0.2856521384485712, "grad_norm": 0.03530050069093704, "grad_norm_var": 2.878266130870253e-05, "learning_rate": 0.006605354861484295, "loss": 2.7513, "step": 5253 }, { "crossentropy": 2.6530662775039673, "epoch": 0.2857065173060714, "grad_norm": 0.04227021709084511, "grad_norm_var": 2.979314495544592e-05, "learning_rate": 0.006604175556281962, "loss": 2.6531, "step": 5254 }, { "crossentropy": 2.5560895204544067, "epoch": 0.2857608961635716, "grad_norm": 0.04014172405004501, "grad_norm_var": 2.9912051519153694e-05, "learning_rate": 0.0066029961515901535, "loss": 2.5561, "step": 5255 }, { "crossentropy": 2.7914414405822754, "epoch": 0.2858152750210718, "grad_norm": 0.037779174745082855, "grad_norm_var": 1.5857680190062285e-05, "learning_rate": 0.006601816647482021, "loss": 2.7914, "step": 5256 }, { "crossentropy": 2.647158145904541, "epoch": 0.285869653878572, "grad_norm": 0.039631400257349014, "grad_norm_var": 1.5893279644620985e-05, "learning_rate": 0.006600637044030712, "loss": 2.6472, "step": 5257 }, { "crossentropy": 2.6406219005584717, "epoch": 0.2859240327360722, "grad_norm": 0.03848422318696976, "grad_norm_var": 1.528542113375473e-05, "learning_rate": 0.0065994573413093865, "loss": 2.6406, "step": 5258 }, { "crossentropy": 2.712322235107422, "epoch": 0.2859784115935724, "grad_norm": 0.0368531197309494, "grad_norm_var": 1.5223423974561334e-05, "learning_rate": 0.006598277539391207, "loss": 2.7123, "step": 5259 }, { "crossentropy": 2.595384120941162, "epoch": 0.2860327904510726, "grad_norm": 0.03499678522348404, "grad_norm_var": 1.5185861656865675e-05, "learning_rate": 0.006597097638349345, "loss": 2.5954, "step": 5260 }, { "crossentropy": 2.7083592414855957, "epoch": 0.2860871693085728, "grad_norm": 0.034682389348745346, "grad_norm_var": 1.5494141581499228e-05, "learning_rate": 0.0065959176382569765, "loss": 2.7084, "step": 5261 }, { "crossentropy": 2.5966092348098755, "epoch": 0.286141548166073, "grad_norm": 0.03477643430233002, "grad_norm_var": 1.5613127331965716e-05, "learning_rate": 0.006594737539187283, "loss": 2.5966, "step": 5262 }, { "crossentropy": 2.636875629425049, "epoch": 0.2861959270235732, "grad_norm": 0.03756242245435715, "grad_norm_var": 1.389624215201076e-05, "learning_rate": 0.006593557341213457, "loss": 2.6369, "step": 5263 }, { "crossentropy": 2.6471885442733765, "epoch": 0.2862503058810734, "grad_norm": 0.038063935935497284, "grad_norm_var": 1.3887116645202221e-05, "learning_rate": 0.006592377044408688, "loss": 2.6472, "step": 5264 }, { "crossentropy": 2.5806528329849243, "epoch": 0.2863046847385736, "grad_norm": 0.03935406357049942, "grad_norm_var": 5.615758553934036e-06, "learning_rate": 0.006591196648846179, "loss": 2.5807, "step": 5265 }, { "crossentropy": 2.7088980674743652, "epoch": 0.2863590635960738, "grad_norm": 0.038740742951631546, "grad_norm_var": 5.540429370731505e-06, "learning_rate": 0.006590016154599138, "loss": 2.7089, "step": 5266 }, { "crossentropy": 2.636008143424988, "epoch": 0.28641344245357403, "grad_norm": 0.03658527880907059, "grad_norm_var": 4.661439275047835e-06, "learning_rate": 0.006588835561740778, "loss": 2.636, "step": 5267 }, { "crossentropy": 2.6132514476776123, "epoch": 0.28646782131107423, "grad_norm": 0.0337572880089283, "grad_norm_var": 5.485380874814943e-06, "learning_rate": 0.0065876548703443175, "loss": 2.6133, "step": 5268 }, { "crossentropy": 2.551379919052124, "epoch": 0.28652220016857444, "grad_norm": 0.03630850091576576, "grad_norm_var": 5.261842648985696e-06, "learning_rate": 0.0065864740804829815, "loss": 2.5514, "step": 5269 }, { "crossentropy": 2.743395209312439, "epoch": 0.28657657902607464, "grad_norm": 0.04065195098519325, "grad_norm_var": 4.3960867742174704e-06, "learning_rate": 0.006585293192230002, "loss": 2.7434, "step": 5270 }, { "crossentropy": 2.56011426448822, "epoch": 0.28663095788357484, "grad_norm": 0.04285961762070656, "grad_norm_var": 5.852025036655733e-06, "learning_rate": 0.0065841122056586165, "loss": 2.5601, "step": 5271 }, { "crossentropy": 2.606668710708618, "epoch": 0.28668533674107505, "grad_norm": 0.04207657650113106, "grad_norm_var": 7.127278328957163e-06, "learning_rate": 0.006582931120842071, "loss": 2.6067, "step": 5272 }, { "crossentropy": 2.697809100151062, "epoch": 0.28673971559857525, "grad_norm": 0.04050345718860626, "grad_norm_var": 7.3835039257998e-06, "learning_rate": 0.00658174993785361, "loss": 2.6978, "step": 5273 }, { "crossentropy": 2.827061653137207, "epoch": 0.28679409445607545, "grad_norm": 0.03632644563913345, "grad_norm_var": 7.503845829233818e-06, "learning_rate": 0.006580568656766495, "loss": 2.8271, "step": 5274 }, { "crossentropy": 2.7656127214431763, "epoch": 0.28684847331357566, "grad_norm": 0.03676990047097206, "grad_norm_var": 7.514299026791191e-06, "learning_rate": 0.006579387277653986, "loss": 2.7656, "step": 5275 }, { "crossentropy": 2.749284029006958, "epoch": 0.28690285217107586, "grad_norm": 0.04208453372120857, "grad_norm_var": 8.05124838836781e-06, "learning_rate": 0.006578205800589348, "loss": 2.7493, "step": 5276 }, { "crossentropy": 2.610278367996216, "epoch": 0.28695723102857607, "grad_norm": 0.0378667488694191, "grad_norm_var": 7.194055626183471e-06, "learning_rate": 0.006577024225645858, "loss": 2.6103, "step": 5277 }, { "crossentropy": 2.6212782859802246, "epoch": 0.28701160988607627, "grad_norm": 0.03379913046956062, "grad_norm_var": 7.725014437908774e-06, "learning_rate": 0.006575842552896798, "loss": 2.6213, "step": 5278 }, { "crossentropy": 2.7363834381103516, "epoch": 0.2870659887435765, "grad_norm": 0.03565467894077301, "grad_norm_var": 8.14821412884418e-06, "learning_rate": 0.0065746607824154505, "loss": 2.7364, "step": 5279 }, { "crossentropy": 2.6202831268310547, "epoch": 0.2871203676010767, "grad_norm": 0.034996990114450455, "grad_norm_var": 8.796923500825384e-06, "learning_rate": 0.0065734789142751085, "loss": 2.6203, "step": 5280 }, { "crossentropy": 2.715228796005249, "epoch": 0.2871747464585769, "grad_norm": 0.03700046241283417, "grad_norm_var": 8.724803205401032e-06, "learning_rate": 0.006572296948549073, "loss": 2.7152, "step": 5281 }, { "crossentropy": 2.6432883739471436, "epoch": 0.2872291253160771, "grad_norm": 0.039701323956251144, "grad_norm_var": 8.893496768341604e-06, "learning_rate": 0.006571114885310645, "loss": 2.6433, "step": 5282 }, { "crossentropy": 2.706163763999939, "epoch": 0.2872835041735773, "grad_norm": 0.038016337901353836, "grad_norm_var": 8.764159072835395e-06, "learning_rate": 0.006569932724633136, "loss": 2.7062, "step": 5283 }, { "crossentropy": 2.5303395986557007, "epoch": 0.2873378830310775, "grad_norm": 0.035056374967098236, "grad_norm_var": 8.13070063528353e-06, "learning_rate": 0.006568750466589865, "loss": 2.5303, "step": 5284 }, { "crossentropy": 2.798384666442871, "epoch": 0.2873922618885777, "grad_norm": 0.0366252101957798, "grad_norm_var": 8.061125686168597e-06, "learning_rate": 0.006567568111254152, "loss": 2.7984, "step": 5285 }, { "crossentropy": 2.609755754470825, "epoch": 0.2874466407460779, "grad_norm": 0.04592229798436165, "grad_norm_var": 1.1573332284978838e-05, "learning_rate": 0.006566385658699326, "loss": 2.6098, "step": 5286 }, { "crossentropy": 2.626213788986206, "epoch": 0.28750101960357816, "grad_norm": 0.034865912050008774, "grad_norm_var": 1.0871151651811739e-05, "learning_rate": 0.006565203108998722, "loss": 2.6262, "step": 5287 }, { "crossentropy": 2.6793678998947144, "epoch": 0.28755539846107836, "grad_norm": 0.03541836142539978, "grad_norm_var": 9.982156231387075e-06, "learning_rate": 0.006564020462225679, "loss": 2.6794, "step": 5288 }, { "crossentropy": 2.6555811166763306, "epoch": 0.28760977731857856, "grad_norm": 0.04482831805944443, "grad_norm_var": 1.2861201885817387e-05, "learning_rate": 0.006562837718453549, "loss": 2.6556, "step": 5289 }, { "crossentropy": 2.6858125925064087, "epoch": 0.28766415617607877, "grad_norm": 0.03834802284836769, "grad_norm_var": 1.2717197059029332e-05, "learning_rate": 0.006561654877755676, "loss": 2.6858, "step": 5290 }, { "crossentropy": 2.7338513135910034, "epoch": 0.28771853503357897, "grad_norm": 0.0339093878865242, "grad_norm_var": 1.3672847621229387e-05, "learning_rate": 0.006560471940205426, "loss": 2.7339, "step": 5291 }, { "crossentropy": 2.5989134311676025, "epoch": 0.2877729138910792, "grad_norm": 0.03506356105208397, "grad_norm_var": 1.2701545625316776e-05, "learning_rate": 0.0065592889058761605, "loss": 2.5989, "step": 5292 }, { "crossentropy": 2.794486880302429, "epoch": 0.2878272927485794, "grad_norm": 0.03453616052865982, "grad_norm_var": 1.3150746231928243e-05, "learning_rate": 0.006558105774841251, "loss": 2.7945, "step": 5293 }, { "crossentropy": 2.6175934076309204, "epoch": 0.2878816716060796, "grad_norm": 0.037695493549108505, "grad_norm_var": 1.2380119639738134e-05, "learning_rate": 0.006556922547174073, "loss": 2.6176, "step": 5294 }, { "crossentropy": 2.7711888551712036, "epoch": 0.2879360504635798, "grad_norm": 0.03797275573015213, "grad_norm_var": 1.2191226228578228e-05, "learning_rate": 0.006555739222948012, "loss": 2.7712, "step": 5295 }, { "crossentropy": 2.657811164855957, "epoch": 0.28799042932108, "grad_norm": 0.05725950747728348, "grad_norm_var": 3.574566556010424e-05, "learning_rate": 0.006554555802236454, "loss": 2.6578, "step": 5296 }, { "crossentropy": 2.6895776987075806, "epoch": 0.2880448081785802, "grad_norm": 0.04118265584111214, "grad_norm_var": 3.578589691187317e-05, "learning_rate": 0.006553372285112793, "loss": 2.6896, "step": 5297 }, { "crossentropy": 2.636578679084778, "epoch": 0.2880991870360804, "grad_norm": 0.03905220329761505, "grad_norm_var": 3.5764524089983385e-05, "learning_rate": 0.006552188671650433, "loss": 2.6366, "step": 5298 }, { "crossentropy": 2.7564141750335693, "epoch": 0.2881535658935806, "grad_norm": 0.03675542026758194, "grad_norm_var": 3.604768454479364e-05, "learning_rate": 0.0065510049619227784, "loss": 2.7564, "step": 5299 }, { "crossentropy": 2.730988383293152, "epoch": 0.2882079447510808, "grad_norm": 0.04798349738121033, "grad_norm_var": 3.964183117903571e-05, "learning_rate": 0.0065498211560032415, "loss": 2.731, "step": 5300 }, { "crossentropy": 2.6941936016082764, "epoch": 0.288262323608581, "grad_norm": 0.03577513247728348, "grad_norm_var": 4.00512214203672e-05, "learning_rate": 0.006548637253965242, "loss": 2.6942, "step": 5301 }, { "crossentropy": 2.703581690788269, "epoch": 0.2883167024660812, "grad_norm": 0.03435070440173149, "grad_norm_var": 3.8951811254736565e-05, "learning_rate": 0.006547453255882203, "loss": 2.7036, "step": 5302 }, { "crossentropy": 2.6805187463760376, "epoch": 0.2883710813235814, "grad_norm": 0.03544914722442627, "grad_norm_var": 3.8646739221048254e-05, "learning_rate": 0.006546269161827556, "loss": 2.6805, "step": 5303 }, { "crossentropy": 2.7583900690078735, "epoch": 0.2884254601810816, "grad_norm": 0.03756679967045784, "grad_norm_var": 3.788094166081516e-05, "learning_rate": 0.006545084971874737, "loss": 2.7584, "step": 5304 }, { "crossentropy": 2.638530731201172, "epoch": 0.2884798390385818, "grad_norm": 0.03702158480882645, "grad_norm_var": 3.586590528450613e-05, "learning_rate": 0.0065439006860971895, "loss": 2.6385, "step": 5305 }, { "crossentropy": 2.6733975410461426, "epoch": 0.288534217896082, "grad_norm": 0.0347273126244545, "grad_norm_var": 3.6876958307617035e-05, "learning_rate": 0.006542716304568359, "loss": 2.6734, "step": 5306 }, { "crossentropy": 2.6961745023727417, "epoch": 0.2885885967535822, "grad_norm": 0.034895703196525574, "grad_norm_var": 3.6331577285398685e-05, "learning_rate": 0.006541531827361704, "loss": 2.6962, "step": 5307 }, { "crossentropy": 2.6255427598953247, "epoch": 0.28864297561108243, "grad_norm": 0.03442496433854103, "grad_norm_var": 3.6656517314441416e-05, "learning_rate": 0.00654034725455068, "loss": 2.6255, "step": 5308 }, { "crossentropy": 2.5411020517349243, "epoch": 0.28869735446858263, "grad_norm": 0.035541899502277374, "grad_norm_var": 3.618275196864551e-05, "learning_rate": 0.006539162586208756, "loss": 2.5411, "step": 5309 }, { "crossentropy": 2.593411684036255, "epoch": 0.28875173332608284, "grad_norm": 0.034555524587631226, "grad_norm_var": 3.717908118621808e-05, "learning_rate": 0.006537977822409405, "loss": 2.5934, "step": 5310 }, { "crossentropy": 2.728251099586487, "epoch": 0.28880611218358304, "grad_norm": 0.04065742343664169, "grad_norm_var": 3.7474043091327104e-05, "learning_rate": 0.006536792963226102, "loss": 2.7283, "step": 5311 }, { "crossentropy": 2.6106969118118286, "epoch": 0.28886049104108324, "grad_norm": 0.038653723895549774, "grad_norm_var": 1.2757924473326872e-05, "learning_rate": 0.006535608008732331, "loss": 2.6107, "step": 5312 }, { "crossentropy": 2.5881232023239136, "epoch": 0.28891486989858345, "grad_norm": 0.037455130368471146, "grad_norm_var": 1.1752351310070907e-05, "learning_rate": 0.0065344229590015845, "loss": 2.5881, "step": 5313 }, { "crossentropy": 2.701719641685486, "epoch": 0.28896924875608365, "grad_norm": 0.03566397726535797, "grad_norm_var": 1.1623672526553385e-05, "learning_rate": 0.006533237814107357, "loss": 2.7017, "step": 5314 }, { "crossentropy": 2.579287052154541, "epoch": 0.28902362761358386, "grad_norm": 0.03707708790898323, "grad_norm_var": 1.1621049021741786e-05, "learning_rate": 0.006532052574123148, "loss": 2.5793, "step": 5315 }, { "crossentropy": 2.6439173221588135, "epoch": 0.28907800647108406, "grad_norm": 0.03715958446264267, "grad_norm_var": 3.0740365542869866e-06, "learning_rate": 0.006530867239122468, "loss": 2.6439, "step": 5316 }, { "crossentropy": 2.666656017303467, "epoch": 0.28913238532858426, "grad_norm": 0.037679847329854965, "grad_norm_var": 3.1646975029588074e-06, "learning_rate": 0.006529681809178829, "loss": 2.6667, "step": 5317 }, { "crossentropy": 2.6382079124450684, "epoch": 0.28918676418608447, "grad_norm": 0.03390156105160713, "grad_norm_var": 3.3018274044273783e-06, "learning_rate": 0.006528496284365749, "loss": 2.6382, "step": 5318 }, { "crossentropy": 2.664350390434265, "epoch": 0.28924114304358467, "grad_norm": 0.034457430243492126, "grad_norm_var": 3.4892849996600213e-06, "learning_rate": 0.006527310664756756, "loss": 2.6644, "step": 5319 }, { "crossentropy": 2.587544322013855, "epoch": 0.2892955219010849, "grad_norm": 0.034360434859991074, "grad_norm_var": 3.6073459182085764e-06, "learning_rate": 0.006526124950425379, "loss": 2.5875, "step": 5320 }, { "crossentropy": 2.6805520057678223, "epoch": 0.2893499007585851, "grad_norm": 0.03429919481277466, "grad_norm_var": 3.750402020306446e-06, "learning_rate": 0.0065249391414451555, "loss": 2.6806, "step": 5321 }, { "crossentropy": 2.6547054052352905, "epoch": 0.2894042796160853, "grad_norm": 0.03807663172483444, "grad_norm_var": 3.896825748551473e-06, "learning_rate": 0.006523753237889627, "loss": 2.6547, "step": 5322 }, { "crossentropy": 2.615403890609741, "epoch": 0.2894586584735855, "grad_norm": 0.036293886601924896, "grad_norm_var": 3.7798153645257165e-06, "learning_rate": 0.006522567239832344, "loss": 2.6154, "step": 5323 }, { "crossentropy": 2.634695053100586, "epoch": 0.2895130373310857, "grad_norm": 0.03422674536705017, "grad_norm_var": 3.830931932918314e-06, "learning_rate": 0.006521381147346862, "loss": 2.6347, "step": 5324 }, { "crossentropy": 2.6899237632751465, "epoch": 0.2895674161885859, "grad_norm": 0.03729363530874252, "grad_norm_var": 3.856453660400159e-06, "learning_rate": 0.006520194960506737, "loss": 2.6899, "step": 5325 }, { "crossentropy": 2.544209122657776, "epoch": 0.2896217950460861, "grad_norm": 0.03381732106208801, "grad_norm_var": 4.068440812500923e-06, "learning_rate": 0.00651900867938554, "loss": 2.5442, "step": 5326 }, { "crossentropy": 2.6131783723831177, "epoch": 0.2896761739035863, "grad_norm": 0.03637731820344925, "grad_norm_var": 2.736458865609283e-06, "learning_rate": 0.00651782230405684, "loss": 2.6132, "step": 5327 }, { "crossentropy": 2.578842878341675, "epoch": 0.2897305527610865, "grad_norm": 0.03419126942753792, "grad_norm_var": 2.4316113701766675e-06, "learning_rate": 0.006516635834594216, "loss": 2.5788, "step": 5328 }, { "crossentropy": 2.724477529525757, "epoch": 0.2897849316185867, "grad_norm": 0.03355970233678818, "grad_norm_var": 2.50512719753241e-06, "learning_rate": 0.00651544927107125, "loss": 2.7245, "step": 5329 }, { "crossentropy": 2.661424160003662, "epoch": 0.2898393104760869, "grad_norm": 0.0370650552213192, "grad_norm_var": 2.6533623305050974e-06, "learning_rate": 0.006514262613561535, "loss": 2.6614, "step": 5330 }, { "crossentropy": 2.6862618923187256, "epoch": 0.2898936893335871, "grad_norm": 0.0386737659573555, "grad_norm_var": 3.124006948934145e-06, "learning_rate": 0.0065130758621386635, "loss": 2.6863, "step": 5331 }, { "crossentropy": 2.650704860687256, "epoch": 0.2899480681910873, "grad_norm": 0.04500770568847656, "grad_norm_var": 8.485639098516385e-06, "learning_rate": 0.006511889016876236, "loss": 2.6507, "step": 5332 }, { "crossentropy": 2.613181948661804, "epoch": 0.2900024470485875, "grad_norm": 0.03648662567138672, "grad_norm_var": 8.339997548933998e-06, "learning_rate": 0.006510702077847863, "loss": 2.6132, "step": 5333 }, { "crossentropy": 2.7565985918045044, "epoch": 0.2900568259060877, "grad_norm": 0.035720255225896835, "grad_norm_var": 8.006220178866792e-06, "learning_rate": 0.006509515045127157, "loss": 2.7566, "step": 5334 }, { "crossentropy": 2.5931674242019653, "epoch": 0.2901112047635879, "grad_norm": 0.038839299231767654, "grad_norm_var": 8.162357987170388e-06, "learning_rate": 0.006508327918787733, "loss": 2.5932, "step": 5335 }, { "crossentropy": 2.6792962551116943, "epoch": 0.29016558362108813, "grad_norm": 0.03504031524062157, "grad_norm_var": 7.995658195471208e-06, "learning_rate": 0.006507140698903218, "loss": 2.6793, "step": 5336 }, { "crossentropy": 2.6417551040649414, "epoch": 0.29021996247858833, "grad_norm": 0.036937300115823746, "grad_norm_var": 7.635209024689505e-06, "learning_rate": 0.006505953385547243, "loss": 2.6418, "step": 5337 }, { "crossentropy": 2.7236762046813965, "epoch": 0.29027434133608854, "grad_norm": 0.03943869471549988, "grad_norm_var": 7.996550121195518e-06, "learning_rate": 0.006504765978793442, "loss": 2.7237, "step": 5338 }, { "crossentropy": 2.7586121559143066, "epoch": 0.29032872019358874, "grad_norm": 0.03580925613641739, "grad_norm_var": 8.044615117467485e-06, "learning_rate": 0.006503578478715458, "loss": 2.7586, "step": 5339 }, { "crossentropy": 2.676964282989502, "epoch": 0.29038309905108894, "grad_norm": 0.03572465106844902, "grad_norm_var": 7.67485651742324e-06, "learning_rate": 0.006502390885386939, "loss": 2.677, "step": 5340 }, { "crossentropy": 2.650404453277588, "epoch": 0.29043747790858915, "grad_norm": 0.03675396367907524, "grad_norm_var": 7.662855762465235e-06, "learning_rate": 0.006501203198881539, "loss": 2.6504, "step": 5341 }, { "crossentropy": 2.6506993770599365, "epoch": 0.29049185676608935, "grad_norm": 0.06508085876703262, "grad_norm_var": 5.6150302475561446e-05, "learning_rate": 0.006500015419272915, "loss": 2.6507, "step": 5342 }, { "crossentropy": 2.4850131273269653, "epoch": 0.29054623562358955, "grad_norm": 0.035934075713157654, "grad_norm_var": 5.6305412466887275e-05, "learning_rate": 0.006498827546634733, "loss": 2.485, "step": 5343 }, { "crossentropy": 2.56263530254364, "epoch": 0.29060061448108976, "grad_norm": 0.0431167297065258, "grad_norm_var": 5.583968706206384e-05, "learning_rate": 0.006497639581040664, "loss": 2.5626, "step": 5344 }, { "crossentropy": 2.7222094535827637, "epoch": 0.29065499333858996, "grad_norm": 0.04030391573905945, "grad_norm_var": 5.349880337994194e-05, "learning_rate": 0.006496451522564384, "loss": 2.7222, "step": 5345 }, { "crossentropy": 2.70354962348938, "epoch": 0.29070937219609017, "grad_norm": 0.04045752063393593, "grad_norm_var": 5.300553625880629e-05, "learning_rate": 0.006495263371279576, "loss": 2.7035, "step": 5346 }, { "crossentropy": 2.5217570066452026, "epoch": 0.29076375105359037, "grad_norm": 0.037492942065000534, "grad_norm_var": 5.329484654711201e-05, "learning_rate": 0.006494075127259928, "loss": 2.5218, "step": 5347 }, { "crossentropy": 2.680072784423828, "epoch": 0.2908181299110906, "grad_norm": 0.03566782921552658, "grad_norm_var": 5.236629874972674e-05, "learning_rate": 0.006492886790579133, "loss": 2.6801, "step": 5348 }, { "crossentropy": 2.652519106864929, "epoch": 0.2908725087685908, "grad_norm": 0.03509059548377991, "grad_norm_var": 5.301182832718644e-05, "learning_rate": 0.006491698361310892, "loss": 2.6525, "step": 5349 }, { "crossentropy": 2.7474085092544556, "epoch": 0.290926887626091, "grad_norm": 0.03375244140625, "grad_norm_var": 5.417025937823154e-05, "learning_rate": 0.006490509839528908, "loss": 2.7474, "step": 5350 }, { "crossentropy": 2.6811704635620117, "epoch": 0.2909812664835912, "grad_norm": 0.03673188015818596, "grad_norm_var": 5.451828636608586e-05, "learning_rate": 0.006489321225306895, "loss": 2.6812, "step": 5351 }, { "crossentropy": 2.6485589742660522, "epoch": 0.2910356453410914, "grad_norm": 0.03496411442756653, "grad_norm_var": 5.455845653407615e-05, "learning_rate": 0.006488132518718568, "loss": 2.6486, "step": 5352 }, { "crossentropy": 2.582337975502014, "epoch": 0.2910900241985916, "grad_norm": 0.07160519063472748, "grad_norm_var": 0.0001203549960128778, "learning_rate": 0.006486943719837648, "loss": 2.5823, "step": 5353 }, { "crossentropy": 2.696232318878174, "epoch": 0.2911444030560918, "grad_norm": 0.035314224660396576, "grad_norm_var": 0.00012234295859814097, "learning_rate": 0.006485754828737866, "loss": 2.6962, "step": 5354 }, { "crossentropy": 2.594061017036438, "epoch": 0.291198781913592, "grad_norm": 0.03623373061418533, "grad_norm_var": 0.00012206822268256423, "learning_rate": 0.006484565845492955, "loss": 2.5941, "step": 5355 }, { "crossentropy": 2.6180386543273926, "epoch": 0.2912531607710922, "grad_norm": 0.03781844675540924, "grad_norm_var": 0.00012090046433519402, "learning_rate": 0.006483376770176653, "loss": 2.618, "step": 5356 }, { "crossentropy": 2.7765287160873413, "epoch": 0.2913075396285924, "grad_norm": 0.03839428350329399, "grad_norm_var": 0.00012013562910123301, "learning_rate": 0.006482187602862707, "loss": 2.7765, "step": 5357 }, { "crossentropy": 2.693589687347412, "epoch": 0.2913619184860926, "grad_norm": 0.033962350338697433, "grad_norm_var": 8.125145862507173e-05, "learning_rate": 0.006480998343624869, "loss": 2.6936, "step": 5358 }, { "crossentropy": 2.606403946876526, "epoch": 0.2914162973435928, "grad_norm": 0.0366230346262455, "grad_norm_var": 8.098317878333551e-05, "learning_rate": 0.006479808992536893, "loss": 2.6064, "step": 5359 }, { "crossentropy": 2.729335069656372, "epoch": 0.291470676201093, "grad_norm": 0.03402622416615486, "grad_norm_var": 8.142560961370831e-05, "learning_rate": 0.006478619549672543, "loss": 2.7293, "step": 5360 }, { "crossentropy": 2.627047300338745, "epoch": 0.2915250550585932, "grad_norm": 0.03890397399663925, "grad_norm_var": 8.123983309416821e-05, "learning_rate": 0.006477430015105588, "loss": 2.627, "step": 5361 }, { "crossentropy": 2.6370164155960083, "epoch": 0.2915794339160934, "grad_norm": 0.03974779695272446, "grad_norm_var": 8.109221873119817e-05, "learning_rate": 0.0064762403889098, "loss": 2.637, "step": 5362 }, { "crossentropy": 2.671533465385437, "epoch": 0.2916338127735936, "grad_norm": 0.03617699816823006, "grad_norm_var": 8.13807565947564e-05, "learning_rate": 0.00647505067115896, "loss": 2.6715, "step": 5363 }, { "crossentropy": 2.6130707263946533, "epoch": 0.29168819163109383, "grad_norm": 0.03617743402719498, "grad_norm_var": 8.120874029077588e-05, "learning_rate": 0.0064738608619268525, "loss": 2.6131, "step": 5364 }, { "crossentropy": 2.5775257349014282, "epoch": 0.29174257048859403, "grad_norm": 0.039121344685554504, "grad_norm_var": 8.040787835113689e-05, "learning_rate": 0.006472670961287268, "loss": 2.5775, "step": 5365 }, { "crossentropy": 2.5617220401763916, "epoch": 0.29179694934609424, "grad_norm": 0.04088462144136429, "grad_norm_var": 7.88612023260545e-05, "learning_rate": 0.006471480969314004, "loss": 2.5617, "step": 5366 }, { "crossentropy": 2.6808433532714844, "epoch": 0.29185132820359444, "grad_norm": 0.037841856479644775, "grad_norm_var": 7.857768899010986e-05, "learning_rate": 0.0064702908860808615, "loss": 2.6808, "step": 5367 }, { "crossentropy": 2.691027522087097, "epoch": 0.29190570706109464, "grad_norm": 0.0369107611477375, "grad_norm_var": 7.770542994352513e-05, "learning_rate": 0.00646910071166165, "loss": 2.691, "step": 5368 }, { "crossentropy": 2.6948745250701904, "epoch": 0.29196008591859485, "grad_norm": 0.034980230033397675, "grad_norm_var": 4.072908089829593e-06, "learning_rate": 0.006467910446130181, "loss": 2.6949, "step": 5369 }, { "crossentropy": 2.6905566453933716, "epoch": 0.29201446477609505, "grad_norm": 0.036902956664562225, "grad_norm_var": 3.85877048919292e-06, "learning_rate": 0.0064667200895602754, "loss": 2.6906, "step": 5370 }, { "crossentropy": 2.6789112091064453, "epoch": 0.29206884363359525, "grad_norm": 0.03551952913403511, "grad_norm_var": 3.9797256573159115e-06, "learning_rate": 0.006465529642025756, "loss": 2.6789, "step": 5371 }, { "crossentropy": 2.680739641189575, "epoch": 0.29212322249109546, "grad_norm": 0.03345385193824768, "grad_norm_var": 4.76648588826478e-06, "learning_rate": 0.006464339103600456, "loss": 2.6807, "step": 5372 }, { "crossentropy": 2.6794310808181763, "epoch": 0.29217760134859566, "grad_norm": 0.034813448786735535, "grad_norm_var": 4.831387670790425e-06, "learning_rate": 0.006463148474358209, "loss": 2.6794, "step": 5373 }, { "crossentropy": 2.6597702503204346, "epoch": 0.29223198020609586, "grad_norm": 0.03395788371562958, "grad_norm_var": 4.832976385593137e-06, "learning_rate": 0.006461957754372858, "loss": 2.6598, "step": 5374 }, { "crossentropy": 2.6946674585342407, "epoch": 0.29228635906359607, "grad_norm": 0.033858608454465866, "grad_norm_var": 5.31229536105491e-06, "learning_rate": 0.00646076694371825, "loss": 2.6947, "step": 5375 }, { "crossentropy": 2.565891981124878, "epoch": 0.29234073792109627, "grad_norm": 0.0360177643597126, "grad_norm_var": 4.915292085176139e-06, "learning_rate": 0.006459576042468238, "loss": 2.5659, "step": 5376 }, { "crossentropy": 2.7400660514831543, "epoch": 0.2923951167785965, "grad_norm": 0.033889852464199066, "grad_norm_var": 4.932481644235632e-06, "learning_rate": 0.006458385050696681, "loss": 2.7401, "step": 5377 }, { "crossentropy": 2.698845386505127, "epoch": 0.2924494956360967, "grad_norm": 0.051526132971048355, "grad_norm_var": 1.907113072560617e-05, "learning_rate": 0.006457193968477443, "loss": 2.6988, "step": 5378 }, { "crossentropy": 2.62203586101532, "epoch": 0.2925038744935969, "grad_norm": 0.0356438048183918, "grad_norm_var": 1.9147556233915093e-05, "learning_rate": 0.006456002795884393, "loss": 2.622, "step": 5379 }, { "crossentropy": 2.608314275741577, "epoch": 0.2925582533510971, "grad_norm": 0.03450724110007286, "grad_norm_var": 1.949812393430833e-05, "learning_rate": 0.006454811532991407, "loss": 2.6083, "step": 5380 }, { "crossentropy": 2.6218178272247314, "epoch": 0.2926126322085973, "grad_norm": 0.0343056321144104, "grad_norm_var": 1.9498373448797492e-05, "learning_rate": 0.006453620179872366, "loss": 2.6218, "step": 5381 }, { "crossentropy": 2.458891272544861, "epoch": 0.2926670110660975, "grad_norm": 0.033137984573841095, "grad_norm_var": 1.8785683442389065e-05, "learning_rate": 0.006452428736601157, "loss": 2.4589, "step": 5382 }, { "crossentropy": 2.49637770652771, "epoch": 0.2927213899235977, "grad_norm": 0.03599997237324715, "grad_norm_var": 1.8564841014319842e-05, "learning_rate": 0.006451237203251672, "loss": 2.4964, "step": 5383 }, { "crossentropy": 2.4842021465301514, "epoch": 0.2927757687810979, "grad_norm": 0.034140948206186295, "grad_norm_var": 1.8694723852496115e-05, "learning_rate": 0.006450045579897808, "loss": 2.4842, "step": 5384 }, { "crossentropy": 2.545923352241516, "epoch": 0.2928301476385981, "grad_norm": 0.03674516826868057, "grad_norm_var": 1.8698619594644437e-05, "learning_rate": 0.0064488538666134685, "loss": 2.5459, "step": 5385 }, { "crossentropy": 2.636513113975525, "epoch": 0.2928845264960983, "grad_norm": 0.034432534128427505, "grad_norm_var": 1.8750120541253094e-05, "learning_rate": 0.006447662063472565, "loss": 2.6365, "step": 5386 }, { "crossentropy": 2.6198590993881226, "epoch": 0.2929389053535985, "grad_norm": 0.033098254352808, "grad_norm_var": 1.918993405056925e-05, "learning_rate": 0.006446470170549009, "loss": 2.6199, "step": 5387 }, { "crossentropy": 2.704488158226013, "epoch": 0.2929932842110987, "grad_norm": 0.03403688967227936, "grad_norm_var": 1.9044686385191224e-05, "learning_rate": 0.006445278187916721, "loss": 2.7045, "step": 5388 }, { "crossentropy": 2.6895545721054077, "epoch": 0.2930476630685989, "grad_norm": 0.04204372316598892, "grad_norm_var": 2.1522870042790276e-05, "learning_rate": 0.006444086115649628, "loss": 2.6896, "step": 5389 }, { "crossentropy": 2.6076771020889282, "epoch": 0.2931020419260991, "grad_norm": 0.03859063982963562, "grad_norm_var": 2.1551030079638468e-05, "learning_rate": 0.00644289395382166, "loss": 2.6077, "step": 5390 }, { "crossentropy": 2.6264050006866455, "epoch": 0.2931564207835993, "grad_norm": 0.03760312870144844, "grad_norm_var": 2.1171787761035434e-05, "learning_rate": 0.006441701702506755, "loss": 2.6264, "step": 5391 }, { "crossentropy": 2.597121834754944, "epoch": 0.29321079964109953, "grad_norm": 0.03464341163635254, "grad_norm_var": 2.1397904136619823e-05, "learning_rate": 0.006440509361778852, "loss": 2.5971, "step": 5392 }, { "crossentropy": 2.6481958627700806, "epoch": 0.29326517849859973, "grad_norm": 0.03540512174367905, "grad_norm_var": 2.1009702762262576e-05, "learning_rate": 0.006439316931711905, "loss": 2.6482, "step": 5393 }, { "crossentropy": 2.722602605819702, "epoch": 0.29331955735609994, "grad_norm": 0.03559814766049385, "grad_norm_var": 5.201489257797981e-06, "learning_rate": 0.006438124412379861, "loss": 2.7226, "step": 5394 }, { "crossentropy": 2.6823450326919556, "epoch": 0.29337393621360014, "grad_norm": 0.035138241946697235, "grad_norm_var": 5.215912321047435e-06, "learning_rate": 0.0064369318038566825, "loss": 2.6823, "step": 5395 }, { "crossentropy": 2.5911957025527954, "epoch": 0.29342831507110034, "grad_norm": 0.035413000732660294, "grad_norm_var": 5.136522605273843e-06, "learning_rate": 0.006435739106216333, "loss": 2.5912, "step": 5396 }, { "crossentropy": 2.6055864095687866, "epoch": 0.29348269392860055, "grad_norm": 0.035699211061000824, "grad_norm_var": 5.008884208136145e-06, "learning_rate": 0.006434546319532782, "loss": 2.6056, "step": 5397 }, { "crossentropy": 2.6628434658050537, "epoch": 0.29353707278610075, "grad_norm": 0.03703347221016884, "grad_norm_var": 4.609516621217659e-06, "learning_rate": 0.0064333534438800076, "loss": 2.6628, "step": 5398 }, { "crossentropy": 2.6474978923797607, "epoch": 0.29359145164360095, "grad_norm": 0.035109613090753555, "grad_norm_var": 4.656260500884625e-06, "learning_rate": 0.006432160479331987, "loss": 2.6475, "step": 5399 }, { "crossentropy": 2.646938681602478, "epoch": 0.29364583050110116, "grad_norm": 0.035876307636499405, "grad_norm_var": 4.432671883983913e-06, "learning_rate": 0.0064309674259627105, "loss": 2.6469, "step": 5400 }, { "crossentropy": 2.5982766151428223, "epoch": 0.29370020935860136, "grad_norm": 0.041390560567379, "grad_norm_var": 6.224874634689487e-06, "learning_rate": 0.006429774283846168, "loss": 2.5983, "step": 5401 }, { "crossentropy": 2.6151528358459473, "epoch": 0.29375458821610156, "grad_norm": 0.03636898472905159, "grad_norm_var": 5.972033367780914e-06, "learning_rate": 0.006428581053056357, "loss": 2.6152, "step": 5402 }, { "crossentropy": 2.7380876541137695, "epoch": 0.29380896707360177, "grad_norm": 0.03669136017560959, "grad_norm_var": 5.177707097769821e-06, "learning_rate": 0.00642738773366728, "loss": 2.7381, "step": 5403 }, { "crossentropy": 2.6624577045440674, "epoch": 0.29386334593110197, "grad_norm": 0.03345809504389763, "grad_norm_var": 5.401471703343995e-06, "learning_rate": 0.006426194325752948, "loss": 2.6625, "step": 5404 }, { "crossentropy": 2.7054781913757324, "epoch": 0.2939177247886022, "grad_norm": 0.03549182042479515, "grad_norm_var": 3.354150597477343e-06, "learning_rate": 0.006425000829387373, "loss": 2.7055, "step": 5405 }, { "crossentropy": 2.7161948680877686, "epoch": 0.2939721036461024, "grad_norm": 0.035500772297382355, "grad_norm_var": 2.9739652216572685e-06, "learning_rate": 0.006423807244644575, "loss": 2.7162, "step": 5406 }, { "crossentropy": 2.590997099876404, "epoch": 0.2940264825036026, "grad_norm": 0.03537203371524811, "grad_norm_var": 2.8160111862564802e-06, "learning_rate": 0.006422613571598579, "loss": 2.591, "step": 5407 }, { "crossentropy": 2.5928213596343994, "epoch": 0.2940808613611028, "grad_norm": 0.03350477293133736, "grad_norm_var": 3.085824520383286e-06, "learning_rate": 0.0064214198103234145, "loss": 2.5928, "step": 5408 }, { "crossentropy": 2.6102800369262695, "epoch": 0.294135240218603, "grad_norm": 0.03387023136019707, "grad_norm_var": 3.3170972747222083e-06, "learning_rate": 0.006420225960893121, "loss": 2.6103, "step": 5409 }, { "crossentropy": 2.5851374864578247, "epoch": 0.2941896190761032, "grad_norm": 0.036845166236162186, "grad_norm_var": 3.3940630149495906e-06, "learning_rate": 0.006419032023381735, "loss": 2.5851, "step": 5410 }, { "crossentropy": 2.684210181236267, "epoch": 0.2942439979336034, "grad_norm": 0.03620372340083122, "grad_norm_var": 3.3713268650837137e-06, "learning_rate": 0.0064178379978633085, "loss": 2.6842, "step": 5411 }, { "crossentropy": 2.7216140031814575, "epoch": 0.2942983767911036, "grad_norm": 0.03835737705230713, "grad_norm_var": 3.735980720061492e-06, "learning_rate": 0.006416643884411889, "loss": 2.7216, "step": 5412 }, { "crossentropy": 2.6666221618652344, "epoch": 0.2943527556486038, "grad_norm": 0.03944450244307518, "grad_norm_var": 4.438333977275072e-06, "learning_rate": 0.006415449683101537, "loss": 2.6666, "step": 5413 }, { "crossentropy": 2.593444585800171, "epoch": 0.294407134506104, "grad_norm": 0.04068593308329582, "grad_norm_var": 5.637869655816287e-06, "learning_rate": 0.006414255394006315, "loss": 2.5934, "step": 5414 }, { "crossentropy": 2.693515419960022, "epoch": 0.2944615133636042, "grad_norm": 0.035464316606521606, "grad_norm_var": 5.579470180136492e-06, "learning_rate": 0.006413061017200294, "loss": 2.6935, "step": 5415 }, { "crossentropy": 2.6867616176605225, "epoch": 0.2945158922211044, "grad_norm": 0.035761017352342606, "grad_norm_var": 5.590393658488901e-06, "learning_rate": 0.006411866552757545, "loss": 2.6868, "step": 5416 }, { "crossentropy": 2.679178833961487, "epoch": 0.2945702710786046, "grad_norm": 0.03542989119887352, "grad_norm_var": 3.944589196086477e-06, "learning_rate": 0.0064106720007521475, "loss": 2.6792, "step": 5417 }, { "crossentropy": 2.6935631036758423, "epoch": 0.2946246499361048, "grad_norm": 0.03425588831305504, "grad_norm_var": 4.162845034745508e-06, "learning_rate": 0.006409477361258188, "loss": 2.6936, "step": 5418 }, { "crossentropy": 2.6970460414886475, "epoch": 0.294679028793605, "grad_norm": 0.04004448652267456, "grad_norm_var": 5.165242841572749e-06, "learning_rate": 0.006408282634349758, "loss": 2.697, "step": 5419 }, { "crossentropy": 2.6151621341705322, "epoch": 0.29473340765110523, "grad_norm": 0.03773258998990059, "grad_norm_var": 4.727043163711177e-06, "learning_rate": 0.0064070878201009495, "loss": 2.6152, "step": 5420 }, { "crossentropy": 2.733304738998413, "epoch": 0.29478778650860543, "grad_norm": 0.036850497126579285, "grad_norm_var": 4.660181314713093e-06, "learning_rate": 0.006405892918585869, "loss": 2.7333, "step": 5421 }, { "crossentropy": 2.595036745071411, "epoch": 0.29484216536610564, "grad_norm": 0.035482898354530334, "grad_norm_var": 4.662779723879397e-06, "learning_rate": 0.00640469792987862, "loss": 2.595, "step": 5422 }, { "crossentropy": 2.6884772777557373, "epoch": 0.29489654422360584, "grad_norm": 0.03541741147637367, "grad_norm_var": 4.655590202610969e-06, "learning_rate": 0.0064035028540533135, "loss": 2.6885, "step": 5423 }, { "crossentropy": 2.658935785293579, "epoch": 0.29495092308110604, "grad_norm": 0.03396156430244446, "grad_norm_var": 4.481063918642993e-06, "learning_rate": 0.006402307691184069, "loss": 2.6589, "step": 5424 }, { "crossentropy": 2.6001731157302856, "epoch": 0.29500530193860625, "grad_norm": 0.03377242386341095, "grad_norm_var": 4.51742984569712e-06, "learning_rate": 0.00640111244134501, "loss": 2.6002, "step": 5425 }, { "crossentropy": 2.7766090631484985, "epoch": 0.29505968079610645, "grad_norm": 0.03578919544816017, "grad_norm_var": 4.5535687603565425e-06, "learning_rate": 0.006399917104610262, "loss": 2.7766, "step": 5426 }, { "crossentropy": 2.676080584526062, "epoch": 0.29511405965360665, "grad_norm": 0.03721212223172188, "grad_norm_var": 4.5717943012524645e-06, "learning_rate": 0.00639872168105396, "loss": 2.6761, "step": 5427 }, { "crossentropy": 2.627288818359375, "epoch": 0.29516843851110686, "grad_norm": 0.041933972388505936, "grad_norm_var": 6.207501968533841e-06, "learning_rate": 0.006397526170750245, "loss": 2.6273, "step": 5428 }, { "crossentropy": 2.605584502220154, "epoch": 0.29522281736860706, "grad_norm": 0.04241807386279106, "grad_norm_var": 7.797746009811229e-06, "learning_rate": 0.006396330573773259, "loss": 2.6056, "step": 5429 }, { "crossentropy": 2.727408766746521, "epoch": 0.29527719622610726, "grad_norm": 0.035204000771045685, "grad_norm_var": 6.99152942288065e-06, "learning_rate": 0.006395134890197153, "loss": 2.7274, "step": 5430 }, { "crossentropy": 2.7003484964370728, "epoch": 0.29533157508360747, "grad_norm": 0.03429402410984039, "grad_norm_var": 7.26536302223333e-06, "learning_rate": 0.006393939120096082, "loss": 2.7003, "step": 5431 }, { "crossentropy": 2.6730167865753174, "epoch": 0.29538595394110767, "grad_norm": 0.03886096179485321, "grad_norm_var": 7.520225048710897e-06, "learning_rate": 0.006392743263544205, "loss": 2.673, "step": 5432 }, { "crossentropy": 2.5742597579956055, "epoch": 0.2954403327986079, "grad_norm": 0.03606416657567024, "grad_norm_var": 7.430238935581211e-06, "learning_rate": 0.006391547320615691, "loss": 2.5743, "step": 5433 }, { "crossentropy": 2.7683671712875366, "epoch": 0.2954947116561081, "grad_norm": 0.03586184233427048, "grad_norm_var": 7.040053550037781e-06, "learning_rate": 0.006390351291384707, "loss": 2.7684, "step": 5434 }, { "crossentropy": 2.6049500703811646, "epoch": 0.29554909051360834, "grad_norm": 0.0349804051220417, "grad_norm_var": 6.540779742490038e-06, "learning_rate": 0.006389155175925435, "loss": 2.6049, "step": 5435 }, { "crossentropy": 2.688903331756592, "epoch": 0.29560346937110854, "grad_norm": 0.04148920252919197, "grad_norm_var": 7.982689228195478e-06, "learning_rate": 0.006387958974312054, "loss": 2.6889, "step": 5436 }, { "crossentropy": 2.641264319419861, "epoch": 0.29565784822860874, "grad_norm": 0.03346981853246689, "grad_norm_var": 8.696572976577e-06, "learning_rate": 0.00638676268661875, "loss": 2.6413, "step": 5437 }, { "crossentropy": 2.5876293182373047, "epoch": 0.29571222708610895, "grad_norm": 0.03412216901779175, "grad_norm_var": 9.021914063245262e-06, "learning_rate": 0.006385566312919716, "loss": 2.5876, "step": 5438 }, { "crossentropy": 2.7087583541870117, "epoch": 0.29576660594360915, "grad_norm": 0.03399761766195297, "grad_norm_var": 9.362915697008236e-06, "learning_rate": 0.006384369853289154, "loss": 2.7088, "step": 5439 }, { "crossentropy": 2.67197048664093, "epoch": 0.29582098480110935, "grad_norm": 0.0387401282787323, "grad_norm_var": 9.195375186804057e-06, "learning_rate": 0.006383173307801263, "loss": 2.672, "step": 5440 }, { "crossentropy": 2.5866613388061523, "epoch": 0.29587536365860956, "grad_norm": 0.036523811519145966, "grad_norm_var": 8.571361950172364e-06, "learning_rate": 0.006381976676530252, "loss": 2.5867, "step": 5441 }, { "crossentropy": 2.6432838439941406, "epoch": 0.29592974251610976, "grad_norm": 0.03873634338378906, "grad_norm_var": 8.663932468775532e-06, "learning_rate": 0.0063807799595503355, "loss": 2.6433, "step": 5442 }, { "crossentropy": 2.710160493850708, "epoch": 0.29598412137360997, "grad_norm": 0.03811637684702873, "grad_norm_var": 8.726229622654791e-06, "learning_rate": 0.0063795831569357355, "loss": 2.7102, "step": 5443 }, { "crossentropy": 2.6174421310424805, "epoch": 0.29603850023111017, "grad_norm": 0.03374476730823517, "grad_norm_var": 7.722260009676978e-06, "learning_rate": 0.006378386268760673, "loss": 2.6174, "step": 5444 }, { "crossentropy": 2.5855942964553833, "epoch": 0.2960928790886104, "grad_norm": 0.03150249645113945, "grad_norm_var": 6.794559496895297e-06, "learning_rate": 0.006377189295099378, "loss": 2.5856, "step": 5445 }, { "crossentropy": 2.735740542411804, "epoch": 0.2961472579461106, "grad_norm": 0.033730536699295044, "grad_norm_var": 7.0830527165489815e-06, "learning_rate": 0.006375992236026089, "loss": 2.7357, "step": 5446 }, { "crossentropy": 2.729699730873108, "epoch": 0.2962016368036108, "grad_norm": 0.03604995086789131, "grad_norm_var": 6.902180090103081e-06, "learning_rate": 0.0063747950916150445, "loss": 2.7297, "step": 5447 }, { "crossentropy": 2.6780773401260376, "epoch": 0.296256015661111, "grad_norm": 0.03477805480360985, "grad_norm_var": 6.3862710615096454e-06, "learning_rate": 0.006373597861940488, "loss": 2.6781, "step": 5448 }, { "crossentropy": 2.619633913040161, "epoch": 0.2963103945186112, "grad_norm": 0.0493277870118618, "grad_norm_var": 1.7947299184078946e-05, "learning_rate": 0.006372400547076674, "loss": 2.6196, "step": 5449 }, { "crossentropy": 2.6667410135269165, "epoch": 0.2963647733761114, "grad_norm": 0.034980978816747665, "grad_norm_var": 1.8079342881845615e-05, "learning_rate": 0.006371203147097858, "loss": 2.6667, "step": 5450 }, { "crossentropy": 2.6872538328170776, "epoch": 0.2964191522336116, "grad_norm": 0.0364050418138504, "grad_norm_var": 1.7914094669083982e-05, "learning_rate": 0.006370005662078301, "loss": 2.6873, "step": 5451 }, { "crossentropy": 2.6978628635406494, "epoch": 0.2964735310911118, "grad_norm": 0.035991888493299484, "grad_norm_var": 1.6224481289421554e-05, "learning_rate": 0.00636880809209227, "loss": 2.6979, "step": 5452 }, { "crossentropy": 2.6759923696517944, "epoch": 0.296527909948612, "grad_norm": 0.03518220782279968, "grad_norm_var": 1.576987402144564e-05, "learning_rate": 0.006367610437214037, "loss": 2.676, "step": 5453 }, { "crossentropy": 2.60818612575531, "epoch": 0.2965822888061122, "grad_norm": 0.03407452628016472, "grad_norm_var": 1.5784298961224232e-05, "learning_rate": 0.006366412697517879, "loss": 2.6082, "step": 5454 }, { "crossentropy": 2.5913782119750977, "epoch": 0.2966366676636124, "grad_norm": 0.03475541993975639, "grad_norm_var": 1.5580720971969327e-05, "learning_rate": 0.00636521487307808, "loss": 2.5914, "step": 5455 }, { "crossentropy": 2.610059976577759, "epoch": 0.2966910465211126, "grad_norm": 0.03802734985947609, "grad_norm_var": 1.5391502666930085e-05, "learning_rate": 0.006364016963968928, "loss": 2.6101, "step": 5456 }, { "crossentropy": 2.6243584156036377, "epoch": 0.2967454253786128, "grad_norm": 0.03989190608263016, "grad_norm_var": 1.616936847406621e-05, "learning_rate": 0.006362818970264715, "loss": 2.6244, "step": 5457 }, { "crossentropy": 2.7295050621032715, "epoch": 0.296799804236113, "grad_norm": 0.036004748195409775, "grad_norm_var": 1.5850707459681643e-05, "learning_rate": 0.00636162089203974, "loss": 2.7295, "step": 5458 }, { "crossentropy": 2.6520278453826904, "epoch": 0.2968541830936132, "grad_norm": 0.034499894827604294, "grad_norm_var": 1.5845452195549446e-05, "learning_rate": 0.006360422729368307, "loss": 2.652, "step": 5459 }, { "crossentropy": 2.6606268882751465, "epoch": 0.2969085619511134, "grad_norm": 0.03506036102771759, "grad_norm_var": 1.552571550063649e-05, "learning_rate": 0.006359224482324724, "loss": 2.6606, "step": 5460 }, { "crossentropy": 2.581699848175049, "epoch": 0.29696294080861363, "grad_norm": 0.03886757045984268, "grad_norm_var": 1.4237738933460762e-05, "learning_rate": 0.006358026150983307, "loss": 2.5817, "step": 5461 }, { "crossentropy": 2.7270604372024536, "epoch": 0.29701731966611383, "grad_norm": 0.04096487537026405, "grad_norm_var": 1.4618621090220901e-05, "learning_rate": 0.006356827735418373, "loss": 2.7271, "step": 5462 }, { "crossentropy": 2.8066728115081787, "epoch": 0.29707169852361404, "grad_norm": 0.04313816875219345, "grad_norm_var": 1.6691823598414626e-05, "learning_rate": 0.006355629235704248, "loss": 2.8067, "step": 5463 }, { "crossentropy": 2.763240098953247, "epoch": 0.29712607738111424, "grad_norm": 0.034084003418684006, "grad_norm_var": 1.6985102461099708e-05, "learning_rate": 0.006354430651915262, "loss": 2.7632, "step": 5464 }, { "crossentropy": 2.6118853092193604, "epoch": 0.29718045623861444, "grad_norm": 0.03651216998696327, "grad_norm_var": 7.1736010226806e-06, "learning_rate": 0.0063532319841257505, "loss": 2.6119, "step": 5465 }, { "crossentropy": 2.700343370437622, "epoch": 0.29723483509611465, "grad_norm": 0.03751831129193306, "grad_norm_var": 6.968173161612616e-06, "learning_rate": 0.00635203323241005, "loss": 2.7003, "step": 5466 }, { "crossentropy": 2.6896179914474487, "epoch": 0.29728921395361485, "grad_norm": 0.035750262439250946, "grad_norm_var": 7.041337226272824e-06, "learning_rate": 0.006350834396842513, "loss": 2.6896, "step": 5467 }, { "crossentropy": 2.649552822113037, "epoch": 0.29734359281111505, "grad_norm": 0.03447842225432396, "grad_norm_var": 7.3667885308249805e-06, "learning_rate": 0.0063496354774974844, "loss": 2.6496, "step": 5468 }, { "crossentropy": 2.7330751419067383, "epoch": 0.29739797166861526, "grad_norm": 0.035268135368824005, "grad_norm_var": 7.34870764495521e-06, "learning_rate": 0.006348436474449321, "loss": 2.7331, "step": 5469 }, { "crossentropy": 2.65856397151947, "epoch": 0.29745235052611546, "grad_norm": 0.03515395149588585, "grad_norm_var": 7.028406059096196e-06, "learning_rate": 0.006347237387772384, "loss": 2.6586, "step": 5470 }, { "crossentropy": 2.6496975421905518, "epoch": 0.29750672938361566, "grad_norm": 0.036063052713871, "grad_norm_var": 6.7659904673199205e-06, "learning_rate": 0.006346038217541042, "loss": 2.6497, "step": 5471 }, { "crossentropy": 2.7345398664474487, "epoch": 0.29756110824111587, "grad_norm": 0.03550196811556816, "grad_norm_var": 6.803575489639351e-06, "learning_rate": 0.006344838963829665, "loss": 2.7345, "step": 5472 }, { "crossentropy": 2.5857847929000854, "epoch": 0.29761548709861607, "grad_norm": 0.040658269077539444, "grad_norm_var": 7.156488318183866e-06, "learning_rate": 0.006343639626712626, "loss": 2.5858, "step": 5473 }, { "crossentropy": 2.6572147607803345, "epoch": 0.2976698659561163, "grad_norm": 0.03684403374791145, "grad_norm_var": 7.10645604431621e-06, "learning_rate": 0.0063424402062643125, "loss": 2.6572, "step": 5474 }, { "crossentropy": 2.5633492469787598, "epoch": 0.2977242448136165, "grad_norm": 0.0346798412501812, "grad_norm_var": 7.0509492685516075e-06, "learning_rate": 0.006341240702559109, "loss": 2.5633, "step": 5475 }, { "crossentropy": 2.6542469263076782, "epoch": 0.2977786236711167, "grad_norm": 0.03476180508732796, "grad_norm_var": 7.13010836540928e-06, "learning_rate": 0.006340041115671408, "loss": 2.6542, "step": 5476 }, { "crossentropy": 2.715271472930908, "epoch": 0.2978330025286169, "grad_norm": 0.034596651792526245, "grad_norm_var": 7.1441882843997195e-06, "learning_rate": 0.006338841445675605, "loss": 2.7153, "step": 5477 }, { "crossentropy": 2.549630880355835, "epoch": 0.2978873813861171, "grad_norm": 0.034676142036914825, "grad_norm_var": 5.975606226204846e-06, "learning_rate": 0.006337641692646105, "loss": 2.5496, "step": 5478 }, { "crossentropy": 2.7376816272735596, "epoch": 0.2979417602436173, "grad_norm": 0.034671489149332047, "grad_norm_var": 2.6576969338252983e-06, "learning_rate": 0.006336441856657314, "loss": 2.7377, "step": 5479 }, { "crossentropy": 2.7495505809783936, "epoch": 0.2979961391011175, "grad_norm": 0.04693702608346939, "grad_norm_var": 1.0211334322763703e-05, "learning_rate": 0.006335241937783644, "loss": 2.7496, "step": 5480 }, { "crossentropy": 2.6348185539245605, "epoch": 0.2980505179586177, "grad_norm": 0.04536863788962364, "grad_norm_var": 1.512274005201463e-05, "learning_rate": 0.006334041936099514, "loss": 2.6348, "step": 5481 }, { "crossentropy": 2.6800310611724854, "epoch": 0.2981048968161179, "grad_norm": 0.040372829884290695, "grad_norm_var": 1.5807202950348282e-05, "learning_rate": 0.006332841851679347, "loss": 2.68, "step": 5482 }, { "crossentropy": 2.7359085083007812, "epoch": 0.2981592756736181, "grad_norm": 0.035738665610551834, "grad_norm_var": 1.5809509298220168e-05, "learning_rate": 0.006331641684597571, "loss": 2.7359, "step": 5483 }, { "crossentropy": 2.6156240701675415, "epoch": 0.2982136545311183, "grad_norm": 0.03470287099480629, "grad_norm_var": 1.5730142727381617e-05, "learning_rate": 0.00633044143492862, "loss": 2.6156, "step": 5484 }, { "crossentropy": 2.670469880104065, "epoch": 0.2982680333886185, "grad_norm": 0.0362359955906868, "grad_norm_var": 1.553297143076221e-05, "learning_rate": 0.006329241102746932, "loss": 2.6705, "step": 5485 }, { "crossentropy": 2.6450304985046387, "epoch": 0.2983224122461187, "grad_norm": 0.03265706077218056, "grad_norm_var": 1.664048160625453e-05, "learning_rate": 0.00632804068812695, "loss": 2.645, "step": 5486 }, { "crossentropy": 2.6343507766723633, "epoch": 0.2983767911036189, "grad_norm": 0.03465365245938301, "grad_norm_var": 1.696967050553421e-05, "learning_rate": 0.006326840191143121, "loss": 2.6344, "step": 5487 }, { "crossentropy": 2.625399351119995, "epoch": 0.2984311699611191, "grad_norm": 0.036175038665533066, "grad_norm_var": 1.6857618726793393e-05, "learning_rate": 0.0063256396118699, "loss": 2.6254, "step": 5488 }, { "crossentropy": 2.5659210681915283, "epoch": 0.29848554881861933, "grad_norm": 0.03628011420369148, "grad_norm_var": 1.5983223355092494e-05, "learning_rate": 0.006324438950381749, "loss": 2.5659, "step": 5489 }, { "crossentropy": 2.560292363166809, "epoch": 0.29853992767611953, "grad_norm": 0.03374217450618744, "grad_norm_var": 1.6580622305666034e-05, "learning_rate": 0.006323238206753126, "loss": 2.5603, "step": 5490 }, { "crossentropy": 2.6607284545898438, "epoch": 0.29859430653361974, "grad_norm": 0.04672546684741974, "grad_norm_var": 2.2500008954334563e-05, "learning_rate": 0.006322037381058505, "loss": 2.6607, "step": 5491 }, { "crossentropy": 2.6690781116485596, "epoch": 0.29864868539111994, "grad_norm": 0.03650542348623276, "grad_norm_var": 2.207820438364736e-05, "learning_rate": 0.006320836473372358, "loss": 2.6691, "step": 5492 }, { "crossentropy": 2.695019006729126, "epoch": 0.29870306424862014, "grad_norm": 0.034453123807907104, "grad_norm_var": 2.2135100397162725e-05, "learning_rate": 0.006319635483769164, "loss": 2.695, "step": 5493 }, { "crossentropy": 2.8044124841690063, "epoch": 0.29875744310612035, "grad_norm": 0.03705081716179848, "grad_norm_var": 2.159550740203515e-05, "learning_rate": 0.006318434412323406, "loss": 2.8044, "step": 5494 }, { "crossentropy": 2.5450947284698486, "epoch": 0.29881182196362055, "grad_norm": 0.038534119725227356, "grad_norm_var": 2.0998189198910036e-05, "learning_rate": 0.006317233259109577, "loss": 2.5451, "step": 5495 }, { "crossentropy": 2.6844452619552612, "epoch": 0.29886620082112075, "grad_norm": 0.03501692786812782, "grad_norm_var": 1.5489249947022352e-05, "learning_rate": 0.00631603202420217, "loss": 2.6844, "step": 5496 }, { "crossentropy": 2.677176833152771, "epoch": 0.29892057967862096, "grad_norm": 0.036405231803655624, "grad_norm_var": 1.067442631522882e-05, "learning_rate": 0.006314830707675681, "loss": 2.6772, "step": 5497 }, { "crossentropy": 2.6621878147125244, "epoch": 0.29897495853612116, "grad_norm": 0.03802188113331795, "grad_norm_var": 9.830364222675424e-06, "learning_rate": 0.006313629309604621, "loss": 2.6622, "step": 5498 }, { "crossentropy": 2.6818281412124634, "epoch": 0.29902933739362136, "grad_norm": 0.036449361592531204, "grad_norm_var": 9.796311846539904e-06, "learning_rate": 0.0063124278300634926, "loss": 2.6818, "step": 5499 }, { "crossentropy": 2.700696110725403, "epoch": 0.29908371625112157, "grad_norm": 0.035667650401592255, "grad_norm_var": 9.626450539855907e-06, "learning_rate": 0.006311226269126815, "loss": 2.7007, "step": 5500 }, { "crossentropy": 2.650764226913452, "epoch": 0.29913809510862177, "grad_norm": 0.03428810089826584, "grad_norm_var": 9.941479015834233e-06, "learning_rate": 0.006310024626869106, "loss": 2.6508, "step": 5501 }, { "crossentropy": 2.70590877532959, "epoch": 0.299192473966122, "grad_norm": 0.034960150718688965, "grad_norm_var": 9.119275936034927e-06, "learning_rate": 0.006308822903364893, "loss": 2.7059, "step": 5502 }, { "crossentropy": 2.5461158752441406, "epoch": 0.2992468528236222, "grad_norm": 0.04490949958562851, "grad_norm_var": 1.308897741278166e-05, "learning_rate": 0.0063076210986887015, "loss": 2.5461, "step": 5503 }, { "crossentropy": 2.778051018714905, "epoch": 0.2993012316811224, "grad_norm": 0.03255976364016533, "grad_norm_var": 1.439948527897818e-05, "learning_rate": 0.00630641921291507, "loss": 2.7781, "step": 5504 }, { "crossentropy": 2.591703414916992, "epoch": 0.2993556105386226, "grad_norm": 0.034330349415540695, "grad_norm_var": 1.481724214298415e-05, "learning_rate": 0.006305217246118535, "loss": 2.5917, "step": 5505 }, { "crossentropy": 2.609813094139099, "epoch": 0.2994099893961228, "grad_norm": 0.03367352858185768, "grad_norm_var": 1.4845993398593152e-05, "learning_rate": 0.006304015198373645, "loss": 2.6098, "step": 5506 }, { "crossentropy": 2.6640087366104126, "epoch": 0.299464368253623, "grad_norm": 0.03494952991604805, "grad_norm_var": 8.00255034530539e-06, "learning_rate": 0.006302813069754949, "loss": 2.664, "step": 5507 }, { "crossentropy": 2.7394139766693115, "epoch": 0.2995187471111232, "grad_norm": 0.03688138350844383, "grad_norm_var": 8.03115782078053e-06, "learning_rate": 0.006301610860336998, "loss": 2.7394, "step": 5508 }, { "crossentropy": 2.6762263774871826, "epoch": 0.2995731259686234, "grad_norm": 0.035979289561510086, "grad_norm_var": 7.834597901093475e-06, "learning_rate": 0.006300408570194358, "loss": 2.6762, "step": 5509 }, { "crossentropy": 2.6269052028656006, "epoch": 0.2996275048261236, "grad_norm": 0.03959626331925392, "grad_norm_var": 8.51818457817301e-06, "learning_rate": 0.006299206199401589, "loss": 2.6269, "step": 5510 }, { "crossentropy": 2.5814138650894165, "epoch": 0.2996818836836238, "grad_norm": 0.03893584758043289, "grad_norm_var": 8.64317497875257e-06, "learning_rate": 0.006298003748033264, "loss": 2.5814, "step": 5511 }, { "crossentropy": 2.5184640884399414, "epoch": 0.299736262541124, "grad_norm": 0.03697531297802925, "grad_norm_var": 8.518066396913325e-06, "learning_rate": 0.006296801216163954, "loss": 2.5185, "step": 5512 }, { "crossentropy": 2.7379095554351807, "epoch": 0.2997906413986242, "grad_norm": 0.03558824956417084, "grad_norm_var": 8.574075993493062e-06, "learning_rate": 0.006295598603868246, "loss": 2.7379, "step": 5513 }, { "crossentropy": 2.6998465061187744, "epoch": 0.2998450202561244, "grad_norm": 0.03394606336951256, "grad_norm_var": 8.77734712969753e-06, "learning_rate": 0.006294395911220717, "loss": 2.6998, "step": 5514 }, { "crossentropy": 2.645721197128296, "epoch": 0.2998993991136246, "grad_norm": 0.03368963301181793, "grad_norm_var": 9.172874270256016e-06, "learning_rate": 0.0062931931382959605, "loss": 2.6457, "step": 5515 }, { "crossentropy": 2.6079673767089844, "epoch": 0.2999537779711248, "grad_norm": 0.03779727593064308, "grad_norm_var": 9.345444603386006e-06, "learning_rate": 0.006291990285168573, "loss": 2.608, "step": 5516 }, { "crossentropy": 2.749247193336487, "epoch": 0.30000815682862503, "grad_norm": 0.04110918194055557, "grad_norm_var": 1.0522506298539975e-05, "learning_rate": 0.0062907873519131505, "loss": 2.7492, "step": 5517 }, { "crossentropy": 2.5637309551239014, "epoch": 0.30006253568612523, "grad_norm": 0.03774115815758705, "grad_norm_var": 1.039130403777949e-05, "learning_rate": 0.006289584338604302, "loss": 2.5637, "step": 5518 }, { "crossentropy": 2.64310884475708, "epoch": 0.30011691454362543, "grad_norm": 0.03406780585646629, "grad_norm_var": 6.002499651933807e-06, "learning_rate": 0.0062883812453166325, "loss": 2.6431, "step": 5519 }, { "crossentropy": 2.651626944541931, "epoch": 0.30017129340112564, "grad_norm": 0.03633217141032219, "grad_norm_var": 5.104309487239528e-06, "learning_rate": 0.006287178072124762, "loss": 2.6516, "step": 5520 }, { "crossentropy": 2.6233829259872437, "epoch": 0.30022567225862584, "grad_norm": 0.03560102730989456, "grad_norm_var": 4.863120323970495e-06, "learning_rate": 0.006285974819103306, "loss": 2.6234, "step": 5521 }, { "crossentropy": 2.6264359951019287, "epoch": 0.30028005111612605, "grad_norm": 0.0362418107688427, "grad_norm_var": 4.3318037729023555e-06, "learning_rate": 0.006284771486326891, "loss": 2.6264, "step": 5522 }, { "crossentropy": 2.664497137069702, "epoch": 0.30033442997362625, "grad_norm": 0.03844408690929413, "grad_norm_var": 4.330919997184296e-06, "learning_rate": 0.006283568073870146, "loss": 2.6645, "step": 5523 }, { "crossentropy": 2.6297640800476074, "epoch": 0.30038880883112645, "grad_norm": 0.034972723573446274, "grad_norm_var": 4.539908312038016e-06, "learning_rate": 0.0062823645818077065, "loss": 2.6298, "step": 5524 }, { "crossentropy": 2.6882498264312744, "epoch": 0.30044318768862666, "grad_norm": 0.03581216558814049, "grad_norm_var": 4.557460086826448e-06, "learning_rate": 0.006281161010214212, "loss": 2.6882, "step": 5525 }, { "crossentropy": 2.6517333984375, "epoch": 0.30049756654612686, "grad_norm": 0.03482069447636604, "grad_norm_var": 4.12476690525021e-06, "learning_rate": 0.006279957359164303, "loss": 2.6517, "step": 5526 }, { "crossentropy": 2.6494903564453125, "epoch": 0.30055194540362706, "grad_norm": 0.03519371151924133, "grad_norm_var": 3.724597456646282e-06, "learning_rate": 0.006278753628732634, "loss": 2.6495, "step": 5527 }, { "crossentropy": 2.6288557052612305, "epoch": 0.30060632426112727, "grad_norm": 0.04296283423900604, "grad_norm_var": 6.62746475721254e-06, "learning_rate": 0.006277549818993857, "loss": 2.6289, "step": 5528 }, { "crossentropy": 2.6796499490737915, "epoch": 0.30066070311862747, "grad_norm": 0.03722146525979042, "grad_norm_var": 6.591268847344391e-06, "learning_rate": 0.00627634593002263, "loss": 2.6796, "step": 5529 }, { "crossentropy": 2.694367289543152, "epoch": 0.3007150819761277, "grad_norm": 0.04247232526540756, "grad_norm_var": 8.092613457200079e-06, "learning_rate": 0.00627514196189362, "loss": 2.6944, "step": 5530 }, { "crossentropy": 2.7198601961135864, "epoch": 0.3007694608336279, "grad_norm": 0.05413026362657547, "grad_norm_var": 2.4761741031786798e-05, "learning_rate": 0.006273937914681493, "loss": 2.7199, "step": 5531 }, { "crossentropy": 2.577507257461548, "epoch": 0.3008238396911281, "grad_norm": 0.037121593952178955, "grad_norm_var": 2.484750704453952e-05, "learning_rate": 0.006272733788460925, "loss": 2.5775, "step": 5532 }, { "crossentropy": 2.5215219259262085, "epoch": 0.3008782185486283, "grad_norm": 0.03554823622107506, "grad_norm_var": 2.476433391185717e-05, "learning_rate": 0.0062715295833065945, "loss": 2.5215, "step": 5533 }, { "crossentropy": 2.779023766517639, "epoch": 0.3009325974061285, "grad_norm": 0.03717264533042908, "grad_norm_var": 2.4807395867702115e-05, "learning_rate": 0.0062703252992931825, "loss": 2.779, "step": 5534 }, { "crossentropy": 2.635760188102722, "epoch": 0.3009869762636287, "grad_norm": 0.038361601531505585, "grad_norm_var": 2.3704348382649842e-05, "learning_rate": 0.0062691209364953825, "loss": 2.6358, "step": 5535 }, { "crossentropy": 2.617814064025879, "epoch": 0.3010413551211289, "grad_norm": 0.035039301961660385, "grad_norm_var": 2.41438284874953e-05, "learning_rate": 0.006267916494987883, "loss": 2.6178, "step": 5536 }, { "crossentropy": 2.569311022758484, "epoch": 0.3010957339786291, "grad_norm": 0.03414458781480789, "grad_norm_var": 2.478009042688045e-05, "learning_rate": 0.006266711974845387, "loss": 2.5693, "step": 5537 }, { "crossentropy": 2.7343963384628296, "epoch": 0.3011501128361293, "grad_norm": 0.032966408878564835, "grad_norm_var": 2.626375440480799e-05, "learning_rate": 0.006265507376142594, "loss": 2.7344, "step": 5538 }, { "crossentropy": 2.6797856092453003, "epoch": 0.3012044916936295, "grad_norm": 0.03669625520706177, "grad_norm_var": 2.632766668742009e-05, "learning_rate": 0.006264302698954214, "loss": 2.6798, "step": 5539 }, { "crossentropy": 2.635996460914612, "epoch": 0.3012588705511297, "grad_norm": 0.03568948432803154, "grad_norm_var": 2.609055308148278e-05, "learning_rate": 0.006263097943354956, "loss": 2.636, "step": 5540 }, { "crossentropy": 2.604708194732666, "epoch": 0.3013132494086299, "grad_norm": 0.03350033238530159, "grad_norm_var": 2.704799253758026e-05, "learning_rate": 0.006261893109419546, "loss": 2.6047, "step": 5541 }, { "crossentropy": 2.7447084188461304, "epoch": 0.3013676282661301, "grad_norm": 0.035599373281002045, "grad_norm_var": 2.6787975206957888e-05, "learning_rate": 0.0062606881972226995, "loss": 2.7447, "step": 5542 }, { "crossentropy": 2.7013490200042725, "epoch": 0.3014220071236303, "grad_norm": 0.03516143187880516, "grad_norm_var": 2.6798994167208782e-05, "learning_rate": 0.006259483206839145, "loss": 2.7013, "step": 5543 }, { "crossentropy": 2.6982405185699463, "epoch": 0.3014763859811305, "grad_norm": 0.03515308350324631, "grad_norm_var": 2.516909423306331e-05, "learning_rate": 0.006258278138343619, "loss": 2.6982, "step": 5544 }, { "crossentropy": 2.639539122581482, "epoch": 0.3015307648386307, "grad_norm": 0.0343022346496582, "grad_norm_var": 2.5712294346606073e-05, "learning_rate": 0.006257072991810853, "loss": 2.6395, "step": 5545 }, { "crossentropy": 2.7772737741470337, "epoch": 0.30158514369613093, "grad_norm": 0.03513837605714798, "grad_norm_var": 2.3787534496170354e-05, "learning_rate": 0.006255867767315595, "loss": 2.7773, "step": 5546 }, { "crossentropy": 2.7327669858932495, "epoch": 0.30163952255363113, "grad_norm": 0.035717591643333435, "grad_norm_var": 1.95870308434265e-06, "learning_rate": 0.006254662464932586, "loss": 2.7328, "step": 5547 }, { "crossentropy": 2.6218453645706177, "epoch": 0.30169390141113134, "grad_norm": 0.03346239775419235, "grad_norm_var": 1.9834333615869707e-06, "learning_rate": 0.006253457084736583, "loss": 2.6218, "step": 5548 }, { "crossentropy": 2.659378170967102, "epoch": 0.30174828026863154, "grad_norm": 0.03342315927147865, "grad_norm_var": 2.17503809149903e-06, "learning_rate": 0.006252251626802339, "loss": 2.6594, "step": 5549 }, { "crossentropy": 2.7053290605545044, "epoch": 0.30180265912613174, "grad_norm": 0.03941557556390762, "grad_norm_var": 3.1106397396024147e-06, "learning_rate": 0.0062510460912046165, "loss": 2.7053, "step": 5550 }, { "crossentropy": 2.7120026350021362, "epoch": 0.30185703798363195, "grad_norm": 0.04074457660317421, "grad_norm_var": 4.4587431852875664e-06, "learning_rate": 0.006249840478018182, "loss": 2.712, "step": 5551 }, { "crossentropy": 2.6797759532928467, "epoch": 0.30191141684113215, "grad_norm": 0.03694651275873184, "grad_norm_var": 4.598267441389507e-06, "learning_rate": 0.006248634787317808, "loss": 2.6798, "step": 5552 }, { "crossentropy": 2.6782898902893066, "epoch": 0.30196579569863236, "grad_norm": 0.03623764216899872, "grad_norm_var": 4.4927414161971085e-06, "learning_rate": 0.006247429019178265, "loss": 2.6783, "step": 5553 }, { "crossentropy": 2.665212869644165, "epoch": 0.30202017455613256, "grad_norm": 0.033765267580747604, "grad_norm_var": 4.248420776698884e-06, "learning_rate": 0.00624622317367434, "loss": 2.6652, "step": 5554 }, { "crossentropy": 2.6336065530776978, "epoch": 0.30207455341363276, "grad_norm": 0.03444039821624756, "grad_norm_var": 4.262184112732714e-06, "learning_rate": 0.0062450172508808155, "loss": 2.6336, "step": 5555 }, { "crossentropy": 2.562678098678589, "epoch": 0.30212893227113297, "grad_norm": 0.04146308824419975, "grad_norm_var": 6.45790208754308e-06, "learning_rate": 0.006243811250872481, "loss": 2.5627, "step": 5556 }, { "crossentropy": 2.6419137716293335, "epoch": 0.30218331112863317, "grad_norm": 0.041851162910461426, "grad_norm_var": 8.139585765437548e-06, "learning_rate": 0.0062426051737241315, "loss": 2.6419, "step": 5557 }, { "crossentropy": 2.5867117643356323, "epoch": 0.3022376899861334, "grad_norm": 0.03790070489048958, "grad_norm_var": 8.216835474169025e-06, "learning_rate": 0.006241399019510568, "loss": 2.5867, "step": 5558 }, { "crossentropy": 2.674081563949585, "epoch": 0.3022920688436336, "grad_norm": 0.03488962724804878, "grad_norm_var": 8.27250746890292e-06, "learning_rate": 0.0062401927883065965, "loss": 2.6741, "step": 5559 }, { "crossentropy": 2.7019463777542114, "epoch": 0.3023464477011338, "grad_norm": 0.03272043913602829, "grad_norm_var": 9.096502819158637e-06, "learning_rate": 0.006238986480187023, "loss": 2.7019, "step": 5560 }, { "crossentropy": 2.63956618309021, "epoch": 0.302400826558634, "grad_norm": 0.03561876341700554, "grad_norm_var": 8.836389268930024e-06, "learning_rate": 0.006237780095226664, "loss": 2.6396, "step": 5561 }, { "crossentropy": 2.6999447345733643, "epoch": 0.3024552054161342, "grad_norm": 0.03771249204874039, "grad_norm_var": 8.788866870188851e-06, "learning_rate": 0.006236573633500337, "loss": 2.6999, "step": 5562 }, { "crossentropy": 2.707074284553528, "epoch": 0.3025095842736344, "grad_norm": 0.039348896592855453, "grad_norm_var": 9.164309121765385e-06, "learning_rate": 0.006235367095082866, "loss": 2.7071, "step": 5563 }, { "crossentropy": 2.614726185798645, "epoch": 0.3025639631311346, "grad_norm": 0.0378999337553978, "grad_norm_var": 8.378095324882567e-06, "learning_rate": 0.0062341604800490785, "loss": 2.6147, "step": 5564 }, { "crossentropy": 2.61535108089447, "epoch": 0.3026183419886348, "grad_norm": 0.03536799177527428, "grad_norm_var": 7.648435524953978e-06, "learning_rate": 0.006232953788473811, "loss": 2.6154, "step": 5565 }, { "crossentropy": 2.5698230266571045, "epoch": 0.302672720846135, "grad_norm": 0.033844977617263794, "grad_norm_var": 7.99443225646008e-06, "learning_rate": 0.006231747020431897, "loss": 2.5698, "step": 5566 }, { "crossentropy": 2.6682599782943726, "epoch": 0.3027270997036352, "grad_norm": 0.03373591974377632, "grad_norm_var": 7.492389068682279e-06, "learning_rate": 0.006230540175998182, "loss": 2.6683, "step": 5567 }, { "crossentropy": 2.679377317428589, "epoch": 0.3027814785611354, "grad_norm": 0.03262801468372345, "grad_norm_var": 8.391656896436506e-06, "learning_rate": 0.006229333255247512, "loss": 2.6794, "step": 5568 }, { "crossentropy": 2.619991898536682, "epoch": 0.3028358574186356, "grad_norm": 0.03311173990368843, "grad_norm_var": 8.992541594939931e-06, "learning_rate": 0.006228126258254739, "loss": 2.62, "step": 5569 }, { "crossentropy": 2.624258875846863, "epoch": 0.3028902362761358, "grad_norm": 0.03547394275665283, "grad_norm_var": 8.661627134707445e-06, "learning_rate": 0.0062269191850947226, "loss": 2.6243, "step": 5570 }, { "crossentropy": 2.6761667728424072, "epoch": 0.302944615133636, "grad_norm": 0.0342799536883831, "grad_norm_var": 8.699284876558835e-06, "learning_rate": 0.006225712035842319, "loss": 2.6762, "step": 5571 }, { "crossentropy": 2.6644366979599, "epoch": 0.3029989939911362, "grad_norm": 0.033104367554187775, "grad_norm_var": 7.106157409900171e-06, "learning_rate": 0.006224504810572401, "loss": 2.6644, "step": 5572 }, { "crossentropy": 2.738580107688904, "epoch": 0.3030533728486364, "grad_norm": 0.03807663917541504, "grad_norm_var": 4.847081337513992e-06, "learning_rate": 0.006223297509359836, "loss": 2.7386, "step": 5573 }, { "crossentropy": 2.70255708694458, "epoch": 0.30310775170613663, "grad_norm": 0.040922340005636215, "grad_norm_var": 6.442482963444027e-06, "learning_rate": 0.0062220901322794986, "loss": 2.7026, "step": 5574 }, { "crossentropy": 2.652377724647522, "epoch": 0.30316213056363683, "grad_norm": 0.0327637754380703, "grad_norm_var": 6.910983353062653e-06, "learning_rate": 0.0062208826794062705, "loss": 2.6524, "step": 5575 }, { "crossentropy": 2.6584984064102173, "epoch": 0.30321650942113704, "grad_norm": 0.03406122699379921, "grad_norm_var": 6.54196219721555e-06, "learning_rate": 0.006219675150815038, "loss": 2.6585, "step": 5576 }, { "crossentropy": 2.6985472440719604, "epoch": 0.30327088827863724, "grad_norm": 0.0335768386721611, "grad_norm_var": 6.769384901907325e-06, "learning_rate": 0.006218467546580687, "loss": 2.6985, "step": 5577 }, { "crossentropy": 2.6156094074249268, "epoch": 0.30332526713613744, "grad_norm": 0.03533191233873367, "grad_norm_var": 6.3798332944969375e-06, "learning_rate": 0.0062172598667781164, "loss": 2.6156, "step": 5578 }, { "crossentropy": 2.6042176485061646, "epoch": 0.30337964599363765, "grad_norm": 0.034265320748090744, "grad_norm_var": 5.1967557355978486e-06, "learning_rate": 0.006216052111482224, "loss": 2.6042, "step": 5579 }, { "crossentropy": 2.616923213005066, "epoch": 0.30343402485113785, "grad_norm": 0.033514924347400665, "grad_norm_var": 4.646200458929597e-06, "learning_rate": 0.006214844280767914, "loss": 2.6169, "step": 5580 }, { "crossentropy": 2.6947357654571533, "epoch": 0.30348840370863805, "grad_norm": 0.03606502339243889, "grad_norm_var": 4.745270260236625e-06, "learning_rate": 0.006213636374710094, "loss": 2.6947, "step": 5581 }, { "crossentropy": 2.558526396751404, "epoch": 0.30354278256613826, "grad_norm": 0.03674788773059845, "grad_norm_var": 4.95172887226535e-06, "learning_rate": 0.006212428393383675, "loss": 2.5585, "step": 5582 }, { "crossentropy": 2.722580075263977, "epoch": 0.30359716142363846, "grad_norm": 0.03798322379589081, "grad_norm_var": 5.446174141047049e-06, "learning_rate": 0.006211220336863581, "loss": 2.7226, "step": 5583 }, { "crossentropy": 2.7299411296844482, "epoch": 0.3036515402811387, "grad_norm": 0.03487171232700348, "grad_norm_var": 5.0155494052086505e-06, "learning_rate": 0.00621001220522473, "loss": 2.7299, "step": 5584 }, { "crossentropy": 2.6229207515716553, "epoch": 0.3037059191386389, "grad_norm": 0.032571591436862946, "grad_norm_var": 5.1884603996389665e-06, "learning_rate": 0.006208803998542048, "loss": 2.6229, "step": 5585 }, { "crossentropy": 2.663128614425659, "epoch": 0.30376029799613913, "grad_norm": 0.033089399337768555, "grad_norm_var": 5.46490194765469e-06, "learning_rate": 0.00620759571689047, "loss": 2.6631, "step": 5586 }, { "crossentropy": 2.7096831798553467, "epoch": 0.30381467685363933, "grad_norm": 0.03475113958120346, "grad_norm_var": 5.4287267144528225e-06, "learning_rate": 0.006206387360344934, "loss": 2.7097, "step": 5587 }, { "crossentropy": 2.6863348484039307, "epoch": 0.30386905571113954, "grad_norm": 0.035427313297986984, "grad_norm_var": 5.145998105010365e-06, "learning_rate": 0.0062051789289803766, "loss": 2.6863, "step": 5588 }, { "crossentropy": 2.7692705392837524, "epoch": 0.30392343456863974, "grad_norm": 0.035382192581892014, "grad_norm_var": 4.584708757465692e-06, "learning_rate": 0.006203970422871745, "loss": 2.7693, "step": 5589 }, { "crossentropy": 2.651304006576538, "epoch": 0.30397781342613994, "grad_norm": 0.03544151410460472, "grad_norm_var": 2.1948209401045335e-06, "learning_rate": 0.006202761842093991, "loss": 2.6513, "step": 5590 }, { "crossentropy": 2.5848240852355957, "epoch": 0.30403219228364015, "grad_norm": 0.03435083106160164, "grad_norm_var": 1.9339926871373217e-06, "learning_rate": 0.006201553186722069, "loss": 2.5848, "step": 5591 }, { "crossentropy": 2.7281771898269653, "epoch": 0.30408657114114035, "grad_norm": 0.039753686636686325, "grad_norm_var": 3.368541257616336e-06, "learning_rate": 0.006200344456830938, "loss": 2.7282, "step": 5592 }, { "crossentropy": 2.6379963159561157, "epoch": 0.30414094999864055, "grad_norm": 0.03496472164988518, "grad_norm_var": 3.18943530280257e-06, "learning_rate": 0.0061991356524955625, "loss": 2.638, "step": 5593 }, { "crossentropy": 2.7306172847747803, "epoch": 0.30419532885614076, "grad_norm": 0.03842170163989067, "grad_norm_var": 3.806662500210831e-06, "learning_rate": 0.006197926773790914, "loss": 2.7306, "step": 5594 }, { "crossentropy": 2.685953974723816, "epoch": 0.30424970771364096, "grad_norm": 0.04063797742128372, "grad_norm_var": 5.316869194930268e-06, "learning_rate": 0.006196717820791961, "loss": 2.686, "step": 5595 }, { "crossentropy": 2.5697131156921387, "epoch": 0.30430408657114116, "grad_norm": 0.0372164212167263, "grad_norm_var": 5.009187688366443e-06, "learning_rate": 0.0061955087935736865, "loss": 2.5697, "step": 5596 }, { "crossentropy": 2.5966668128967285, "epoch": 0.30435846542864137, "grad_norm": 0.038104426115751266, "grad_norm_var": 5.258327200623134e-06, "learning_rate": 0.006194299692211069, "loss": 2.5967, "step": 5597 }, { "crossentropy": 2.6404606103897095, "epoch": 0.30441284428614157, "grad_norm": 0.034155335277318954, "grad_norm_var": 5.500162214177919e-06, "learning_rate": 0.006193090516779102, "loss": 2.6405, "step": 5598 }, { "crossentropy": 2.670303463935852, "epoch": 0.3044672231436418, "grad_norm": 0.03533371537923813, "grad_norm_var": 5.263095679454895e-06, "learning_rate": 0.006191881267352769, "loss": 2.6703, "step": 5599 }, { "crossentropy": 2.6988561153411865, "epoch": 0.304521602001142, "grad_norm": 0.03612269461154938, "grad_norm_var": 5.188621423359482e-06, "learning_rate": 0.006190671944007074, "loss": 2.6989, "step": 5600 }, { "crossentropy": 2.5804885625839233, "epoch": 0.3045759808586422, "grad_norm": 0.03601423650979996, "grad_norm_var": 4.36355250599448e-06, "learning_rate": 0.006189462546817014, "loss": 2.5805, "step": 5601 }, { "crossentropy": 2.5996795892715454, "epoch": 0.3046303597161424, "grad_norm": 0.04095960035920143, "grad_norm_var": 4.972810387313062e-06, "learning_rate": 0.006188253075857596, "loss": 2.5997, "step": 5602 }, { "crossentropy": 2.6792110204696655, "epoch": 0.3046847385736426, "grad_norm": 0.037419069558382034, "grad_norm_var": 4.7280322736524084e-06, "learning_rate": 0.006187043531203829, "loss": 2.6792, "step": 5603 }, { "crossentropy": 2.7500689029693604, "epoch": 0.3047391174311428, "grad_norm": 0.03533231094479561, "grad_norm_var": 4.746700982062822e-06, "learning_rate": 0.006185833912930731, "loss": 2.7501, "step": 5604 }, { "crossentropy": 2.651585102081299, "epoch": 0.304793496288643, "grad_norm": 0.03872257471084595, "grad_norm_var": 4.790056698295888e-06, "learning_rate": 0.0061846242211133174, "loss": 2.6516, "step": 5605 }, { "crossentropy": 2.557036280632019, "epoch": 0.3048478751461432, "grad_norm": 0.03603554144501686, "grad_norm_var": 4.68396645240763e-06, "learning_rate": 0.006183414455826615, "loss": 2.557, "step": 5606 }, { "crossentropy": 2.6298869848251343, "epoch": 0.3049022540036434, "grad_norm": 0.035798102617263794, "grad_norm_var": 4.285037977407499e-06, "learning_rate": 0.006182204617145651, "loss": 2.6299, "step": 5607 }, { "crossentropy": 2.632510781288147, "epoch": 0.3049566328611436, "grad_norm": 0.03625458851456642, "grad_norm_var": 3.852793389796707e-06, "learning_rate": 0.006180994705145457, "loss": 2.6325, "step": 5608 }, { "crossentropy": 2.7182165384292603, "epoch": 0.3050110117186438, "grad_norm": 0.03512876480817795, "grad_norm_var": 3.8106518672933442e-06, "learning_rate": 0.006179784719901076, "loss": 2.7182, "step": 5609 }, { "crossentropy": 2.624799609184265, "epoch": 0.305065390576144, "grad_norm": 0.033421844244003296, "grad_norm_var": 4.4109999677464005e-06, "learning_rate": 0.006178574661487544, "loss": 2.6248, "step": 5610 }, { "crossentropy": 2.7399874925613403, "epoch": 0.3051197694336442, "grad_norm": 0.042780108749866486, "grad_norm_var": 5.8322401968252205e-06, "learning_rate": 0.006177364529979912, "loss": 2.74, "step": 5611 }, { "crossentropy": 2.7550255060195923, "epoch": 0.3051741482911444, "grad_norm": 0.03919943794608116, "grad_norm_var": 6.188126092879914e-06, "learning_rate": 0.006176154325453228, "loss": 2.755, "step": 5612 }, { "crossentropy": 2.6970596313476562, "epoch": 0.3052285271486446, "grad_norm": 0.04220076650381088, "grad_norm_var": 7.881656265397383e-06, "learning_rate": 0.006174944047982549, "loss": 2.6971, "step": 5613 }, { "crossentropy": 2.641928195953369, "epoch": 0.3052829060061448, "grad_norm": 0.03820057585835457, "grad_norm_var": 7.273049031662702e-06, "learning_rate": 0.006173733697642937, "loss": 2.6419, "step": 5614 }, { "crossentropy": 2.6717424392700195, "epoch": 0.30533728486364503, "grad_norm": 0.036684755235910416, "grad_norm_var": 7.0090143243215405e-06, "learning_rate": 0.006172523274509455, "loss": 2.6717, "step": 5615 }, { "crossentropy": 2.6670641899108887, "epoch": 0.30539166372114523, "grad_norm": 0.034487396478652954, "grad_norm_var": 7.480206332377328e-06, "learning_rate": 0.006171312778657172, "loss": 2.6671, "step": 5616 }, { "crossentropy": 2.63256573677063, "epoch": 0.30544604257864544, "grad_norm": 0.035560667514801025, "grad_norm_var": 7.577775289221063e-06, "learning_rate": 0.006170102210161164, "loss": 2.6326, "step": 5617 }, { "crossentropy": 2.6112459897994995, "epoch": 0.30550042143614564, "grad_norm": 0.03466442599892616, "grad_norm_var": 7.05560670451281e-06, "learning_rate": 0.006168891569096505, "loss": 2.6112, "step": 5618 }, { "crossentropy": 2.6037416458129883, "epoch": 0.30555480029364585, "grad_norm": 0.03351050242781639, "grad_norm_var": 7.788465376034938e-06, "learning_rate": 0.006167680855538283, "loss": 2.6037, "step": 5619 }, { "crossentropy": 2.6226836442947388, "epoch": 0.30560917915114605, "grad_norm": 0.033202849328517914, "grad_norm_var": 8.474087263173e-06, "learning_rate": 0.006166470069561582, "loss": 2.6227, "step": 5620 }, { "crossentropy": 2.5784730911254883, "epoch": 0.30566355800864625, "grad_norm": 0.03500015288591385, "grad_norm_var": 8.294476399088532e-06, "learning_rate": 0.006165259211241495, "loss": 2.5785, "step": 5621 }, { "crossentropy": 2.6869399547576904, "epoch": 0.30571793686614646, "grad_norm": 0.037580180913209915, "grad_norm_var": 8.372004152601362e-06, "learning_rate": 0.006164048280653118, "loss": 2.6869, "step": 5622 }, { "crossentropy": 2.6816786527633667, "epoch": 0.30577231572364666, "grad_norm": 0.037627074867486954, "grad_norm_var": 8.414860229930682e-06, "learning_rate": 0.006162837277871552, "loss": 2.6817, "step": 5623 }, { "crossentropy": 2.6130915880203247, "epoch": 0.30582669458114686, "grad_norm": 0.03315235301852226, "grad_norm_var": 9.156745430223082e-06, "learning_rate": 0.006161626202971903, "loss": 2.6131, "step": 5624 }, { "crossentropy": 2.715200662612915, "epoch": 0.30588107343864707, "grad_norm": 0.03709230571985245, "grad_norm_var": 9.064867047348102e-06, "learning_rate": 0.006160415056029278, "loss": 2.7152, "step": 5625 }, { "crossentropy": 2.535356283187866, "epoch": 0.30593545229614727, "grad_norm": 0.0353488028049469, "grad_norm_var": 8.500208765112778e-06, "learning_rate": 0.0061592038371187955, "loss": 2.5354, "step": 5626 }, { "crossentropy": 2.634174942970276, "epoch": 0.3059898311536475, "grad_norm": 0.03431766480207443, "grad_norm_var": 6.051667311441726e-06, "learning_rate": 0.00615799254631557, "loss": 2.6342, "step": 5627 }, { "crossentropy": 2.656317353248596, "epoch": 0.3060442100111477, "grad_norm": 0.03391463682055473, "grad_norm_var": 5.623374415567965e-06, "learning_rate": 0.006156781183694726, "loss": 2.6563, "step": 5628 }, { "crossentropy": 2.676317811012268, "epoch": 0.3060985888686479, "grad_norm": 0.03564510866999626, "grad_norm_var": 2.7006589754976473e-06, "learning_rate": 0.0061555697493313934, "loss": 2.6763, "step": 5629 }, { "crossentropy": 2.5444581508636475, "epoch": 0.3061529677261481, "grad_norm": 0.03447960317134857, "grad_norm_var": 2.1638322231111913e-06, "learning_rate": 0.0061543582433007, "loss": 2.5445, "step": 5630 }, { "crossentropy": 2.6512798070907593, "epoch": 0.3062073465836483, "grad_norm": 0.051563799381256104, "grad_norm_var": 1.906152052894335e-05, "learning_rate": 0.006153146665677785, "loss": 2.6513, "step": 5631 }, { "crossentropy": 2.7007975578308105, "epoch": 0.3062617254411485, "grad_norm": 0.04083460941910744, "grad_norm_var": 2.0238659614275437e-05, "learning_rate": 0.006151935016537789, "loss": 2.7008, "step": 5632 }, { "crossentropy": 2.70012104511261, "epoch": 0.3063161042986487, "grad_norm": 0.03755280375480652, "grad_norm_var": 2.0245581694924854e-05, "learning_rate": 0.00615072329595586, "loss": 2.7001, "step": 5633 }, { "crossentropy": 2.762192964553833, "epoch": 0.3063704831561489, "grad_norm": 0.03391828387975693, "grad_norm_var": 2.047223556968693e-05, "learning_rate": 0.006149511504007142, "loss": 2.7622, "step": 5634 }, { "crossentropy": 2.6599104404449463, "epoch": 0.3064248620136491, "grad_norm": 0.03425037860870361, "grad_norm_var": 2.0206967628984506e-05, "learning_rate": 0.006148299640766792, "loss": 2.6599, "step": 5635 }, { "crossentropy": 2.58146071434021, "epoch": 0.3064792408711493, "grad_norm": 0.03568333387374878, "grad_norm_var": 1.94704418033607e-05, "learning_rate": 0.0061470877063099704, "loss": 2.5815, "step": 5636 }, { "crossentropy": 2.497122883796692, "epoch": 0.3065336197286495, "grad_norm": 0.03329605981707573, "grad_norm_var": 2.004897209067069e-05, "learning_rate": 0.0061458757007118395, "loss": 2.4971, "step": 5637 }, { "crossentropy": 2.660246253013611, "epoch": 0.3065879985861497, "grad_norm": 0.03282066062092781, "grad_norm_var": 2.086881957408289e-05, "learning_rate": 0.006144663624047563, "loss": 2.6602, "step": 5638 }, { "crossentropy": 2.5097551345825195, "epoch": 0.3066423774436499, "grad_norm": 0.03370685502886772, "grad_norm_var": 2.1158456200938828e-05, "learning_rate": 0.006143451476392317, "loss": 2.5098, "step": 5639 }, { "crossentropy": 2.683839201927185, "epoch": 0.3066967563011501, "grad_norm": 0.03747024014592171, "grad_norm_var": 2.0627519529261135e-05, "learning_rate": 0.006142239257821277, "loss": 2.6838, "step": 5640 }, { "crossentropy": 2.6685373783111572, "epoch": 0.3067511351586503, "grad_norm": 0.03435178101062775, "grad_norm_var": 2.0832423773170922e-05, "learning_rate": 0.006141026968409623, "loss": 2.6685, "step": 5641 }, { "crossentropy": 2.5500115156173706, "epoch": 0.3068055140161505, "grad_norm": 0.03403793275356293, "grad_norm_var": 2.1088101355250648e-05, "learning_rate": 0.00613981460823254, "loss": 2.55, "step": 5642 }, { "crossentropy": 2.596098780632019, "epoch": 0.30685989287365073, "grad_norm": 0.0361584909260273, "grad_norm_var": 2.0858689642179693e-05, "learning_rate": 0.006138602177365218, "loss": 2.5961, "step": 5643 }, { "crossentropy": 2.70418119430542, "epoch": 0.30691427173115093, "grad_norm": 0.04009092599153519, "grad_norm_var": 2.1335899597083446e-05, "learning_rate": 0.006137389675882851, "loss": 2.7042, "step": 5644 }, { "crossentropy": 2.6999257802963257, "epoch": 0.30696865058865114, "grad_norm": 0.038299042731523514, "grad_norm_var": 2.1432444789181287e-05, "learning_rate": 0.006136177103860634, "loss": 2.6999, "step": 5645 }, { "crossentropy": 2.5822322368621826, "epoch": 0.30702302944615134, "grad_norm": 0.035267606377601624, "grad_norm_var": 2.122932956652995e-05, "learning_rate": 0.006134964461373776, "loss": 2.5822, "step": 5646 }, { "crossentropy": 2.6092811822891235, "epoch": 0.30707740830365154, "grad_norm": 0.03661565110087395, "grad_norm_var": 5.831880598313142e-06, "learning_rate": 0.006133751748497477, "loss": 2.6093, "step": 5647 }, { "crossentropy": 2.7032586336135864, "epoch": 0.30713178716115175, "grad_norm": 0.036583226174116135, "grad_norm_var": 4.1627265882295445e-06, "learning_rate": 0.006132538965306954, "loss": 2.7033, "step": 5648 }, { "crossentropy": 2.7648574113845825, "epoch": 0.30718616601865195, "grad_norm": 0.038197435438632965, "grad_norm_var": 4.3538401401548944e-06, "learning_rate": 0.006131326111877417, "loss": 2.7649, "step": 5649 }, { "crossentropy": 2.620033621788025, "epoch": 0.30724054487615216, "grad_norm": 0.03252962976694107, "grad_norm_var": 4.799022608703342e-06, "learning_rate": 0.006130113188284094, "loss": 2.62, "step": 5650 }, { "crossentropy": 2.5104331970214844, "epoch": 0.30729492373365236, "grad_norm": 0.03713693842291832, "grad_norm_var": 4.806142933405586e-06, "learning_rate": 0.0061289001946022, "loss": 2.5104, "step": 5651 }, { "crossentropy": 2.6626542806625366, "epoch": 0.30734930259115256, "grad_norm": 0.03411993756890297, "grad_norm_var": 4.976005166577174e-06, "learning_rate": 0.006127687130906972, "loss": 2.6627, "step": 5652 }, { "crossentropy": 2.6700856685638428, "epoch": 0.30740368144865277, "grad_norm": 0.032853350043296814, "grad_norm_var": 5.128244868040678e-06, "learning_rate": 0.006126473997273637, "loss": 2.6701, "step": 5653 }, { "crossentropy": 2.5984833240509033, "epoch": 0.30745806030615297, "grad_norm": 0.03575175255537033, "grad_norm_var": 4.5633759524849046e-06, "learning_rate": 0.006125260793777437, "loss": 2.5985, "step": 5654 }, { "crossentropy": 2.7005724906921387, "epoch": 0.3075124391636532, "grad_norm": 0.03787742927670479, "grad_norm_var": 4.47364574461794e-06, "learning_rate": 0.00612404752049361, "loss": 2.7006, "step": 5655 }, { "crossentropy": 2.5234192609786987, "epoch": 0.3075668180211534, "grad_norm": 0.036576047539711, "grad_norm_var": 4.358324499890929e-06, "learning_rate": 0.006122834177497403, "loss": 2.5234, "step": 5656 }, { "crossentropy": 2.661360502243042, "epoch": 0.3076211968786536, "grad_norm": 0.037245962768793106, "grad_norm_var": 4.235024679643864e-06, "learning_rate": 0.00612162076486407, "loss": 2.6614, "step": 5657 }, { "crossentropy": 2.5421537160873413, "epoch": 0.3076755757361538, "grad_norm": 0.03869206830859184, "grad_norm_var": 4.241679367566973e-06, "learning_rate": 0.006120407282668861, "loss": 2.5422, "step": 5658 }, { "crossentropy": 2.635301351547241, "epoch": 0.307729954593654, "grad_norm": 0.03733363747596741, "grad_norm_var": 4.274524311634331e-06, "learning_rate": 0.006119193730987036, "loss": 2.6353, "step": 5659 }, { "crossentropy": 2.6235753297805786, "epoch": 0.3077843334511542, "grad_norm": 0.034817516803741455, "grad_norm_var": 3.539164740480836e-06, "learning_rate": 0.00611798010989386, "loss": 2.6236, "step": 5660 }, { "crossentropy": 2.6634775400161743, "epoch": 0.3078387123086544, "grad_norm": 0.0332297682762146, "grad_norm_var": 3.7559652341649537e-06, "learning_rate": 0.006116766419464601, "loss": 2.6635, "step": 5661 }, { "crossentropy": 2.686119556427002, "epoch": 0.3078930911661546, "grad_norm": 0.03574084863066673, "grad_norm_var": 3.728371507853249e-06, "learning_rate": 0.006115552659774529, "loss": 2.6861, "step": 5662 }, { "crossentropy": 2.688259720802307, "epoch": 0.3079474700236548, "grad_norm": 0.03744862601161003, "grad_norm_var": 3.8449639035912735e-06, "learning_rate": 0.006114338830898921, "loss": 2.6883, "step": 5663 }, { "crossentropy": 2.57821261882782, "epoch": 0.308001848881155, "grad_norm": 0.036678023636341095, "grad_norm_var": 3.852791349552055e-06, "learning_rate": 0.006113124932913058, "loss": 2.5782, "step": 5664 }, { "crossentropy": 2.659840703010559, "epoch": 0.3080562277386552, "grad_norm": 0.03749805688858032, "grad_norm_var": 3.679784589299441e-06, "learning_rate": 0.006111910965892224, "loss": 2.6598, "step": 5665 }, { "crossentropy": 2.5354106426239014, "epoch": 0.3081106065961554, "grad_norm": 0.0346897654235363, "grad_norm_var": 2.9803596603703483e-06, "learning_rate": 0.006110696929911709, "loss": 2.5354, "step": 5666 }, { "crossentropy": 2.6063915491104126, "epoch": 0.3081649854536556, "grad_norm": 0.08006487041711807, "grad_norm_var": 0.00012405886912590362, "learning_rate": 0.006109482825046805, "loss": 2.6064, "step": 5667 }, { "crossentropy": 2.5647268295288086, "epoch": 0.3082193643111558, "grad_norm": 0.03491934388875961, "grad_norm_var": 0.00012360118826902888, "learning_rate": 0.006108268651372812, "loss": 2.5647, "step": 5668 }, { "crossentropy": 2.773268699645996, "epoch": 0.308273743168656, "grad_norm": 0.03771255910396576, "grad_norm_var": 0.00012119914358120742, "learning_rate": 0.006107054408965029, "loss": 2.7733, "step": 5669 }, { "crossentropy": 2.7031638622283936, "epoch": 0.3083281220261562, "grad_norm": 0.049420248717069626, "grad_norm_var": 0.00012669678189567568, "learning_rate": 0.006105840097898764, "loss": 2.7032, "step": 5670 }, { "crossentropy": 2.5845459699630737, "epoch": 0.30838250088365643, "grad_norm": 0.036480069160461426, "grad_norm_var": 0.00012721364334315454, "learning_rate": 0.006104625718249327, "loss": 2.5845, "step": 5671 }, { "crossentropy": 2.6145495176315308, "epoch": 0.30843687974115663, "grad_norm": 0.03771918639540672, "grad_norm_var": 0.0001267872800710667, "learning_rate": 0.006103411270092034, "loss": 2.6145, "step": 5672 }, { "crossentropy": 2.632825016975403, "epoch": 0.30849125859865684, "grad_norm": 0.06590016186237335, "grad_norm_var": 0.00016765565283634876, "learning_rate": 0.0061021967535022014, "loss": 2.6328, "step": 5673 }, { "crossentropy": 2.6398123502731323, "epoch": 0.30854563745615704, "grad_norm": 0.03438350930809975, "grad_norm_var": 0.0001705849649261124, "learning_rate": 0.006100982168555155, "loss": 2.6398, "step": 5674 }, { "crossentropy": 2.7261838912963867, "epoch": 0.30860001631365724, "grad_norm": 0.03605267405509949, "grad_norm_var": 0.0001713994998688774, "learning_rate": 0.00609976751532622, "loss": 2.7262, "step": 5675 }, { "crossentropy": 2.6242177486419678, "epoch": 0.30865439517115745, "grad_norm": 0.03585895895957947, "grad_norm_var": 0.00017055016783022435, "learning_rate": 0.00609855279389073, "loss": 2.6242, "step": 5676 }, { "crossentropy": 2.644997000694275, "epoch": 0.30870877402865765, "grad_norm": 0.03435184434056282, "grad_norm_var": 0.00016939344946427417, "learning_rate": 0.006097338004324018, "loss": 2.645, "step": 5677 }, { "crossentropy": 2.640005946159363, "epoch": 0.30876315288615785, "grad_norm": 0.03288961201906204, "grad_norm_var": 0.0001721128031551257, "learning_rate": 0.006096123146701428, "loss": 2.64, "step": 5678 }, { "crossentropy": 2.620866060256958, "epoch": 0.30881753174365806, "grad_norm": 0.034582603722810745, "grad_norm_var": 0.00017412820581962456, "learning_rate": 0.006094908221098301, "loss": 2.6209, "step": 5679 }, { "crossentropy": 2.585369110107422, "epoch": 0.30887191060115826, "grad_norm": 0.03453095257282257, "grad_norm_var": 0.00017571088602269316, "learning_rate": 0.006093693227589987, "loss": 2.5854, "step": 5680 }, { "crossentropy": 2.7035797834396362, "epoch": 0.30892628945865847, "grad_norm": 0.03574617952108383, "grad_norm_var": 0.00017673609322591636, "learning_rate": 0.006092478166251839, "loss": 2.7036, "step": 5681 }, { "crossentropy": 2.6045295000076294, "epoch": 0.30898066831615867, "grad_norm": 0.03790178894996643, "grad_norm_var": 0.00017469709757987796, "learning_rate": 0.006091263037159214, "loss": 2.6045, "step": 5682 }, { "crossentropy": 2.614842414855957, "epoch": 0.30903504717365887, "grad_norm": 0.03589208796620369, "grad_norm_var": 6.749435763392837e-05, "learning_rate": 0.006090047840387473, "loss": 2.6148, "step": 5683 }, { "crossentropy": 2.7271578311920166, "epoch": 0.3090894260311591, "grad_norm": 0.034247517585754395, "grad_norm_var": 6.783402727754661e-05, "learning_rate": 0.00608883257601198, "loss": 2.7272, "step": 5684 }, { "crossentropy": 2.649125099182129, "epoch": 0.3091438048886593, "grad_norm": 0.03388328477740288, "grad_norm_var": 6.907817656849006e-05, "learning_rate": 0.006087617244108107, "loss": 2.6491, "step": 5685 }, { "crossentropy": 2.6765555143356323, "epoch": 0.3091981837461595, "grad_norm": 0.03520853444933891, "grad_norm_var": 6.0279296560700595e-05, "learning_rate": 0.0060864018447512255, "loss": 2.6766, "step": 5686 }, { "crossentropy": 2.516327977180481, "epoch": 0.3092525626036597, "grad_norm": 0.03345317766070366, "grad_norm_var": 6.115330001253602e-05, "learning_rate": 0.0060851863780167145, "loss": 2.5163, "step": 5687 }, { "crossentropy": 2.5856488943099976, "epoch": 0.3093069414611599, "grad_norm": 0.033188894391059875, "grad_norm_var": 6.202433479585105e-05, "learning_rate": 0.006083970843979956, "loss": 2.5856, "step": 5688 }, { "crossentropy": 2.6587623357772827, "epoch": 0.3093613203186601, "grad_norm": 0.036795467138290405, "grad_norm_var": 1.8635732550268195e-06, "learning_rate": 0.006082755242716338, "loss": 2.6588, "step": 5689 }, { "crossentropy": 2.6458834409713745, "epoch": 0.3094156991761603, "grad_norm": 0.035072945058345795, "grad_norm_var": 1.84254448586732e-06, "learning_rate": 0.0060815395743012484, "loss": 2.6459, "step": 5690 }, { "crossentropy": 2.6987416744232178, "epoch": 0.3094700780336605, "grad_norm": 0.03412461280822754, "grad_norm_var": 1.798748536694511e-06, "learning_rate": 0.0060803238388100825, "loss": 2.6987, "step": 5691 }, { "crossentropy": 2.6528210639953613, "epoch": 0.3095244568911607, "grad_norm": 0.03556521236896515, "grad_norm_var": 1.7649388377484476e-06, "learning_rate": 0.006079108036318238, "loss": 2.6528, "step": 5692 }, { "crossentropy": 2.6869451999664307, "epoch": 0.3095788357486609, "grad_norm": 0.1117786094546318, "grad_norm_var": 0.00037141033740239695, "learning_rate": 0.006077892166901121, "loss": 2.6869, "step": 5693 }, { "crossentropy": 2.527710795402527, "epoch": 0.3096332146061611, "grad_norm": 0.043248746544122696, "grad_norm_var": 0.0003687399097042889, "learning_rate": 0.006076676230634136, "loss": 2.5277, "step": 5694 }, { "crossentropy": 2.6954644918441772, "epoch": 0.3096875934636613, "grad_norm": 0.04734042286872864, "grad_norm_var": 0.00036914227943301333, "learning_rate": 0.006075460227592695, "loss": 2.6955, "step": 5695 }, { "crossentropy": 2.5957239866256714, "epoch": 0.3097419723211615, "grad_norm": 0.03997739031910896, "grad_norm_var": 0.0003662086959934046, "learning_rate": 0.006074244157852213, "loss": 2.5957, "step": 5696 }, { "crossentropy": 2.5681623220443726, "epoch": 0.3097963511786617, "grad_norm": 0.043515246361494064, "grad_norm_var": 0.0003640580889226747, "learning_rate": 0.006073028021488111, "loss": 2.5682, "step": 5697 }, { "crossentropy": 2.603311061859131, "epoch": 0.3098507300361619, "grad_norm": 0.04518356919288635, "grad_norm_var": 0.0003634420525180162, "learning_rate": 0.0060718118185758094, "loss": 2.6033, "step": 5698 }, { "crossentropy": 2.696494698524475, "epoch": 0.30990510889366213, "grad_norm": 0.041733257472515106, "grad_norm_var": 0.00036050231164028076, "learning_rate": 0.0060705955491907404, "loss": 2.6965, "step": 5699 }, { "crossentropy": 2.745758295059204, "epoch": 0.30995948775116233, "grad_norm": 0.03820100054144859, "grad_norm_var": 0.00035698682567463175, "learning_rate": 0.006069379213408333, "loss": 2.7458, "step": 5700 }, { "crossentropy": 2.634367346763611, "epoch": 0.31001386660866254, "grad_norm": 0.036478590220212936, "grad_norm_var": 0.00035424719909939954, "learning_rate": 0.0060681628113040224, "loss": 2.6344, "step": 5701 }, { "crossentropy": 2.5900213718414307, "epoch": 0.31006824546616274, "grad_norm": 0.03735998645424843, "grad_norm_var": 0.00035225005576509737, "learning_rate": 0.006066946342953253, "loss": 2.59, "step": 5702 }, { "crossentropy": 2.52286159992218, "epoch": 0.31012262432366294, "grad_norm": 0.03356816619634628, "grad_norm_var": 0.0003520997045441926, "learning_rate": 0.006065729808431463, "loss": 2.5229, "step": 5703 }, { "crossentropy": 2.6640409231185913, "epoch": 0.31017700318116315, "grad_norm": 0.036855313926935196, "grad_norm_var": 0.0003479868476922424, "learning_rate": 0.006064513207814104, "loss": 2.664, "step": 5704 }, { "crossentropy": 2.5860629081726074, "epoch": 0.31023138203866335, "grad_norm": 0.03703601658344269, "grad_norm_var": 0.0003477738272430248, "learning_rate": 0.00606329654117663, "loss": 2.5861, "step": 5705 }, { "crossentropy": 2.542502999305725, "epoch": 0.31028576089616355, "grad_norm": 0.03537716716527939, "grad_norm_var": 0.0003474351512228482, "learning_rate": 0.006062079808594494, "loss": 2.5425, "step": 5706 }, { "crossentropy": 2.6931530237197876, "epoch": 0.31034013975366376, "grad_norm": 0.034340888261795044, "grad_norm_var": 0.0003471652981437797, "learning_rate": 0.00606086301014316, "loss": 2.6932, "step": 5707 }, { "crossentropy": 2.6864705085754395, "epoch": 0.31039451861116396, "grad_norm": 0.034125134348869324, "grad_norm_var": 0.000348837189977165, "learning_rate": 0.00605964614589809, "loss": 2.6865, "step": 5708 }, { "crossentropy": 2.6098783016204834, "epoch": 0.31044889746866416, "grad_norm": 0.03406692296266556, "grad_norm_var": 1.8885934676604543e-05, "learning_rate": 0.006058429215934755, "loss": 2.6099, "step": 5709 }, { "crossentropy": 2.6462482213974, "epoch": 0.31050327632616437, "grad_norm": 0.03385983407497406, "grad_norm_var": 1.863906247386952e-05, "learning_rate": 0.006057212220328627, "loss": 2.6462, "step": 5710 }, { "crossentropy": 2.6489691734313965, "epoch": 0.31055765518366457, "grad_norm": 0.03821029141545296, "grad_norm_var": 1.255596988325071e-05, "learning_rate": 0.006055995159155184, "loss": 2.649, "step": 5711 }, { "crossentropy": 2.652207136154175, "epoch": 0.3106120340411648, "grad_norm": 0.038311757147312164, "grad_norm_var": 1.2177632123516515e-05, "learning_rate": 0.006054778032489902, "loss": 2.6522, "step": 5712 }, { "crossentropy": 2.7245200872421265, "epoch": 0.310666412898665, "grad_norm": 0.03719841316342354, "grad_norm_var": 9.51168897579957e-06, "learning_rate": 0.006053560840408272, "loss": 2.7245, "step": 5713 }, { "crossentropy": 2.6569820642471313, "epoch": 0.3107207917561652, "grad_norm": 0.03698857128620148, "grad_norm_var": 4.760754102459131e-06, "learning_rate": 0.006052343582985781, "loss": 2.657, "step": 5714 }, { "crossentropy": 2.622883439064026, "epoch": 0.3107751706136654, "grad_norm": 0.037960514426231384, "grad_norm_var": 3.0087791465846965e-06, "learning_rate": 0.006051126260297921, "loss": 2.6229, "step": 5715 }, { "crossentropy": 2.6498740911483765, "epoch": 0.3108295494711656, "grad_norm": 0.035792026668787, "grad_norm_var": 2.7435885336981234e-06, "learning_rate": 0.006049908872420191, "loss": 2.6499, "step": 5716 }, { "crossentropy": 2.6802685260772705, "epoch": 0.3108839283286658, "grad_norm": 0.03406749665737152, "grad_norm_var": 2.9838008124851687e-06, "learning_rate": 0.006048691419428091, "loss": 2.6803, "step": 5717 }, { "crossentropy": 2.6669284105300903, "epoch": 0.310938307186166, "grad_norm": 0.03628458082675934, "grad_norm_var": 2.8531772027551305e-06, "learning_rate": 0.006047473901397129, "loss": 2.6669, "step": 5718 }, { "crossentropy": 2.773347496986389, "epoch": 0.3109926860436662, "grad_norm": 0.034559465944767, "grad_norm_var": 2.6093365648655053e-06, "learning_rate": 0.00604625631840281, "loss": 2.7733, "step": 5719 }, { "crossentropy": 2.5924606323242188, "epoch": 0.3110470649011664, "grad_norm": 0.03527165204286575, "grad_norm_var": 2.5727386159324697e-06, "learning_rate": 0.00604503867052065, "loss": 2.5925, "step": 5720 }, { "crossentropy": 2.690038800239563, "epoch": 0.3111014437586666, "grad_norm": 0.03453565016388893, "grad_norm_var": 2.564971139882438e-06, "learning_rate": 0.006043820957826166, "loss": 2.69, "step": 5721 }, { "crossentropy": 2.64851176738739, "epoch": 0.3111558226161668, "grad_norm": 0.03374357894062996, "grad_norm_var": 2.7986777655622864e-06, "learning_rate": 0.006042603180394879, "loss": 2.6485, "step": 5722 }, { "crossentropy": 2.6080652475357056, "epoch": 0.311210201473667, "grad_norm": 0.03330264613032341, "grad_norm_var": 3.0379007108607045e-06, "learning_rate": 0.006041385338302316, "loss": 2.6081, "step": 5723 }, { "crossentropy": 2.6273396015167236, "epoch": 0.3112645803311672, "grad_norm": 0.034028299152851105, "grad_norm_var": 3.05646292901654e-06, "learning_rate": 0.0060401674316240056, "loss": 2.6273, "step": 5724 }, { "crossentropy": 2.6441245079040527, "epoch": 0.3113189591886674, "grad_norm": 0.03689829632639885, "grad_norm_var": 3.0122077758926517e-06, "learning_rate": 0.006038949460435479, "loss": 2.6441, "step": 5725 }, { "crossentropy": 2.460519313812256, "epoch": 0.3113733380461676, "grad_norm": 0.03489233925938606, "grad_norm_var": 2.827114526962604e-06, "learning_rate": 0.006037731424812276, "loss": 2.4605, "step": 5726 }, { "crossentropy": 2.5444215536117554, "epoch": 0.31142771690366783, "grad_norm": 0.036253128200769424, "grad_norm_var": 2.425237825287585e-06, "learning_rate": 0.0060365133248299375, "loss": 2.5444, "step": 5727 }, { "crossentropy": 2.556104063987732, "epoch": 0.31148209576116803, "grad_norm": 0.03437647596001625, "grad_norm_var": 1.98628708043911e-06, "learning_rate": 0.006035295160564009, "loss": 2.5561, "step": 5728 }, { "crossentropy": 2.761210083961487, "epoch": 0.31153647461866824, "grad_norm": 0.033702246844768524, "grad_norm_var": 1.904703221459562e-06, "learning_rate": 0.006034076932090039, "loss": 2.7612, "step": 5729 }, { "crossentropy": 2.7327758073806763, "epoch": 0.31159085347616844, "grad_norm": 0.03742344304919243, "grad_norm_var": 2.022197272906883e-06, "learning_rate": 0.006032858639483583, "loss": 2.7328, "step": 5730 }, { "crossentropy": 2.577093005180359, "epoch": 0.31164523233366864, "grad_norm": 0.03775275871157646, "grad_norm_var": 1.9482393161053545e-06, "learning_rate": 0.006031640282820197, "loss": 2.5771, "step": 5731 }, { "crossentropy": 2.5613105297088623, "epoch": 0.31169961119116885, "grad_norm": 0.03700124844908714, "grad_norm_var": 2.1382635325778444e-06, "learning_rate": 0.006030421862175442, "loss": 2.5613, "step": 5732 }, { "crossentropy": 2.6935181617736816, "epoch": 0.3117539900486691, "grad_norm": 0.0371716283261776, "grad_norm_var": 2.2486575099131893e-06, "learning_rate": 0.006029203377624885, "loss": 2.6935, "step": 5733 }, { "crossentropy": 2.597765326499939, "epoch": 0.3118083689061693, "grad_norm": 0.034069545567035675, "grad_norm_var": 2.3087755889905985e-06, "learning_rate": 0.006027984829244092, "loss": 2.5978, "step": 5734 }, { "crossentropy": 2.6259013414382935, "epoch": 0.3118627477636695, "grad_norm": 0.03406551107764244, "grad_norm_var": 2.3735479296439766e-06, "learning_rate": 0.00602676621710864, "loss": 2.6259, "step": 5735 }, { "crossentropy": 2.626110076904297, "epoch": 0.3119171266211697, "grad_norm": 0.03545881435275078, "grad_norm_var": 2.3755157881827707e-06, "learning_rate": 0.006025547541294103, "loss": 2.6261, "step": 5736 }, { "crossentropy": 2.6318775415420532, "epoch": 0.3119715054786699, "grad_norm": 0.03839583322405815, "grad_norm_var": 2.9174264631600644e-06, "learning_rate": 0.0060243288018760645, "loss": 2.6319, "step": 5737 }, { "crossentropy": 2.6227670907974243, "epoch": 0.3120258843361701, "grad_norm": 0.03260580822825432, "grad_norm_var": 3.269868110898575e-06, "learning_rate": 0.006023109998930108, "loss": 2.6228, "step": 5738 }, { "crossentropy": 2.6065789461135864, "epoch": 0.3120802631936703, "grad_norm": 0.03572681546211243, "grad_norm_var": 2.939081826299391e-06, "learning_rate": 0.006021891132531825, "loss": 2.6066, "step": 5739 }, { "crossentropy": 2.6693203449249268, "epoch": 0.31213464205117053, "grad_norm": 0.036335572600364685, "grad_norm_var": 2.7840166134160867e-06, "learning_rate": 0.006020672202756805, "loss": 2.6693, "step": 5740 }, { "crossentropy": 2.6015377044677734, "epoch": 0.31218902090867073, "grad_norm": 0.03748766705393791, "grad_norm_var": 2.8953269176787882e-06, "learning_rate": 0.006019453209680648, "loss": 2.6015, "step": 5741 }, { "crossentropy": 2.6256003379821777, "epoch": 0.31224339976617094, "grad_norm": 0.03446997329592705, "grad_norm_var": 2.95730614141565e-06, "learning_rate": 0.006018234153378952, "loss": 2.6256, "step": 5742 }, { "crossentropy": 2.740735173225403, "epoch": 0.31229777862367114, "grad_norm": 0.033291250467300415, "grad_norm_var": 3.3142248035688523e-06, "learning_rate": 0.006017015033927323, "loss": 2.7407, "step": 5743 }, { "crossentropy": 2.5712356567382812, "epoch": 0.31235215748117134, "grad_norm": 0.03229774534702301, "grad_norm_var": 3.918814194497679e-06, "learning_rate": 0.00601579585140137, "loss": 2.5712, "step": 5744 }, { "crossentropy": 2.706214666366577, "epoch": 0.31240653633867155, "grad_norm": 0.03388994559645653, "grad_norm_var": 3.877188599114787e-06, "learning_rate": 0.006014576605876706, "loss": 2.7062, "step": 5745 }, { "crossentropy": 2.7103723287582397, "epoch": 0.31246091519617175, "grad_norm": 0.034815967082977295, "grad_norm_var": 3.621320063031582e-06, "learning_rate": 0.006013357297428947, "loss": 2.7104, "step": 5746 }, { "crossentropy": 2.6237136125564575, "epoch": 0.31251529405367195, "grad_norm": 0.0336078405380249, "grad_norm_var": 3.3408069144299477e-06, "learning_rate": 0.006012137926133712, "loss": 2.6237, "step": 5747 }, { "crossentropy": 2.556052088737488, "epoch": 0.31256967291117216, "grad_norm": 0.03546027094125748, "grad_norm_var": 3.0869119004850454e-06, "learning_rate": 0.0060109184920666275, "loss": 2.5561, "step": 5748 }, { "crossentropy": 2.680623173713684, "epoch": 0.31262405176867236, "grad_norm": 0.03354961425065994, "grad_norm_var": 2.832442517799495e-06, "learning_rate": 0.00600969899530332, "loss": 2.6806, "step": 5749 }, { "crossentropy": 2.644422769546509, "epoch": 0.31267843062617257, "grad_norm": 0.031663525849580765, "grad_norm_var": 3.403082101797322e-06, "learning_rate": 0.006008479435919423, "loss": 2.6444, "step": 5750 }, { "crossentropy": 2.5971686840057373, "epoch": 0.31273280948367277, "grad_norm": 0.034178476780653, "grad_norm_var": 3.396278991466689e-06, "learning_rate": 0.0060072598139905686, "loss": 2.5972, "step": 5751 }, { "crossentropy": 2.63762366771698, "epoch": 0.312787188341173, "grad_norm": 0.03746705502271652, "grad_norm_var": 3.884410563935401e-06, "learning_rate": 0.0060060401295924035, "loss": 2.6376, "step": 5752 }, { "crossentropy": 2.645871877670288, "epoch": 0.3128415671986732, "grad_norm": 0.03571955859661102, "grad_norm_var": 3.0142218729542944e-06, "learning_rate": 0.006004820382800566, "loss": 2.6459, "step": 5753 }, { "crossentropy": 2.700477361679077, "epoch": 0.3128959460561734, "grad_norm": 0.03404427319765091, "grad_norm_var": 2.773450788953226e-06, "learning_rate": 0.006003600573690704, "loss": 2.7005, "step": 5754 }, { "crossentropy": 2.6490646600723267, "epoch": 0.3129503249136736, "grad_norm": 0.03489965945482254, "grad_norm_var": 2.6947343134316066e-06, "learning_rate": 0.006002380702338471, "loss": 2.6491, "step": 5755 }, { "crossentropy": 2.628902554512024, "epoch": 0.3130047037711738, "grad_norm": 0.03322313725948334, "grad_norm_var": 2.569004881865638e-06, "learning_rate": 0.006001160768819521, "loss": 2.6289, "step": 5756 }, { "crossentropy": 2.601754665374756, "epoch": 0.313059082628674, "grad_norm": 0.03435051068663597, "grad_norm_var": 1.8838489019155796e-06, "learning_rate": 0.005999940773209516, "loss": 2.6018, "step": 5757 }, { "crossentropy": 2.56512451171875, "epoch": 0.3131134614861742, "grad_norm": 0.0341416671872139, "grad_norm_var": 1.878025647407642e-06, "learning_rate": 0.005998720715584113, "loss": 2.5651, "step": 5758 }, { "crossentropy": 2.6723721027374268, "epoch": 0.3131678403436744, "grad_norm": 0.03307285159826279, "grad_norm_var": 1.9063783383714098e-06, "learning_rate": 0.005997500596018984, "loss": 2.6724, "step": 5759 }, { "crossentropy": 2.6233103275299072, "epoch": 0.3132222192011746, "grad_norm": 0.03667697310447693, "grad_norm_var": 2.024107831224975e-06, "learning_rate": 0.005996280414589798, "loss": 2.6233, "step": 5760 }, { "crossentropy": 2.726985216140747, "epoch": 0.3132765980586748, "grad_norm": 0.035705383867025375, "grad_norm_var": 2.101166974365539e-06, "learning_rate": 0.005995060171372229, "loss": 2.727, "step": 5761 }, { "crossentropy": 2.5677679777145386, "epoch": 0.313330976916175, "grad_norm": 0.034706395119428635, "grad_norm_var": 2.0978278434834995e-06, "learning_rate": 0.0059938398664419554, "loss": 2.5678, "step": 5762 }, { "crossentropy": 2.730402946472168, "epoch": 0.3133853557736752, "grad_norm": 0.1557426154613495, "grad_norm_var": 0.0009194002832853792, "learning_rate": 0.005992619499874662, "loss": 2.7304, "step": 5763 }, { "crossentropy": 2.6277859210968018, "epoch": 0.3134397346311754, "grad_norm": 0.034209493547677994, "grad_norm_var": 0.0009206158144366869, "learning_rate": 0.005991399071746031, "loss": 2.6278, "step": 5764 }, { "crossentropy": 2.7180460691452026, "epoch": 0.3134941134886756, "grad_norm": 0.034242380410432816, "grad_norm_var": 0.0009198574570800616, "learning_rate": 0.005990178582131753, "loss": 2.718, "step": 5765 }, { "crossentropy": 2.7014894485473633, "epoch": 0.3135484923461758, "grad_norm": 0.035954684019088745, "grad_norm_var": 0.0009150211838880586, "learning_rate": 0.005988958031107523, "loss": 2.7015, "step": 5766 }, { "crossentropy": 2.6630939245224, "epoch": 0.313602871203676, "grad_norm": 0.040128208696842194, "grad_norm_var": 0.0009107147432629206, "learning_rate": 0.005987737418749039, "loss": 2.6631, "step": 5767 }, { "crossentropy": 2.4696990251541138, "epoch": 0.31365725006117623, "grad_norm": 0.03671116381883621, "grad_norm_var": 0.0009112846924802437, "learning_rate": 0.005986516745132, "loss": 2.4697, "step": 5768 }, { "crossentropy": 2.7915679216384888, "epoch": 0.31371162891867643, "grad_norm": 0.03579023852944374, "grad_norm_var": 0.0009112190273363639, "learning_rate": 0.005985296010332113, "loss": 2.7916, "step": 5769 }, { "crossentropy": 2.6890329122543335, "epoch": 0.31376600777617664, "grad_norm": 0.03477231785655022, "grad_norm_var": 0.0009104094967045926, "learning_rate": 0.005984075214425086, "loss": 2.689, "step": 5770 }, { "crossentropy": 2.704712748527527, "epoch": 0.31382038663367684, "grad_norm": 0.035097166895866394, "grad_norm_var": 0.000910204662027933, "learning_rate": 0.005982854357486632, "loss": 2.7047, "step": 5771 }, { "crossentropy": 2.4977262020111084, "epoch": 0.31387476549117704, "grad_norm": 0.03438951075077057, "grad_norm_var": 0.0009088029999903379, "learning_rate": 0.005981633439592467, "loss": 2.4977, "step": 5772 }, { "crossentropy": 2.672772765159607, "epoch": 0.31392914434867725, "grad_norm": 0.038609713315963745, "grad_norm_var": 0.0009051067443060519, "learning_rate": 0.00598041246081831, "loss": 2.6728, "step": 5773 }, { "crossentropy": 2.605258345603943, "epoch": 0.31398352320617745, "grad_norm": 0.03748709708452225, "grad_norm_var": 0.0009018005293998895, "learning_rate": 0.005979191421239888, "loss": 2.6053, "step": 5774 }, { "crossentropy": 2.7449315786361694, "epoch": 0.31403790206367765, "grad_norm": 0.03870967775583267, "grad_norm_var": 0.0008960765967888723, "learning_rate": 0.005977970320932924, "loss": 2.7449, "step": 5775 }, { "crossentropy": 2.6071337461471558, "epoch": 0.31409228092117786, "grad_norm": 0.03620980679988861, "grad_norm_var": 0.0008965266538977834, "learning_rate": 0.005976749159973156, "loss": 2.6071, "step": 5776 }, { "crossentropy": 2.556643486022949, "epoch": 0.31414665977867806, "grad_norm": 0.03436486795544624, "grad_norm_var": 0.0008980596855599633, "learning_rate": 0.0059755279384363135, "loss": 2.5566, "step": 5777 }, { "crossentropy": 2.7125039100646973, "epoch": 0.31420103863617826, "grad_norm": 0.040217895060777664, "grad_norm_var": 0.0008934444121605471, "learning_rate": 0.0059743066563981385, "loss": 2.7125, "step": 5778 }, { "crossentropy": 2.643442988395691, "epoch": 0.31425541749367847, "grad_norm": 0.03943563252687454, "grad_norm_var": 4.720814814111046e-06, "learning_rate": 0.005973085313934372, "loss": 2.6434, "step": 5779 }, { "crossentropy": 2.797002911567688, "epoch": 0.31430979635117867, "grad_norm": 0.04059315845370293, "grad_norm_var": 5.194244880591762e-06, "learning_rate": 0.005971863911120763, "loss": 2.797, "step": 5780 }, { "crossentropy": 2.616753339767456, "epoch": 0.3143641752086789, "grad_norm": 0.03714247792959213, "grad_norm_var": 4.636345822972044e-06, "learning_rate": 0.00597064244803306, "loss": 2.6168, "step": 5781 }, { "crossentropy": 2.7373324632644653, "epoch": 0.3144185540661791, "grad_norm": 0.03596588596701622, "grad_norm_var": 4.634455059387051e-06, "learning_rate": 0.005969420924747016, "loss": 2.7373, "step": 5782 }, { "crossentropy": 2.7632776498794556, "epoch": 0.3144729329236793, "grad_norm": 0.03576435521245003, "grad_norm_var": 4.136335053248006e-06, "learning_rate": 0.005968199341338392, "loss": 2.7633, "step": 5783 }, { "crossentropy": 2.619343400001526, "epoch": 0.3145273117811795, "grad_norm": 0.03715973347425461, "grad_norm_var": 4.134398456944778e-06, "learning_rate": 0.005966977697882947, "loss": 2.6193, "step": 5784 }, { "crossentropy": 2.618021845817566, "epoch": 0.3145816906386797, "grad_norm": 0.0362231507897377, "grad_norm_var": 4.077330241441153e-06, "learning_rate": 0.0059657559944564465, "loss": 2.618, "step": 5785 }, { "crossentropy": 2.5197471380233765, "epoch": 0.3146360694961799, "grad_norm": 0.0378216952085495, "grad_norm_var": 3.7491401911601253e-06, "learning_rate": 0.005964534231134659, "loss": 2.5197, "step": 5786 }, { "crossentropy": 2.7059974670410156, "epoch": 0.3146904483536801, "grad_norm": 0.06441105902194977, "grad_norm_var": 4.9238691093017074e-05, "learning_rate": 0.005963312407993358, "loss": 2.706, "step": 5787 }, { "crossentropy": 2.634850859642029, "epoch": 0.3147448272111803, "grad_norm": 0.03545838221907616, "grad_norm_var": 4.864852253637086e-05, "learning_rate": 0.005962090525108319, "loss": 2.6349, "step": 5788 }, { "crossentropy": 2.644721031188965, "epoch": 0.3147992060686805, "grad_norm": 0.03368812054395676, "grad_norm_var": 5.048309223894334e-05, "learning_rate": 0.005960868582555324, "loss": 2.6447, "step": 5789 }, { "crossentropy": 2.6961658000946045, "epoch": 0.3148535849261807, "grad_norm": 0.0372488796710968, "grad_norm_var": 5.052804798056273e-05, "learning_rate": 0.005959646580410154, "loss": 2.6962, "step": 5790 }, { "crossentropy": 2.6038039922714233, "epoch": 0.3149079637836809, "grad_norm": 0.038763485848903656, "grad_norm_var": 5.0527753662042245e-05, "learning_rate": 0.005958424518748598, "loss": 2.6038, "step": 5791 }, { "crossentropy": 2.5889497995376587, "epoch": 0.3149623426411811, "grad_norm": 0.03846510872244835, "grad_norm_var": 5.007299244987925e-05, "learning_rate": 0.005957202397646449, "loss": 2.5889, "step": 5792 }, { "crossentropy": 2.5555949211120605, "epoch": 0.3150167214986813, "grad_norm": 0.03808358311653137, "grad_norm_var": 4.8678609451837064e-05, "learning_rate": 0.005955980217179495, "loss": 2.5556, "step": 5793 }, { "crossentropy": 2.740564465522766, "epoch": 0.3150711003561815, "grad_norm": 0.036454878747463226, "grad_norm_var": 4.9029162805363084e-05, "learning_rate": 0.005954757977423544, "loss": 2.7406, "step": 5794 }, { "crossentropy": 2.6015900373458862, "epoch": 0.3151254792136817, "grad_norm": 0.03651946783065796, "grad_norm_var": 4.935919250900796e-05, "learning_rate": 0.005953535678454392, "loss": 2.6016, "step": 5795 }, { "crossentropy": 2.6534409523010254, "epoch": 0.31517985807118193, "grad_norm": 0.03802181035280228, "grad_norm_var": 4.9135442331033986e-05, "learning_rate": 0.0059523133203478445, "loss": 2.6534, "step": 5796 }, { "crossentropy": 2.5252104997634888, "epoch": 0.31523423692868213, "grad_norm": 0.03791377320885658, "grad_norm_var": 4.9025354628309756e-05, "learning_rate": 0.005951090903179713, "loss": 2.5252, "step": 5797 }, { "crossentropy": 2.7011317014694214, "epoch": 0.31528861578618234, "grad_norm": 0.03789593651890755, "grad_norm_var": 4.857446562215728e-05, "learning_rate": 0.005949868427025813, "loss": 2.7011, "step": 5798 }, { "crossentropy": 2.677079200744629, "epoch": 0.31534299464368254, "grad_norm": 0.04074084758758545, "grad_norm_var": 4.81456560263845e-05, "learning_rate": 0.005948645891961957, "loss": 2.6771, "step": 5799 }, { "crossentropy": 2.6595267057418823, "epoch": 0.31539737350118274, "grad_norm": 0.03726687654852867, "grad_norm_var": 4.811930721922147e-05, "learning_rate": 0.005947423298063966, "loss": 2.6595, "step": 5800 }, { "crossentropy": 2.668311595916748, "epoch": 0.31545175235868295, "grad_norm": 0.033843375742435455, "grad_norm_var": 4.9373745271540495e-05, "learning_rate": 0.005946200645407667, "loss": 2.6683, "step": 5801 }, { "crossentropy": 2.6562395095825195, "epoch": 0.31550613121618315, "grad_norm": 0.035012137144804, "grad_norm_var": 5.027565656085901e-05, "learning_rate": 0.0059449779340688845, "loss": 2.6562, "step": 5802 }, { "crossentropy": 2.597918152809143, "epoch": 0.31556051007368335, "grad_norm": 0.03389424458146095, "grad_norm_var": 4.01391722358446e-06, "learning_rate": 0.005943755164123453, "loss": 2.5979, "step": 5803 }, { "crossentropy": 2.620098829269409, "epoch": 0.31561488893118356, "grad_norm": 0.03535056859254837, "grad_norm_var": 4.034352753003641e-06, "learning_rate": 0.005942532335647206, "loss": 2.6201, "step": 5804 }, { "crossentropy": 2.641234278678894, "epoch": 0.31566926778868376, "grad_norm": 0.03683548420667648, "grad_norm_var": 3.3380526094750616e-06, "learning_rate": 0.005941309448715982, "loss": 2.6412, "step": 5805 }, { "crossentropy": 2.646138310432434, "epoch": 0.31572364664618396, "grad_norm": 0.03470737114548683, "grad_norm_var": 3.663994673838653e-06, "learning_rate": 0.005940086503405624, "loss": 2.6461, "step": 5806 }, { "crossentropy": 2.7813256978988647, "epoch": 0.31577802550368417, "grad_norm": 0.03708793595433235, "grad_norm_var": 3.4143349542659074e-06, "learning_rate": 0.005938863499791977, "loss": 2.7813, "step": 5807 }, { "crossentropy": 2.6900718212127686, "epoch": 0.31583240436118437, "grad_norm": 0.036290302872657776, "grad_norm_var": 3.2143018274333537e-06, "learning_rate": 0.005937640437950891, "loss": 2.6901, "step": 5808 }, { "crossentropy": 2.5399869680404663, "epoch": 0.3158867832186846, "grad_norm": 0.03473448008298874, "grad_norm_var": 3.2617345328689664e-06, "learning_rate": 0.005936417317958221, "loss": 2.54, "step": 5809 }, { "crossentropy": 2.5968589782714844, "epoch": 0.3159411620761848, "grad_norm": 0.03897612914443016, "grad_norm_var": 3.6739158345668507e-06, "learning_rate": 0.005935194139889818, "loss": 2.5969, "step": 5810 }, { "crossentropy": 2.6038668155670166, "epoch": 0.315995540933685, "grad_norm": 0.03868532180786133, "grad_norm_var": 3.9530339236158264e-06, "learning_rate": 0.005933970903821549, "loss": 2.6039, "step": 5811 }, { "crossentropy": 2.650942802429199, "epoch": 0.3160499197911852, "grad_norm": 0.036983951926231384, "grad_norm_var": 3.837931688223615e-06, "learning_rate": 0.005932747609829275, "loss": 2.6509, "step": 5812 }, { "crossentropy": 2.6893585920333862, "epoch": 0.3161042986486854, "grad_norm": 0.034572530537843704, "grad_norm_var": 3.96761884154361e-06, "learning_rate": 0.005931524257988864, "loss": 2.6894, "step": 5813 }, { "crossentropy": 2.695220708847046, "epoch": 0.3161586775061856, "grad_norm": 0.0347292497754097, "grad_norm_var": 3.975341880370758e-06, "learning_rate": 0.0059303008483761865, "loss": 2.6952, "step": 5814 }, { "crossentropy": 2.523709774017334, "epoch": 0.3162130563636858, "grad_norm": 0.033838022500276566, "grad_norm_var": 2.803497263214235e-06, "learning_rate": 0.0059290773810671184, "loss": 2.5237, "step": 5815 }, { "crossentropy": 2.4789764881134033, "epoch": 0.316267435221186, "grad_norm": 0.03500887751579285, "grad_norm_var": 2.680680001688971e-06, "learning_rate": 0.005927853856137536, "loss": 2.479, "step": 5816 }, { "crossentropy": 2.4843955039978027, "epoch": 0.3163218140786862, "grad_norm": 0.03271353989839554, "grad_norm_var": 3.0340337184783214e-06, "learning_rate": 0.005926630273663323, "loss": 2.4844, "step": 5817 }, { "crossentropy": 2.5473631620407104, "epoch": 0.3163761929361864, "grad_norm": 0.032953619956970215, "grad_norm_var": 3.4571418949911243e-06, "learning_rate": 0.005925406633720365, "loss": 2.5474, "step": 5818 }, { "crossentropy": 2.652202010154724, "epoch": 0.3164305717936866, "grad_norm": 0.03564818203449249, "grad_norm_var": 3.283221657415271e-06, "learning_rate": 0.00592418293638455, "loss": 2.6522, "step": 5819 }, { "crossentropy": 2.674172282218933, "epoch": 0.3164849506511868, "grad_norm": 0.048480499535799026, "grad_norm_var": 1.3674250275432416e-05, "learning_rate": 0.005922959181731772, "loss": 2.6742, "step": 5820 }, { "crossentropy": 2.7582807540893555, "epoch": 0.316539329508687, "grad_norm": 0.03825487941503525, "grad_norm_var": 1.3884411991673304e-05, "learning_rate": 0.0059217353698379225, "loss": 2.7583, "step": 5821 }, { "crossentropy": 2.6695014238357544, "epoch": 0.3165937083661872, "grad_norm": 0.03271793946623802, "grad_norm_var": 1.4601729657335443e-05, "learning_rate": 0.005920511500778909, "loss": 2.6695, "step": 5822 }, { "crossentropy": 2.585980534553528, "epoch": 0.3166480872236874, "grad_norm": 0.033437781035900116, "grad_norm_var": 1.5077607929451544e-05, "learning_rate": 0.0059192875746306274, "loss": 2.586, "step": 5823 }, { "crossentropy": 2.7084743976593018, "epoch": 0.31670246608118763, "grad_norm": 0.035175520926713943, "grad_norm_var": 1.5130943985491862e-05, "learning_rate": 0.005918063591468989, "loss": 2.7085, "step": 5824 }, { "crossentropy": 2.6634386777877808, "epoch": 0.31675684493868783, "grad_norm": 0.0480194054543972, "grad_norm_var": 2.3819074396778418e-05, "learning_rate": 0.005916839551369901, "loss": 2.6634, "step": 5825 }, { "crossentropy": 2.5352078676223755, "epoch": 0.31681122379618804, "grad_norm": 0.03324142098426819, "grad_norm_var": 2.4277263042347257e-05, "learning_rate": 0.005915615454409281, "loss": 2.5352, "step": 5826 }, { "crossentropy": 2.620214343070984, "epoch": 0.31686560265368824, "grad_norm": 0.035251274704933167, "grad_norm_var": 2.4026890878151713e-05, "learning_rate": 0.005914391300663044, "loss": 2.6202, "step": 5827 }, { "crossentropy": 2.71822726726532, "epoch": 0.31691998151118844, "grad_norm": 0.03446001932024956, "grad_norm_var": 2.4199632178199756e-05, "learning_rate": 0.005913167090207109, "loss": 2.7182, "step": 5828 }, { "crossentropy": 2.7162548303604126, "epoch": 0.31697436036868865, "grad_norm": 0.034147609025239944, "grad_norm_var": 2.4300654384735016e-05, "learning_rate": 0.005911942823117403, "loss": 2.7163, "step": 5829 }, { "crossentropy": 2.626366138458252, "epoch": 0.31702873922618885, "grad_norm": 0.033700596541166306, "grad_norm_var": 2.455888701605342e-05, "learning_rate": 0.005910718499469854, "loss": 2.6264, "step": 5830 }, { "crossentropy": 2.61661958694458, "epoch": 0.31708311808368905, "grad_norm": 0.03298606351017952, "grad_norm_var": 2.4857289349390862e-05, "learning_rate": 0.005909494119340393, "loss": 2.6166, "step": 5831 }, { "crossentropy": 2.6844836473464966, "epoch": 0.31713749694118926, "grad_norm": 0.034069232642650604, "grad_norm_var": 2.5038190764032724e-05, "learning_rate": 0.005908269682804953, "loss": 2.6845, "step": 5832 }, { "crossentropy": 2.7423322200775146, "epoch": 0.31719187579868946, "grad_norm": 0.03477465733885765, "grad_norm_var": 2.4413284598893448e-05, "learning_rate": 0.005907045189939476, "loss": 2.7423, "step": 5833 }, { "crossentropy": 2.6667014360427856, "epoch": 0.31724625465618966, "grad_norm": 0.03943457081913948, "grad_norm_var": 2.4334775920151902e-05, "learning_rate": 0.0059058206408199015, "loss": 2.6667, "step": 5834 }, { "crossentropy": 2.597704768180847, "epoch": 0.31730063351368987, "grad_norm": 0.03327086195349693, "grad_norm_var": 2.4954040874354023e-05, "learning_rate": 0.005904596035522175, "loss": 2.5977, "step": 5835 }, { "crossentropy": 2.601750373840332, "epoch": 0.31735501237119007, "grad_norm": 0.034987639635801315, "grad_norm_var": 1.4489292371986686e-05, "learning_rate": 0.005903371374122245, "loss": 2.6018, "step": 5836 }, { "crossentropy": 2.5969709157943726, "epoch": 0.3174093912286903, "grad_norm": 0.03405488282442093, "grad_norm_var": 1.404659091106952e-05, "learning_rate": 0.005902146656696067, "loss": 2.597, "step": 5837 }, { "crossentropy": 2.651594877243042, "epoch": 0.3174637700861905, "grad_norm": 0.03495970368385315, "grad_norm_var": 1.36089011481035e-05, "learning_rate": 0.005900921883319591, "loss": 2.6516, "step": 5838 }, { "crossentropy": 2.6716628074645996, "epoch": 0.3175181489436907, "grad_norm": 0.03546919673681259, "grad_norm_var": 1.3342597344006187e-05, "learning_rate": 0.005899697054068782, "loss": 2.6717, "step": 5839 }, { "crossentropy": 2.6644469499588013, "epoch": 0.3175725278011909, "grad_norm": 0.043017368763685226, "grad_norm_var": 1.684656612886947e-05, "learning_rate": 0.0058984721690196, "loss": 2.6644, "step": 5840 }, { "crossentropy": 2.620177149772644, "epoch": 0.3176269066586911, "grad_norm": 0.03413846343755722, "grad_norm_var": 6.625690959928023e-06, "learning_rate": 0.005897247228248012, "loss": 2.6202, "step": 5841 }, { "crossentropy": 2.622642755508423, "epoch": 0.3176812855161913, "grad_norm": 0.03304889053106308, "grad_norm_var": 6.6763020892673365e-06, "learning_rate": 0.0058960222318299865, "loss": 2.6226, "step": 5842 }, { "crossentropy": 2.6488394737243652, "epoch": 0.3177356643736915, "grad_norm": 0.033110905438661575, "grad_norm_var": 6.922505300123912e-06, "learning_rate": 0.005894797179841498, "loss": 2.6488, "step": 5843 }, { "crossentropy": 2.682205319404602, "epoch": 0.3177900432311917, "grad_norm": 0.03427500277757645, "grad_norm_var": 6.937396012968109e-06, "learning_rate": 0.005893572072358524, "loss": 2.6822, "step": 5844 }, { "crossentropy": 2.604052424430847, "epoch": 0.3178444220886919, "grad_norm": 0.034834906458854675, "grad_norm_var": 6.891981858482785e-06, "learning_rate": 0.005892346909457041, "loss": 2.6041, "step": 5845 }, { "crossentropy": 2.588803768157959, "epoch": 0.3178988009461921, "grad_norm": 0.04076608270406723, "grad_norm_var": 8.780100427332033e-06, "learning_rate": 0.0058911216912130365, "loss": 2.5888, "step": 5846 }, { "crossentropy": 2.6124991178512573, "epoch": 0.3179531798036923, "grad_norm": 0.04021037742495537, "grad_norm_var": 9.66874770558599e-06, "learning_rate": 0.005889896417702494, "loss": 2.6125, "step": 5847 }, { "crossentropy": 2.622332811355591, "epoch": 0.3180075586611925, "grad_norm": 0.033381715416908264, "grad_norm_var": 9.86624505018522e-06, "learning_rate": 0.005888671089001406, "loss": 2.6223, "step": 5848 }, { "crossentropy": 2.5675060749053955, "epoch": 0.3180619375186927, "grad_norm": 0.03284848481416702, "grad_norm_var": 1.0376472140011251e-05, "learning_rate": 0.005887445705185765, "loss": 2.5675, "step": 5849 }, { "crossentropy": 2.6459691524505615, "epoch": 0.3181163163761929, "grad_norm": 0.03535769134759903, "grad_norm_var": 9.405920576338079e-06, "learning_rate": 0.00588622026633157, "loss": 2.646, "step": 5850 }, { "crossentropy": 2.7177869081497192, "epoch": 0.3181706952336931, "grad_norm": 0.03614792227745056, "grad_norm_var": 9.074568813236677e-06, "learning_rate": 0.005884994772514819, "loss": 2.7178, "step": 5851 }, { "crossentropy": 2.6164695024490356, "epoch": 0.3182250740911933, "grad_norm": 0.03268646448850632, "grad_norm_var": 9.612771754797557e-06, "learning_rate": 0.005883769223811517, "loss": 2.6165, "step": 5852 }, { "crossentropy": 2.627895474433899, "epoch": 0.31827945294869353, "grad_norm": 0.03716234862804413, "grad_norm_var": 9.609562249962047e-06, "learning_rate": 0.005882543620297672, "loss": 2.6279, "step": 5853 }, { "crossentropy": 2.6507232189178467, "epoch": 0.31833383180619373, "grad_norm": 0.03963220492005348, "grad_norm_var": 1.0504482256452433e-05, "learning_rate": 0.005881317962049295, "loss": 2.6507, "step": 5854 }, { "crossentropy": 2.696444869041443, "epoch": 0.31838821066369394, "grad_norm": 0.03691304475069046, "grad_norm_var": 1.0531530122718605e-05, "learning_rate": 0.0058800922491424, "loss": 2.6964, "step": 5855 }, { "crossentropy": 2.7933579683303833, "epoch": 0.31844258952119414, "grad_norm": 0.035026416182518005, "grad_norm_var": 7.147769131277994e-06, "learning_rate": 0.005878866481653002, "loss": 2.7934, "step": 5856 }, { "crossentropy": 2.5962517261505127, "epoch": 0.31849696837869435, "grad_norm": 0.03406558930873871, "grad_norm_var": 7.162266262631702e-06, "learning_rate": 0.0058776406596571275, "loss": 2.5963, "step": 5857 }, { "crossentropy": 2.7141385078430176, "epoch": 0.31855134723619455, "grad_norm": 0.03310195356607437, "grad_norm_var": 7.144451309859003e-06, "learning_rate": 0.005876414783230796, "loss": 2.7141, "step": 5858 }, { "crossentropy": 2.544436454772949, "epoch": 0.31860572609369475, "grad_norm": 0.033006004989147186, "grad_norm_var": 7.1798843886315225e-06, "learning_rate": 0.005875188852450038, "loss": 2.5444, "step": 5859 }, { "crossentropy": 2.493962526321411, "epoch": 0.31866010495119496, "grad_norm": 0.03484491631388664, "grad_norm_var": 7.100372834687301e-06, "learning_rate": 0.00587396286739088, "loss": 2.494, "step": 5860 }, { "crossentropy": 2.5285842418670654, "epoch": 0.31871448380869516, "grad_norm": 0.035469748079776764, "grad_norm_var": 7.058757336516654e-06, "learning_rate": 0.005872736828129363, "loss": 2.5286, "step": 5861 }, { "crossentropy": 2.58154296875, "epoch": 0.31876886266619536, "grad_norm": 0.035736825317144394, "grad_norm_var": 5.2181778965367425e-06, "learning_rate": 0.00587151073474152, "loss": 2.5815, "step": 5862 }, { "crossentropy": 2.6909170150756836, "epoch": 0.31882324152369557, "grad_norm": 0.0366179458796978, "grad_norm_var": 3.6964510053222322e-06, "learning_rate": 0.005870284587303394, "loss": 2.6909, "step": 5863 }, { "crossentropy": 2.7047210931777954, "epoch": 0.31887762038119577, "grad_norm": 0.03336183726787567, "grad_norm_var": 3.701096017837598e-06, "learning_rate": 0.005869058385891028, "loss": 2.7047, "step": 5864 }, { "crossentropy": 2.620656728744507, "epoch": 0.318931999238696, "grad_norm": 0.03784235566854477, "grad_norm_var": 3.7448086305088185e-06, "learning_rate": 0.0058678321305804726, "loss": 2.6207, "step": 5865 }, { "crossentropy": 2.693971633911133, "epoch": 0.3189863780961962, "grad_norm": 0.037995584309101105, "grad_norm_var": 4.152230973908377e-06, "learning_rate": 0.005866605821447779, "loss": 2.694, "step": 5866 }, { "crossentropy": 2.6440600156784058, "epoch": 0.3190407569536964, "grad_norm": 0.03814033418893814, "grad_norm_var": 4.545710473456005e-06, "learning_rate": 0.005865379458568997, "loss": 2.6441, "step": 5867 }, { "crossentropy": 2.690049409866333, "epoch": 0.3190951358111966, "grad_norm": 0.03617512434720993, "grad_norm_var": 3.892889383492332e-06, "learning_rate": 0.005864153042020191, "loss": 2.69, "step": 5868 }, { "crossentropy": 2.6675233840942383, "epoch": 0.3191495146686968, "grad_norm": 0.03466610983014107, "grad_norm_var": 3.8765899146174395e-06, "learning_rate": 0.005862926571877417, "loss": 2.6675, "step": 5869 }, { "crossentropy": 2.7141504287719727, "epoch": 0.319203893526197, "grad_norm": 0.033584848046302795, "grad_norm_var": 3.062005538989378e-06, "learning_rate": 0.005861700048216741, "loss": 2.7142, "step": 5870 }, { "crossentropy": 2.6112765073776245, "epoch": 0.3192582723836972, "grad_norm": 0.03341832756996155, "grad_norm_var": 3.1246279845289123e-06, "learning_rate": 0.005860473471114232, "loss": 2.6113, "step": 5871 }, { "crossentropy": 2.71560275554657, "epoch": 0.3193126512411974, "grad_norm": 0.04515240341424942, "grad_norm_var": 9.31106969930239e-06, "learning_rate": 0.005859246840645962, "loss": 2.7156, "step": 5872 }, { "crossentropy": 2.4972909688949585, "epoch": 0.3193670300986976, "grad_norm": 0.039344120770692825, "grad_norm_var": 9.815103771730558e-06, "learning_rate": 0.005858020156888003, "loss": 2.4973, "step": 5873 }, { "crossentropy": 2.588942766189575, "epoch": 0.3194214089561978, "grad_norm": 0.03567992150783539, "grad_norm_var": 9.181516097897929e-06, "learning_rate": 0.005856793419916433, "loss": 2.5889, "step": 5874 }, { "crossentropy": 2.5555272102355957, "epoch": 0.319475787813698, "grad_norm": 0.03436383232474327, "grad_norm_var": 8.697715172717215e-06, "learning_rate": 0.005855566629807335, "loss": 2.5555, "step": 5875 }, { "crossentropy": 2.6780298948287964, "epoch": 0.3195301666711982, "grad_norm": 0.0350252129137516, "grad_norm_var": 8.662372006892904e-06, "learning_rate": 0.0058543397866367905, "loss": 2.678, "step": 5876 }, { "crossentropy": 2.702044367790222, "epoch": 0.3195845455286984, "grad_norm": 0.04002008214592934, "grad_norm_var": 9.385455860810333e-06, "learning_rate": 0.005853112890480891, "loss": 2.702, "step": 5877 }, { "crossentropy": 2.697144389152527, "epoch": 0.3196389243861986, "grad_norm": 0.040634047240018845, "grad_norm_var": 1.0258528667187773e-05, "learning_rate": 0.005851885941415724, "loss": 2.6971, "step": 5878 }, { "crossentropy": 2.4829633235931396, "epoch": 0.3196933032436988, "grad_norm": 0.0431072860956192, "grad_norm_var": 1.255873469876249e-05, "learning_rate": 0.005850658939517387, "loss": 2.483, "step": 5879 }, { "crossentropy": 2.5999306440353394, "epoch": 0.319747682101199, "grad_norm": 0.03879573941230774, "grad_norm_var": 1.147341393953801e-05, "learning_rate": 0.005849431884861974, "loss": 2.5999, "step": 5880 }, { "crossentropy": 2.611148238182068, "epoch": 0.31980206095869923, "grad_norm": 0.03794286400079727, "grad_norm_var": 1.1475328768504692e-05, "learning_rate": 0.005848204777525587, "loss": 2.6111, "step": 5881 }, { "crossentropy": 2.701396107673645, "epoch": 0.3198564398161995, "grad_norm": 0.03429478034377098, "grad_norm_var": 1.2211558040602957e-05, "learning_rate": 0.005846977617584332, "loss": 2.7014, "step": 5882 }, { "crossentropy": 2.6596932411193848, "epoch": 0.3199108186736997, "grad_norm": 0.03461485728621483, "grad_norm_var": 1.2697508716284085e-05, "learning_rate": 0.005845750405114315, "loss": 2.6597, "step": 5883 }, { "crossentropy": 2.609187364578247, "epoch": 0.3199651975311999, "grad_norm": 0.03474342077970505, "grad_norm_var": 1.3040584793238875e-05, "learning_rate": 0.005844523140191645, "loss": 2.6092, "step": 5884 }, { "crossentropy": 2.7207577228546143, "epoch": 0.3200195763887001, "grad_norm": 0.03505439683794975, "grad_norm_var": 1.2918216324442345e-05, "learning_rate": 0.005843295822892438, "loss": 2.7208, "step": 5885 }, { "crossentropy": 2.7800532579421997, "epoch": 0.3200739552462003, "grad_norm": 0.03368544578552246, "grad_norm_var": 1.2869875682959091e-05, "learning_rate": 0.005842068453292809, "loss": 2.7801, "step": 5886 }, { "crossentropy": 2.6400938034057617, "epoch": 0.3201283341037005, "grad_norm": 0.03381063789129257, "grad_norm_var": 1.26794705823087e-05, "learning_rate": 0.005840841031468881, "loss": 2.6401, "step": 5887 }, { "crossentropy": 2.587724447250366, "epoch": 0.3201827129612007, "grad_norm": 0.035866789519786835, "grad_norm_var": 8.305381434014815e-06, "learning_rate": 0.005839613557496776, "loss": 2.5877, "step": 5888 }, { "crossentropy": 2.636917233467102, "epoch": 0.3202370918187009, "grad_norm": 0.03562585264444351, "grad_norm_var": 7.851892575096106e-06, "learning_rate": 0.00583838603145262, "loss": 2.6369, "step": 5889 }, { "crossentropy": 2.7252609729766846, "epoch": 0.3202914706762011, "grad_norm": 0.03543976694345474, "grad_norm_var": 7.880286012893791e-06, "learning_rate": 0.005837158453412546, "loss": 2.7253, "step": 5890 }, { "crossentropy": 2.6199179887771606, "epoch": 0.3203458495337013, "grad_norm": 0.03499186784029007, "grad_norm_var": 7.731161970208251e-06, "learning_rate": 0.005835930823452682, "loss": 2.6199, "step": 5891 }, { "crossentropy": 2.5467536449432373, "epoch": 0.3204002283912015, "grad_norm": 0.03473544493317604, "grad_norm_var": 7.792551492362873e-06, "learning_rate": 0.00583470314164917, "loss": 2.5468, "step": 5892 }, { "crossentropy": 2.662495732307434, "epoch": 0.32045460724870173, "grad_norm": 0.03371407091617584, "grad_norm_var": 7.284762543970496e-06, "learning_rate": 0.005833475408078146, "loss": 2.6625, "step": 5893 }, { "crossentropy": 2.6053712368011475, "epoch": 0.32050898610620193, "grad_norm": 0.0348396934568882, "grad_norm_var": 5.8540480380540765e-06, "learning_rate": 0.005832247622815756, "loss": 2.6054, "step": 5894 }, { "crossentropy": 2.670014500617981, "epoch": 0.32056336496370214, "grad_norm": 0.03324007987976074, "grad_norm_var": 2.199101958939333e-06, "learning_rate": 0.005831019785938142, "loss": 2.67, "step": 5895 }, { "crossentropy": 2.5960816144943237, "epoch": 0.32061774382120234, "grad_norm": 0.034929946064949036, "grad_norm_var": 1.2216144619662968e-06, "learning_rate": 0.005829791897521458, "loss": 2.5961, "step": 5896 }, { "crossentropy": 2.5805740356445312, "epoch": 0.32067212267870254, "grad_norm": 0.035328980535268784, "grad_norm_var": 5.6919387798407e-07, "learning_rate": 0.005828563957641852, "loss": 2.5806, "step": 5897 }, { "crossentropy": 2.65468430519104, "epoch": 0.32072650153620275, "grad_norm": 0.03579287230968475, "grad_norm_var": 6.320655966960201e-07, "learning_rate": 0.005827335966375484, "loss": 2.6547, "step": 5898 }, { "crossentropy": 2.691343665122986, "epoch": 0.32078088039370295, "grad_norm": 0.03478622063994408, "grad_norm_var": 6.302217528683998e-07, "learning_rate": 0.0058261079237985096, "loss": 2.6913, "step": 5899 }, { "crossentropy": 2.6320375204086304, "epoch": 0.32083525925120315, "grad_norm": 0.036683131009340286, "grad_norm_var": 8.542109590057208e-07, "learning_rate": 0.005824879829987093, "loss": 2.632, "step": 5900 }, { "crossentropy": 2.6481558084487915, "epoch": 0.32088963810870336, "grad_norm": 0.03485109284520149, "grad_norm_var": 8.528210843831522e-07, "learning_rate": 0.005823651685017397, "loss": 2.6482, "step": 5901 }, { "crossentropy": 2.650621175765991, "epoch": 0.32094401696620356, "grad_norm": 0.03311842679977417, "grad_norm_var": 9.643697987819314e-07, "learning_rate": 0.005822423488965591, "loss": 2.6506, "step": 5902 }, { "crossentropy": 2.570771098136902, "epoch": 0.32099839582370376, "grad_norm": 0.03521556779742241, "grad_norm_var": 8.912233680717525e-07, "learning_rate": 0.005821195241907847, "loss": 2.5708, "step": 5903 }, { "crossentropy": 2.6676021814346313, "epoch": 0.32105277468120397, "grad_norm": 0.03290053457021713, "grad_norm_var": 1.0775556914009606e-06, "learning_rate": 0.0058199669439203405, "loss": 2.6676, "step": 5904 }, { "crossentropy": 2.779286503791809, "epoch": 0.32110715353870417, "grad_norm": 0.035524241626262665, "grad_norm_var": 1.0664987102244778e-06, "learning_rate": 0.005818738595079248, "loss": 2.7793, "step": 5905 }, { "crossentropy": 2.668796420097351, "epoch": 0.3211615323962044, "grad_norm": 0.03278622031211853, "grad_norm_var": 1.2645697532508969e-06, "learning_rate": 0.005817510195460752, "loss": 2.6688, "step": 5906 }, { "crossentropy": 2.6161141395568848, "epoch": 0.3212159112537046, "grad_norm": 0.03202424943447113, "grad_norm_var": 1.6559403600873689e-06, "learning_rate": 0.005816281745141037, "loss": 2.6161, "step": 5907 }, { "crossentropy": 2.66618812084198, "epoch": 0.3212702901112048, "grad_norm": 0.033999454230070114, "grad_norm_var": 1.6573116608672422e-06, "learning_rate": 0.005815053244196287, "loss": 2.6662, "step": 5908 }, { "crossentropy": 2.590592384338379, "epoch": 0.321324668968705, "grad_norm": 0.03347426652908325, "grad_norm_var": 1.6815082845742304e-06, "learning_rate": 0.005813824692702695, "loss": 2.5906, "step": 5909 }, { "crossentropy": 2.7208796739578247, "epoch": 0.3213790478262052, "grad_norm": 0.0364149808883667, "grad_norm_var": 1.9408369981393597e-06, "learning_rate": 0.005812596090736454, "loss": 2.7209, "step": 5910 }, { "crossentropy": 2.6969048976898193, "epoch": 0.3214334266837054, "grad_norm": 0.035453878343105316, "grad_norm_var": 1.8924009525050912e-06, "learning_rate": 0.005811367438373762, "loss": 2.6969, "step": 5911 }, { "crossentropy": 2.689799666404724, "epoch": 0.3214878055412056, "grad_norm": 0.034563347697257996, "grad_norm_var": 1.8837077279796703e-06, "learning_rate": 0.005810138735690818, "loss": 2.6898, "step": 5912 }, { "crossentropy": 2.6164169311523438, "epoch": 0.3215421843987058, "grad_norm": 0.03338554874062538, "grad_norm_var": 1.919815321626874e-06, "learning_rate": 0.0058089099827638244, "loss": 2.6164, "step": 5913 }, { "crossentropy": 2.5640249252319336, "epoch": 0.321596563256206, "grad_norm": 0.03359726071357727, "grad_norm_var": 1.823851794918498e-06, "learning_rate": 0.0058076811796689875, "loss": 2.564, "step": 5914 }, { "crossentropy": 2.651815891265869, "epoch": 0.3216509421137062, "grad_norm": 0.03402869775891304, "grad_norm_var": 1.810470870002486e-06, "learning_rate": 0.005806452326482517, "loss": 2.6518, "step": 5915 }, { "crossentropy": 2.5988534688949585, "epoch": 0.3217053209712064, "grad_norm": 0.033863186836242676, "grad_norm_var": 1.3931281666531547e-06, "learning_rate": 0.005805223423280626, "loss": 2.5989, "step": 5916 }, { "crossentropy": 2.6584670543670654, "epoch": 0.3217596998287066, "grad_norm": 0.032883137464523315, "grad_norm_var": 1.4315547086656925e-06, "learning_rate": 0.005803994470139528, "loss": 2.6585, "step": 5917 }, { "crossentropy": 2.675202965736389, "epoch": 0.3218140786862068, "grad_norm": 0.03538832068443298, "grad_norm_var": 1.5012789102909762e-06, "learning_rate": 0.005802765467135442, "loss": 2.6752, "step": 5918 }, { "crossentropy": 2.694966673851013, "epoch": 0.321868457543707, "grad_norm": 0.037017349153757095, "grad_norm_var": 1.973639185451348e-06, "learning_rate": 0.00580153641434459, "loss": 2.695, "step": 5919 }, { "crossentropy": 2.615625262260437, "epoch": 0.3219228364012072, "grad_norm": 0.03495366498827934, "grad_norm_var": 1.8795776891381359e-06, "learning_rate": 0.005800307311843199, "loss": 2.6156, "step": 5920 }, { "crossentropy": 2.66055691242218, "epoch": 0.3219772152587074, "grad_norm": 0.034370921552181244, "grad_norm_var": 1.7798139643596459e-06, "learning_rate": 0.005799078159707494, "loss": 2.6606, "step": 5921 }, { "crossentropy": 2.6966969966888428, "epoch": 0.32203159411620763, "grad_norm": 0.03338681161403656, "grad_norm_var": 1.6841171198206731e-06, "learning_rate": 0.005797848958013706, "loss": 2.6967, "step": 5922 }, { "crossentropy": 2.6356900930404663, "epoch": 0.32208597297370783, "grad_norm": 0.033744554966688156, "grad_norm_var": 1.3470118654314764e-06, "learning_rate": 0.005796619706838069, "loss": 2.6357, "step": 5923 }, { "crossentropy": 2.711057662963867, "epoch": 0.32214035183120804, "grad_norm": 0.034004095941782, "grad_norm_var": 1.3467604663935465e-06, "learning_rate": 0.005795390406256821, "loss": 2.7111, "step": 5924 }, { "crossentropy": 2.6588672399520874, "epoch": 0.32219473068870824, "grad_norm": 0.035208385437726974, "grad_norm_var": 1.3187852614175595e-06, "learning_rate": 0.005794161056346201, "loss": 2.6589, "step": 5925 }, { "crossentropy": 2.573452353477478, "epoch": 0.32224910954620845, "grad_norm": 0.03656088188290596, "grad_norm_var": 1.3570475648777096e-06, "learning_rate": 0.005792931657182453, "loss": 2.5735, "step": 5926 }, { "crossentropy": 2.6938804388046265, "epoch": 0.32230348840370865, "grad_norm": 0.039154935628175735, "grad_norm_var": 2.6712294519938447e-06, "learning_rate": 0.0057917022088418224, "loss": 2.6939, "step": 5927 }, { "crossentropy": 2.575684428215027, "epoch": 0.32235786726120885, "grad_norm": 0.03535230830311775, "grad_norm_var": 2.6897678344922164e-06, "learning_rate": 0.00579047271140056, "loss": 2.5757, "step": 5928 }, { "crossentropy": 2.4841889142990112, "epoch": 0.32241224611870906, "grad_norm": 0.031979821622371674, "grad_norm_var": 3.0795552908690604e-06, "learning_rate": 0.005789243164934918, "loss": 2.4842, "step": 5929 }, { "crossentropy": 2.6593711376190186, "epoch": 0.32246662497620926, "grad_norm": 0.03452172875404358, "grad_norm_var": 2.994776542236619e-06, "learning_rate": 0.005788013569521146, "loss": 2.6594, "step": 5930 }, { "crossentropy": 2.5894575119018555, "epoch": 0.32252100383370946, "grad_norm": 0.0413321889936924, "grad_norm_var": 5.600695434387383e-06, "learning_rate": 0.0057867839252355125, "loss": 2.5895, "step": 5931 }, { "crossentropy": 2.723023533821106, "epoch": 0.32257538269120967, "grad_norm": 0.044062934815883636, "grad_norm_var": 1.0240459252109681e-05, "learning_rate": 0.005785554232154271, "loss": 2.723, "step": 5932 }, { "crossentropy": 2.6396217346191406, "epoch": 0.32262976154870987, "grad_norm": 0.03541778028011322, "grad_norm_var": 9.632524723856596e-06, "learning_rate": 0.005784324490353689, "loss": 2.6396, "step": 5933 }, { "crossentropy": 2.701273560523987, "epoch": 0.3226841404062101, "grad_norm": 0.03334952890872955, "grad_norm_var": 1.0066353992672396e-05, "learning_rate": 0.005783094699910032, "loss": 2.7013, "step": 5934 }, { "crossentropy": 2.681615948677063, "epoch": 0.3227385192637103, "grad_norm": 0.03479772061109543, "grad_norm_var": 1.0043926874262615e-05, "learning_rate": 0.005781864860899574, "loss": 2.6816, "step": 5935 }, { "crossentropy": 2.6704620122909546, "epoch": 0.3227928981212105, "grad_norm": 0.03389088064432144, "grad_norm_var": 1.0229121537063714e-05, "learning_rate": 0.005780634973398584, "loss": 2.6705, "step": 5936 }, { "crossentropy": 2.6257166862487793, "epoch": 0.3228472769787107, "grad_norm": 0.03328767418861389, "grad_norm_var": 1.049384092711893e-05, "learning_rate": 0.0057794050374833416, "loss": 2.6257, "step": 5937 }, { "crossentropy": 2.583918333053589, "epoch": 0.3229016558362109, "grad_norm": 0.035694997757673264, "grad_norm_var": 1.013699822124565e-05, "learning_rate": 0.005778175053230126, "loss": 2.5839, "step": 5938 }, { "crossentropy": 2.587889075279236, "epoch": 0.3229560346937111, "grad_norm": 0.03665278106927872, "grad_norm_var": 9.879236240485587e-06, "learning_rate": 0.005776945020715219, "loss": 2.5879, "step": 5939 }, { "crossentropy": 2.6681071519851685, "epoch": 0.3230104135512113, "grad_norm": 0.035311538726091385, "grad_norm_var": 9.64610514914138e-06, "learning_rate": 0.005775714940014905, "loss": 2.6681, "step": 5940 }, { "crossentropy": 2.7451727390289307, "epoch": 0.3230647924087115, "grad_norm": 0.034105923026800156, "grad_norm_var": 9.843725107379295e-06, "learning_rate": 0.005774484811205473, "loss": 2.7452, "step": 5941 }, { "crossentropy": 2.6971641778945923, "epoch": 0.3231191712662117, "grad_norm": 0.0366097055375576, "grad_norm_var": 9.847739494994676e-06, "learning_rate": 0.0057732546343632184, "loss": 2.6972, "step": 5942 }, { "crossentropy": 2.628546714782715, "epoch": 0.3231735501237119, "grad_norm": 0.034284986555576324, "grad_norm_var": 9.262050884107734e-06, "learning_rate": 0.005772024409564431, "loss": 2.6285, "step": 5943 }, { "crossentropy": 2.7215490341186523, "epoch": 0.3232279289812121, "grad_norm": 0.0359494686126709, "grad_norm_var": 9.259379265401784e-06, "learning_rate": 0.005770794136885408, "loss": 2.7215, "step": 5944 }, { "crossentropy": 2.696851134300232, "epoch": 0.3232823078387123, "grad_norm": 0.03571074828505516, "grad_norm_var": 8.27719528060933e-06, "learning_rate": 0.005769563816402452, "loss": 2.6969, "step": 5945 }, { "crossentropy": 2.6780096292495728, "epoch": 0.3233366866962125, "grad_norm": 0.036859363317489624, "grad_norm_var": 8.177832809025399e-06, "learning_rate": 0.005768333448191867, "loss": 2.678, "step": 5946 }, { "crossentropy": 2.659753680229187, "epoch": 0.3233910655537127, "grad_norm": 0.03673943877220154, "grad_norm_var": 6.281364691332279e-06, "learning_rate": 0.005767103032329955, "loss": 2.6598, "step": 5947 }, { "crossentropy": 2.6890405416488647, "epoch": 0.3234454444112129, "grad_norm": 0.06744846701622009, "grad_norm_var": 6.624050070193338e-05, "learning_rate": 0.00576587256889303, "loss": 2.689, "step": 5948 }, { "crossentropy": 2.65871524810791, "epoch": 0.3234998232687131, "grad_norm": 0.03468187153339386, "grad_norm_var": 6.645480857874192e-05, "learning_rate": 0.005764642057957402, "loss": 2.6587, "step": 5949 }, { "crossentropy": 2.6664414405822754, "epoch": 0.32355420212621333, "grad_norm": 0.036188047379255295, "grad_norm_var": 6.549695658833454e-05, "learning_rate": 0.005763411499599387, "loss": 2.6664, "step": 5950 }, { "crossentropy": 2.659270405769348, "epoch": 0.32360858098371353, "grad_norm": 0.03530414402484894, "grad_norm_var": 6.533805819160312e-05, "learning_rate": 0.005762180893895301, "loss": 2.6593, "step": 5951 }, { "crossentropy": 2.7152791023254395, "epoch": 0.32366295984121374, "grad_norm": 0.034978386014699936, "grad_norm_var": 6.49002497733618e-05, "learning_rate": 0.005760950240921468, "loss": 2.7153, "step": 5952 }, { "crossentropy": 2.6446629762649536, "epoch": 0.32371733869871394, "grad_norm": 0.035214614123106, "grad_norm_var": 6.405315565271551e-05, "learning_rate": 0.0057597195407542105, "loss": 2.6447, "step": 5953 }, { "crossentropy": 2.654831051826477, "epoch": 0.32377171755621414, "grad_norm": 0.03592575341463089, "grad_norm_var": 6.399761305414201e-05, "learning_rate": 0.005758488793469854, "loss": 2.6548, "step": 5954 }, { "crossentropy": 2.6320056915283203, "epoch": 0.32382609641371435, "grad_norm": 0.035378649830818176, "grad_norm_var": 6.426387170544995e-05, "learning_rate": 0.0057572579991447305, "loss": 2.632, "step": 5955 }, { "crossentropy": 2.588613986968994, "epoch": 0.32388047527121455, "grad_norm": 0.03693777695298195, "grad_norm_var": 6.394526860044274e-05, "learning_rate": 0.005756027157855171, "loss": 2.5886, "step": 5956 }, { "crossentropy": 2.70089054107666, "epoch": 0.32393485412871476, "grad_norm": 0.03866999223828316, "grad_norm_var": 6.309361112212061e-05, "learning_rate": 0.005754796269677515, "loss": 2.7009, "step": 5957 }, { "crossentropy": 2.6986536979675293, "epoch": 0.32398923298621496, "grad_norm": 0.03641577064990997, "grad_norm_var": 6.313010423240964e-05, "learning_rate": 0.005753565334688094, "loss": 2.6987, "step": 5958 }, { "crossentropy": 2.6464747190475464, "epoch": 0.32404361184371516, "grad_norm": 0.03544475883245468, "grad_norm_var": 6.2652380483922e-05, "learning_rate": 0.005752334352963256, "loss": 2.6465, "step": 5959 }, { "crossentropy": 2.6950581073760986, "epoch": 0.32409799070121537, "grad_norm": 0.03516346961259842, "grad_norm_var": 6.290488754699675e-05, "learning_rate": 0.005751103324579342, "loss": 2.6951, "step": 5960 }, { "crossentropy": 2.5475140810012817, "epoch": 0.32415236955871557, "grad_norm": 0.0372818224132061, "grad_norm_var": 6.25919004702898e-05, "learning_rate": 0.0057498722496127, "loss": 2.5475, "step": 5961 }, { "crossentropy": 2.5905792713165283, "epoch": 0.3242067484162158, "grad_norm": 0.03871292248368263, "grad_norm_var": 6.251496510972042e-05, "learning_rate": 0.005748641128139681, "loss": 2.5906, "step": 5962 }, { "crossentropy": 2.653325915336609, "epoch": 0.324261127273716, "grad_norm": 0.06534582376480103, "grad_norm_var": 0.00010825967910339604, "learning_rate": 0.005747409960236637, "loss": 2.6533, "step": 5963 }, { "crossentropy": 2.615605354309082, "epoch": 0.3243155061312162, "grad_norm": 0.03662841394543648, "grad_norm_var": 5.4598678221945186e-05, "learning_rate": 0.005746178745979924, "loss": 2.6156, "step": 5964 }, { "crossentropy": 2.6184873580932617, "epoch": 0.3243698849887164, "grad_norm": 0.034627385437488556, "grad_norm_var": 5.46230929504186e-05, "learning_rate": 0.005744947485445901, "loss": 2.6185, "step": 5965 }, { "crossentropy": 2.6157166957855225, "epoch": 0.3244242638462166, "grad_norm": 0.03696582466363907, "grad_norm_var": 5.4471584246603734e-05, "learning_rate": 0.005743716178710928, "loss": 2.6157, "step": 5966 }, { "crossentropy": 2.669695258140564, "epoch": 0.3244786427037168, "grad_norm": 0.0353887714445591, "grad_norm_var": 5.4440910686874104e-05, "learning_rate": 0.005742484825851372, "loss": 2.6697, "step": 5967 }, { "crossentropy": 2.6824105978012085, "epoch": 0.324533021561217, "grad_norm": 0.03478572890162468, "grad_norm_var": 5.452258267361713e-05, "learning_rate": 0.005741253426943599, "loss": 2.6824, "step": 5968 }, { "crossentropy": 2.6710885763168335, "epoch": 0.3245874004187172, "grad_norm": 0.03527954965829849, "grad_norm_var": 5.449824990204118e-05, "learning_rate": 0.00574002198206398, "loss": 2.6711, "step": 5969 }, { "crossentropy": 2.696757197380066, "epoch": 0.3246417792762174, "grad_norm": 0.035200703889131546, "grad_norm_var": 5.473738471574151e-05, "learning_rate": 0.005738790491288888, "loss": 2.6968, "step": 5970 }, { "crossentropy": 2.669173240661621, "epoch": 0.3246961581337176, "grad_norm": 0.034396227449178696, "grad_norm_var": 5.514293796931944e-05, "learning_rate": 0.0057375589546946985, "loss": 2.6692, "step": 5971 }, { "crossentropy": 2.658963680267334, "epoch": 0.3247505369912178, "grad_norm": 0.03417636454105377, "grad_norm_var": 5.599324828360619e-05, "learning_rate": 0.005736327372357789, "loss": 2.659, "step": 5972 }, { "crossentropy": 2.5879955291748047, "epoch": 0.324804915848718, "grad_norm": 0.03497323766350746, "grad_norm_var": 5.640880395951786e-05, "learning_rate": 0.005735095744354542, "loss": 2.588, "step": 5973 }, { "crossentropy": 2.6350507736206055, "epoch": 0.3248592947062182, "grad_norm": 0.03426626697182655, "grad_norm_var": 5.702241063943499e-05, "learning_rate": 0.0057338640707613455, "loss": 2.6351, "step": 5974 }, { "crossentropy": 2.5097910165786743, "epoch": 0.3249136735637184, "grad_norm": 0.03422372043132782, "grad_norm_var": 5.743633165140079e-05, "learning_rate": 0.00573263235165458, "loss": 2.5098, "step": 5975 }, { "crossentropy": 2.701617121696472, "epoch": 0.3249680524212186, "grad_norm": 0.03519512712955475, "grad_norm_var": 5.742721341907697e-05, "learning_rate": 0.0057314005871106416, "loss": 2.7016, "step": 5976 }, { "crossentropy": 2.6175142526626587, "epoch": 0.3250224312787188, "grad_norm": 0.033654335886240005, "grad_norm_var": 5.8278006561572554e-05, "learning_rate": 0.0057301687772059184, "loss": 2.6175, "step": 5977 }, { "crossentropy": 2.6878459453582764, "epoch": 0.32507681013621903, "grad_norm": 0.03603241592645645, "grad_norm_var": 5.8155539660671254e-05, "learning_rate": 0.00572893692201681, "loss": 2.6878, "step": 5978 }, { "crossentropy": 2.7261343002319336, "epoch": 0.32513118899371923, "grad_norm": 0.035234853625297546, "grad_norm_var": 8.0392598261018e-07, "learning_rate": 0.005727705021619712, "loss": 2.7261, "step": 5979 }, { "crossentropy": 2.4147814512252808, "epoch": 0.32518556785121944, "grad_norm": 0.03451735898852348, "grad_norm_var": 6.422053914950191e-07, "learning_rate": 0.005726473076091028, "loss": 2.4148, "step": 5980 }, { "crossentropy": 2.5931124687194824, "epoch": 0.32523994670871964, "grad_norm": 0.035072244703769684, "grad_norm_var": 6.364842747831611e-07, "learning_rate": 0.005725241085507161, "loss": 2.5931, "step": 5981 }, { "crossentropy": 2.6060761213302612, "epoch": 0.32529432556621984, "grad_norm": 0.03768285736441612, "grad_norm_var": 8.603670279128464e-07, "learning_rate": 0.005724009049944518, "loss": 2.6061, "step": 5982 }, { "crossentropy": 2.6230428218841553, "epoch": 0.32534870442372005, "grad_norm": 0.03509325534105301, "grad_norm_var": 8.507031396866779e-07, "learning_rate": 0.0057227769694795086, "loss": 2.623, "step": 5983 }, { "crossentropy": 2.592112898826599, "epoch": 0.32540308328122025, "grad_norm": 0.036366287618875504, "grad_norm_var": 9.645244985310073e-07, "learning_rate": 0.005721544844188544, "loss": 2.5921, "step": 5984 }, { "crossentropy": 2.6845275163650513, "epoch": 0.32545746213872045, "grad_norm": 0.06108928099274635, "grad_norm_var": 4.32668832903576e-05, "learning_rate": 0.005720312674148044, "loss": 2.6845, "step": 5985 }, { "crossentropy": 2.632454752922058, "epoch": 0.32551184099622066, "grad_norm": 0.03398092836141586, "grad_norm_var": 4.360345587807341e-05, "learning_rate": 0.005719080459434421, "loss": 2.6325, "step": 5986 }, { "crossentropy": 2.6491122245788574, "epoch": 0.32556621985372086, "grad_norm": 0.034298207610845566, "grad_norm_var": 4.3633147943433526e-05, "learning_rate": 0.0057178482001241, "loss": 2.6491, "step": 5987 }, { "crossentropy": 2.6965514421463013, "epoch": 0.32562059871122107, "grad_norm": 0.0358261801302433, "grad_norm_var": 4.3266596128615996e-05, "learning_rate": 0.005716615896293501, "loss": 2.6966, "step": 5988 }, { "crossentropy": 2.6662176847457886, "epoch": 0.32567497756872127, "grad_norm": 0.032982658594846725, "grad_norm_var": 4.3977632616573556e-05, "learning_rate": 0.005715383548019053, "loss": 2.6662, "step": 5989 }, { "crossentropy": 2.629021406173706, "epoch": 0.3257293564262215, "grad_norm": 0.033861298114061356, "grad_norm_var": 4.411361094894117e-05, "learning_rate": 0.005714151155377184, "loss": 2.629, "step": 5990 }, { "crossentropy": 2.674821376800537, "epoch": 0.3257837352837217, "grad_norm": 0.03437923640012741, "grad_norm_var": 4.406648298262515e-05, "learning_rate": 0.005712918718444327, "loss": 2.6748, "step": 5991 }, { "crossentropy": 2.6837925910949707, "epoch": 0.3258381141412219, "grad_norm": 0.03732195869088173, "grad_norm_var": 4.3956716213123616e-05, "learning_rate": 0.005711686237296916, "loss": 2.6838, "step": 5992 }, { "crossentropy": 2.771053910255432, "epoch": 0.3258924929987221, "grad_norm": 0.03974311798810959, "grad_norm_var": 4.3791399771158215e-05, "learning_rate": 0.005710453712011386, "loss": 2.7711, "step": 5993 }, { "crossentropy": 2.574345111846924, "epoch": 0.3259468718562223, "grad_norm": 0.035141680389642715, "grad_norm_var": 4.396690437908727e-05, "learning_rate": 0.005709221142664181, "loss": 2.5743, "step": 5994 }, { "crossentropy": 2.668082356452942, "epoch": 0.3260012507137225, "grad_norm": 0.03672036528587341, "grad_norm_var": 4.374788537450195e-05, "learning_rate": 0.005707988529331741, "loss": 2.6681, "step": 5995 }, { "crossentropy": 2.6628708839416504, "epoch": 0.3260556295712227, "grad_norm": 0.03679923340678215, "grad_norm_var": 4.3278482611435375e-05, "learning_rate": 0.005706755872090514, "loss": 2.6629, "step": 5996 }, { "crossentropy": 2.591955542564392, "epoch": 0.3261100084287229, "grad_norm": 0.037996530532836914, "grad_norm_var": 4.295508749831955e-05, "learning_rate": 0.005705523171016945, "loss": 2.592, "step": 5997 }, { "crossentropy": 2.7400381565093994, "epoch": 0.3261643872862231, "grad_norm": 0.03354174271225929, "grad_norm_var": 4.39011844580441e-05, "learning_rate": 0.005704290426187488, "loss": 2.74, "step": 5998 }, { "crossentropy": 2.5827927589416504, "epoch": 0.3262187661437233, "grad_norm": 0.03452390059828758, "grad_norm_var": 4.4081100733658835e-05, "learning_rate": 0.005703057637678595, "loss": 2.5828, "step": 5999 }, { "crossentropy": 2.7242860794067383, "epoch": 0.3262731450012235, "grad_norm": 0.034158188849687576, "grad_norm_var": 4.4619743359193496e-05, "learning_rate": 0.005701824805566722, "loss": 2.7243, "step": 6000 }, { "crossentropy": 2.575577139854431, "epoch": 0.3263275238587237, "grad_norm": 0.034719571471214294, "grad_norm_var": 3.4629630387010096e-06, "learning_rate": 0.005700591929928329, "loss": 2.5756, "step": 6001 }, { "crossentropy": 2.704699993133545, "epoch": 0.3263819027162239, "grad_norm": 0.03535278141498566, "grad_norm_var": 3.325651437663159e-06, "learning_rate": 0.005699359010839878, "loss": 2.7047, "step": 6002 }, { "crossentropy": 2.626834988594055, "epoch": 0.3264362815737241, "grad_norm": 0.0392657071352005, "grad_norm_var": 4.09813553292193e-06, "learning_rate": 0.005698126048377834, "loss": 2.6268, "step": 6003 }, { "crossentropy": 2.693767547607422, "epoch": 0.3264906604312243, "grad_norm": 0.03850903734564781, "grad_norm_var": 4.5677732540937965e-06, "learning_rate": 0.005696893042618662, "loss": 2.6938, "step": 6004 }, { "crossentropy": 2.5541385412216187, "epoch": 0.3265450392887245, "grad_norm": 0.03561219573020935, "grad_norm_var": 3.9635726100881634e-06, "learning_rate": 0.005695659993638834, "loss": 2.5541, "step": 6005 }, { "crossentropy": 2.6583547592163086, "epoch": 0.32659941814622473, "grad_norm": 0.03451971337199211, "grad_norm_var": 3.7938789065429987e-06, "learning_rate": 0.005694426901514821, "loss": 2.6584, "step": 6006 }, { "crossentropy": 2.558905243873596, "epoch": 0.32665379700372493, "grad_norm": 0.032461028546094894, "grad_norm_var": 4.4752221454418794e-06, "learning_rate": 0.0056931937663231, "loss": 2.5589, "step": 6007 }, { "crossentropy": 2.5793451070785522, "epoch": 0.32670817586122514, "grad_norm": 0.03401724249124527, "grad_norm_var": 4.585951781621346e-06, "learning_rate": 0.005691960588140147, "loss": 2.5793, "step": 6008 }, { "crossentropy": 2.691780686378479, "epoch": 0.32676255471872534, "grad_norm": 0.03640296682715416, "grad_norm_var": 3.535008935271327e-06, "learning_rate": 0.005690727367042444, "loss": 2.6918, "step": 6009 }, { "crossentropy": 2.57978618144989, "epoch": 0.32681693357622554, "grad_norm": 0.03963993117213249, "grad_norm_var": 4.51944665802306e-06, "learning_rate": 0.005689494103106474, "loss": 2.5798, "step": 6010 }, { "crossentropy": 2.5725579261779785, "epoch": 0.32687131243372575, "grad_norm": 0.03864569216966629, "grad_norm_var": 4.9642879683730575e-06, "learning_rate": 0.005688260796408722, "loss": 2.5726, "step": 6011 }, { "crossentropy": 2.6729460954666138, "epoch": 0.32692569129122595, "grad_norm": 0.0353349931538105, "grad_norm_var": 4.944270964179984e-06, "learning_rate": 0.005687027447025677, "loss": 2.6729, "step": 6012 }, { "crossentropy": 2.574247121810913, "epoch": 0.32698007014872615, "grad_norm": 0.042479969561100006, "grad_norm_var": 7.442632157032778e-06, "learning_rate": 0.005685794055033829, "loss": 2.5742, "step": 6013 }, { "crossentropy": 2.6581090688705444, "epoch": 0.32703444900622636, "grad_norm": 0.03553325682878494, "grad_norm_var": 6.984908804507269e-06, "learning_rate": 0.005684560620509675, "loss": 2.6581, "step": 6014 }, { "crossentropy": 2.781859874725342, "epoch": 0.32708882786372656, "grad_norm": 0.03488372638821602, "grad_norm_var": 6.906661467035585e-06, "learning_rate": 0.005683327143529708, "loss": 2.7819, "step": 6015 }, { "crossentropy": 2.5767323970794678, "epoch": 0.32714320672122676, "grad_norm": 0.033604152500629425, "grad_norm_var": 7.087463165921365e-06, "learning_rate": 0.00568209362417043, "loss": 2.5767, "step": 6016 }, { "crossentropy": 2.649420380592346, "epoch": 0.32719758557872697, "grad_norm": 0.09046842902898788, "grad_norm_var": 0.00018950126009036148, "learning_rate": 0.005680860062508342, "loss": 2.6494, "step": 6017 }, { "crossentropy": 2.610624313354492, "epoch": 0.32725196443622717, "grad_norm": 0.036556676030159, "grad_norm_var": 0.00018887867490484093, "learning_rate": 0.005679626458619947, "loss": 2.6106, "step": 6018 }, { "crossentropy": 2.642198324203491, "epoch": 0.3273063432937274, "grad_norm": 0.041634537279605865, "grad_norm_var": 0.00018903823184086653, "learning_rate": 0.005678392812581751, "loss": 2.6422, "step": 6019 }, { "crossentropy": 2.6649701595306396, "epoch": 0.3273607221512276, "grad_norm": 0.03370868042111397, "grad_norm_var": 0.00019144487589703758, "learning_rate": 0.005677159124470267, "loss": 2.665, "step": 6020 }, { "crossentropy": 2.5835766792297363, "epoch": 0.3274151010087278, "grad_norm": 0.03296026214957237, "grad_norm_var": 0.00019333653463220006, "learning_rate": 0.005675925394362004, "loss": 2.5836, "step": 6021 }, { "crossentropy": 2.532701015472412, "epoch": 0.327469479866228, "grad_norm": 0.033878277987241745, "grad_norm_var": 0.00019379273742560116, "learning_rate": 0.00567469162233348, "loss": 2.5327, "step": 6022 }, { "crossentropy": 2.614490509033203, "epoch": 0.3275238587237282, "grad_norm": 0.034467704594135284, "grad_norm_var": 0.0001921575757851572, "learning_rate": 0.005673457808461208, "loss": 2.6145, "step": 6023 }, { "crossentropy": 2.599114775657654, "epoch": 0.3275782375812284, "grad_norm": 0.037894848734140396, "grad_norm_var": 0.00019019102929898588, "learning_rate": 0.0056722239528217136, "loss": 2.5991, "step": 6024 }, { "crossentropy": 2.5200129747390747, "epoch": 0.3276326164387286, "grad_norm": 0.03410419449210167, "grad_norm_var": 0.0001915872926598244, "learning_rate": 0.005670990055491514, "loss": 2.52, "step": 6025 }, { "crossentropy": 2.6511775255203247, "epoch": 0.3276869952962288, "grad_norm": 0.03303039073944092, "grad_norm_var": 0.00019440339680318046, "learning_rate": 0.005669756116547138, "loss": 2.6512, "step": 6026 }, { "crossentropy": 2.6055333614349365, "epoch": 0.327741374153729, "grad_norm": 0.039060674607753754, "grad_norm_var": 0.00019437662231244068, "learning_rate": 0.005668522136065113, "loss": 2.6055, "step": 6027 }, { "crossentropy": 2.6344412565231323, "epoch": 0.3277957530112292, "grad_norm": 0.03557950630784035, "grad_norm_var": 0.00019424946113698598, "learning_rate": 0.005667288114121967, "loss": 2.6344, "step": 6028 }, { "crossentropy": 2.7363659143447876, "epoch": 0.3278501318687294, "grad_norm": 0.038145072758197784, "grad_norm_var": 0.00019362370074652295, "learning_rate": 0.005666054050794234, "loss": 2.7364, "step": 6029 }, { "crossentropy": 2.573060631752014, "epoch": 0.3279045107262296, "grad_norm": 0.03930680826306343, "grad_norm_var": 0.00019272192742983025, "learning_rate": 0.005664819946158451, "loss": 2.5731, "step": 6030 }, { "crossentropy": 2.6635583639144897, "epoch": 0.3279588895837299, "grad_norm": 0.03499879315495491, "grad_norm_var": 0.0001926545353950276, "learning_rate": 0.005663585800291153, "loss": 2.6636, "step": 6031 }, { "crossentropy": 2.5503894090652466, "epoch": 0.3280132684412301, "grad_norm": 0.0332789309322834, "grad_norm_var": 0.0001929097577115702, "learning_rate": 0.005662351613268882, "loss": 2.5504, "step": 6032 }, { "crossentropy": 2.644614100456238, "epoch": 0.3280676472987303, "grad_norm": 0.03419103845953941, "grad_norm_var": 7.034612462769365e-06, "learning_rate": 0.005661117385168184, "loss": 2.6446, "step": 6033 }, { "crossentropy": 2.7670748233795166, "epoch": 0.3281220261562305, "grad_norm": 0.042769938707351685, "grad_norm_var": 1.0074445513990939e-05, "learning_rate": 0.005659883116065601, "loss": 2.7671, "step": 6034 }, { "crossentropy": 2.7627705335617065, "epoch": 0.3281764050137307, "grad_norm": 0.03490089252591133, "grad_norm_var": 8.018405578470665e-06, "learning_rate": 0.0056586488060376815, "loss": 2.7628, "step": 6035 }, { "crossentropy": 2.5970120429992676, "epoch": 0.3282307838712309, "grad_norm": 0.03256833553314209, "grad_norm_var": 8.41267711285491e-06, "learning_rate": 0.005657414455160979, "loss": 2.597, "step": 6036 }, { "crossentropy": 2.587480664253235, "epoch": 0.3282851627287311, "grad_norm": 0.032566118985414505, "grad_norm_var": 8.566154980591683e-06, "learning_rate": 0.005656180063512044, "loss": 2.5875, "step": 6037 }, { "crossentropy": 2.615099787712097, "epoch": 0.3283395415862313, "grad_norm": 0.03261581063270569, "grad_norm_var": 8.967594183190557e-06, "learning_rate": 0.005654945631167433, "loss": 2.6151, "step": 6038 }, { "crossentropy": 2.4989736080169678, "epoch": 0.3283939204437315, "grad_norm": 0.033544473350048065, "grad_norm_var": 9.159318694026916e-06, "learning_rate": 0.005653711158203704, "loss": 2.499, "step": 6039 }, { "crossentropy": 2.703913927078247, "epoch": 0.3284482993012317, "grad_norm": 0.03355195000767708, "grad_norm_var": 8.971487921886843e-06, "learning_rate": 0.005652476644697419, "loss": 2.7039, "step": 6040 }, { "crossentropy": 2.735809803009033, "epoch": 0.3285026781587319, "grad_norm": 0.036544766277074814, "grad_norm_var": 8.966575677368284e-06, "learning_rate": 0.005651242090725141, "loss": 2.7358, "step": 6041 }, { "crossentropy": 2.64785099029541, "epoch": 0.3285570570162321, "grad_norm": 0.035885971039533615, "grad_norm_var": 8.56797480944596e-06, "learning_rate": 0.005650007496363434, "loss": 2.6479, "step": 6042 }, { "crossentropy": 2.6323060989379883, "epoch": 0.3286114358737323, "grad_norm": 0.037480127066373825, "grad_norm_var": 7.993609027361938e-06, "learning_rate": 0.00564877286168887, "loss": 2.6323, "step": 6043 }, { "crossentropy": 2.684743881225586, "epoch": 0.3286658147312325, "grad_norm": 0.038520269095897675, "grad_norm_var": 8.567040318457897e-06, "learning_rate": 0.005647538186778018, "loss": 2.6847, "step": 6044 }, { "crossentropy": 2.681432604789734, "epoch": 0.3287201935887327, "grad_norm": 0.0329350121319294, "grad_norm_var": 8.550697545950796e-06, "learning_rate": 0.00564630347170745, "loss": 2.6814, "step": 6045 }, { "crossentropy": 2.6670643091201782, "epoch": 0.3287745724462329, "grad_norm": 0.034125205129384995, "grad_norm_var": 7.497637208661994e-06, "learning_rate": 0.005645068716553744, "loss": 2.6671, "step": 6046 }, { "crossentropy": 2.600866675376892, "epoch": 0.32882895130373313, "grad_norm": 0.03340949863195419, "grad_norm_var": 7.662084843488431e-06, "learning_rate": 0.005643833921393477, "loss": 2.6009, "step": 6047 }, { "crossentropy": 2.637120246887207, "epoch": 0.32888333016123333, "grad_norm": 0.03475078195333481, "grad_norm_var": 7.473362131672247e-06, "learning_rate": 0.005642599086303233, "loss": 2.6371, "step": 6048 }, { "crossentropy": 2.5411750078201294, "epoch": 0.32893770901873354, "grad_norm": 0.08007048815488815, "grad_norm_var": 0.0001339447673024757, "learning_rate": 0.00564136421135959, "loss": 2.5412, "step": 6049 }, { "crossentropy": 2.628675937652588, "epoch": 0.32899208787623374, "grad_norm": 0.03467775881290436, "grad_norm_var": 0.00013277220835293418, "learning_rate": 0.005640129296639137, "loss": 2.6287, "step": 6050 }, { "crossentropy": 2.6448429822921753, "epoch": 0.32904646673373394, "grad_norm": 0.03442122042179108, "grad_norm_var": 0.0001329454128320084, "learning_rate": 0.005638894342218462, "loss": 2.6448, "step": 6051 }, { "crossentropy": 2.594226837158203, "epoch": 0.32910084559123415, "grad_norm": 0.03356967493891716, "grad_norm_var": 0.0001323691055122524, "learning_rate": 0.005637659348174156, "loss": 2.5942, "step": 6052 }, { "crossentropy": 2.7123953104019165, "epoch": 0.32915522444873435, "grad_norm": 0.03369595855474472, "grad_norm_var": 0.00013171815384790437, "learning_rate": 0.005636424314582811, "loss": 2.7124, "step": 6053 }, { "crossentropy": 2.691219925880432, "epoch": 0.32920960330623456, "grad_norm": 0.03615731745958328, "grad_norm_var": 0.00013020166011711847, "learning_rate": 0.005635189241521023, "loss": 2.6912, "step": 6054 }, { "crossentropy": 2.729656934738159, "epoch": 0.32926398216373476, "grad_norm": 0.036593325436115265, "grad_norm_var": 0.0001290897816489125, "learning_rate": 0.005633954129065393, "loss": 2.7297, "step": 6055 }, { "crossentropy": 2.755439043045044, "epoch": 0.32931836102123496, "grad_norm": 0.04155668616294861, "grad_norm_var": 0.00012845456555617764, "learning_rate": 0.005632718977292516, "loss": 2.7554, "step": 6056 }, { "crossentropy": 2.698719620704651, "epoch": 0.32937273987873517, "grad_norm": 0.03651374205946922, "grad_norm_var": 0.00012846229846715033, "learning_rate": 0.005631483786279, "loss": 2.6987, "step": 6057 }, { "crossentropy": 2.61827290058136, "epoch": 0.32942711873623537, "grad_norm": 0.033478546887636185, "grad_norm_var": 0.00012963076548350345, "learning_rate": 0.005630248556101447, "loss": 2.6183, "step": 6058 }, { "crossentropy": 2.609615683555603, "epoch": 0.3294814975937356, "grad_norm": 0.035510819405317307, "grad_norm_var": 0.0001300745717316587, "learning_rate": 0.005629013286836467, "loss": 2.6096, "step": 6059 }, { "crossentropy": 2.66163969039917, "epoch": 0.3295358764512358, "grad_norm": 0.03724338859319687, "grad_norm_var": 0.0001301090326452444, "learning_rate": 0.005627777978560669, "loss": 2.6616, "step": 6060 }, { "crossentropy": 2.691342353820801, "epoch": 0.329590255308736, "grad_norm": 0.036617692559957504, "grad_norm_var": 0.00012844786366099253, "learning_rate": 0.005626542631350666, "loss": 2.6913, "step": 6061 }, { "crossentropy": 2.5362014770507812, "epoch": 0.3296446341662362, "grad_norm": 0.04017839580774307, "grad_norm_var": 0.00012738906512204794, "learning_rate": 0.005625307245283074, "loss": 2.5362, "step": 6062 }, { "crossentropy": 2.693200945854187, "epoch": 0.3296990130237364, "grad_norm": 0.04049365967512131, "grad_norm_var": 0.00012557303395039764, "learning_rate": 0.005624071820434507, "loss": 2.6932, "step": 6063 }, { "crossentropy": 2.6363552808761597, "epoch": 0.3297533918812366, "grad_norm": 0.036078110337257385, "grad_norm_var": 0.00012491421469783127, "learning_rate": 0.00562283635688159, "loss": 2.6364, "step": 6064 }, { "crossentropy": 2.685207962989807, "epoch": 0.3298077707387368, "grad_norm": 0.03392846882343292, "grad_norm_var": 6.403867336193745e-06, "learning_rate": 0.005621600854700941, "loss": 2.6852, "step": 6065 }, { "crossentropy": 2.6095263957977295, "epoch": 0.329862149596237, "grad_norm": 0.032970014959573746, "grad_norm_var": 6.954311658859356e-06, "learning_rate": 0.005620365313969189, "loss": 2.6095, "step": 6066 }, { "crossentropy": 2.685228109359741, "epoch": 0.3299165284537372, "grad_norm": 0.034109894186258316, "grad_norm_var": 7.033706184670213e-06, "learning_rate": 0.005619129734762956, "loss": 2.6852, "step": 6067 }, { "crossentropy": 2.618186593055725, "epoch": 0.3299709073112374, "grad_norm": 0.032895736396312714, "grad_norm_var": 7.295617998838664e-06, "learning_rate": 0.005617894117158876, "loss": 2.6182, "step": 6068 }, { "crossentropy": 2.6429858207702637, "epoch": 0.3300252861687376, "grad_norm": 0.034500602632761, "grad_norm_var": 7.075336021996186e-06, "learning_rate": 0.005616658461233578, "loss": 2.643, "step": 6069 }, { "crossentropy": 2.658658742904663, "epoch": 0.3300796650262378, "grad_norm": 0.03515041619539261, "grad_norm_var": 7.141297129588791e-06, "learning_rate": 0.005615422767063697, "loss": 2.6587, "step": 6070 }, { "crossentropy": 2.634189009666443, "epoch": 0.330134043883738, "grad_norm": 0.03463838994503021, "grad_norm_var": 7.2551445789200674e-06, "learning_rate": 0.005614187034725869, "loss": 2.6342, "step": 6071 }, { "crossentropy": 2.5996090173721313, "epoch": 0.3301884227412382, "grad_norm": 0.03316865861415863, "grad_norm_var": 5.428497284931608e-06, "learning_rate": 0.005612951264296736, "loss": 2.5996, "step": 6072 }, { "crossentropy": 2.6349785327911377, "epoch": 0.3302428015987384, "grad_norm": 0.03397293761372566, "grad_norm_var": 5.477464885047558e-06, "learning_rate": 0.005611715455852936, "loss": 2.635, "step": 6073 }, { "crossentropy": 2.6614818572998047, "epoch": 0.3302971804562386, "grad_norm": 0.032231852412223816, "grad_norm_var": 5.878788210086916e-06, "learning_rate": 0.005610479609471113, "loss": 2.6615, "step": 6074 }, { "crossentropy": 2.6409215927124023, "epoch": 0.33035155931373883, "grad_norm": 0.034299951046705246, "grad_norm_var": 5.925179002294612e-06, "learning_rate": 0.005609243725227914, "loss": 2.6409, "step": 6075 }, { "crossentropy": 2.5947000980377197, "epoch": 0.33040593817123903, "grad_norm": 0.032524462789297104, "grad_norm_var": 6.002879884384598e-06, "learning_rate": 0.005608007803199987, "loss": 2.5947, "step": 6076 }, { "crossentropy": 2.6739503145217896, "epoch": 0.33046031702873924, "grad_norm": 0.037522703409194946, "grad_norm_var": 6.266173320669763e-06, "learning_rate": 0.0056067718434639835, "loss": 2.674, "step": 6077 }, { "crossentropy": 2.6431716680526733, "epoch": 0.33051469588623944, "grad_norm": 0.03871815279126167, "grad_norm_var": 5.374959561761129e-06, "learning_rate": 0.005605535846096556, "loss": 2.6432, "step": 6078 }, { "crossentropy": 2.65591037273407, "epoch": 0.33056907474373964, "grad_norm": 0.03831253573298454, "grad_norm_var": 4.023823922427496e-06, "learning_rate": 0.00560429981117436, "loss": 2.6559, "step": 6079 }, { "crossentropy": 2.6070334911346436, "epoch": 0.33062345360123985, "grad_norm": 0.03383777290582657, "grad_norm_var": 3.922554196421809e-06, "learning_rate": 0.0056030637387740525, "loss": 2.607, "step": 6080 }, { "crossentropy": 2.6358522176742554, "epoch": 0.33067783245874005, "grad_norm": 0.03211352229118347, "grad_norm_var": 4.278573325811052e-06, "learning_rate": 0.005601827628972294, "loss": 2.6359, "step": 6081 }, { "crossentropy": 2.6552945375442505, "epoch": 0.33073221131624025, "grad_norm": 0.035890839993953705, "grad_norm_var": 4.241060760714656e-06, "learning_rate": 0.005600591481845747, "loss": 2.6553, "step": 6082 }, { "crossentropy": 2.6081355810165405, "epoch": 0.33078659017374046, "grad_norm": 0.03627369925379753, "grad_norm_var": 4.387089041094951e-06, "learning_rate": 0.0055993552974710785, "loss": 2.6081, "step": 6083 }, { "crossentropy": 2.5271929502487183, "epoch": 0.33084096903124066, "grad_norm": 0.03541190177202225, "grad_norm_var": 4.1596022911755475e-06, "learning_rate": 0.00559811907592495, "loss": 2.5272, "step": 6084 }, { "crossentropy": 2.4989609718322754, "epoch": 0.33089534788874087, "grad_norm": 0.03452177718281746, "grad_norm_var": 4.158472990988387e-06, "learning_rate": 0.005596882817284036, "loss": 2.499, "step": 6085 }, { "crossentropy": 2.7257237434387207, "epoch": 0.33094972674624107, "grad_norm": 0.042115114629268646, "grad_norm_var": 7.411702401100411e-06, "learning_rate": 0.0055956465216250055, "loss": 2.7257, "step": 6086 }, { "crossentropy": 2.5633946657180786, "epoch": 0.33100410560374127, "grad_norm": 0.03440828621387482, "grad_norm_var": 7.436756500420599e-06, "learning_rate": 0.005594410189024532, "loss": 2.5634, "step": 6087 }, { "crossentropy": 2.704718232154846, "epoch": 0.3310584844612415, "grad_norm": 0.034039340913295746, "grad_norm_var": 7.232904280743818e-06, "learning_rate": 0.0055931738195592945, "loss": 2.7047, "step": 6088 }, { "crossentropy": 2.778470754623413, "epoch": 0.3311128633187417, "grad_norm": 0.035244137048721313, "grad_norm_var": 7.094196789035149e-06, "learning_rate": 0.005591937413305968, "loss": 2.7785, "step": 6089 }, { "crossentropy": 2.6786117553710938, "epoch": 0.3311672421762419, "grad_norm": 0.03655899688601494, "grad_norm_var": 6.398145734567926e-06, "learning_rate": 0.0055907009703412384, "loss": 2.6786, "step": 6090 }, { "crossentropy": 2.6624585390090942, "epoch": 0.3312216210337421, "grad_norm": 0.04203127324581146, "grad_norm_var": 8.652530890131589e-06, "learning_rate": 0.005589464490741781, "loss": 2.6625, "step": 6091 }, { "crossentropy": 2.701390266418457, "epoch": 0.3312759998912423, "grad_norm": 0.041529037058353424, "grad_norm_var": 9.28294072855314e-06, "learning_rate": 0.005588227974584288, "loss": 2.7014, "step": 6092 }, { "crossentropy": 2.6061153411865234, "epoch": 0.3313303787487425, "grad_norm": 0.03936178982257843, "grad_norm_var": 9.675697729040883e-06, "learning_rate": 0.005586991421945444, "loss": 2.6061, "step": 6093 }, { "crossentropy": 2.6728931665420532, "epoch": 0.3313847576062427, "grad_norm": 0.03907616063952446, "grad_norm_var": 9.770591664554479e-06, "learning_rate": 0.00558575483290194, "loss": 2.6729, "step": 6094 }, { "crossentropy": 2.5632489919662476, "epoch": 0.3314391364637429, "grad_norm": 0.0373477041721344, "grad_norm_var": 9.649680987986603e-06, "learning_rate": 0.005584518207530464, "loss": 2.5632, "step": 6095 }, { "crossentropy": 2.626034140586853, "epoch": 0.3314935153212431, "grad_norm": 0.05047351494431496, "grad_norm_var": 2.0242639141127874e-05, "learning_rate": 0.005583281545907718, "loss": 2.626, "step": 6096 }, { "crossentropy": 2.765756368637085, "epoch": 0.3315478941787433, "grad_norm": 0.03476499766111374, "grad_norm_var": 1.863640465786052e-05, "learning_rate": 0.005582044848110392, "loss": 2.7658, "step": 6097 }, { "crossentropy": 2.5972719192504883, "epoch": 0.3316022730362435, "grad_norm": 0.04277973249554634, "grad_norm_var": 1.9604957812193132e-05, "learning_rate": 0.005580808114215188, "loss": 2.5973, "step": 6098 }, { "crossentropy": 2.6733659505844116, "epoch": 0.3316566518937437, "grad_norm": 0.03673034533858299, "grad_norm_var": 1.9482677769601616e-05, "learning_rate": 0.005579571344298806, "loss": 2.6734, "step": 6099 }, { "crossentropy": 2.5805959701538086, "epoch": 0.3317110307512439, "grad_norm": 0.03663231059908867, "grad_norm_var": 1.9069258020071715e-05, "learning_rate": 0.00557833453843795, "loss": 2.5806, "step": 6100 }, { "crossentropy": 2.626294493675232, "epoch": 0.3317654096087441, "grad_norm": 0.0359036959707737, "grad_norm_var": 1.8437010677823233e-05, "learning_rate": 0.005577097696709324, "loss": 2.6263, "step": 6101 }, { "crossentropy": 2.7270724773406982, "epoch": 0.3318197884662443, "grad_norm": 0.039290186017751694, "grad_norm_var": 1.764465511839101e-05, "learning_rate": 0.005575860819189637, "loss": 2.7271, "step": 6102 }, { "crossentropy": 2.643159866333008, "epoch": 0.33187416732374453, "grad_norm": 0.037322960793972015, "grad_norm_var": 1.6581312116083446e-05, "learning_rate": 0.0055746239059556, "loss": 2.6432, "step": 6103 }, { "crossentropy": 2.5879791975021362, "epoch": 0.33192854618124473, "grad_norm": 0.03801898658275604, "grad_norm_var": 1.5101899185265832e-05, "learning_rate": 0.005573386957083924, "loss": 2.588, "step": 6104 }, { "crossentropy": 2.6236976385116577, "epoch": 0.33198292503874494, "grad_norm": 0.034676674753427505, "grad_norm_var": 1.5401782218932233e-05, "learning_rate": 0.0055721499726513225, "loss": 2.6237, "step": 6105 }, { "crossentropy": 2.6478545665740967, "epoch": 0.33203730389624514, "grad_norm": 0.034428514540195465, "grad_norm_var": 1.6352208758145063e-05, "learning_rate": 0.005570912952734513, "loss": 2.6479, "step": 6106 }, { "crossentropy": 2.6499682664871216, "epoch": 0.33209168275374534, "grad_norm": 0.03427187353372574, "grad_norm_var": 1.6744253110363937e-05, "learning_rate": 0.005569675897410216, "loss": 2.65, "step": 6107 }, { "crossentropy": 2.49715793132782, "epoch": 0.33214606161124555, "grad_norm": 0.03551166132092476, "grad_norm_var": 1.6406989731736615e-05, "learning_rate": 0.00556843880675515, "loss": 2.4972, "step": 6108 }, { "crossentropy": 2.6915335655212402, "epoch": 0.33220044046874575, "grad_norm": 0.036102909594774246, "grad_norm_var": 1.644077545917532e-05, "learning_rate": 0.005567201680846038, "loss": 2.6915, "step": 6109 }, { "crossentropy": 2.5545209646224976, "epoch": 0.33225481932624595, "grad_norm": 0.033222004771232605, "grad_norm_var": 1.751500465728828e-05, "learning_rate": 0.005565964519759607, "loss": 2.5545, "step": 6110 }, { "crossentropy": 2.5761473178863525, "epoch": 0.33230919818374616, "grad_norm": 0.0362938791513443, "grad_norm_var": 1.7583665695569626e-05, "learning_rate": 0.005564727323572586, "loss": 2.5761, "step": 6111 }, { "crossentropy": 2.7182403802871704, "epoch": 0.33236357704124636, "grad_norm": 0.0358193963766098, "grad_norm_var": 5.2197291953416655e-06, "learning_rate": 0.0055634900923617015, "loss": 2.7182, "step": 6112 }, { "crossentropy": 2.6448174715042114, "epoch": 0.33241795589874656, "grad_norm": 0.03276924416422844, "grad_norm_var": 5.893268029234268e-06, "learning_rate": 0.005562252826203686, "loss": 2.6448, "step": 6113 }, { "crossentropy": 2.701190233230591, "epoch": 0.33247233475624677, "grad_norm": 0.03540264070034027, "grad_norm_var": 2.8580155288539244e-06, "learning_rate": 0.0055610155251752755, "loss": 2.7012, "step": 6114 }, { "crossentropy": 2.5843392610549927, "epoch": 0.33252671361374697, "grad_norm": 0.038388941437006, "grad_norm_var": 3.241257819282713e-06, "learning_rate": 0.0055597781893532055, "loss": 2.5843, "step": 6115 }, { "crossentropy": 2.574113965034485, "epoch": 0.3325810924712472, "grad_norm": 0.03617943078279495, "grad_norm_var": 3.2085580448148443e-06, "learning_rate": 0.0055585408188142124, "loss": 2.5741, "step": 6116 }, { "crossentropy": 2.6417049169540405, "epoch": 0.3326354713287474, "grad_norm": 0.03461533784866333, "grad_norm_var": 3.303107972714668e-06, "learning_rate": 0.00555730341363504, "loss": 2.6417, "step": 6117 }, { "crossentropy": 2.7446922063827515, "epoch": 0.3326898501862476, "grad_norm": 0.0368175134062767, "grad_norm_var": 2.524560433603678e-06, "learning_rate": 0.00555606597389243, "loss": 2.7447, "step": 6118 }, { "crossentropy": 2.6823076009750366, "epoch": 0.3327442290437478, "grad_norm": 0.033535927534103394, "grad_norm_var": 2.5585599922873954e-06, "learning_rate": 0.005554828499663126, "loss": 2.6823, "step": 6119 }, { "crossentropy": 2.629046678543091, "epoch": 0.332798607901248, "grad_norm": 0.03576023876667023, "grad_norm_var": 2.0821855565196385e-06, "learning_rate": 0.0055535909910238756, "loss": 2.629, "step": 6120 }, { "crossentropy": 2.5810376405715942, "epoch": 0.3328529867587482, "grad_norm": 0.03678376227617264, "grad_norm_var": 2.2021800433683996e-06, "learning_rate": 0.005552353448051427, "loss": 2.581, "step": 6121 }, { "crossentropy": 2.6274741888046265, "epoch": 0.3329073656162484, "grad_norm": 0.03954713046550751, "grad_norm_var": 3.197860715255472e-06, "learning_rate": 0.005551115870822535, "loss": 2.6275, "step": 6122 }, { "crossentropy": 2.659738063812256, "epoch": 0.3329617444737486, "grad_norm": 0.036542605608701706, "grad_norm_var": 3.0911093422499846e-06, "learning_rate": 0.005549878259413947, "loss": 2.6597, "step": 6123 }, { "crossentropy": 2.6283814907073975, "epoch": 0.3330161233312488, "grad_norm": 0.04174322634935379, "grad_norm_var": 5.2529791826202064e-06, "learning_rate": 0.005548640613902425, "loss": 2.6284, "step": 6124 }, { "crossentropy": 2.5529879331588745, "epoch": 0.333070502188749, "grad_norm": 0.03731722757220268, "grad_norm_var": 5.3261393019585676e-06, "learning_rate": 0.005547402934364723, "loss": 2.553, "step": 6125 }, { "crossentropy": 2.5663535594940186, "epoch": 0.3331248810462492, "grad_norm": 0.03273271396756172, "grad_norm_var": 5.541656047541807e-06, "learning_rate": 0.0055461652208776005, "loss": 2.5664, "step": 6126 }, { "crossentropy": 2.6157795190811157, "epoch": 0.3331792599037494, "grad_norm": 0.034763239324092865, "grad_norm_var": 5.682308437863527e-06, "learning_rate": 0.00554492747351782, "loss": 2.6158, "step": 6127 }, { "crossentropy": 2.63973331451416, "epoch": 0.3332336387612496, "grad_norm": 0.03591363877058029, "grad_norm_var": 5.678459094245134e-06, "learning_rate": 0.005543689692362146, "loss": 2.6397, "step": 6128 }, { "crossentropy": 2.6354600191116333, "epoch": 0.3332880176187498, "grad_norm": 0.03373711183667183, "grad_norm_var": 5.297394228291561e-06, "learning_rate": 0.005542451877487345, "loss": 2.6355, "step": 6129 }, { "crossentropy": 2.540237307548523, "epoch": 0.33334239647625, "grad_norm": 0.035135962069034576, "grad_norm_var": 5.331481369126532e-06, "learning_rate": 0.005541214028970183, "loss": 2.5402, "step": 6130 }, { "crossentropy": 2.655646562576294, "epoch": 0.33339677533375023, "grad_norm": 0.03480752184987068, "grad_norm_var": 5.097244349117012e-06, "learning_rate": 0.005539976146887433, "loss": 2.6556, "step": 6131 }, { "crossentropy": 2.6065540313720703, "epoch": 0.33345115419125043, "grad_norm": 0.03379940614104271, "grad_norm_var": 5.3929997121217784e-06, "learning_rate": 0.005538738231315865, "loss": 2.6066, "step": 6132 }, { "crossentropy": 2.6433109045028687, "epoch": 0.33350553304875064, "grad_norm": 0.03586331382393837, "grad_norm_var": 5.2853894644307986e-06, "learning_rate": 0.005537500282332255, "loss": 2.6433, "step": 6133 }, { "crossentropy": 2.712325930595398, "epoch": 0.33355991190625084, "grad_norm": 0.0347517691552639, "grad_norm_var": 5.3062777046954114e-06, "learning_rate": 0.005536262300013377, "loss": 2.7123, "step": 6134 }, { "crossentropy": 2.557325839996338, "epoch": 0.33361429076375104, "grad_norm": 0.036365289241075516, "grad_norm_var": 4.954028251465993e-06, "learning_rate": 0.0055350242844360124, "loss": 2.5573, "step": 6135 }, { "crossentropy": 2.587536931037903, "epoch": 0.33366866962125125, "grad_norm": 0.03386792540550232, "grad_norm_var": 5.231452226226764e-06, "learning_rate": 0.0055337862356769395, "loss": 2.5875, "step": 6136 }, { "crossentropy": 2.705939292907715, "epoch": 0.33372304847875145, "grad_norm": 0.03823487460613251, "grad_norm_var": 5.542857244438386e-06, "learning_rate": 0.005532548153812943, "loss": 2.7059, "step": 6137 }, { "crossentropy": 2.7283307313919067, "epoch": 0.33377742733625165, "grad_norm": 0.03274582698941231, "grad_norm_var": 5.167575193803832e-06, "learning_rate": 0.005531310038920805, "loss": 2.7283, "step": 6138 }, { "crossentropy": 2.649546504020691, "epoch": 0.33383180619375186, "grad_norm": 0.03334615379571915, "grad_norm_var": 5.370372789006e-06, "learning_rate": 0.005530071891077315, "loss": 2.6495, "step": 6139 }, { "crossentropy": 2.7353410720825195, "epoch": 0.33388618505125206, "grad_norm": 0.03547057881951332, "grad_norm_var": 2.457691687727618e-06, "learning_rate": 0.005528833710359261, "loss": 2.7353, "step": 6140 }, { "crossentropy": 2.531428813934326, "epoch": 0.33394056390875226, "grad_norm": 0.04030252620577812, "grad_norm_var": 3.965586614649667e-06, "learning_rate": 0.005527595496843431, "loss": 2.5314, "step": 6141 }, { "crossentropy": 2.67263925075531, "epoch": 0.33399494276625247, "grad_norm": 0.03842367231845856, "grad_norm_var": 4.182211005154754e-06, "learning_rate": 0.005526357250606623, "loss": 2.6726, "step": 6142 }, { "crossentropy": 2.6529457569122314, "epoch": 0.33404932162375267, "grad_norm": 0.03366236388683319, "grad_norm_var": 4.361777980596039e-06, "learning_rate": 0.005525118971725627, "loss": 2.6529, "step": 6143 }, { "crossentropy": 2.5983282327651978, "epoch": 0.3341037004812529, "grad_norm": 0.03628847002983093, "grad_norm_var": 4.396142260115696e-06, "learning_rate": 0.005523880660277242, "loss": 2.5983, "step": 6144 }, { "crossentropy": 2.636562943458557, "epoch": 0.3341580793387531, "grad_norm": 0.03799358755350113, "grad_norm_var": 4.5704659070157826e-06, "learning_rate": 0.0055226423163382676, "loss": 2.6366, "step": 6145 }, { "crossentropy": 2.6490893363952637, "epoch": 0.3342124581962533, "grad_norm": 0.03979258984327316, "grad_norm_var": 5.580987487914987e-06, "learning_rate": 0.005521403939985505, "loss": 2.6491, "step": 6146 }, { "crossentropy": 2.6658259630203247, "epoch": 0.3342668370537535, "grad_norm": 0.03522038087248802, "grad_norm_var": 5.5269749399953124e-06, "learning_rate": 0.005520165531295754, "loss": 2.6658, "step": 6147 }, { "crossentropy": 2.6480830907821655, "epoch": 0.3343212159112537, "grad_norm": 0.032723281532526016, "grad_norm_var": 5.91625552904725e-06, "learning_rate": 0.005518927090345822, "loss": 2.6481, "step": 6148 }, { "crossentropy": 2.661821961402893, "epoch": 0.3343755947687539, "grad_norm": 0.034497812390327454, "grad_norm_var": 6.046898099554218e-06, "learning_rate": 0.005517688617212517, "loss": 2.6618, "step": 6149 }, { "crossentropy": 2.610864996910095, "epoch": 0.3344299736262541, "grad_norm": 0.038653723895549774, "grad_norm_var": 6.424277746008613e-06, "learning_rate": 0.005516450111972645, "loss": 2.6109, "step": 6150 }, { "crossentropy": 2.5315935611724854, "epoch": 0.3344843524837543, "grad_norm": 0.0377299003303051, "grad_norm_var": 6.589056282195934e-06, "learning_rate": 0.0055152115747030194, "loss": 2.5316, "step": 6151 }, { "crossentropy": 2.6072235107421875, "epoch": 0.3345387313412545, "grad_norm": 0.03529016673564911, "grad_norm_var": 6.276162573128537e-06, "learning_rate": 0.005513973005480453, "loss": 2.6072, "step": 6152 }, { "crossentropy": 2.60586678981781, "epoch": 0.3345931101987547, "grad_norm": 0.039123617112636566, "grad_norm_var": 6.5579506249596275e-06, "learning_rate": 0.00551273440438176, "loss": 2.6059, "step": 6153 }, { "crossentropy": 2.6044150590896606, "epoch": 0.3346474890562549, "grad_norm": 0.03667303919792175, "grad_norm_var": 5.6456157377514245e-06, "learning_rate": 0.005511495771483756, "loss": 2.6044, "step": 6154 }, { "crossentropy": 2.6899702548980713, "epoch": 0.3347018679137551, "grad_norm": 0.040750760585069656, "grad_norm_var": 5.885102141076996e-06, "learning_rate": 0.0055102571068632615, "loss": 2.69, "step": 6155 }, { "crossentropy": 2.5246773958206177, "epoch": 0.3347562467712553, "grad_norm": 0.040494661778211594, "grad_norm_var": 6.413192376443731e-06, "learning_rate": 0.005509018410597097, "loss": 2.5247, "step": 6156 }, { "crossentropy": 2.70827853679657, "epoch": 0.3348106256287555, "grad_norm": 0.036028195172548294, "grad_norm_var": 5.873117033984113e-06, "learning_rate": 0.005507779682762086, "loss": 2.7083, "step": 6157 }, { "crossentropy": 2.714151620864868, "epoch": 0.3348650044862557, "grad_norm": 0.033594388514757156, "grad_norm_var": 6.468209340499259e-06, "learning_rate": 0.0055065409234350505, "loss": 2.7142, "step": 6158 }, { "crossentropy": 2.622049331665039, "epoch": 0.3349193833437559, "grad_norm": 0.033694248646497726, "grad_norm_var": 6.455009054901854e-06, "learning_rate": 0.005505302132692822, "loss": 2.622, "step": 6159 }, { "crossentropy": 2.5578540563583374, "epoch": 0.33497376220125613, "grad_norm": 0.034654147922992706, "grad_norm_var": 6.729993548705802e-06, "learning_rate": 0.005504063310612224, "loss": 2.5579, "step": 6160 }, { "crossentropy": 2.4940078258514404, "epoch": 0.33502814105875633, "grad_norm": 0.03731945529580116, "grad_norm_var": 6.640519868159461e-06, "learning_rate": 0.005502824457270091, "loss": 2.494, "step": 6161 }, { "crossentropy": 2.7202882766723633, "epoch": 0.33508251991625654, "grad_norm": 0.04445004463195801, "grad_norm_var": 9.953987622004073e-06, "learning_rate": 0.005501585572743253, "loss": 2.7203, "step": 6162 }, { "crossentropy": 2.6320148706436157, "epoch": 0.33513689877375674, "grad_norm": 0.046772222965955734, "grad_norm_var": 1.565935435355921e-05, "learning_rate": 0.005500346657108544, "loss": 2.632, "step": 6163 }, { "crossentropy": 2.598968505859375, "epoch": 0.33519127763125695, "grad_norm": 0.03752119094133377, "grad_norm_var": 1.3944387950310759e-05, "learning_rate": 0.005499107710442806, "loss": 2.599, "step": 6164 }, { "crossentropy": 2.641426920890808, "epoch": 0.33524565648875715, "grad_norm": 0.03824610635638237, "grad_norm_var": 1.3095700069920036e-05, "learning_rate": 0.005497868732822868, "loss": 2.6414, "step": 6165 }, { "crossentropy": 2.643194079399109, "epoch": 0.33530003534625735, "grad_norm": 0.03473991900682449, "grad_norm_var": 1.3809637585163123e-05, "learning_rate": 0.005496629724325579, "loss": 2.6432, "step": 6166 }, { "crossentropy": 2.6735438108444214, "epoch": 0.33535441420375756, "grad_norm": 0.034135546535253525, "grad_norm_var": 1.4719048434700248e-05, "learning_rate": 0.005495390685027777, "loss": 2.6735, "step": 6167 }, { "crossentropy": 2.567805290222168, "epoch": 0.33540879306125776, "grad_norm": 0.033989984542131424, "grad_norm_var": 1.5245583326832976e-05, "learning_rate": 0.005494151615006306, "loss": 2.5678, "step": 6168 }, { "crossentropy": 2.6217257976531982, "epoch": 0.33546317191875796, "grad_norm": 0.034479282796382904, "grad_norm_var": 1.567294578938468e-05, "learning_rate": 0.005492912514338011, "loss": 2.6217, "step": 6169 }, { "crossentropy": 2.6173903942108154, "epoch": 0.33551755077625817, "grad_norm": 0.03358842805027962, "grad_norm_var": 1.6544583674396537e-05, "learning_rate": 0.0054916733830997445, "loss": 2.6174, "step": 6170 }, { "crossentropy": 2.7299121618270874, "epoch": 0.33557192963375837, "grad_norm": 0.0397336445748806, "grad_norm_var": 1.6121419207394624e-05, "learning_rate": 0.005490434221368351, "loss": 2.7299, "step": 6171 }, { "crossentropy": 2.6238229274749756, "epoch": 0.3356263084912586, "grad_norm": 0.03838251531124115, "grad_norm_var": 1.5441448463749765e-05, "learning_rate": 0.0054891950292206846, "loss": 2.6238, "step": 6172 }, { "crossentropy": 2.643694281578064, "epoch": 0.3356806873487588, "grad_norm": 0.03565897047519684, "grad_norm_var": 1.5495747213070155e-05, "learning_rate": 0.005487955806733598, "loss": 2.6437, "step": 6173 }, { "crossentropy": 2.600029468536377, "epoch": 0.335735066206259, "grad_norm": 0.03261205554008484, "grad_norm_var": 1.599360484215879e-05, "learning_rate": 0.005486716553983951, "loss": 2.6, "step": 6174 }, { "crossentropy": 2.56452476978302, "epoch": 0.3357894450637592, "grad_norm": 0.034221552312374115, "grad_norm_var": 1.5787451045709496e-05, "learning_rate": 0.005485477271048592, "loss": 2.5645, "step": 6175 }, { "crossentropy": 2.687405824661255, "epoch": 0.3358438239212594, "grad_norm": 0.034512490034103394, "grad_norm_var": 1.5831248282468086e-05, "learning_rate": 0.005484237958004388, "loss": 2.6874, "step": 6176 }, { "crossentropy": 2.715131640434265, "epoch": 0.3358982027787596, "grad_norm": 0.03602415323257446, "grad_norm_var": 1.58632734398254e-05, "learning_rate": 0.005482998614928196, "loss": 2.7151, "step": 6177 }, { "crossentropy": 2.6875102519989014, "epoch": 0.3359525816362598, "grad_norm": 0.03478603810071945, "grad_norm_var": 1.1864584747608987e-05, "learning_rate": 0.005481759241896881, "loss": 2.6875, "step": 6178 }, { "crossentropy": 2.7142133712768555, "epoch": 0.33600696049376, "grad_norm": 0.03376521170139313, "grad_norm_var": 4.125534206007806e-06, "learning_rate": 0.005480519838987306, "loss": 2.7142, "step": 6179 }, { "crossentropy": 2.5638551712036133, "epoch": 0.33606133935126026, "grad_norm": 0.03405998274683952, "grad_norm_var": 3.895280097197112e-06, "learning_rate": 0.00547928040627634, "loss": 2.5639, "step": 6180 }, { "crossentropy": 2.66738760471344, "epoch": 0.33611571820876046, "grad_norm": 0.032717905938625336, "grad_norm_var": 3.547910222329899e-06, "learning_rate": 0.005478040943840849, "loss": 2.6674, "step": 6181 }, { "crossentropy": 2.6028621196746826, "epoch": 0.33617009706626066, "grad_norm": 0.03393849357962608, "grad_norm_var": 3.5985313744895758e-06, "learning_rate": 0.005476801451757706, "loss": 2.6029, "step": 6182 }, { "crossentropy": 2.604983687400818, "epoch": 0.33622447592376087, "grad_norm": 0.03347059711813927, "grad_norm_var": 3.684003039065159e-06, "learning_rate": 0.00547556193010378, "loss": 2.605, "step": 6183 }, { "crossentropy": 2.7569068670272827, "epoch": 0.33627885478126107, "grad_norm": 0.0339902825653553, "grad_norm_var": 3.6839729900833377e-06, "learning_rate": 0.005474322378955946, "loss": 2.7569, "step": 6184 }, { "crossentropy": 2.6409990787506104, "epoch": 0.3363332336387613, "grad_norm": 0.034534234553575516, "grad_norm_var": 3.6822049442418345e-06, "learning_rate": 0.005473082798391082, "loss": 2.641, "step": 6185 }, { "crossentropy": 2.636245608329773, "epoch": 0.3363876124962615, "grad_norm": 0.03334638476371765, "grad_norm_var": 3.7233463157741267e-06, "learning_rate": 0.005471843188486062, "loss": 2.6362, "step": 6186 }, { "crossentropy": 2.5701457262039185, "epoch": 0.3364419913537617, "grad_norm": 0.03594595566391945, "grad_norm_var": 2.0953933980195905e-06, "learning_rate": 0.0054706035493177685, "loss": 2.5701, "step": 6187 }, { "crossentropy": 2.5960631370544434, "epoch": 0.3364963702112619, "grad_norm": 0.03752671554684639, "grad_norm_var": 1.697910614139632e-06, "learning_rate": 0.005469363880963082, "loss": 2.5961, "step": 6188 }, { "crossentropy": 2.645151972770691, "epoch": 0.3365507490687621, "grad_norm": 0.03857947885990143, "grad_norm_var": 2.7039361080389082e-06, "learning_rate": 0.005468124183498884, "loss": 2.6452, "step": 6189 }, { "crossentropy": 2.684308886528015, "epoch": 0.3366051279262623, "grad_norm": 0.047059979289770126, "grad_norm_var": 1.1868830193970809e-05, "learning_rate": 0.00546688445700206, "loss": 2.6843, "step": 6190 }, { "crossentropy": 2.6519306898117065, "epoch": 0.3366595067837625, "grad_norm": 0.04584823548793793, "grad_norm_var": 1.8289230442825043e-05, "learning_rate": 0.0054656447015494995, "loss": 2.6519, "step": 6191 }, { "crossentropy": 2.5915513038635254, "epoch": 0.3367138856412627, "grad_norm": 0.03342809900641441, "grad_norm_var": 1.861490226878661e-05, "learning_rate": 0.005464404917218088, "loss": 2.5916, "step": 6192 }, { "crossentropy": 2.6872400045394897, "epoch": 0.3367682644987629, "grad_norm": 0.03688780963420868, "grad_norm_var": 1.86425545789121e-05, "learning_rate": 0.0054631651040847155, "loss": 2.6872, "step": 6193 }, { "crossentropy": 2.513597846031189, "epoch": 0.3368226433562631, "grad_norm": 0.03482755273580551, "grad_norm_var": 1.8634598494499516e-05, "learning_rate": 0.005461925262226276, "loss": 2.5136, "step": 6194 }, { "crossentropy": 2.715298056602478, "epoch": 0.3368770222137633, "grad_norm": 0.0345035083591938, "grad_norm_var": 1.8424514291490168e-05, "learning_rate": 0.005460685391719662, "loss": 2.7153, "step": 6195 }, { "crossentropy": 2.5684804916381836, "epoch": 0.3369314010712635, "grad_norm": 0.03306719288229942, "grad_norm_var": 1.8781516694245846e-05, "learning_rate": 0.0054594454926417705, "loss": 2.5685, "step": 6196 }, { "crossentropy": 2.619940161705017, "epoch": 0.3369857799287637, "grad_norm": 0.0348723866045475, "grad_norm_var": 1.8062865861602436e-05, "learning_rate": 0.005458205565069495, "loss": 2.6199, "step": 6197 }, { "crossentropy": 2.689080238342285, "epoch": 0.3370401587862639, "grad_norm": 0.03320252150297165, "grad_norm_var": 1.8334751119106902e-05, "learning_rate": 0.00545696560907974, "loss": 2.6891, "step": 6198 }, { "crossentropy": 2.462679147720337, "epoch": 0.3370945376437641, "grad_norm": 0.034080155193805695, "grad_norm_var": 1.8126537788460254e-05, "learning_rate": 0.005455725624749404, "loss": 2.4627, "step": 6199 }, { "crossentropy": 2.6039674282073975, "epoch": 0.33714891650126433, "grad_norm": 0.033600471913814545, "grad_norm_var": 1.82590069845961e-05, "learning_rate": 0.005454485612155389, "loss": 2.604, "step": 6200 }, { "crossentropy": 2.7239447832107544, "epoch": 0.33720329535876453, "grad_norm": 0.035163551568984985, "grad_norm_var": 1.8132917806032375e-05, "learning_rate": 0.0054532455713746, "loss": 2.7239, "step": 6201 }, { "crossentropy": 2.6310354471206665, "epoch": 0.33725767421626474, "grad_norm": 0.041307758539915085, "grad_norm_var": 1.888344052755342e-05, "learning_rate": 0.0054520055024839436, "loss": 2.631, "step": 6202 }, { "crossentropy": 2.7116953134536743, "epoch": 0.33731205307376494, "grad_norm": 0.036372773349285126, "grad_norm_var": 1.8842306152648016e-05, "learning_rate": 0.005450765405560327, "loss": 2.7117, "step": 6203 }, { "crossentropy": 2.5442850589752197, "epoch": 0.33736643193126514, "grad_norm": 0.034491363912820816, "grad_norm_var": 1.9162684456501487e-05, "learning_rate": 0.0054495252806806595, "loss": 2.5443, "step": 6204 }, { "crossentropy": 2.6300941705703735, "epoch": 0.33742081078876535, "grad_norm": 0.03659464791417122, "grad_norm_var": 1.891304911218264e-05, "learning_rate": 0.005448285127921855, "loss": 2.6301, "step": 6205 }, { "crossentropy": 2.6263482570648193, "epoch": 0.33747518964626555, "grad_norm": 0.03259461745619774, "grad_norm_var": 1.1781451301452981e-05, "learning_rate": 0.005447044947360823, "loss": 2.6263, "step": 6206 }, { "crossentropy": 2.6390193700790405, "epoch": 0.33752956850376575, "grad_norm": 0.034057099372148514, "grad_norm_var": 4.481205111988145e-06, "learning_rate": 0.00544580473907448, "loss": 2.639, "step": 6207 }, { "crossentropy": 2.6386163234710693, "epoch": 0.33758394736126596, "grad_norm": 0.03422672674059868, "grad_norm_var": 4.359998593104058e-06, "learning_rate": 0.005444564503139744, "loss": 2.6386, "step": 6208 }, { "crossentropy": 2.601063847541809, "epoch": 0.33763832621876616, "grad_norm": 0.035132937133312225, "grad_norm_var": 4.108565247446366e-06, "learning_rate": 0.005443324239633533, "loss": 2.6011, "step": 6209 }, { "crossentropy": 2.7385661602020264, "epoch": 0.33769270507626636, "grad_norm": 0.033138349652290344, "grad_norm_var": 4.298930615825348e-06, "learning_rate": 0.005442083948632763, "loss": 2.7386, "step": 6210 }, { "crossentropy": 2.5401763916015625, "epoch": 0.33774708393376657, "grad_norm": 0.03132970631122589, "grad_norm_var": 5.0435427600005465e-06, "learning_rate": 0.005440843630214361, "loss": 2.5402, "step": 6211 }, { "crossentropy": 2.6023048162460327, "epoch": 0.33780146279126677, "grad_norm": 0.03347540646791458, "grad_norm_var": 4.971780267476871e-06, "learning_rate": 0.005439603284455247, "loss": 2.6023, "step": 6212 }, { "crossentropy": 2.5832271575927734, "epoch": 0.337855841648767, "grad_norm": 0.03656567260622978, "grad_norm_var": 5.211907371482858e-06, "learning_rate": 0.0054383629114323475, "loss": 2.5832, "step": 6213 }, { "crossentropy": 2.6585458517074585, "epoch": 0.3379102205062672, "grad_norm": 0.03678436949849129, "grad_norm_var": 5.294601626840839e-06, "learning_rate": 0.005437122511222589, "loss": 2.6585, "step": 6214 }, { "crossentropy": 2.538653612136841, "epoch": 0.3379645993637674, "grad_norm": 0.03404153510928154, "grad_norm_var": 5.299082449787956e-06, "learning_rate": 0.005435882083902899, "loss": 2.5387, "step": 6215 }, { "crossentropy": 2.6884686946868896, "epoch": 0.3380189782212676, "grad_norm": 0.03817756474018097, "grad_norm_var": 5.797175394667691e-06, "learning_rate": 0.00543464162955021, "loss": 2.6885, "step": 6216 }, { "crossentropy": 2.6394708156585693, "epoch": 0.3380733570787678, "grad_norm": 0.03650524094700813, "grad_norm_var": 5.900322406348958e-06, "learning_rate": 0.005433401148241451, "loss": 2.6395, "step": 6217 }, { "crossentropy": 2.4841649532318115, "epoch": 0.338127735936268, "grad_norm": 0.03310863673686981, "grad_norm_var": 3.5338540623471837e-06, "learning_rate": 0.005432160640053557, "loss": 2.4842, "step": 6218 }, { "crossentropy": 2.725373148918152, "epoch": 0.3381821147937682, "grad_norm": 0.036881256848573685, "grad_norm_var": 3.6575060283518257e-06, "learning_rate": 0.005430920105063464, "loss": 2.7254, "step": 6219 }, { "crossentropy": 2.58778715133667, "epoch": 0.3382364936512684, "grad_norm": 0.035345882177352905, "grad_norm_var": 3.665806093442782e-06, "learning_rate": 0.0054296795433481084, "loss": 2.5878, "step": 6220 }, { "crossentropy": 2.7301807403564453, "epoch": 0.3382908725087686, "grad_norm": 0.0365845151245594, "grad_norm_var": 3.663485792620933e-06, "learning_rate": 0.005428438954984426, "loss": 2.7302, "step": 6221 }, { "crossentropy": 2.6330502033233643, "epoch": 0.3383452513662688, "grad_norm": 0.03257784992456436, "grad_norm_var": 3.668594495390143e-06, "learning_rate": 0.00542719834004936, "loss": 2.6331, "step": 6222 }, { "crossentropy": 2.5816304683685303, "epoch": 0.338399630223769, "grad_norm": 0.03258374333381653, "grad_norm_var": 3.964116931115478e-06, "learning_rate": 0.005425957698619851, "loss": 2.5816, "step": 6223 }, { "crossentropy": 2.7141120433807373, "epoch": 0.3384540090812692, "grad_norm": 0.03689853101968765, "grad_norm_var": 4.213635988875355e-06, "learning_rate": 0.005424717030772843, "loss": 2.7141, "step": 6224 }, { "crossentropy": 2.6185073852539062, "epoch": 0.3385083879387694, "grad_norm": 0.03841317817568779, "grad_norm_var": 4.968025941874619e-06, "learning_rate": 0.0054234763365852805, "loss": 2.6185, "step": 6225 }, { "crossentropy": 2.646007776260376, "epoch": 0.3385627667962696, "grad_norm": 0.03616418316960335, "grad_norm_var": 4.728377585882145e-06, "learning_rate": 0.00542223561613411, "loss": 2.646, "step": 6226 }, { "crossentropy": 2.539257764816284, "epoch": 0.3386171456537698, "grad_norm": 0.03548242151737213, "grad_norm_var": 3.585806197291726e-06, "learning_rate": 0.0054209948694962816, "loss": 2.5393, "step": 6227 }, { "crossentropy": 2.636887788772583, "epoch": 0.33867152451127003, "grad_norm": 0.0352102592587471, "grad_norm_var": 3.2826104859542033e-06, "learning_rate": 0.005419754096748742, "loss": 2.6369, "step": 6228 }, { "crossentropy": 2.729543447494507, "epoch": 0.33872590336877023, "grad_norm": 0.03536508232355118, "grad_norm_var": 3.235372316902294e-06, "learning_rate": 0.005418513297968446, "loss": 2.7295, "step": 6229 }, { "crossentropy": 2.6179327964782715, "epoch": 0.33878028222627043, "grad_norm": 0.0341496579349041, "grad_norm_var": 3.2646763398257983e-06, "learning_rate": 0.0054172724732323456, "loss": 2.6179, "step": 6230 }, { "crossentropy": 2.5869566202163696, "epoch": 0.33883466108377064, "grad_norm": 0.03515695780515671, "grad_norm_var": 3.1302743523559106e-06, "learning_rate": 0.005416031622617396, "loss": 2.587, "step": 6231 }, { "crossentropy": 2.556138515472412, "epoch": 0.33888903994127084, "grad_norm": 0.03485225886106491, "grad_norm_var": 2.6509791528718517e-06, "learning_rate": 0.005414790746200551, "loss": 2.5561, "step": 6232 }, { "crossentropy": 2.6501110792160034, "epoch": 0.33894341879877105, "grad_norm": 0.033759359270334244, "grad_norm_var": 2.6919365542071396e-06, "learning_rate": 0.005413549844058772, "loss": 2.6501, "step": 6233 }, { "crossentropy": 2.648969292640686, "epoch": 0.33899779765627125, "grad_norm": 0.03462415933609009, "grad_norm_var": 2.4213000070806567e-06, "learning_rate": 0.005412308916269017, "loss": 2.649, "step": 6234 }, { "crossentropy": 2.663204073905945, "epoch": 0.33905217651377145, "grad_norm": 0.03533781319856644, "grad_norm_var": 2.2351222953099006e-06, "learning_rate": 0.005411067962908247, "loss": 2.6632, "step": 6235 }, { "crossentropy": 2.619852662086487, "epoch": 0.33910655537127166, "grad_norm": 0.035247206687927246, "grad_norm_var": 2.23324072128091e-06, "learning_rate": 0.005409826984053425, "loss": 2.6199, "step": 6236 }, { "crossentropy": 2.679559111595154, "epoch": 0.33916093422877186, "grad_norm": 0.03300255537033081, "grad_norm_var": 2.350240454904867e-06, "learning_rate": 0.005408585979781515, "loss": 2.6796, "step": 6237 }, { "crossentropy": 2.5891324281692505, "epoch": 0.33921531308627206, "grad_norm": 0.03271834924817085, "grad_norm_var": 2.307474950602404e-06, "learning_rate": 0.005407344950169486, "loss": 2.5891, "step": 6238 }, { "crossentropy": 2.6766217947006226, "epoch": 0.33926969194377227, "grad_norm": 0.03283219039440155, "grad_norm_var": 2.2334326122331117e-06, "learning_rate": 0.005406103895294299, "loss": 2.6766, "step": 6239 }, { "crossentropy": 2.7227485179901123, "epoch": 0.33932407080127247, "grad_norm": 0.03324878588318825, "grad_norm_var": 2.1181844817188174e-06, "learning_rate": 0.005404862815232931, "loss": 2.7227, "step": 6240 }, { "crossentropy": 2.6817508935928345, "epoch": 0.3393784496587727, "grad_norm": 0.04171029105782509, "grad_norm_var": 4.419975340156507e-06, "learning_rate": 0.005403621710062347, "loss": 2.6818, "step": 6241 }, { "crossentropy": 2.5132232904434204, "epoch": 0.3394328285162729, "grad_norm": 0.034482721239328384, "grad_norm_var": 4.319726090577097e-06, "learning_rate": 0.0054023805798595215, "loss": 2.5132, "step": 6242 }, { "crossentropy": 2.5856438875198364, "epoch": 0.3394872073737731, "grad_norm": 0.03442646563053131, "grad_norm_var": 4.296679802839644e-06, "learning_rate": 0.005401139424701427, "loss": 2.5856, "step": 6243 }, { "crossentropy": 2.6423423290252686, "epoch": 0.3395415862312733, "grad_norm": 0.03518027067184448, "grad_norm_var": 4.294926689982215e-06, "learning_rate": 0.005399898244665039, "loss": 2.6423, "step": 6244 }, { "crossentropy": 2.589959502220154, "epoch": 0.3395959650887735, "grad_norm": 0.033461470156908035, "grad_norm_var": 4.366786446847494e-06, "learning_rate": 0.005398657039827335, "loss": 2.59, "step": 6245 }, { "crossentropy": 2.557827949523926, "epoch": 0.3396503439462737, "grad_norm": 0.03202330693602562, "grad_norm_var": 4.787513667230993e-06, "learning_rate": 0.005397415810265294, "loss": 2.5578, "step": 6246 }, { "crossentropy": 2.5971927642822266, "epoch": 0.3397047228037739, "grad_norm": 0.03658868372440338, "grad_norm_var": 5.0402742301604185e-06, "learning_rate": 0.005396174556055892, "loss": 2.5972, "step": 6247 }, { "crossentropy": 2.565650463104248, "epoch": 0.3397591016612741, "grad_norm": 0.03449694439768791, "grad_norm_var": 5.0359056517584416e-06, "learning_rate": 0.005394933277276117, "loss": 2.5657, "step": 6248 }, { "crossentropy": 2.6178929805755615, "epoch": 0.3398134805187743, "grad_norm": 0.034406986087560654, "grad_norm_var": 4.992009377915996e-06, "learning_rate": 0.005393691974002947, "loss": 2.6179, "step": 6249 }, { "crossentropy": 2.4960216283798218, "epoch": 0.3398678593762745, "grad_norm": 0.03648756071925163, "grad_norm_var": 5.212105959801795e-06, "learning_rate": 0.005392450646313367, "loss": 2.496, "step": 6250 }, { "crossentropy": 2.67986798286438, "epoch": 0.3399222382337747, "grad_norm": 0.036165863275527954, "grad_norm_var": 5.322262740534561e-06, "learning_rate": 0.005391209294284365, "loss": 2.6799, "step": 6251 }, { "crossentropy": 2.5608288049697876, "epoch": 0.3399766170912749, "grad_norm": 0.03442614898085594, "grad_norm_var": 5.313246687222809e-06, "learning_rate": 0.005389967917992926, "loss": 2.5608, "step": 6252 }, { "crossentropy": 2.7253834009170532, "epoch": 0.3400309959487751, "grad_norm": 0.03554077446460724, "grad_norm_var": 5.131741502871027e-06, "learning_rate": 0.00538872651751604, "loss": 2.7254, "step": 6253 }, { "crossentropy": 2.6791815757751465, "epoch": 0.3400853748062753, "grad_norm": 0.03681093826889992, "grad_norm_var": 4.995021498666967e-06, "learning_rate": 0.0053874850929307, "loss": 2.6792, "step": 6254 }, { "crossentropy": 2.761000156402588, "epoch": 0.3401397536637755, "grad_norm": 0.035023100674152374, "grad_norm_var": 4.619964525170763e-06, "learning_rate": 0.005386243644313895, "loss": 2.761, "step": 6255 }, { "crossentropy": 2.6069586277008057, "epoch": 0.3401941325212757, "grad_norm": 0.037218645215034485, "grad_norm_var": 4.5297894004105974e-06, "learning_rate": 0.005385002171742619, "loss": 2.607, "step": 6256 }, { "crossentropy": 2.733469605445862, "epoch": 0.34024851137877593, "grad_norm": 0.038704633712768555, "grad_norm_var": 2.6168874914087962e-06, "learning_rate": 0.005383760675293866, "loss": 2.7335, "step": 6257 }, { "crossentropy": 2.659242868423462, "epoch": 0.34030289023627613, "grad_norm": 0.035174474120140076, "grad_norm_var": 2.567699090102032e-06, "learning_rate": 0.005382519155044635, "loss": 2.6592, "step": 6258 }, { "crossentropy": 2.602709650993347, "epoch": 0.34035726909377634, "grad_norm": 0.03543902561068535, "grad_norm_var": 2.502569400317928e-06, "learning_rate": 0.005381277611071923, "loss": 2.6027, "step": 6259 }, { "crossentropy": 2.5252926349639893, "epoch": 0.34041164795127654, "grad_norm": 0.03214036300778389, "grad_norm_var": 3.18816493723491e-06, "learning_rate": 0.005380036043452727, "loss": 2.5253, "step": 6260 }, { "crossentropy": 2.7810319662094116, "epoch": 0.34046602680877674, "grad_norm": 0.03382959961891174, "grad_norm_var": 3.108512686157925e-06, "learning_rate": 0.005378794452264053, "loss": 2.781, "step": 6261 }, { "crossentropy": 2.639557719230652, "epoch": 0.34052040566627695, "grad_norm": 0.0338694266974926, "grad_norm_var": 2.5199352510401713e-06, "learning_rate": 0.005377552837582898, "loss": 2.6396, "step": 6262 }, { "crossentropy": 2.690836191177368, "epoch": 0.34057478452377715, "grad_norm": 0.035438764840364456, "grad_norm_var": 2.419591610239707e-06, "learning_rate": 0.005376311199486268, "loss": 2.6908, "step": 6263 }, { "crossentropy": 2.6713058948516846, "epoch": 0.34062916338127736, "grad_norm": 0.0335998460650444, "grad_norm_var": 2.5687370251064695e-06, "learning_rate": 0.005375069538051168, "loss": 2.6713, "step": 6264 }, { "crossentropy": 2.6524150371551514, "epoch": 0.34068354223877756, "grad_norm": 0.03442826867103577, "grad_norm_var": 2.5663241557310457e-06, "learning_rate": 0.005373827853354606, "loss": 2.6524, "step": 6265 }, { "crossentropy": 2.6419607400894165, "epoch": 0.34073792109627776, "grad_norm": 0.03329997509717941, "grad_norm_var": 2.683291404165002e-06, "learning_rate": 0.00537258614547359, "loss": 2.642, "step": 6266 }, { "crossentropy": 2.5690544843673706, "epoch": 0.34079229995377797, "grad_norm": 0.03708086162805557, "grad_norm_var": 2.8693902635462676e-06, "learning_rate": 0.005371344414485126, "loss": 2.5691, "step": 6267 }, { "crossentropy": 2.693264365196228, "epoch": 0.34084667881127817, "grad_norm": 0.037709083408117294, "grad_norm_var": 3.236409890880476e-06, "learning_rate": 0.005370102660466229, "loss": 2.6933, "step": 6268 }, { "crossentropy": 2.5252554416656494, "epoch": 0.3409010576687784, "grad_norm": 0.03459388390183449, "grad_norm_var": 3.2660559998231565e-06, "learning_rate": 0.005368860883493911, "loss": 2.5253, "step": 6269 }, { "crossentropy": 2.6453771591186523, "epoch": 0.3409554365262786, "grad_norm": 0.03189515322446823, "grad_norm_var": 3.768050482066733e-06, "learning_rate": 0.005367619083645184, "loss": 2.6454, "step": 6270 }, { "crossentropy": 2.662338614463806, "epoch": 0.3410098153837788, "grad_norm": 0.03247198462486267, "grad_norm_var": 4.1551582148626386e-06, "learning_rate": 0.005366377260997064, "loss": 2.6623, "step": 6271 }, { "crossentropy": 2.668835997581482, "epoch": 0.341064194241279, "grad_norm": 0.03318934887647629, "grad_norm_var": 3.8736241794769255e-06, "learning_rate": 0.005365135415626569, "loss": 2.6688, "step": 6272 }, { "crossentropy": 2.672358512878418, "epoch": 0.3411185730987792, "grad_norm": 0.03366298973560333, "grad_norm_var": 2.6721535126688714e-06, "learning_rate": 0.005363893547610714, "loss": 2.6724, "step": 6273 }, { "crossentropy": 2.6346828937530518, "epoch": 0.3411729519562794, "grad_norm": 0.03309048339724541, "grad_norm_var": 2.6836396642017255e-06, "learning_rate": 0.005362651657026521, "loss": 2.6347, "step": 6274 }, { "crossentropy": 2.5568262338638306, "epoch": 0.3412273308137796, "grad_norm": 0.03577928617596626, "grad_norm_var": 2.751230456535652e-06, "learning_rate": 0.00536140974395101, "loss": 2.5568, "step": 6275 }, { "crossentropy": 2.635976791381836, "epoch": 0.3412817096712798, "grad_norm": 0.03926927596330643, "grad_norm_var": 4.036415219590047e-06, "learning_rate": 0.005360167808461205, "loss": 2.636, "step": 6276 }, { "crossentropy": 2.6217901706695557, "epoch": 0.34133608852878, "grad_norm": 0.03998081013560295, "grad_norm_var": 5.789481554861695e-06, "learning_rate": 0.005358925850634128, "loss": 2.6218, "step": 6277 }, { "crossentropy": 2.691897749900818, "epoch": 0.3413904673862802, "grad_norm": 0.03753655031323433, "grad_norm_var": 6.096750204303593e-06, "learning_rate": 0.005357683870546802, "loss": 2.6919, "step": 6278 }, { "crossentropy": 2.568649172782898, "epoch": 0.3414448462437804, "grad_norm": 0.03501078486442566, "grad_norm_var": 6.093954704838866e-06, "learning_rate": 0.005356441868276259, "loss": 2.5686, "step": 6279 }, { "crossentropy": 2.649872899055481, "epoch": 0.3414992251012806, "grad_norm": 0.038053013384342194, "grad_norm_var": 6.405591282018785e-06, "learning_rate": 0.005355199843899523, "loss": 2.6499, "step": 6280 }, { "crossentropy": 2.6426457166671753, "epoch": 0.3415536039587808, "grad_norm": 0.03537061810493469, "grad_norm_var": 6.3338797943818245e-06, "learning_rate": 0.005353957797493623, "loss": 2.6426, "step": 6281 }, { "crossentropy": 2.69470751285553, "epoch": 0.341607982816281, "grad_norm": 0.034126166254282, "grad_norm_var": 6.1342302358296495e-06, "learning_rate": 0.00535271572913559, "loss": 2.6947, "step": 6282 }, { "crossentropy": 2.7199103832244873, "epoch": 0.3416623616737812, "grad_norm": 0.03574886545538902, "grad_norm_var": 5.973463617324944e-06, "learning_rate": 0.005351473638902458, "loss": 2.7199, "step": 6283 }, { "crossentropy": 2.7049522399902344, "epoch": 0.3417167405312814, "grad_norm": 0.03778413310647011, "grad_norm_var": 5.996241144142165e-06, "learning_rate": 0.005350231526871258, "loss": 2.705, "step": 6284 }, { "crossentropy": 2.5485153198242188, "epoch": 0.34177111938878163, "grad_norm": 0.03592504560947418, "grad_norm_var": 5.951009464236154e-06, "learning_rate": 0.005348989393119024, "loss": 2.5485, "step": 6285 }, { "crossentropy": 2.603990316390991, "epoch": 0.34182549824628183, "grad_norm": 0.03447183594107628, "grad_norm_var": 5.108285256588454e-06, "learning_rate": 0.005347747237722793, "loss": 2.604, "step": 6286 }, { "crossentropy": 2.704578399658203, "epoch": 0.34187987710378204, "grad_norm": 0.037123970687389374, "grad_norm_var": 4.448108637844506e-06, "learning_rate": 0.0053465050607596045, "loss": 2.7046, "step": 6287 }, { "crossentropy": 2.6464537382125854, "epoch": 0.34193425596128224, "grad_norm": 0.03881312906742096, "grad_norm_var": 4.3114861832669235e-06, "learning_rate": 0.0053452628623064935, "loss": 2.6465, "step": 6288 }, { "crossentropy": 2.6407843828201294, "epoch": 0.34198863481878244, "grad_norm": 0.035124026238918304, "grad_norm_var": 3.919668474050564e-06, "learning_rate": 0.005344020642440502, "loss": 2.6408, "step": 6289 }, { "crossentropy": 2.4890018701553345, "epoch": 0.34204301367628265, "grad_norm": 0.03281817212700844, "grad_norm_var": 4.046299103147716e-06, "learning_rate": 0.005342778401238673, "loss": 2.489, "step": 6290 }, { "crossentropy": 2.657824754714966, "epoch": 0.34209739253378285, "grad_norm": 0.03357107564806938, "grad_norm_var": 4.543674299990289e-06, "learning_rate": 0.005341536138778045, "loss": 2.6578, "step": 6291 }, { "crossentropy": 2.758597254753113, "epoch": 0.34215177139128305, "grad_norm": 0.07516251504421234, "grad_norm_var": 9.929591374295069e-05, "learning_rate": 0.005340293855135664, "loss": 2.7586, "step": 6292 }, { "crossentropy": 2.6956065893173218, "epoch": 0.34220615024878326, "grad_norm": 0.034502387046813965, "grad_norm_var": 0.00010011840414637878, "learning_rate": 0.005339051550388575, "loss": 2.6956, "step": 6293 }, { "crossentropy": 2.5572229623794556, "epoch": 0.34226052910628346, "grad_norm": 0.034211400896310806, "grad_norm_var": 0.00010110198623979507, "learning_rate": 0.005337809224613827, "loss": 2.5572, "step": 6294 }, { "crossentropy": 2.533817172050476, "epoch": 0.34231490796378367, "grad_norm": 0.035235945135354996, "grad_norm_var": 0.00010101575758725649, "learning_rate": 0.005336566877888463, "loss": 2.5338, "step": 6295 }, { "crossentropy": 2.6208852529525757, "epoch": 0.34236928682128387, "grad_norm": 0.03968662768602371, "grad_norm_var": 0.00010119352234511473, "learning_rate": 0.005335324510289539, "loss": 2.6209, "step": 6296 }, { "crossentropy": 2.5997471809387207, "epoch": 0.3424236656787841, "grad_norm": 0.036660801619291306, "grad_norm_var": 0.00010082722152673402, "learning_rate": 0.0053340821218940985, "loss": 2.5997, "step": 6297 }, { "crossentropy": 2.6490572690963745, "epoch": 0.3424780445362843, "grad_norm": 0.04070473089814186, "grad_norm_var": 9.997155844162538e-05, "learning_rate": 0.005332839712779196, "loss": 2.6491, "step": 6298 }, { "crossentropy": 2.5701544284820557, "epoch": 0.3425324233937845, "grad_norm": 0.0396343469619751, "grad_norm_var": 9.943983989388431e-05, "learning_rate": 0.005331597283021886, "loss": 2.5702, "step": 6299 }, { "crossentropy": 2.594103217124939, "epoch": 0.3425868022512847, "grad_norm": 0.04160527512431145, "grad_norm_var": 9.981477516087406e-05, "learning_rate": 0.005330354832699221, "loss": 2.5941, "step": 6300 }, { "crossentropy": 2.6492435932159424, "epoch": 0.3426411811087849, "grad_norm": 0.03744182363152504, "grad_norm_var": 9.932087784629325e-05, "learning_rate": 0.0053291123618882585, "loss": 2.6492, "step": 6301 }, { "crossentropy": 2.584641695022583, "epoch": 0.3426955599662851, "grad_norm": 0.038939427584409714, "grad_norm_var": 9.776795224748775e-05, "learning_rate": 0.005327869870666052, "loss": 2.5846, "step": 6302 }, { "crossentropy": 2.622670531272888, "epoch": 0.3427499388237853, "grad_norm": 0.0355202853679657, "grad_norm_var": 9.842652920202065e-05, "learning_rate": 0.005326627359109665, "loss": 2.6227, "step": 6303 }, { "crossentropy": 2.6336077451705933, "epoch": 0.3428043176812855, "grad_norm": 0.0359121672809124, "grad_norm_var": 9.916093471518673e-05, "learning_rate": 0.005325384827296152, "loss": 2.6336, "step": 6304 }, { "crossentropy": 2.675908327102661, "epoch": 0.3428586965387857, "grad_norm": 0.03298276290297508, "grad_norm_var": 0.00010060282689076193, "learning_rate": 0.005324142275302575, "loss": 2.6759, "step": 6305 }, { "crossentropy": 2.6568044424057007, "epoch": 0.3429130753962859, "grad_norm": 0.03752836585044861, "grad_norm_var": 9.808395117665846e-05, "learning_rate": 0.005322899703205996, "loss": 2.6568, "step": 6306 }, { "crossentropy": 2.6630868911743164, "epoch": 0.3429674542537861, "grad_norm": 0.033722031861543655, "grad_norm_var": 9.796943760873195e-05, "learning_rate": 0.005321657111083479, "loss": 2.6631, "step": 6307 }, { "crossentropy": 2.5489965677261353, "epoch": 0.3430218331112863, "grad_norm": 0.03314213827252388, "grad_norm_var": 7.626846130892075e-06, "learning_rate": 0.005320414499012088, "loss": 2.549, "step": 6308 }, { "crossentropy": 2.586071252822876, "epoch": 0.3430762119687865, "grad_norm": 0.03449340537190437, "grad_norm_var": 7.629500192470514e-06, "learning_rate": 0.005319171867068888, "loss": 2.5861, "step": 6309 }, { "crossentropy": 2.7006384134292603, "epoch": 0.3431305908262867, "grad_norm": 0.03360641002655029, "grad_norm_var": 7.854236925389167e-06, "learning_rate": 0.005317929215330946, "loss": 2.7006, "step": 6310 }, { "crossentropy": 2.7884998321533203, "epoch": 0.3431849696837869, "grad_norm": 0.03527083247900009, "grad_norm_var": 7.847614212005971e-06, "learning_rate": 0.005316686543875331, "loss": 2.7885, "step": 6311 }, { "crossentropy": 2.6137681007385254, "epoch": 0.3432393485412871, "grad_norm": 0.03523360937833786, "grad_norm_var": 7.3007474665709445e-06, "learning_rate": 0.0053154438527791125, "loss": 2.6138, "step": 6312 }, { "crossentropy": 2.7108055353164673, "epoch": 0.34329372739878733, "grad_norm": 0.03335310518741608, "grad_norm_var": 7.869486902313018e-06, "learning_rate": 0.005314201142119361, "loss": 2.7108, "step": 6313 }, { "crossentropy": 2.5865330696105957, "epoch": 0.34334810625628753, "grad_norm": 0.03239727392792702, "grad_norm_var": 7.185572121001735e-06, "learning_rate": 0.005312958411973145, "loss": 2.5865, "step": 6314 }, { "crossentropy": 2.68932569026947, "epoch": 0.34340248511378774, "grad_norm": 0.03305034339427948, "grad_norm_var": 6.418192051973505e-06, "learning_rate": 0.005311715662417541, "loss": 2.6893, "step": 6315 }, { "crossentropy": 2.7402154207229614, "epoch": 0.34345686397128794, "grad_norm": 0.034858010709285736, "grad_norm_var": 3.557314760649355e-06, "learning_rate": 0.005310472893529623, "loss": 2.7402, "step": 6316 }, { "crossentropy": 2.592866897583008, "epoch": 0.34351124282878814, "grad_norm": 0.03304949775338173, "grad_norm_var": 3.2397956901168238e-06, "learning_rate": 0.005309230105386464, "loss": 2.5929, "step": 6317 }, { "crossentropy": 2.582471013069153, "epoch": 0.34356562168628835, "grad_norm": 0.033140696585178375, "grad_norm_var": 1.9601756340719675e-06, "learning_rate": 0.005307987298065145, "loss": 2.5825, "step": 6318 }, { "crossentropy": 2.5335845947265625, "epoch": 0.34362000054378855, "grad_norm": 0.03638672083616257, "grad_norm_var": 2.1591806663607064e-06, "learning_rate": 0.005306744471642739, "loss": 2.5336, "step": 6319 }, { "crossentropy": 2.6128231287002563, "epoch": 0.34367437940128875, "grad_norm": 0.037036292254924774, "grad_norm_var": 2.48609721103521e-06, "learning_rate": 0.005305501626196327, "loss": 2.6128, "step": 6320 }, { "crossentropy": 2.719606876373291, "epoch": 0.34372875825878896, "grad_norm": 0.03715410828590393, "grad_norm_var": 2.8252901618119307e-06, "learning_rate": 0.005304258761802989, "loss": 2.7196, "step": 6321 }, { "crossentropy": 2.696593999862671, "epoch": 0.34378313711628916, "grad_norm": 0.039557602256536484, "grad_norm_var": 3.877961358080458e-06, "learning_rate": 0.005303015878539807, "loss": 2.6966, "step": 6322 }, { "crossentropy": 2.672334671020508, "epoch": 0.34383751597378936, "grad_norm": 0.03577951341867447, "grad_norm_var": 3.869929342414403e-06, "learning_rate": 0.0053017729764838605, "loss": 2.6723, "step": 6323 }, { "crossentropy": 2.6869431734085083, "epoch": 0.34389189483128957, "grad_norm": 0.033656761050224304, "grad_norm_var": 3.7696822107478417e-06, "learning_rate": 0.005300530055712238, "loss": 2.6869, "step": 6324 }, { "crossentropy": 2.6741591691970825, "epoch": 0.34394627368878977, "grad_norm": 0.03133409470319748, "grad_norm_var": 4.554890096403602e-06, "learning_rate": 0.005299287116302021, "loss": 2.6742, "step": 6325 }, { "crossentropy": 2.646126389503479, "epoch": 0.34400065254629, "grad_norm": 0.03371172398328781, "grad_norm_var": 4.54052136104003e-06, "learning_rate": 0.005298044158330295, "loss": 2.6461, "step": 6326 }, { "crossentropy": 2.653021216392517, "epoch": 0.3440550314037902, "grad_norm": 0.03488219156861305, "grad_norm_var": 4.519637331599352e-06, "learning_rate": 0.005296801181874148, "loss": 2.653, "step": 6327 }, { "crossentropy": 2.7963463068008423, "epoch": 0.3441094102612904, "grad_norm": 0.034663695842027664, "grad_norm_var": 4.496452046245415e-06, "learning_rate": 0.00529555818701067, "loss": 2.7963, "step": 6328 }, { "crossentropy": 2.7301141023635864, "epoch": 0.34416378911879064, "grad_norm": 0.03314569219946861, "grad_norm_var": 4.5343352440718744e-06, "learning_rate": 0.005294315173816948, "loss": 2.7301, "step": 6329 }, { "crossentropy": 2.54302716255188, "epoch": 0.34421816797629085, "grad_norm": 0.03676500543951988, "grad_norm_var": 4.4364307374457346e-06, "learning_rate": 0.005293072142370073, "loss": 2.543, "step": 6330 }, { "crossentropy": 2.670109272003174, "epoch": 0.34427254683379105, "grad_norm": 0.04666129872202873, "grad_norm_var": 1.2684182959387704e-05, "learning_rate": 0.005291829092747137, "loss": 2.6701, "step": 6331 }, { "crossentropy": 2.608635187149048, "epoch": 0.34432692569129125, "grad_norm": 0.04607896879315376, "grad_norm_var": 1.9239321862636143e-05, "learning_rate": 0.0052905860250252335, "loss": 2.6086, "step": 6332 }, { "crossentropy": 2.699987292289734, "epoch": 0.34438130454879146, "grad_norm": 0.03581221401691437, "grad_norm_var": 1.8468258627957144e-05, "learning_rate": 0.005289342939281456, "loss": 2.7, "step": 6333 }, { "crossentropy": 2.703381061553955, "epoch": 0.34443568340629166, "grad_norm": 0.03938805311918259, "grad_norm_var": 1.8017394164872244e-05, "learning_rate": 0.005288099835592897, "loss": 2.7034, "step": 6334 }, { "crossentropy": 2.5388102531433105, "epoch": 0.34449006226379186, "grad_norm": 0.03466750681400299, "grad_norm_var": 1.8342905968381632e-05, "learning_rate": 0.0052868567140366566, "loss": 2.5388, "step": 6335 }, { "crossentropy": 2.7248517274856567, "epoch": 0.34454444112129207, "grad_norm": 0.0342341773211956, "grad_norm_var": 1.8780267268482763e-05, "learning_rate": 0.005285613574689829, "loss": 2.7249, "step": 6336 }, { "crossentropy": 2.587124466896057, "epoch": 0.34459881997879227, "grad_norm": 0.03262670338153839, "grad_norm_var": 1.9798269795593543e-05, "learning_rate": 0.005284370417629515, "loss": 2.5871, "step": 6337 }, { "crossentropy": 2.5233747959136963, "epoch": 0.3446531988362925, "grad_norm": 0.035049546509981155, "grad_norm_var": 1.919171031726785e-05, "learning_rate": 0.005283127242932811, "loss": 2.5234, "step": 6338 }, { "crossentropy": 2.578234553337097, "epoch": 0.3447075776937927, "grad_norm": 0.03568151593208313, "grad_norm_var": 1.9197198105741266e-05, "learning_rate": 0.005281884050676822, "loss": 2.5782, "step": 6339 }, { "crossentropy": 2.676087260246277, "epoch": 0.3447619565512929, "grad_norm": 0.034246888011693954, "grad_norm_var": 1.9022987608758624e-05, "learning_rate": 0.0052806408409386465, "loss": 2.6761, "step": 6340 }, { "crossentropy": 2.67140793800354, "epoch": 0.3448163354087931, "grad_norm": 0.03280877694487572, "grad_norm_var": 1.8205231520949245e-05, "learning_rate": 0.005279397613795385, "loss": 2.6714, "step": 6341 }, { "crossentropy": 2.6942672729492188, "epoch": 0.3448707142662933, "grad_norm": 0.034268055111169815, "grad_norm_var": 1.8034327101907183e-05, "learning_rate": 0.005278154369324146, "loss": 2.6943, "step": 6342 }, { "crossentropy": 2.5488098859786987, "epoch": 0.3449250931237935, "grad_norm": 0.033443234860897064, "grad_norm_var": 1.843792328027685e-05, "learning_rate": 0.005276911107602033, "loss": 2.5488, "step": 6343 }, { "crossentropy": 2.6343178749084473, "epoch": 0.3449794719812937, "grad_norm": 0.03461970388889313, "grad_norm_var": 1.844718070464392e-05, "learning_rate": 0.005275667828706151, "loss": 2.6343, "step": 6344 }, { "crossentropy": 2.583106756210327, "epoch": 0.3450338508387939, "grad_norm": 0.03665890917181969, "grad_norm_var": 1.7779167716907145e-05, "learning_rate": 0.005274424532713606, "loss": 2.5831, "step": 6345 }, { "crossentropy": 2.676064968109131, "epoch": 0.3450882296962941, "grad_norm": 0.03387581184506416, "grad_norm_var": 1.817497334630023e-05, "learning_rate": 0.005273181219701508, "loss": 2.6761, "step": 6346 }, { "crossentropy": 2.6413357257843018, "epoch": 0.3451426085537943, "grad_norm": 0.0349026620388031, "grad_norm_var": 1.05054378471664e-05, "learning_rate": 0.005271937889746965, "loss": 2.6413, "step": 6347 }, { "crossentropy": 2.6577497720718384, "epoch": 0.3451969874112945, "grad_norm": 0.03449372202157974, "grad_norm_var": 2.5877498171057605e-06, "learning_rate": 0.005270694542927088, "loss": 2.6577, "step": 6348 }, { "crossentropy": 2.6460959911346436, "epoch": 0.3452513662687947, "grad_norm": 0.0338062047958374, "grad_norm_var": 2.568143181225629e-06, "learning_rate": 0.005269451179318988, "loss": 2.6461, "step": 6349 }, { "crossentropy": 2.702826738357544, "epoch": 0.3453057451262949, "grad_norm": 0.03301626071333885, "grad_norm_var": 1.1000325590202383e-06, "learning_rate": 0.0052682077989997765, "loss": 2.7028, "step": 6350 }, { "crossentropy": 2.651555895805359, "epoch": 0.3453601239837951, "grad_norm": 0.032819025218486786, "grad_norm_var": 1.2168440000737136e-06, "learning_rate": 0.005266964402046569, "loss": 2.6516, "step": 6351 }, { "crossentropy": 2.6096019744873047, "epoch": 0.3454145028412953, "grad_norm": 0.036022283136844635, "grad_norm_var": 1.434492731523275e-06, "learning_rate": 0.0052657209885364755, "loss": 2.6096, "step": 6352 }, { "crossentropy": 2.640178918838501, "epoch": 0.3454688816987955, "grad_norm": 0.03814580291509628, "grad_norm_var": 2.128114627000741e-06, "learning_rate": 0.005264477558546616, "loss": 2.6402, "step": 6353 }, { "crossentropy": 2.593633770942688, "epoch": 0.34552326055629573, "grad_norm": 0.0349893718957901, "grad_norm_var": 2.1248636777049246e-06, "learning_rate": 0.005263234112154104, "loss": 2.5936, "step": 6354 }, { "crossentropy": 2.7139720916748047, "epoch": 0.34557763941379593, "grad_norm": 0.03270726278424263, "grad_norm_var": 2.2537695985854734e-06, "learning_rate": 0.005261990649436058, "loss": 2.714, "step": 6355 }, { "crossentropy": 2.7701550722122192, "epoch": 0.34563201827129614, "grad_norm": 0.03368322551250458, "grad_norm_var": 2.2871254350943782e-06, "learning_rate": 0.005260747170469596, "loss": 2.7702, "step": 6356 }, { "crossentropy": 2.718484401702881, "epoch": 0.34568639712879634, "grad_norm": 0.032830435782670975, "grad_norm_var": 2.282584760743278e-06, "learning_rate": 0.0052595036753318395, "loss": 2.7185, "step": 6357 }, { "crossentropy": 2.605335831642151, "epoch": 0.34574077598629654, "grad_norm": 0.03329506888985634, "grad_norm_var": 2.357914055959679e-06, "learning_rate": 0.005258260164099905, "loss": 2.6053, "step": 6358 }, { "crossentropy": 2.6068389415740967, "epoch": 0.34579515484379675, "grad_norm": 0.032575491815805435, "grad_norm_var": 2.5077826842693936e-06, "learning_rate": 0.005257016636850916, "loss": 2.6068, "step": 6359 }, { "crossentropy": 2.737680673599243, "epoch": 0.34584953370129695, "grad_norm": 0.035203952342271805, "grad_norm_var": 2.555768388552241e-06, "learning_rate": 0.005255773093661995, "loss": 2.7377, "step": 6360 }, { "crossentropy": 2.68540358543396, "epoch": 0.34590391255879716, "grad_norm": 0.03255115821957588, "grad_norm_var": 2.3261134899752362e-06, "learning_rate": 0.005254529534610265, "loss": 2.6854, "step": 6361 }, { "crossentropy": 2.6156437397003174, "epoch": 0.34595829141629736, "grad_norm": 0.033316753804683685, "grad_norm_var": 2.359180308842262e-06, "learning_rate": 0.005253285959772851, "loss": 2.6156, "step": 6362 }, { "crossentropy": 2.6447877883911133, "epoch": 0.34601267027379756, "grad_norm": 0.03263307735323906, "grad_norm_var": 2.4147468232054805e-06, "learning_rate": 0.005252042369226877, "loss": 2.6448, "step": 6363 }, { "crossentropy": 2.6836286783218384, "epoch": 0.34606704913129777, "grad_norm": 0.03451584652066231, "grad_norm_var": 2.4165861782618826e-06, "learning_rate": 0.005250798763049473, "loss": 2.6836, "step": 6364 }, { "crossentropy": 2.638587474822998, "epoch": 0.34612142798879797, "grad_norm": 0.036385953426361084, "grad_norm_var": 2.8064758374699365e-06, "learning_rate": 0.00524955514131776, "loss": 2.6386, "step": 6365 }, { "crossentropy": 2.668974995613098, "epoch": 0.3461758068462982, "grad_norm": 0.03593349829316139, "grad_norm_var": 2.9389301637894675e-06, "learning_rate": 0.005248311504108871, "loss": 2.669, "step": 6366 }, { "crossentropy": 2.7014797925949097, "epoch": 0.3462301857037984, "grad_norm": 0.03406647592782974, "grad_norm_var": 2.8022519367962643e-06, "learning_rate": 0.005247067851499935, "loss": 2.7015, "step": 6367 }, { "crossentropy": 2.713961958885193, "epoch": 0.3462845645612986, "grad_norm": 0.033297523856163025, "grad_norm_var": 2.641827715948025e-06, "learning_rate": 0.0052458241835680805, "loss": 2.714, "step": 6368 }, { "crossentropy": 2.697994828224182, "epoch": 0.3463389434187988, "grad_norm": 0.03553274646401405, "grad_norm_var": 1.670554112243964e-06, "learning_rate": 0.005244580500390439, "loss": 2.698, "step": 6369 }, { "crossentropy": 2.5230207443237305, "epoch": 0.346393322276299, "grad_norm": 0.03568815439939499, "grad_norm_var": 1.7960611346084603e-06, "learning_rate": 0.005243336802044144, "loss": 2.523, "step": 6370 }, { "crossentropy": 2.665856957435608, "epoch": 0.3464477011337992, "grad_norm": 0.03566104546189308, "grad_norm_var": 1.8269022992306132e-06, "learning_rate": 0.005242093088606327, "loss": 2.6659, "step": 6371 }, { "crossentropy": 2.5938585996627808, "epoch": 0.3465020799912994, "grad_norm": 0.03446751460433006, "grad_norm_var": 1.8114999615546796e-06, "learning_rate": 0.005240849360154121, "loss": 2.5939, "step": 6372 }, { "crossentropy": 2.67265522480011, "epoch": 0.3465564588487996, "grad_norm": 0.03257846459746361, "grad_norm_var": 1.8630648326346694e-06, "learning_rate": 0.005239605616764663, "loss": 2.6727, "step": 6373 }, { "crossentropy": 2.4701290130615234, "epoch": 0.3466108377062998, "grad_norm": 0.03918725252151489, "grad_norm_var": 3.2973084280368364e-06, "learning_rate": 0.005238361858515088, "loss": 2.4701, "step": 6374 }, { "crossentropy": 2.6447633504867554, "epoch": 0.3466652165638, "grad_norm": 0.03734052553772926, "grad_norm_var": 3.430360572894096e-06, "learning_rate": 0.005237118085482534, "loss": 2.6448, "step": 6375 }, { "crossentropy": 2.643069624900818, "epoch": 0.3467195954213002, "grad_norm": 0.04951715096831322, "grad_norm_var": 1.681943752996351e-05, "learning_rate": 0.0052358742977441355, "loss": 2.6431, "step": 6376 }, { "crossentropy": 2.7444132566452026, "epoch": 0.3467739742788004, "grad_norm": 0.035106465220451355, "grad_norm_var": 1.6123333404317353e-05, "learning_rate": 0.005234630495377034, "loss": 2.7444, "step": 6377 }, { "crossentropy": 2.732664465904236, "epoch": 0.3468283531363006, "grad_norm": 0.03422035276889801, "grad_norm_var": 1.585689673897535e-05, "learning_rate": 0.005233386678458368, "loss": 2.7327, "step": 6378 }, { "crossentropy": 2.705752372741699, "epoch": 0.3468827319938008, "grad_norm": 0.041584037244319916, "grad_norm_var": 1.6836235351969097e-05, "learning_rate": 0.0052321428470652786, "loss": 2.7058, "step": 6379 }, { "crossentropy": 2.602966785430908, "epoch": 0.346937110851301, "grad_norm": 0.03253812715411186, "grad_norm_var": 1.7621758434617728e-05, "learning_rate": 0.005230899001274904, "loss": 2.603, "step": 6380 }, { "crossentropy": 2.6459360122680664, "epoch": 0.3469914897088012, "grad_norm": 0.03536345809698105, "grad_norm_var": 1.769502660358783e-05, "learning_rate": 0.00522965514116439, "loss": 2.6459, "step": 6381 }, { "crossentropy": 2.6472620964050293, "epoch": 0.34704586856630143, "grad_norm": 0.035197239369153976, "grad_norm_var": 1.7772755681427544e-05, "learning_rate": 0.005228411266810879, "loss": 2.6473, "step": 6382 }, { "crossentropy": 2.6501951217651367, "epoch": 0.34710024742380163, "grad_norm": 0.035067394375801086, "grad_norm_var": 1.753273517913767e-05, "learning_rate": 0.005227167378291512, "loss": 2.6502, "step": 6383 }, { "crossentropy": 2.6534210443496704, "epoch": 0.34715462628130184, "grad_norm": 0.03786313906311989, "grad_norm_var": 1.6948908774754892e-05, "learning_rate": 0.0052259234756834375, "loss": 2.6534, "step": 6384 }, { "crossentropy": 2.6624401807785034, "epoch": 0.34720900513880204, "grad_norm": 0.037981871515512466, "grad_norm_var": 1.694848657545437e-05, "learning_rate": 0.005224679559063798, "loss": 2.6624, "step": 6385 }, { "crossentropy": 2.619805335998535, "epoch": 0.34726338399630224, "grad_norm": 0.03713787719607353, "grad_norm_var": 1.6858134999828828e-05, "learning_rate": 0.005223435628509745, "loss": 2.6198, "step": 6386 }, { "crossentropy": 2.6495882272720337, "epoch": 0.34731776285380245, "grad_norm": 0.035630058497190475, "grad_norm_var": 1.686342023737575e-05, "learning_rate": 0.00522219168409842, "loss": 2.6496, "step": 6387 }, { "crossentropy": 2.513556957244873, "epoch": 0.34737214171130265, "grad_norm": 0.03437186777591705, "grad_norm_var": 1.6895316897421312e-05, "learning_rate": 0.005220947725906975, "loss": 2.5136, "step": 6388 }, { "crossentropy": 2.705263614654541, "epoch": 0.34742652056880285, "grad_norm": 0.033256784081459045, "grad_norm_var": 1.6531610739219334e-05, "learning_rate": 0.005219703754012558, "loss": 2.7053, "step": 6389 }, { "crossentropy": 2.542525053024292, "epoch": 0.34748089942630306, "grad_norm": 0.033818319439888, "grad_norm_var": 1.673896605558107e-05, "learning_rate": 0.005218459768492318, "loss": 2.5425, "step": 6390 }, { "crossentropy": 2.6196380853652954, "epoch": 0.34753527828380326, "grad_norm": 0.035666029900312424, "grad_norm_var": 1.6754385048377747e-05, "learning_rate": 0.005217215769423408, "loss": 2.6196, "step": 6391 }, { "crossentropy": 2.5037602186203003, "epoch": 0.34758965714130347, "grad_norm": 0.03586186468601227, "grad_norm_var": 4.7446067690905145e-06, "learning_rate": 0.005215971756882978, "loss": 2.5038, "step": 6392 }, { "crossentropy": 2.6580259799957275, "epoch": 0.34764403599880367, "grad_norm": 0.03554563224315643, "grad_norm_var": 4.723864580545983e-06, "learning_rate": 0.005214727730948181, "loss": 2.658, "step": 6393 }, { "crossentropy": 2.680630326271057, "epoch": 0.3476984148563039, "grad_norm": 0.038497257977724075, "grad_norm_var": 5.0267542932152616e-06, "learning_rate": 0.0052134836916961705, "loss": 2.6806, "step": 6394 }, { "crossentropy": 2.6090710163116455, "epoch": 0.3477527937138041, "grad_norm": 0.035627804696559906, "grad_norm_var": 2.778678890437656e-06, "learning_rate": 0.005212239639204099, "loss": 2.6091, "step": 6395 }, { "crossentropy": 2.5699360370635986, "epoch": 0.3478071725713043, "grad_norm": 0.03356076031923294, "grad_norm_var": 2.428044030849894e-06, "learning_rate": 0.005210995573549127, "loss": 2.5699, "step": 6396 }, { "crossentropy": 2.556227922439575, "epoch": 0.3478615514288045, "grad_norm": 0.034224219620227814, "grad_norm_var": 2.5531354370291725e-06, "learning_rate": 0.005209751494808401, "loss": 2.5562, "step": 6397 }, { "crossentropy": 2.657642960548401, "epoch": 0.3479159302863047, "grad_norm": 0.03628244251012802, "grad_norm_var": 2.5711021751399313e-06, "learning_rate": 0.005208507403059086, "loss": 2.6576, "step": 6398 }, { "crossentropy": 2.5976544618606567, "epoch": 0.3479703091438049, "grad_norm": 0.034047335386276245, "grad_norm_var": 2.715316893617259e-06, "learning_rate": 0.005207263298378335, "loss": 2.5977, "step": 6399 }, { "crossentropy": 2.672219395637512, "epoch": 0.3480246880013051, "grad_norm": 0.03512353450059891, "grad_norm_var": 2.3525492815751683e-06, "learning_rate": 0.005206019180843309, "loss": 2.6722, "step": 6400 }, { "crossentropy": 2.701328754425049, "epoch": 0.3480790668588053, "grad_norm": 0.03492894023656845, "grad_norm_var": 1.8900480191391215e-06, "learning_rate": 0.005204775050531164, "loss": 2.7013, "step": 6401 }, { "crossentropy": 2.667362332344055, "epoch": 0.3481334457163055, "grad_norm": 0.03715703263878822, "grad_norm_var": 1.8949596297927289e-06, "learning_rate": 0.005203530907519062, "loss": 2.6674, "step": 6402 }, { "crossentropy": 2.6956844329833984, "epoch": 0.3481878245738057, "grad_norm": 0.03275734558701515, "grad_norm_var": 2.255587928330869e-06, "learning_rate": 0.005202286751884163, "loss": 2.6957, "step": 6403 }, { "crossentropy": 2.6516642570495605, "epoch": 0.3482422034313059, "grad_norm": 0.03523667901754379, "grad_norm_var": 2.2246622576253355e-06, "learning_rate": 0.005201042583703627, "loss": 2.6517, "step": 6404 }, { "crossentropy": 2.7088972330093384, "epoch": 0.3482965822888061, "grad_norm": 0.03381863236427307, "grad_norm_var": 2.106348361329465e-06, "learning_rate": 0.005199798403054619, "loss": 2.7089, "step": 6405 }, { "crossentropy": 2.624071717262268, "epoch": 0.3483509611463063, "grad_norm": 0.03469139337539673, "grad_norm_var": 2.0007597776317e-06, "learning_rate": 0.005198554210014299, "loss": 2.6241, "step": 6406 }, { "crossentropy": 2.654021382331848, "epoch": 0.3484053400038065, "grad_norm": 0.033389173448085785, "grad_norm_var": 2.1800024566452522e-06, "learning_rate": 0.005197310004659834, "loss": 2.654, "step": 6407 }, { "crossentropy": 2.656287431716919, "epoch": 0.3484597188613067, "grad_norm": 0.031790610402822495, "grad_norm_var": 2.7735445373357205e-06, "learning_rate": 0.005196065787068383, "loss": 2.6563, "step": 6408 }, { "crossentropy": 2.692238211631775, "epoch": 0.3485140977188069, "grad_norm": 0.0335826501250267, "grad_norm_var": 2.8172379726409956e-06, "learning_rate": 0.005194821557317118, "loss": 2.6922, "step": 6409 }, { "crossentropy": 2.65359628200531, "epoch": 0.34856847657630713, "grad_norm": 0.03362421318888664, "grad_norm_var": 1.8145083171954939e-06, "learning_rate": 0.0051935773154832, "loss": 2.6536, "step": 6410 }, { "crossentropy": 2.6422934532165527, "epoch": 0.34862285543380733, "grad_norm": 0.03258538991212845, "grad_norm_var": 1.880833039504213e-06, "learning_rate": 0.005192333061643798, "loss": 2.6423, "step": 6411 }, { "crossentropy": 2.595698833465576, "epoch": 0.34867723429130754, "grad_norm": 0.03586878627538681, "grad_norm_var": 2.0247385953110764e-06, "learning_rate": 0.0051910887958760785, "loss": 2.5957, "step": 6412 }, { "crossentropy": 2.6442084312438965, "epoch": 0.34873161314880774, "grad_norm": 0.036589380353689194, "grad_norm_var": 2.3443869313572244e-06, "learning_rate": 0.00518984451825721, "loss": 2.6442, "step": 6413 }, { "crossentropy": 2.572330951690674, "epoch": 0.34878599200630794, "grad_norm": 0.0330301895737648, "grad_norm_var": 2.2182633749598737e-06, "learning_rate": 0.005188600228864362, "loss": 2.5723, "step": 6414 }, { "crossentropy": 2.650056481361389, "epoch": 0.34884037086380815, "grad_norm": 0.03218561038374901, "grad_norm_var": 2.4886301811408623e-06, "learning_rate": 0.005187355927774702, "loss": 2.6501, "step": 6415 }, { "crossentropy": 2.577713131904602, "epoch": 0.34889474972130835, "grad_norm": 0.032887063920497894, "grad_norm_var": 2.5101848880824027e-06, "learning_rate": 0.005186111615065404, "loss": 2.5777, "step": 6416 }, { "crossentropy": 2.6094141006469727, "epoch": 0.34894912857880855, "grad_norm": 0.03492395579814911, "grad_norm_var": 2.509574187638715e-06, "learning_rate": 0.005184867290813636, "loss": 2.6094, "step": 6417 }, { "crossentropy": 2.6583038568496704, "epoch": 0.34900350743630876, "grad_norm": 0.03355332091450691, "grad_norm_var": 1.80785409748437e-06, "learning_rate": 0.005183622955096572, "loss": 2.6583, "step": 6418 }, { "crossentropy": 2.678598165512085, "epoch": 0.34905788629380896, "grad_norm": 0.03490245342254639, "grad_norm_var": 1.8023383730871843e-06, "learning_rate": 0.005182378607991383, "loss": 2.6786, "step": 6419 }, { "crossentropy": 2.6487957239151, "epoch": 0.34911226515130916, "grad_norm": 0.035417936742305756, "grad_norm_var": 1.8363042501851403e-06, "learning_rate": 0.005181134249575244, "loss": 2.6488, "step": 6420 }, { "crossentropy": 2.6658473014831543, "epoch": 0.34916664400880937, "grad_norm": 0.03501307964324951, "grad_norm_var": 1.9081274820720655e-06, "learning_rate": 0.005179889879925326, "loss": 2.6658, "step": 6421 }, { "crossentropy": 2.6316975355148315, "epoch": 0.34922102286630957, "grad_norm": 0.035638149827718735, "grad_norm_var": 2.051148929314156e-06, "learning_rate": 0.005178645499118804, "loss": 2.6317, "step": 6422 }, { "crossentropy": 2.6476601362228394, "epoch": 0.3492754017238098, "grad_norm": 0.03548983111977577, "grad_norm_var": 2.1386718469513024e-06, "learning_rate": 0.005177401107232858, "loss": 2.6477, "step": 6423 }, { "crossentropy": 2.663588762283325, "epoch": 0.34932978058131, "grad_norm": 0.03313049301505089, "grad_norm_var": 1.821747898978337e-06, "learning_rate": 0.005176156704344658, "loss": 2.6636, "step": 6424 }, { "crossentropy": 2.683190703392029, "epoch": 0.3493841594388102, "grad_norm": 0.03547769412398338, "grad_norm_var": 1.8709041997501396e-06, "learning_rate": 0.005174912290531384, "loss": 2.6832, "step": 6425 }, { "crossentropy": 2.7534677982330322, "epoch": 0.3494385382963104, "grad_norm": 0.0362093485891819, "grad_norm_var": 2.0229613967109873e-06, "learning_rate": 0.005173667865870214, "loss": 2.7535, "step": 6426 }, { "crossentropy": 2.640864610671997, "epoch": 0.3494929171538106, "grad_norm": 0.03427167609333992, "grad_norm_var": 1.7575217024824613e-06, "learning_rate": 0.0051724234304383245, "loss": 2.6409, "step": 6427 }, { "crossentropy": 2.6746950149536133, "epoch": 0.3495472960113108, "grad_norm": 0.035486359149217606, "grad_norm_var": 1.7051183279342501e-06, "learning_rate": 0.0051711789843128944, "loss": 2.6747, "step": 6428 }, { "crossentropy": 2.7404472827911377, "epoch": 0.349601674868811, "grad_norm": 0.035608548671007156, "grad_norm_var": 1.5100365838895586e-06, "learning_rate": 0.005169934527571103, "loss": 2.7404, "step": 6429 }, { "crossentropy": 2.606720209121704, "epoch": 0.3496560537263112, "grad_norm": 0.03647603094577789, "grad_norm_var": 1.5416559827524194e-06, "learning_rate": 0.005168690060290131, "loss": 2.6067, "step": 6430 }, { "crossentropy": 2.6972299814224243, "epoch": 0.3497104325838114, "grad_norm": 0.035563863813877106, "grad_norm_var": 1.080949894801247e-06, "learning_rate": 0.005167445582547159, "loss": 2.6972, "step": 6431 }, { "crossentropy": 2.591805100440979, "epoch": 0.3497648114413116, "grad_norm": 0.03396262228488922, "grad_norm_var": 8.497936218857319e-07, "learning_rate": 0.005166201094419367, "loss": 2.5918, "step": 6432 }, { "crossentropy": 2.6650612354278564, "epoch": 0.3498191902988118, "grad_norm": 0.03778228908777237, "grad_norm_var": 1.3046361081786931e-06, "learning_rate": 0.005164956595983939, "loss": 2.6651, "step": 6433 }, { "crossentropy": 2.685198426246643, "epoch": 0.349873569156312, "grad_norm": 0.03998502716422081, "grad_norm_var": 2.435932185035292e-06, "learning_rate": 0.005163712087318056, "loss": 2.6852, "step": 6434 }, { "crossentropy": 2.656173825263977, "epoch": 0.3499279480138122, "grad_norm": 0.034111298620700836, "grad_norm_var": 2.5540107861996355e-06, "learning_rate": 0.005162467568498903, "loss": 2.6562, "step": 6435 }, { "crossentropy": 2.5833820104599, "epoch": 0.3499823268713124, "grad_norm": 0.03788362070918083, "grad_norm_var": 2.8736326469674484e-06, "learning_rate": 0.005161223039603662, "loss": 2.5834, "step": 6436 }, { "crossentropy": 2.674870491027832, "epoch": 0.3500367057288126, "grad_norm": 0.03556929528713226, "grad_norm_var": 2.837900235835624e-06, "learning_rate": 0.005159978500709518, "loss": 2.6749, "step": 6437 }, { "crossentropy": 2.5970462560653687, "epoch": 0.35009108458631283, "grad_norm": 0.03708896413445473, "grad_norm_var": 2.9400055972526412e-06, "learning_rate": 0.005158733951893658, "loss": 2.597, "step": 6438 }, { "crossentropy": 2.675311803817749, "epoch": 0.35014546344381303, "grad_norm": 0.03881612792611122, "grad_norm_var": 3.4580087221356784e-06, "learning_rate": 0.005157489393233265, "loss": 2.6753, "step": 6439 }, { "crossentropy": 2.6418964862823486, "epoch": 0.35019984230131324, "grad_norm": 0.03197339549660683, "grad_norm_var": 3.998118728441164e-06, "learning_rate": 0.0051562448248055285, "loss": 2.6419, "step": 6440 }, { "crossentropy": 2.6470649242401123, "epoch": 0.35025422115881344, "grad_norm": 0.03684336692094803, "grad_norm_var": 4.016549505537516e-06, "learning_rate": 0.005155000246687631, "loss": 2.6471, "step": 6441 }, { "crossentropy": 2.6283005475997925, "epoch": 0.35030860001631364, "grad_norm": 0.03560489043593407, "grad_norm_var": 4.030732580235542e-06, "learning_rate": 0.005153755658956766, "loss": 2.6283, "step": 6442 }, { "crossentropy": 2.628275156021118, "epoch": 0.35036297887381385, "grad_norm": 0.03420918062329292, "grad_norm_var": 4.045913394066549e-06, "learning_rate": 0.005152511061690114, "loss": 2.6283, "step": 6443 }, { "crossentropy": 2.721205472946167, "epoch": 0.35041735773131405, "grad_norm": 0.03387857973575592, "grad_norm_var": 4.3305101860215136e-06, "learning_rate": 0.005151266454964871, "loss": 2.7212, "step": 6444 }, { "crossentropy": 2.6313356161117554, "epoch": 0.35047173658881425, "grad_norm": 0.0325155146420002, "grad_norm_var": 5.073304327300049e-06, "learning_rate": 0.005150021838858223, "loss": 2.6313, "step": 6445 }, { "crossentropy": 2.584873676300049, "epoch": 0.35052611544631446, "grad_norm": 0.03387816622853279, "grad_norm_var": 5.249343392472496e-06, "learning_rate": 0.005148777213447361, "loss": 2.5849, "step": 6446 }, { "crossentropy": 2.5735867023468018, "epoch": 0.35058049430381466, "grad_norm": 0.03396202623844147, "grad_norm_var": 5.418312747299849e-06, "learning_rate": 0.005147532578809474, "loss": 2.5736, "step": 6447 }, { "crossentropy": 2.64728581905365, "epoch": 0.35063487316131486, "grad_norm": 0.032976455986499786, "grad_norm_var": 5.681772466729386e-06, "learning_rate": 0.005146287935021754, "loss": 2.6473, "step": 6448 }, { "crossentropy": 2.568931460380554, "epoch": 0.35068925201881507, "grad_norm": 0.035298820585012436, "grad_norm_var": 5.292438853007897e-06, "learning_rate": 0.005145043282161394, "loss": 2.5689, "step": 6449 }, { "crossentropy": 2.575730800628662, "epoch": 0.35074363087631527, "grad_norm": 0.03338218852877617, "grad_norm_var": 3.881389449839067e-06, "learning_rate": 0.005143798620305583, "loss": 2.5757, "step": 6450 }, { "crossentropy": 2.5805574655532837, "epoch": 0.3507980097338155, "grad_norm": 0.0326182059943676, "grad_norm_var": 4.172658336963099e-06, "learning_rate": 0.005142553949531518, "loss": 2.5806, "step": 6451 }, { "crossentropy": 2.473240852355957, "epoch": 0.3508523885913157, "grad_norm": 0.034245844930410385, "grad_norm_var": 3.49494636208558e-06, "learning_rate": 0.005141309269916388, "loss": 2.4732, "step": 6452 }, { "crossentropy": 2.6383097171783447, "epoch": 0.3509067674488159, "grad_norm": 0.03540848568081856, "grad_norm_var": 3.4747893753015364e-06, "learning_rate": 0.005140064581537389, "loss": 2.6383, "step": 6453 }, { "crossentropy": 2.637661933898926, "epoch": 0.3509611463063161, "grad_norm": 0.036409661173820496, "grad_norm_var": 3.2731018363124064e-06, "learning_rate": 0.005138819884471715, "loss": 2.6377, "step": 6454 }, { "crossentropy": 2.606851816177368, "epoch": 0.3510155251638163, "grad_norm": 0.03451480716466904, "grad_norm_var": 1.9548463124812826e-06, "learning_rate": 0.005137575178796563, "loss": 2.6069, "step": 6455 }, { "crossentropy": 2.574346423149109, "epoch": 0.3510699040213165, "grad_norm": 0.032806746661663055, "grad_norm_var": 1.7472368098407515e-06, "learning_rate": 0.005136330464589125, "loss": 2.5743, "step": 6456 }, { "crossentropy": 2.604078531265259, "epoch": 0.3511242828788167, "grad_norm": 0.03462420776486397, "grad_norm_var": 1.2979082039915138e-06, "learning_rate": 0.0051350857419265985, "loss": 2.6041, "step": 6457 }, { "crossentropy": 2.5623779296875, "epoch": 0.3511786617363169, "grad_norm": 0.03266851603984833, "grad_norm_var": 1.2655675482655648e-06, "learning_rate": 0.005133841010886181, "loss": 2.5624, "step": 6458 }, { "crossentropy": 2.6212435960769653, "epoch": 0.3512330405938171, "grad_norm": 0.03428233787417412, "grad_norm_var": 1.2683098242676747e-06, "learning_rate": 0.005132596271545068, "loss": 2.6212, "step": 6459 }, { "crossentropy": 2.686821937561035, "epoch": 0.3512874194513173, "grad_norm": 0.033019088208675385, "grad_norm_var": 1.3246027655385827e-06, "learning_rate": 0.00513135152398046, "loss": 2.6868, "step": 6460 }, { "crossentropy": 2.5337352752685547, "epoch": 0.3513417983088175, "grad_norm": 0.033961523324251175, "grad_norm_var": 1.1858127517235782e-06, "learning_rate": 0.00513010676826955, "loss": 2.5337, "step": 6461 }, { "crossentropy": 2.637859344482422, "epoch": 0.3513961771663177, "grad_norm": 0.03482348471879959, "grad_norm_var": 1.2258585380231502e-06, "learning_rate": 0.005128862004489542, "loss": 2.6379, "step": 6462 }, { "crossentropy": 2.5082510709762573, "epoch": 0.3514505560238179, "grad_norm": 0.03197295218706131, "grad_norm_var": 1.4998209414712286e-06, "learning_rate": 0.005127617232717631, "loss": 2.5083, "step": 6463 }, { "crossentropy": 2.5938029289245605, "epoch": 0.3515049348813181, "grad_norm": 0.03321436792612076, "grad_norm_var": 1.4728463043639044e-06, "learning_rate": 0.005126372453031019, "loss": 2.5938, "step": 6464 }, { "crossentropy": 2.6534589529037476, "epoch": 0.3515593137388183, "grad_norm": 0.03439687564969063, "grad_norm_var": 1.3618672144953086e-06, "learning_rate": 0.005125127665506905, "loss": 2.6535, "step": 6465 }, { "crossentropy": 2.6048210859298706, "epoch": 0.35161369259631853, "grad_norm": 0.03497154638171196, "grad_norm_var": 1.4106857518930704e-06, "learning_rate": 0.005123882870222489, "loss": 2.6048, "step": 6466 }, { "crossentropy": 2.7050055265426636, "epoch": 0.35166807145381873, "grad_norm": 0.03320508077740669, "grad_norm_var": 1.3243868181224416e-06, "learning_rate": 0.005122638067254975, "loss": 2.705, "step": 6467 }, { "crossentropy": 2.697184205055237, "epoch": 0.35172245031131893, "grad_norm": 0.03380396217107773, "grad_norm_var": 1.3240411489723673e-06, "learning_rate": 0.00512139325668156, "loss": 2.6972, "step": 6468 }, { "crossentropy": 2.6421819925308228, "epoch": 0.35177682916881914, "grad_norm": 0.033371955156326294, "grad_norm_var": 1.2022201653879202e-06, "learning_rate": 0.005120148438579449, "loss": 2.6422, "step": 6469 }, { "crossentropy": 2.6516149044036865, "epoch": 0.35183120802631934, "grad_norm": 0.03481043502688408, "grad_norm_var": 8.222271144099509e-07, "learning_rate": 0.0051189036130258455, "loss": 2.6516, "step": 6470 }, { "crossentropy": 2.7447270154953003, "epoch": 0.35188558688381955, "grad_norm": 0.032425396144390106, "grad_norm_var": 8.898118135646759e-07, "learning_rate": 0.005117658780097949, "loss": 2.7447, "step": 6471 }, { "crossentropy": 2.5684280395507812, "epoch": 0.35193996574131975, "grad_norm": 0.034184809774160385, "grad_norm_var": 8.540389478913409e-07, "learning_rate": 0.005116413939872966, "loss": 2.5684, "step": 6472 }, { "crossentropy": 2.5954543352127075, "epoch": 0.35199434459881995, "grad_norm": 0.03675279766321182, "grad_norm_var": 1.3900038595484587e-06, "learning_rate": 0.005115169092428098, "loss": 2.5955, "step": 6473 }, { "crossentropy": 2.591223359107971, "epoch": 0.35204872345632016, "grad_norm": 0.03910667449235916, "grad_norm_var": 2.9521863109555417e-06, "learning_rate": 0.005113924237840551, "loss": 2.5912, "step": 6474 }, { "crossentropy": 2.708398938179016, "epoch": 0.35210310231382036, "grad_norm": 0.03726661577820778, "grad_norm_var": 3.5141308834843803e-06, "learning_rate": 0.0051126793761875275, "loss": 2.7084, "step": 6475 }, { "crossentropy": 2.6418017148971558, "epoch": 0.35215748117132056, "grad_norm": 0.04225492849946022, "grad_norm_var": 7.076598298574877e-06, "learning_rate": 0.005111434507546236, "loss": 2.6418, "step": 6476 }, { "crossentropy": 2.5674597024917603, "epoch": 0.35221186002882077, "grad_norm": 0.04055587947368622, "grad_norm_var": 8.852603410588502e-06, "learning_rate": 0.00511018963199388, "loss": 2.5675, "step": 6477 }, { "crossentropy": 2.5111364126205444, "epoch": 0.352266238886321, "grad_norm": 0.033902671188116074, "grad_norm_var": 8.981886457493094e-06, "learning_rate": 0.005108944749607664, "loss": 2.5111, "step": 6478 }, { "crossentropy": 2.723883271217346, "epoch": 0.35232061774382123, "grad_norm": 0.03488393872976303, "grad_norm_var": 8.186281954572494e-06, "learning_rate": 0.005107699860464799, "loss": 2.7239, "step": 6479 }, { "crossentropy": 2.6577574014663696, "epoch": 0.35237499660132143, "grad_norm": 0.03611969202756882, "grad_norm_var": 7.801614204705486e-06, "learning_rate": 0.005106454964642486, "loss": 2.6578, "step": 6480 }, { "crossentropy": 2.5101640224456787, "epoch": 0.35242937545882164, "grad_norm": 0.040019888430833817, "grad_norm_var": 8.762650222297422e-06, "learning_rate": 0.005105210062217937, "loss": 2.5102, "step": 6481 }, { "crossentropy": 2.6581937074661255, "epoch": 0.35248375431632184, "grad_norm": 0.036392584443092346, "grad_norm_var": 8.674619944197027e-06, "learning_rate": 0.0051039651532683574, "loss": 2.6582, "step": 6482 }, { "crossentropy": 2.622445583343506, "epoch": 0.35253813317382204, "grad_norm": 0.03363034874200821, "grad_norm_var": 8.516609833053348e-06, "learning_rate": 0.005102720237870957, "loss": 2.6224, "step": 6483 }, { "crossentropy": 2.566347360610962, "epoch": 0.35259251203132225, "grad_norm": 0.033848702907562256, "grad_norm_var": 8.50233618564696e-06, "learning_rate": 0.005101475316102943, "loss": 2.5663, "step": 6484 }, { "crossentropy": 2.5239473581314087, "epoch": 0.35264689088882245, "grad_norm": 0.060031697154045105, "grad_norm_var": 4.2798323393789896e-05, "learning_rate": 0.005100230388041523, "loss": 2.5239, "step": 6485 }, { "crossentropy": 2.70095157623291, "epoch": 0.35270126974632265, "grad_norm": 0.03247898072004318, "grad_norm_var": 4.409434012812476e-05, "learning_rate": 0.005098985453763909, "loss": 2.701, "step": 6486 }, { "crossentropy": 2.56040620803833, "epoch": 0.35275564860382286, "grad_norm": 0.03345419466495514, "grad_norm_var": 4.343133710330519e-05, "learning_rate": 0.005097740513347309, "loss": 2.5604, "step": 6487 }, { "crossentropy": 2.588103413581848, "epoch": 0.35281002746132306, "grad_norm": 0.03882868215441704, "grad_norm_var": 4.253745337747244e-05, "learning_rate": 0.005096495566868935, "loss": 2.5881, "step": 6488 }, { "crossentropy": 2.5536924600601196, "epoch": 0.35286440631882326, "grad_norm": 0.03825506195425987, "grad_norm_var": 4.240955394734583e-05, "learning_rate": 0.005095250614405993, "loss": 2.5537, "step": 6489 }, { "crossentropy": 2.579498291015625, "epoch": 0.35291878517632347, "grad_norm": 0.03343378007411957, "grad_norm_var": 4.372710530953783e-05, "learning_rate": 0.005094005656035699, "loss": 2.5795, "step": 6490 }, { "crossentropy": 2.7565003633499146, "epoch": 0.35297316403382367, "grad_norm": 0.046794187277555466, "grad_norm_var": 4.8678662959498324e-05, "learning_rate": 0.005092760691835261, "loss": 2.7565, "step": 6491 }, { "crossentropy": 2.6670867204666138, "epoch": 0.3530275428913239, "grad_norm": 0.034353457391262054, "grad_norm_var": 4.855140948822143e-05, "learning_rate": 0.0050915157218818915, "loss": 2.6671, "step": 6492 }, { "crossentropy": 2.66700279712677, "epoch": 0.3530819217488241, "grad_norm": 0.034406132996082306, "grad_norm_var": 4.876730527268672e-05, "learning_rate": 0.005090270746252802, "loss": 2.667, "step": 6493 }, { "crossentropy": 2.5991402864456177, "epoch": 0.3531363006063243, "grad_norm": 0.03394218534231186, "grad_norm_var": 4.8748175513841736e-05, "learning_rate": 0.005089025765025207, "loss": 2.5991, "step": 6494 }, { "crossentropy": 2.608368992805481, "epoch": 0.3531906794638245, "grad_norm": 0.03550195321440697, "grad_norm_var": 4.85519796740251e-05, "learning_rate": 0.005087780778276314, "loss": 2.6084, "step": 6495 }, { "crossentropy": 2.4922953844070435, "epoch": 0.3532450583213247, "grad_norm": 0.04175557941198349, "grad_norm_var": 4.942989586316637e-05, "learning_rate": 0.0050865357860833404, "loss": 2.4923, "step": 6496 }, { "crossentropy": 2.59377658367157, "epoch": 0.3532994371788249, "grad_norm": 0.037531327456235886, "grad_norm_var": 4.912864330714791e-05, "learning_rate": 0.0050852907885234975, "loss": 2.5938, "step": 6497 }, { "crossentropy": 2.7097429037094116, "epoch": 0.3533538160363251, "grad_norm": 0.037753913551568985, "grad_norm_var": 4.899083668185219e-05, "learning_rate": 0.005084045785674, "loss": 2.7097, "step": 6498 }, { "crossentropy": 2.715487480163574, "epoch": 0.3534081948938253, "grad_norm": 0.036140408366918564, "grad_norm_var": 4.7964030736845296e-05, "learning_rate": 0.005082800777612061, "loss": 2.7155, "step": 6499 }, { "crossentropy": 2.640347480773926, "epoch": 0.3534625737513255, "grad_norm": 0.03421895578503609, "grad_norm_var": 4.776608706438419e-05, "learning_rate": 0.005081555764414895, "loss": 2.6403, "step": 6500 }, { "crossentropy": 2.7133854627609253, "epoch": 0.3535169526088257, "grad_norm": 0.03861735388636589, "grad_norm_var": 1.3678185380313195e-05, "learning_rate": 0.0050803107461597176, "loss": 2.7134, "step": 6501 }, { "crossentropy": 2.638417959213257, "epoch": 0.3535713314663259, "grad_norm": 0.033358339220285416, "grad_norm_var": 1.3229659241591658e-05, "learning_rate": 0.0050790657229237415, "loss": 2.6384, "step": 6502 }, { "crossentropy": 2.6020495891571045, "epoch": 0.3536257103238261, "grad_norm": 0.032759469002485275, "grad_norm_var": 1.35671155197955e-05, "learning_rate": 0.0050778206947841836, "loss": 2.602, "step": 6503 }, { "crossentropy": 2.6949952840805054, "epoch": 0.3536800891813263, "grad_norm": 0.035926032811403275, "grad_norm_var": 1.3280762938034224e-05, "learning_rate": 0.005076575661818257, "loss": 2.695, "step": 6504 }, { "crossentropy": 2.5923869609832764, "epoch": 0.3537344680388265, "grad_norm": 0.03554653003811836, "grad_norm_var": 1.3122339458214392e-05, "learning_rate": 0.005075330624103182, "loss": 2.5924, "step": 6505 }, { "crossentropy": 2.646175742149353, "epoch": 0.3537888468963267, "grad_norm": 0.034578971564769745, "grad_norm_var": 1.275482662381333e-05, "learning_rate": 0.005074085581716168, "loss": 2.6462, "step": 6506 }, { "crossentropy": 2.7494856119155884, "epoch": 0.35384322575382693, "grad_norm": 0.032893646508455276, "grad_norm_var": 5.65765725243818e-06, "learning_rate": 0.0050728405347344385, "loss": 2.7495, "step": 6507 }, { "crossentropy": 2.5621702671051025, "epoch": 0.35389760461132713, "grad_norm": 0.03489464893937111, "grad_norm_var": 5.587437641509327e-06, "learning_rate": 0.005071595483235206, "loss": 2.5622, "step": 6508 }, { "crossentropy": 2.605971574783325, "epoch": 0.35395198346882734, "grad_norm": 0.03494039550423622, "grad_norm_var": 5.519228563704142e-06, "learning_rate": 0.005070350427295689, "loss": 2.606, "step": 6509 }, { "crossentropy": 2.5803005695343018, "epoch": 0.35400636232632754, "grad_norm": 0.03443346172571182, "grad_norm_var": 5.422610169776519e-06, "learning_rate": 0.005069105366993103, "loss": 2.5803, "step": 6510 }, { "crossentropy": 2.579184889793396, "epoch": 0.35406074118382774, "grad_norm": 0.036122921854257584, "grad_norm_var": 5.432118903394139e-06, "learning_rate": 0.005067860302404668, "loss": 2.5792, "step": 6511 }, { "crossentropy": 2.6378676891326904, "epoch": 0.35411512004132795, "grad_norm": 0.0332055389881134, "grad_norm_var": 3.117052118236224e-06, "learning_rate": 0.0050666152336076, "loss": 2.6379, "step": 6512 }, { "crossentropy": 2.60445773601532, "epoch": 0.35416949889882815, "grad_norm": 0.034417182207107544, "grad_norm_var": 2.747941961108123e-06, "learning_rate": 0.005065370160679115, "loss": 2.6045, "step": 6513 }, { "crossentropy": 2.5815486907958984, "epoch": 0.35422387775632835, "grad_norm": 0.033128220587968826, "grad_norm_var": 2.379345558402533e-06, "learning_rate": 0.005064125083696436, "loss": 2.5815, "step": 6514 }, { "crossentropy": 2.664783477783203, "epoch": 0.35427825661382856, "grad_norm": 0.033164724707603455, "grad_norm_var": 2.3608261571799715e-06, "learning_rate": 0.005062880002736778, "loss": 2.6648, "step": 6515 }, { "crossentropy": 2.531296730041504, "epoch": 0.35433263547132876, "grad_norm": 0.0333583727478981, "grad_norm_var": 2.4408422611282843e-06, "learning_rate": 0.005061634917877362, "loss": 2.5313, "step": 6516 }, { "crossentropy": 2.517230987548828, "epoch": 0.35438701432882896, "grad_norm": 0.03326179087162018, "grad_norm_var": 1.2641746674992825e-06, "learning_rate": 0.005060389829195404, "loss": 2.5172, "step": 6517 }, { "crossentropy": 2.6513285636901855, "epoch": 0.35444139318632917, "grad_norm": 0.034930068999528885, "grad_norm_var": 1.2580338352884944e-06, "learning_rate": 0.005059144736768127, "loss": 2.6513, "step": 6518 }, { "crossentropy": 2.6974867582321167, "epoch": 0.35449577204382937, "grad_norm": 0.033633407205343246, "grad_norm_var": 1.1352751115379159e-06, "learning_rate": 0.005057899640672749, "loss": 2.6975, "step": 6519 }, { "crossentropy": 2.682446599006653, "epoch": 0.3545501509013296, "grad_norm": 0.03480488806962967, "grad_norm_var": 9.673647750098324e-07, "learning_rate": 0.005056654540986487, "loss": 2.6824, "step": 6520 }, { "crossentropy": 2.7113088369369507, "epoch": 0.3546045297588298, "grad_norm": 0.03384097293019295, "grad_norm_var": 8.445926374923973e-07, "learning_rate": 0.005055409437786564, "loss": 2.7113, "step": 6521 }, { "crossentropy": 2.7852203845977783, "epoch": 0.35465890861633, "grad_norm": 0.036519430577754974, "grad_norm_var": 1.2037032678859083e-06, "learning_rate": 0.005054164331150199, "loss": 2.7852, "step": 6522 }, { "crossentropy": 2.6674846410751343, "epoch": 0.3547132874738302, "grad_norm": 0.035224370658397675, "grad_norm_var": 1.1304622588086006e-06, "learning_rate": 0.005052919221154615, "loss": 2.6675, "step": 6523 }, { "crossentropy": 2.588507652282715, "epoch": 0.3547676663313304, "grad_norm": 0.04893108829855919, "grad_norm_var": 1.443084028852435e-05, "learning_rate": 0.005051674107877027, "loss": 2.5885, "step": 6524 }, { "crossentropy": 2.6318631172180176, "epoch": 0.3548220451888306, "grad_norm": 0.038247283548116684, "grad_norm_var": 1.498009103669534e-05, "learning_rate": 0.005050428991394663, "loss": 2.6319, "step": 6525 }, { "crossentropy": 2.6742093563079834, "epoch": 0.3548764240463308, "grad_norm": 0.03776612505316734, "grad_norm_var": 1.5221893496134009e-05, "learning_rate": 0.005049183871784736, "loss": 2.6742, "step": 6526 }, { "crossentropy": 2.5761505365371704, "epoch": 0.354930802903831, "grad_norm": 0.039461709558963776, "grad_norm_var": 1.6124792677573754e-05, "learning_rate": 0.005047938749124472, "loss": 2.5762, "step": 6527 }, { "crossentropy": 2.6287765502929688, "epoch": 0.3549851817613312, "grad_norm": 0.037928979843854904, "grad_norm_var": 1.584214415025394e-05, "learning_rate": 0.005046693623491092, "loss": 2.6288, "step": 6528 }, { "crossentropy": 2.69384229183197, "epoch": 0.3550395606188314, "grad_norm": 0.03744065761566162, "grad_norm_var": 1.5709422776529387e-05, "learning_rate": 0.005045448494961818, "loss": 2.6938, "step": 6529 }, { "crossentropy": 2.6512553691864014, "epoch": 0.3550939394763316, "grad_norm": 0.03648485615849495, "grad_norm_var": 1.497052113140033e-05, "learning_rate": 0.00504420336361387, "loss": 2.6513, "step": 6530 }, { "crossentropy": 2.7059741020202637, "epoch": 0.3551483183338318, "grad_norm": 0.03903662785887718, "grad_norm_var": 1.446534868415657e-05, "learning_rate": 0.00504295822952447, "loss": 2.706, "step": 6531 }, { "crossentropy": 2.519323468208313, "epoch": 0.355202697191332, "grad_norm": 0.03328598663210869, "grad_norm_var": 1.4500142012875231e-05, "learning_rate": 0.005041713092770841, "loss": 2.5193, "step": 6532 }, { "crossentropy": 2.6739258766174316, "epoch": 0.3552570760488322, "grad_norm": 0.035219188779592514, "grad_norm_var": 1.3783585850803542e-05, "learning_rate": 0.005040467953430205, "loss": 2.6739, "step": 6533 }, { "crossentropy": 2.5795743465423584, "epoch": 0.3553114549063324, "grad_norm": 0.033330097794532776, "grad_norm_var": 1.4395232467319617e-05, "learning_rate": 0.005039222811579784, "loss": 2.5796, "step": 6534 }, { "crossentropy": 2.686585783958435, "epoch": 0.35536583376383263, "grad_norm": 0.033266447484493256, "grad_norm_var": 1.4565787251675083e-05, "learning_rate": 0.005037977667296801, "loss": 2.6866, "step": 6535 }, { "crossentropy": 2.6083569526672363, "epoch": 0.35542021262133283, "grad_norm": 0.032667405903339386, "grad_norm_var": 1.5455364900183952e-05, "learning_rate": 0.005036732520658479, "loss": 2.6084, "step": 6536 }, { "crossentropy": 2.4870325326919556, "epoch": 0.35547459147883304, "grad_norm": 0.05241341516375542, "grad_norm_var": 2.9709351368480074e-05, "learning_rate": 0.005035487371742039, "loss": 2.487, "step": 6537 }, { "crossentropy": 2.5764962434768677, "epoch": 0.35552897033633324, "grad_norm": 0.03348414972424507, "grad_norm_var": 3.0864715547634695e-05, "learning_rate": 0.005034242220624706, "loss": 2.5765, "step": 6538 }, { "crossentropy": 2.5565152168273926, "epoch": 0.35558334919383344, "grad_norm": 0.03791145235300064, "grad_norm_var": 3.0406896237527894e-05, "learning_rate": 0.005032997067383702, "loss": 2.5565, "step": 6539 }, { "crossentropy": 2.6018823385238647, "epoch": 0.35563772805133365, "grad_norm": 0.03336310386657715, "grad_norm_var": 2.2718639631584028e-05, "learning_rate": 0.005031751912096253, "loss": 2.6019, "step": 6540 }, { "crossentropy": 2.557567834854126, "epoch": 0.35569210690883385, "grad_norm": 0.03184296935796738, "grad_norm_var": 2.418006714459663e-05, "learning_rate": 0.005030506754839577, "loss": 2.5576, "step": 6541 }, { "crossentropy": 2.6605266332626343, "epoch": 0.35574648576633405, "grad_norm": 0.03427058458328247, "grad_norm_var": 2.437994581379974e-05, "learning_rate": 0.005029261595690901, "loss": 2.6605, "step": 6542 }, { "crossentropy": 2.508273482322693, "epoch": 0.35580086462383426, "grad_norm": 0.035306964069604874, "grad_norm_var": 2.3728373382808413e-05, "learning_rate": 0.005028016434727449, "loss": 2.5083, "step": 6543 }, { "crossentropy": 2.57875394821167, "epoch": 0.35585524348133446, "grad_norm": 0.03436167538166046, "grad_norm_var": 2.3643471333098463e-05, "learning_rate": 0.005026771272026445, "loss": 2.5788, "step": 6544 }, { "crossentropy": 2.6724905967712402, "epoch": 0.35590962233883466, "grad_norm": 0.03327414020895958, "grad_norm_var": 2.384776753622415e-05, "learning_rate": 0.005025526107665109, "loss": 2.6725, "step": 6545 }, { "crossentropy": 2.7241241931915283, "epoch": 0.35596400119633487, "grad_norm": 0.03588416799902916, "grad_norm_var": 2.379904435795277e-05, "learning_rate": 0.00502428094172067, "loss": 2.7241, "step": 6546 }, { "crossentropy": 2.5978375673294067, "epoch": 0.35601838005383507, "grad_norm": 0.036014288663864136, "grad_norm_var": 2.2967897940664948e-05, "learning_rate": 0.005023035774270348, "loss": 2.5978, "step": 6547 }, { "crossentropy": 2.6511244773864746, "epoch": 0.3560727589113353, "grad_norm": 0.034549858421087265, "grad_norm_var": 2.2716795965036355e-05, "learning_rate": 0.005021790605391369, "loss": 2.6511, "step": 6548 }, { "crossentropy": 2.5965991020202637, "epoch": 0.3561271377688355, "grad_norm": 0.036138471215963364, "grad_norm_var": 2.274162983607926e-05, "learning_rate": 0.005020545435160957, "loss": 2.5966, "step": 6549 }, { "crossentropy": 2.542773723602295, "epoch": 0.3561815166263357, "grad_norm": 0.03445844724774361, "grad_norm_var": 2.249400408379116e-05, "learning_rate": 0.005019300263656336, "loss": 2.5428, "step": 6550 }, { "crossentropy": 2.6086199283599854, "epoch": 0.3562358954838359, "grad_norm": 0.030955467373132706, "grad_norm_var": 2.3539274499155952e-05, "learning_rate": 0.005018055090954733, "loss": 2.6086, "step": 6551 }, { "crossentropy": 2.683761239051819, "epoch": 0.3562902743413361, "grad_norm": 0.03509246185421944, "grad_norm_var": 2.3013236466060127e-05, "learning_rate": 0.0050168099171333655, "loss": 2.6838, "step": 6552 }, { "crossentropy": 2.628469228744507, "epoch": 0.3563446531988363, "grad_norm": 0.03690740838646889, "grad_norm_var": 3.243339578515997e-06, "learning_rate": 0.005015564742269467, "loss": 2.6285, "step": 6553 }, { "crossentropy": 2.5095112323760986, "epoch": 0.3563990320563365, "grad_norm": 0.035318732261657715, "grad_norm_var": 3.177449842078385e-06, "learning_rate": 0.005014319566440256, "loss": 2.5095, "step": 6554 }, { "crossentropy": 2.6549521684646606, "epoch": 0.3564534109138367, "grad_norm": 0.03395407646894455, "grad_norm_var": 2.4765746548508284e-06, "learning_rate": 0.005013074389722958, "loss": 2.655, "step": 6555 }, { "crossentropy": 2.608888864517212, "epoch": 0.3565077897713369, "grad_norm": 0.03507011756300926, "grad_norm_var": 2.404303211794856e-06, "learning_rate": 0.0050118292121948, "loss": 2.6089, "step": 6556 }, { "crossentropy": 2.5102561712265015, "epoch": 0.3565621686288371, "grad_norm": 0.034953244030475616, "grad_norm_var": 1.8707547384394507e-06, "learning_rate": 0.005010584033933006, "loss": 2.5103, "step": 6557 }, { "crossentropy": 2.64435350894928, "epoch": 0.3566165474863373, "grad_norm": 0.0334828719496727, "grad_norm_var": 1.9632361214714142e-06, "learning_rate": 0.005009338855014798, "loss": 2.6444, "step": 6558 }, { "crossentropy": 2.636600375175476, "epoch": 0.3566709263438375, "grad_norm": 0.034834060817956924, "grad_norm_var": 1.9410007716359757e-06, "learning_rate": 0.005008093675517403, "loss": 2.6366, "step": 6559 }, { "crossentropy": 2.5874850749969482, "epoch": 0.3567253052013377, "grad_norm": 0.03676118329167366, "grad_norm_var": 2.191621886128645e-06, "learning_rate": 0.005006848495518047, "loss": 2.5875, "step": 6560 }, { "crossentropy": 2.528991460800171, "epoch": 0.3567796840588379, "grad_norm": 0.03249054029583931, "grad_norm_var": 2.394964455629198e-06, "learning_rate": 0.005005603315093953, "loss": 2.529, "step": 6561 }, { "crossentropy": 2.704303503036499, "epoch": 0.3568340629163381, "grad_norm": 0.03786451742053032, "grad_norm_var": 2.92526755290608e-06, "learning_rate": 0.005004358134322346, "loss": 2.7043, "step": 6562 }, { "crossentropy": 2.571007013320923, "epoch": 0.3568884417738383, "grad_norm": 0.03464889153838158, "grad_norm_var": 2.843999196573225e-06, "learning_rate": 0.005003112953280452, "loss": 2.571, "step": 6563 }, { "crossentropy": 2.6538007259368896, "epoch": 0.35694282063133853, "grad_norm": 0.03407004103064537, "grad_norm_var": 2.877111577928582e-06, "learning_rate": 0.005001867772045497, "loss": 2.6538, "step": 6564 }, { "crossentropy": 2.7015455961227417, "epoch": 0.35699719948883873, "grad_norm": 0.03599414974451065, "grad_norm_var": 2.8528985293309213e-06, "learning_rate": 0.005000622590694704, "loss": 2.7015, "step": 6565 }, { "crossentropy": 2.628798723220825, "epoch": 0.35705157834633894, "grad_norm": 0.03866453841328621, "grad_norm_var": 3.76508152706839e-06, "learning_rate": 0.004999377409305298, "loss": 2.6288, "step": 6566 }, { "crossentropy": 2.628261923789978, "epoch": 0.35710595720383914, "grad_norm": 0.03384804725646973, "grad_norm_var": 2.7025290584715976e-06, "learning_rate": 0.004998132227954503, "loss": 2.6283, "step": 6567 }, { "crossentropy": 2.6028599739074707, "epoch": 0.35716033606133935, "grad_norm": 0.03159419074654579, "grad_norm_var": 3.539564058533534e-06, "learning_rate": 0.004996887046719547, "loss": 2.6029, "step": 6568 }, { "crossentropy": 2.6912965774536133, "epoch": 0.35721471491883955, "grad_norm": 0.03461635485291481, "grad_norm_var": 3.2936763282177815e-06, "learning_rate": 0.004995641865677655, "loss": 2.6913, "step": 6569 }, { "crossentropy": 2.6874629259109497, "epoch": 0.35726909377633975, "grad_norm": 0.034040749073028564, "grad_norm_var": 3.321906075990505e-06, "learning_rate": 0.004994396684906048, "loss": 2.6875, "step": 6570 }, { "crossentropy": 2.6709922552108765, "epoch": 0.35732347263383996, "grad_norm": 0.03470863401889801, "grad_norm_var": 3.271833829600797e-06, "learning_rate": 0.004993151504481953, "loss": 2.671, "step": 6571 }, { "crossentropy": 2.625143527984619, "epoch": 0.35737785149134016, "grad_norm": 0.032430872321128845, "grad_norm_var": 3.630652204216801e-06, "learning_rate": 0.004991906324482597, "loss": 2.6251, "step": 6572 }, { "crossentropy": 2.6959110498428345, "epoch": 0.35743223034884036, "grad_norm": 0.032530251890420914, "grad_norm_var": 3.911788415679e-06, "learning_rate": 0.004990661144985203, "loss": 2.6959, "step": 6573 }, { "crossentropy": 2.615722179412842, "epoch": 0.35748660920634057, "grad_norm": 0.033795908093452454, "grad_norm_var": 3.873947113465735e-06, "learning_rate": 0.004989415966066996, "loss": 2.6157, "step": 6574 }, { "crossentropy": 2.627171516418457, "epoch": 0.35754098806384077, "grad_norm": 0.039219971746206284, "grad_norm_var": 5.238929371174465e-06, "learning_rate": 0.004988170787805201, "loss": 2.6272, "step": 6575 }, { "crossentropy": 2.693425178527832, "epoch": 0.357595366921341, "grad_norm": 0.035503558814525604, "grad_norm_var": 5.013941323159186e-06, "learning_rate": 0.004986925610277041, "loss": 2.6934, "step": 6576 }, { "crossentropy": 2.615676164627075, "epoch": 0.3576497457788412, "grad_norm": 0.033386196941137314, "grad_norm_var": 4.794093833483612e-06, "learning_rate": 0.004985680433559746, "loss": 2.6157, "step": 6577 }, { "crossentropy": 2.5633933544158936, "epoch": 0.3577041246363414, "grad_norm": 0.03319932147860527, "grad_norm_var": 4.252680922903611e-06, "learning_rate": 0.004984435257730534, "loss": 2.5634, "step": 6578 }, { "crossentropy": 2.5354808568954468, "epoch": 0.3577585034938416, "grad_norm": 0.03384294733405113, "grad_norm_var": 4.2789681019713675e-06, "learning_rate": 0.004983190082866633, "loss": 2.5355, "step": 6579 }, { "crossentropy": 2.6466814279556274, "epoch": 0.3578128823513418, "grad_norm": 0.033202677965164185, "grad_norm_var": 4.371705839984844e-06, "learning_rate": 0.00498194490904527, "loss": 2.6467, "step": 6580 }, { "crossentropy": 2.6207467317581177, "epoch": 0.357867261208842, "grad_norm": 0.032078780233860016, "grad_norm_var": 4.5034334259721105e-06, "learning_rate": 0.004980699736343665, "loss": 2.6207, "step": 6581 }, { "crossentropy": 2.509933829307556, "epoch": 0.3579216400663422, "grad_norm": 0.031530849635601044, "grad_norm_var": 3.405621285522351e-06, "learning_rate": 0.004979454564839043, "loss": 2.5099, "step": 6582 }, { "crossentropy": 2.6131175756454468, "epoch": 0.3579760189238424, "grad_norm": 0.03354950621724129, "grad_norm_var": 3.4061178938446976e-06, "learning_rate": 0.004978209394608631, "loss": 2.6131, "step": 6583 }, { "crossentropy": 2.6310046911239624, "epoch": 0.3580303977813426, "grad_norm": 0.036187104880809784, "grad_norm_var": 3.4337954293472393e-06, "learning_rate": 0.004976964225729652, "loss": 2.631, "step": 6584 }, { "crossentropy": 2.6078070402145386, "epoch": 0.3580847766388428, "grad_norm": 0.0357840433716774, "grad_norm_var": 3.616691027354587e-06, "learning_rate": 0.004975719058279331, "loss": 2.6078, "step": 6585 }, { "crossentropy": 2.734888195991516, "epoch": 0.358139155496343, "grad_norm": 0.03536843881011009, "grad_norm_var": 3.723108503980663e-06, "learning_rate": 0.004974473892334892, "loss": 2.7349, "step": 6586 }, { "crossentropy": 2.65253484249115, "epoch": 0.3581935343538432, "grad_norm": 0.032720427960157394, "grad_norm_var": 3.8207371239586115e-06, "learning_rate": 0.004973228727973555, "loss": 2.6525, "step": 6587 }, { "crossentropy": 2.6045433282852173, "epoch": 0.3582479132113434, "grad_norm": 0.03343115374445915, "grad_norm_var": 3.6712384810468878e-06, "learning_rate": 0.004971983565272553, "loss": 2.6045, "step": 6588 }, { "crossentropy": 2.5607701539993286, "epoch": 0.3583022920688436, "grad_norm": 0.03455571085214615, "grad_norm_var": 3.5082537369388507e-06, "learning_rate": 0.004970738404309099, "loss": 2.5608, "step": 6589 }, { "crossentropy": 2.6615257263183594, "epoch": 0.3583566709263438, "grad_norm": 0.035174861550331116, "grad_norm_var": 3.551002240324147e-06, "learning_rate": 0.004969493245160423, "loss": 2.6615, "step": 6590 }, { "crossentropy": 2.682238817214966, "epoch": 0.358411049783844, "grad_norm": 0.03423059359192848, "grad_norm_var": 1.8311777168822373e-06, "learning_rate": 0.004968248087903749, "loss": 2.6822, "step": 6591 }, { "crossentropy": 2.690288782119751, "epoch": 0.35846542864134423, "grad_norm": 0.033278025686740875, "grad_norm_var": 1.6898699169089668e-06, "learning_rate": 0.004967002932616298, "loss": 2.6903, "step": 6592 }, { "crossentropy": 2.6258450746536255, "epoch": 0.35851980749884443, "grad_norm": 0.03499852865934372, "grad_norm_var": 1.7537047992607334e-06, "learning_rate": 0.004965757779375294, "loss": 2.6258, "step": 6593 }, { "crossentropy": 2.6449437141418457, "epoch": 0.35857418635634464, "grad_norm": 0.03630112484097481, "grad_norm_var": 2.0463004152227253e-06, "learning_rate": 0.004964512628257961, "loss": 2.6449, "step": 6594 }, { "crossentropy": 2.571548104286194, "epoch": 0.35862856521384484, "grad_norm": 0.03562459722161293, "grad_norm_var": 2.1742044098576582e-06, "learning_rate": 0.004963267479341523, "loss": 2.5715, "step": 6595 }, { "crossentropy": 2.658941149711609, "epoch": 0.35868294407134504, "grad_norm": 0.03264658525586128, "grad_norm_var": 2.2712623896257536e-06, "learning_rate": 0.004962022332703199, "loss": 2.6589, "step": 6596 }, { "crossentropy": 2.651099920272827, "epoch": 0.35873732292884525, "grad_norm": 0.03412926569581032, "grad_norm_var": 1.949657281429852e-06, "learning_rate": 0.004960777188420217, "loss": 2.6511, "step": 6597 }, { "crossentropy": 2.64865779876709, "epoch": 0.35879170178634545, "grad_norm": 0.0333808958530426, "grad_norm_var": 1.4695413475091304e-06, "learning_rate": 0.004959532046569795, "loss": 2.6487, "step": 6598 }, { "crossentropy": 2.5498634576797485, "epoch": 0.35884608064384566, "grad_norm": 0.03306916356086731, "grad_norm_var": 1.5422785737628787e-06, "learning_rate": 0.004958286907229158, "loss": 2.5499, "step": 6599 }, { "crossentropy": 2.5947755575180054, "epoch": 0.35890045950134586, "grad_norm": 0.03537115082144737, "grad_norm_var": 1.3927311861643954e-06, "learning_rate": 0.00495704177047553, "loss": 2.5948, "step": 6600 }, { "crossentropy": 2.6560516357421875, "epoch": 0.35895483835884606, "grad_norm": 0.035554371774196625, "grad_norm_var": 1.3530026133490649e-06, "learning_rate": 0.0049557966363861315, "loss": 2.6561, "step": 6601 }, { "crossentropy": 2.674581289291382, "epoch": 0.35900921721634627, "grad_norm": 0.03215906023979187, "grad_norm_var": 1.5672343615156144e-06, "learning_rate": 0.004954551505038184, "loss": 2.6746, "step": 6602 }, { "crossentropy": 2.60366153717041, "epoch": 0.35906359607384647, "grad_norm": 0.03456873446702957, "grad_norm_var": 1.4249706541277196e-06, "learning_rate": 0.0049533063765089085, "loss": 2.6037, "step": 6603 }, { "crossentropy": 2.6917725801467896, "epoch": 0.3591179749313467, "grad_norm": 0.04287702217698097, "grad_norm_var": 5.932905045551104e-06, "learning_rate": 0.004952061250875528, "loss": 2.6918, "step": 6604 }, { "crossentropy": 2.5762388706207275, "epoch": 0.3591723537888469, "grad_norm": 0.04526923969388008, "grad_norm_var": 1.2657711006600764e-05, "learning_rate": 0.004950816128215265, "loss": 2.5762, "step": 6605 }, { "crossentropy": 2.662997245788574, "epoch": 0.3592267326463471, "grad_norm": 0.037290237843990326, "grad_norm_var": 1.2834519197424399e-05, "learning_rate": 0.004949571008605339, "loss": 2.663, "step": 6606 }, { "crossentropy": 2.539774179458618, "epoch": 0.3592811115038473, "grad_norm": 0.034680675715208054, "grad_norm_var": 1.276069266186707e-05, "learning_rate": 0.004948325892122973, "loss": 2.5398, "step": 6607 }, { "crossentropy": 2.6477832794189453, "epoch": 0.3593354903613475, "grad_norm": 0.03569720685482025, "grad_norm_var": 1.234527067495809e-05, "learning_rate": 0.004947080778845387, "loss": 2.6478, "step": 6608 }, { "crossentropy": 2.696065664291382, "epoch": 0.3593898692188477, "grad_norm": 0.03319177031517029, "grad_norm_var": 1.2754683466399917e-05, "learning_rate": 0.004945835668849801, "loss": 2.6961, "step": 6609 }, { "crossentropy": 2.661018967628479, "epoch": 0.3594442480763479, "grad_norm": 0.03265088424086571, "grad_norm_var": 1.3313471708268725e-05, "learning_rate": 0.004944590562213435, "loss": 2.661, "step": 6610 }, { "crossentropy": 2.6847355365753174, "epoch": 0.3594986269338481, "grad_norm": 0.03276168555021286, "grad_norm_var": 1.3782014489710909e-05, "learning_rate": 0.004943345459013514, "loss": 2.6847, "step": 6611 }, { "crossentropy": 2.563360571861267, "epoch": 0.3595530057913483, "grad_norm": 0.03309253603219986, "grad_norm_var": 1.3634821170168075e-05, "learning_rate": 0.0049421003593272525, "loss": 2.5634, "step": 6612 }, { "crossentropy": 2.5921376943588257, "epoch": 0.3596073846488485, "grad_norm": 0.035719435662031174, "grad_norm_var": 1.3532130975902704e-05, "learning_rate": 0.004940855263231873, "loss": 2.5921, "step": 6613 }, { "crossentropy": 2.6231689453125, "epoch": 0.3596617635063487, "grad_norm": 0.033148761838674545, "grad_norm_var": 1.3599799476957514e-05, "learning_rate": 0.004939610170804596, "loss": 2.6232, "step": 6614 }, { "crossentropy": 2.6442254781723022, "epoch": 0.3597161423638489, "grad_norm": 0.03391653671860695, "grad_norm_var": 1.337637527182208e-05, "learning_rate": 0.004938365082122638, "loss": 2.6442, "step": 6615 }, { "crossentropy": 2.659797787666321, "epoch": 0.3597705212213491, "grad_norm": 0.039885420352220535, "grad_norm_var": 1.4574391857746551e-05, "learning_rate": 0.004937119997263224, "loss": 2.6598, "step": 6616 }, { "crossentropy": 2.485016703605652, "epoch": 0.3598249000788493, "grad_norm": 0.03416324406862259, "grad_norm_var": 1.4737004124165444e-05, "learning_rate": 0.004935874916303564, "loss": 2.485, "step": 6617 }, { "crossentropy": 2.571410059928894, "epoch": 0.3598792789363495, "grad_norm": 0.0334848128259182, "grad_norm_var": 1.4222343174848048e-05, "learning_rate": 0.004934629839320885, "loss": 2.5714, "step": 6618 }, { "crossentropy": 2.636289119720459, "epoch": 0.3599336577938497, "grad_norm": 0.03289440646767616, "grad_norm_var": 1.4666820232168513e-05, "learning_rate": 0.0049333847663924025, "loss": 2.6363, "step": 6619 }, { "crossentropy": 2.5855122804641724, "epoch": 0.35998803665134993, "grad_norm": 0.03361913934350014, "grad_norm_var": 1.1127658363530949e-05, "learning_rate": 0.004932139697595333, "loss": 2.5855, "step": 6620 }, { "crossentropy": 2.570526361465454, "epoch": 0.36004241550885013, "grad_norm": 0.03285502642393112, "grad_norm_var": 3.913423636129192e-06, "learning_rate": 0.004930894633006897, "loss": 2.5705, "step": 6621 }, { "crossentropy": 2.7298752069473267, "epoch": 0.36009679436635034, "grad_norm": 0.035173069685697556, "grad_norm_var": 3.353904362214897e-06, "learning_rate": 0.004929649572704312, "loss": 2.7299, "step": 6622 }, { "crossentropy": 2.523353099822998, "epoch": 0.36015117322385054, "grad_norm": 0.04569249600172043, "grad_norm_var": 1.166276796681394e-05, "learning_rate": 0.004928404516764794, "loss": 2.5234, "step": 6623 }, { "crossentropy": 2.625874876976013, "epoch": 0.36020555208135074, "grad_norm": 0.03283541649580002, "grad_norm_var": 1.1859624573024897e-05, "learning_rate": 0.004927159465265562, "loss": 2.6259, "step": 6624 }, { "crossentropy": 2.582860589027405, "epoch": 0.36025993093885095, "grad_norm": 0.033358171582221985, "grad_norm_var": 1.1828052279584608e-05, "learning_rate": 0.004925914418283832, "loss": 2.5829, "step": 6625 }, { "crossentropy": 2.662347197532654, "epoch": 0.36031430979635115, "grad_norm": 0.032416149973869324, "grad_norm_var": 1.1895728912096152e-05, "learning_rate": 0.004924669375896819, "loss": 2.6623, "step": 6626 }, { "crossentropy": 2.6514004468917847, "epoch": 0.3603686886538514, "grad_norm": 0.03484119847416878, "grad_norm_var": 1.163175219195672e-05, "learning_rate": 0.004923424338181745, "loss": 2.6514, "step": 6627 }, { "crossentropy": 2.5991016626358032, "epoch": 0.3604230675113516, "grad_norm": 0.03968791663646698, "grad_norm_var": 1.2832667073302678e-05, "learning_rate": 0.004922179305215818, "loss": 2.5991, "step": 6628 }, { "crossentropy": 2.592136859893799, "epoch": 0.3604774463688518, "grad_norm": 0.04476151242852211, "grad_norm_var": 1.8531838613382715e-05, "learning_rate": 0.00492093427707626, "loss": 2.5921, "step": 6629 }, { "crossentropy": 2.7407249212265015, "epoch": 0.360531825226352, "grad_norm": 0.03972489386796951, "grad_norm_var": 1.8913687217358747e-05, "learning_rate": 0.004919689253840284, "loss": 2.7407, "step": 6630 }, { "crossentropy": 2.530860185623169, "epoch": 0.3605862040838522, "grad_norm": 0.033881526440382004, "grad_norm_var": 1.892445503715312e-05, "learning_rate": 0.004918444235585105, "loss": 2.5309, "step": 6631 }, { "crossentropy": 2.6003860235214233, "epoch": 0.36064058294135243, "grad_norm": 0.032585471868515015, "grad_norm_var": 1.8672441900170234e-05, "learning_rate": 0.0049171992223879385, "loss": 2.6004, "step": 6632 }, { "crossentropy": 2.609729528427124, "epoch": 0.36069496179885263, "grad_norm": 0.03470706567168236, "grad_norm_var": 1.8575986594233465e-05, "learning_rate": 0.004915954214326001, "loss": 2.6097, "step": 6633 }, { "crossentropy": 2.6677465438842773, "epoch": 0.36074934065635283, "grad_norm": 0.03599308803677559, "grad_norm_var": 1.8200807046858438e-05, "learning_rate": 0.004914709211476503, "loss": 2.6677, "step": 6634 }, { "crossentropy": 2.671694278717041, "epoch": 0.36080371951385304, "grad_norm": 0.03853067755699158, "grad_norm_var": 1.789813869068618e-05, "learning_rate": 0.004913464213916661, "loss": 2.6717, "step": 6635 }, { "crossentropy": 2.5967458486557007, "epoch": 0.36085809837135324, "grad_norm": 0.035290513187646866, "grad_norm_var": 1.74772131165401e-05, "learning_rate": 0.004912219221723687, "loss": 2.5967, "step": 6636 }, { "crossentropy": 2.6321219205856323, "epoch": 0.36091247722885345, "grad_norm": 0.03362635523080826, "grad_norm_var": 1.7150241657773863e-05, "learning_rate": 0.004910974234974794, "loss": 2.6321, "step": 6637 }, { "crossentropy": 2.6062527894973755, "epoch": 0.36096685608635365, "grad_norm": 0.03418833017349243, "grad_norm_var": 1.737773252411696e-05, "learning_rate": 0.0049097292537471976, "loss": 2.6063, "step": 6638 }, { "crossentropy": 2.6315170526504517, "epoch": 0.36102123494385385, "grad_norm": 0.03839605301618576, "grad_norm_var": 1.1647846183657912e-05, "learning_rate": 0.0049084842781181096, "loss": 2.6315, "step": 6639 }, { "crossentropy": 2.656668186187744, "epoch": 0.36107561380135406, "grad_norm": 0.03853747993707657, "grad_norm_var": 1.132985156625313e-05, "learning_rate": 0.004907239308164739, "loss": 2.6567, "step": 6640 }, { "crossentropy": 2.587744355201721, "epoch": 0.36112999265885426, "grad_norm": 0.03335036709904671, "grad_norm_var": 1.1332898839221522e-05, "learning_rate": 0.004905994343964303, "loss": 2.5877, "step": 6641 }, { "crossentropy": 2.7143590450286865, "epoch": 0.36118437151635446, "grad_norm": 0.03728863224387169, "grad_norm_var": 1.030494382448117e-05, "learning_rate": 0.004904749385594008, "loss": 2.7144, "step": 6642 }, { "crossentropy": 2.631029486656189, "epoch": 0.36123875037385467, "grad_norm": 0.03756843879818916, "grad_norm_var": 1.0135000301528945e-05, "learning_rate": 0.004903504433131066, "loss": 2.631, "step": 6643 }, { "crossentropy": 2.5739927291870117, "epoch": 0.36129312923135487, "grad_norm": 0.03532024100422859, "grad_norm_var": 9.62067822430305e-06, "learning_rate": 0.004902259486652692, "loss": 2.574, "step": 6644 }, { "crossentropy": 2.579662561416626, "epoch": 0.3613475080888551, "grad_norm": 0.033283013850450516, "grad_norm_var": 5.187604310504373e-06, "learning_rate": 0.0049010145462360915, "loss": 2.5797, "step": 6645 }, { "crossentropy": 2.606187105178833, "epoch": 0.3614018869463553, "grad_norm": 0.034039631485939026, "grad_norm_var": 4.207527327619066e-06, "learning_rate": 0.004899769611958477, "loss": 2.6062, "step": 6646 }, { "crossentropy": 2.5748919248580933, "epoch": 0.3614562658038555, "grad_norm": 0.03202790021896362, "grad_norm_var": 4.800450751140985e-06, "learning_rate": 0.004898524683897059, "loss": 2.5749, "step": 6647 }, { "crossentropy": 2.5904674530029297, "epoch": 0.3615106446613557, "grad_norm": 0.033998217433691025, "grad_norm_var": 4.414652096855429e-06, "learning_rate": 0.004897279762129044, "loss": 2.5905, "step": 6648 }, { "crossentropy": 2.6840124130249023, "epoch": 0.3615650235188559, "grad_norm": 0.03187178075313568, "grad_norm_var": 5.173033896282754e-06, "learning_rate": 0.004896034846731643, "loss": 2.684, "step": 6649 }, { "crossentropy": 2.618815064430237, "epoch": 0.3616194023763561, "grad_norm": 0.03159428760409355, "grad_norm_var": 5.921281385120232e-06, "learning_rate": 0.004894789937782064, "loss": 2.6188, "step": 6650 }, { "crossentropy": 2.6609957218170166, "epoch": 0.3616737812338563, "grad_norm": 0.03746117278933525, "grad_norm_var": 5.47959698923798e-06, "learning_rate": 0.004893545035357515, "loss": 2.661, "step": 6651 }, { "crossentropy": 2.5882084369659424, "epoch": 0.3617281600913565, "grad_norm": 0.03455359488725662, "grad_norm_var": 5.471743227653906e-06, "learning_rate": 0.004892300139535202, "loss": 2.5882, "step": 6652 }, { "crossentropy": 2.6074434518814087, "epoch": 0.3617825389488567, "grad_norm": 0.033969175070524216, "grad_norm_var": 5.424569320672661e-06, "learning_rate": 0.004891055250392337, "loss": 2.6074, "step": 6653 }, { "crossentropy": 2.641579508781433, "epoch": 0.3618369178063569, "grad_norm": 0.033021681010723114, "grad_norm_var": 5.6110864058243774e-06, "learning_rate": 0.004889810368006121, "loss": 2.6416, "step": 6654 }, { "crossentropy": 2.6463546752929688, "epoch": 0.3618912966638571, "grad_norm": 0.03575959429144859, "grad_norm_var": 4.77001783034392e-06, "learning_rate": 0.004888565492453765, "loss": 2.6464, "step": 6655 }, { "crossentropy": 2.548184394836426, "epoch": 0.3619456755213573, "grad_norm": 0.03782173618674278, "grad_norm_var": 4.426541993212257e-06, "learning_rate": 0.004887320623812473, "loss": 2.5482, "step": 6656 }, { "crossentropy": 2.5770461559295654, "epoch": 0.3620000543788575, "grad_norm": 0.03548594191670418, "grad_norm_var": 4.367693025563296e-06, "learning_rate": 0.00488607576215945, "loss": 2.577, "step": 6657 }, { "crossentropy": 2.6143229007720947, "epoch": 0.3620544332363577, "grad_norm": 0.03382496163249016, "grad_norm_var": 3.918121662164058e-06, "learning_rate": 0.004884830907571904, "loss": 2.6143, "step": 6658 }, { "crossentropy": 2.673859477043152, "epoch": 0.3621088120938579, "grad_norm": 0.033738873898983, "grad_norm_var": 3.2552266877499775e-06, "learning_rate": 0.004883586060127035, "loss": 2.6739, "step": 6659 }, { "crossentropy": 2.633709669113159, "epoch": 0.3621631909513581, "grad_norm": 0.0318206250667572, "grad_norm_var": 3.5146376875932447e-06, "learning_rate": 0.004882341219902051, "loss": 2.6337, "step": 6660 }, { "crossentropy": 2.634897232055664, "epoch": 0.36221756980885833, "grad_norm": 0.03358263149857521, "grad_norm_var": 3.490925861706663e-06, "learning_rate": 0.0048810963869741555, "loss": 2.6349, "step": 6661 }, { "crossentropy": 2.572015881538391, "epoch": 0.36227194866635853, "grad_norm": 0.03284156695008278, "grad_norm_var": 3.580013800711518e-06, "learning_rate": 0.004879851561420552, "loss": 2.572, "step": 6662 }, { "crossentropy": 2.7058310508728027, "epoch": 0.36232632752385874, "grad_norm": 0.03770112246274948, "grad_norm_var": 4.129457296462672e-06, "learning_rate": 0.0048786067433184395, "loss": 2.7058, "step": 6663 }, { "crossentropy": 2.5335354804992676, "epoch": 0.36238070638135894, "grad_norm": 0.04130527004599571, "grad_norm_var": 7.157463977412274e-06, "learning_rate": 0.004877361932745027, "loss": 2.5335, "step": 6664 }, { "crossentropy": 2.6210349798202515, "epoch": 0.36243508523885914, "grad_norm": 0.03412942215800285, "grad_norm_var": 6.602964418374729e-06, "learning_rate": 0.004876117129777511, "loss": 2.621, "step": 6665 }, { "crossentropy": 2.675740122795105, "epoch": 0.36248946409635935, "grad_norm": 0.03206195682287216, "grad_norm_var": 6.409678535986239e-06, "learning_rate": 0.004874872334493095, "loss": 2.6757, "step": 6666 }, { "crossentropy": 2.656353235244751, "epoch": 0.36254384295385955, "grad_norm": 0.03263869136571884, "grad_norm_var": 6.243671554819786e-06, "learning_rate": 0.004873627546968982, "loss": 2.6564, "step": 6667 }, { "crossentropy": 2.6137537956237793, "epoch": 0.36259822181135976, "grad_norm": 0.03499807417392731, "grad_norm_var": 6.250836070439911e-06, "learning_rate": 0.00487238276728237, "loss": 2.6138, "step": 6668 }, { "crossentropy": 2.65796959400177, "epoch": 0.36265260066885996, "grad_norm": 0.03351058438420296, "grad_norm_var": 6.306761027205452e-06, "learning_rate": 0.004871137995510461, "loss": 2.658, "step": 6669 }, { "crossentropy": 2.5408682823181152, "epoch": 0.36270697952636016, "grad_norm": 0.0316341370344162, "grad_norm_var": 6.72652103346118e-06, "learning_rate": 0.00486989323173045, "loss": 2.5409, "step": 6670 }, { "crossentropy": 2.7087600231170654, "epoch": 0.36276135838386037, "grad_norm": 0.03451322764158249, "grad_norm_var": 6.623170556598288e-06, "learning_rate": 0.004868648476019541, "loss": 2.7088, "step": 6671 }, { "crossentropy": 2.632684588432312, "epoch": 0.36281573724136057, "grad_norm": 0.03386517986655235, "grad_norm_var": 5.836317561763495e-06, "learning_rate": 0.004867403728454933, "loss": 2.6327, "step": 6672 }, { "crossentropy": 2.64142906665802, "epoch": 0.3628701160988608, "grad_norm": 0.03576217219233513, "grad_norm_var": 5.887407573134292e-06, "learning_rate": 0.00486615898911382, "loss": 2.6414, "step": 6673 }, { "crossentropy": 2.5716105699539185, "epoch": 0.362924494956361, "grad_norm": 0.0343061126768589, "grad_norm_var": 5.8748957284228406e-06, "learning_rate": 0.004864914258073403, "loss": 2.5716, "step": 6674 }, { "crossentropy": 2.5826761722564697, "epoch": 0.3629788738138612, "grad_norm": 0.034761108458042145, "grad_norm_var": 5.867050873285123e-06, "learning_rate": 0.004863669535410877, "loss": 2.5827, "step": 6675 }, { "crossentropy": 2.6797678470611572, "epoch": 0.3630332526713614, "grad_norm": 0.036429811269044876, "grad_norm_var": 5.646847570422477e-06, "learning_rate": 0.004862424821203438, "loss": 2.6798, "step": 6676 }, { "crossentropy": 2.5517624616622925, "epoch": 0.3630876315288616, "grad_norm": 0.03361174836754799, "grad_norm_var": 5.642843858143174e-06, "learning_rate": 0.0048611801155282845, "loss": 2.5518, "step": 6677 }, { "crossentropy": 2.6227495670318604, "epoch": 0.3631420103863618, "grad_norm": 0.03193735331296921, "grad_norm_var": 5.909486791777497e-06, "learning_rate": 0.004859935418462612, "loss": 2.6227, "step": 6678 }, { "crossentropy": 2.5938332080841064, "epoch": 0.363196389243862, "grad_norm": 0.034300290048122406, "grad_norm_var": 5.213853774732602e-06, "learning_rate": 0.004858690730083614, "loss": 2.5938, "step": 6679 }, { "crossentropy": 2.562049150466919, "epoch": 0.3632507681013622, "grad_norm": 0.03493858501315117, "grad_norm_var": 1.8517642855236937e-06, "learning_rate": 0.004857446050468482, "loss": 2.562, "step": 6680 }, { "crossentropy": 2.6664308309555054, "epoch": 0.3633051469588624, "grad_norm": 0.03711479902267456, "grad_norm_var": 2.4752758230920173e-06, "learning_rate": 0.004856201379694418, "loss": 2.6664, "step": 6681 }, { "crossentropy": 2.608373761177063, "epoch": 0.3633595258163626, "grad_norm": 0.04046887159347534, "grad_norm_var": 4.553138566445493e-06, "learning_rate": 0.004854956717838607, "loss": 2.6084, "step": 6682 }, { "crossentropy": 2.604282855987549, "epoch": 0.3634139046738628, "grad_norm": 0.03442903235554695, "grad_norm_var": 4.267517603587928e-06, "learning_rate": 0.004853712064978247, "loss": 2.6043, "step": 6683 }, { "crossentropy": 2.4932360649108887, "epoch": 0.363468283531363, "grad_norm": 0.03580979257822037, "grad_norm_var": 4.331616215047518e-06, "learning_rate": 0.004852467421190527, "loss": 2.4932, "step": 6684 }, { "crossentropy": 2.608092427253723, "epoch": 0.3635226623888632, "grad_norm": 0.03290456905961037, "grad_norm_var": 4.461750787361033e-06, "learning_rate": 0.0048512227865526395, "loss": 2.6081, "step": 6685 }, { "crossentropy": 2.69973361492157, "epoch": 0.3635770412463634, "grad_norm": 0.036327581852674484, "grad_norm_var": 3.857870203799867e-06, "learning_rate": 0.004849978161141778, "loss": 2.6997, "step": 6686 }, { "crossentropy": 2.5586782693862915, "epoch": 0.3636314201038636, "grad_norm": 0.03668643906712532, "grad_norm_var": 3.9851930950571735e-06, "learning_rate": 0.004848733545035129, "loss": 2.5587, "step": 6687 }, { "crossentropy": 2.5276739597320557, "epoch": 0.3636857989613638, "grad_norm": 0.03343548998236656, "grad_norm_var": 4.0748308557711966e-06, "learning_rate": 0.004847488938309885, "loss": 2.5277, "step": 6688 }, { "crossentropy": 2.58281409740448, "epoch": 0.36374017781886403, "grad_norm": 0.03411255404353142, "grad_norm_var": 4.121585678999034e-06, "learning_rate": 0.004846244341043237, "loss": 2.5828, "step": 6689 }, { "crossentropy": 2.611216425895691, "epoch": 0.36379455667636423, "grad_norm": 0.03303644061088562, "grad_norm_var": 4.3564631224817775e-06, "learning_rate": 0.00484499975331237, "loss": 2.6112, "step": 6690 }, { "crossentropy": 2.644786238670349, "epoch": 0.36384893553386444, "grad_norm": 0.03312360495328903, "grad_norm_var": 4.580364538630567e-06, "learning_rate": 0.004843755175194472, "loss": 2.6448, "step": 6691 }, { "crossentropy": 2.606232166290283, "epoch": 0.36390331439136464, "grad_norm": 0.032818105071783066, "grad_norm_var": 4.666978649881838e-06, "learning_rate": 0.004842510606766736, "loss": 2.6062, "step": 6692 }, { "crossentropy": 2.6726995706558228, "epoch": 0.36395769324886484, "grad_norm": 0.03321095183491707, "grad_norm_var": 4.734690742628072e-06, "learning_rate": 0.004841266048106342, "loss": 2.6727, "step": 6693 }, { "crossentropy": 2.598579525947571, "epoch": 0.36401207210636505, "grad_norm": 0.03220078721642494, "grad_norm_var": 4.64318905493861e-06, "learning_rate": 0.004840021499290481, "loss": 2.5986, "step": 6694 }, { "crossentropy": 2.436154842376709, "epoch": 0.36406645096386525, "grad_norm": 0.03364355489611626, "grad_norm_var": 4.703601941921133e-06, "learning_rate": 0.004838776960396339, "loss": 2.4362, "step": 6695 }, { "crossentropy": 2.633998394012451, "epoch": 0.36412082982136545, "grad_norm": 0.032784562557935715, "grad_norm_var": 4.908215539004407e-06, "learning_rate": 0.004837532431501098, "loss": 2.634, "step": 6696 }, { "crossentropy": 2.606085777282715, "epoch": 0.36417520867886566, "grad_norm": 0.033963270485401154, "grad_norm_var": 4.433039000995414e-06, "learning_rate": 0.0048362879126819455, "loss": 2.6061, "step": 6697 }, { "crossentropy": 2.644289493560791, "epoch": 0.36422958753636586, "grad_norm": 0.0338851660490036, "grad_norm_var": 1.7354454402780545e-06, "learning_rate": 0.004835043404016062, "loss": 2.6443, "step": 6698 }, { "crossentropy": 2.5919809341430664, "epoch": 0.36428396639386607, "grad_norm": 0.03421016409993172, "grad_norm_var": 1.7229496986366383e-06, "learning_rate": 0.004833798905580633, "loss": 2.592, "step": 6699 }, { "crossentropy": 2.606292486190796, "epoch": 0.36433834525136627, "grad_norm": 0.0319613441824913, "grad_norm_var": 1.66072401067089e-06, "learning_rate": 0.004832554417452843, "loss": 2.6063, "step": 6700 }, { "crossentropy": 2.7168833017349243, "epoch": 0.3643927241088665, "grad_norm": 0.03305654227733612, "grad_norm_var": 1.6471835998566053e-06, "learning_rate": 0.0048313099397098705, "loss": 2.7169, "step": 6701 }, { "crossentropy": 2.5860854387283325, "epoch": 0.3644471029663667, "grad_norm": 0.03349432349205017, "grad_norm_var": 1.138724392213813e-06, "learning_rate": 0.004830065472428897, "loss": 2.5861, "step": 6702 }, { "crossentropy": 2.7339630126953125, "epoch": 0.3645014818238669, "grad_norm": 0.03343992680311203, "grad_norm_var": 4.079647208626395e-07, "learning_rate": 0.0048288210156871075, "loss": 2.734, "step": 6703 }, { "crossentropy": 2.638368606567383, "epoch": 0.3645558606813671, "grad_norm": 0.03347347676753998, "grad_norm_var": 4.0887512230177124e-07, "learning_rate": 0.004827576569561676, "loss": 2.6384, "step": 6704 }, { "crossentropy": 2.611401677131653, "epoch": 0.3646102395388673, "grad_norm": 0.03428013622760773, "grad_norm_var": 4.2932427649725973e-07, "learning_rate": 0.004826332134129786, "loss": 2.6114, "step": 6705 }, { "crossentropy": 2.563820004463196, "epoch": 0.3646646183963675, "grad_norm": 0.03624198958277702, "grad_norm_var": 9.647126492669129e-07, "learning_rate": 0.004825087709468616, "loss": 2.5638, "step": 6706 }, { "crossentropy": 2.591017007827759, "epoch": 0.3647189972538677, "grad_norm": 0.03702574968338013, "grad_norm_var": 1.7274472624691113e-06, "learning_rate": 0.004823843295655343, "loss": 2.591, "step": 6707 }, { "crossentropy": 2.681854724884033, "epoch": 0.3647733761113679, "grad_norm": 0.04056796804070473, "grad_norm_var": 4.538296869702601e-06, "learning_rate": 0.004822598892767142, "loss": 2.6819, "step": 6708 }, { "crossentropy": 2.7004224061965942, "epoch": 0.3648277549688681, "grad_norm": 0.03591274097561836, "grad_norm_var": 4.632830979384566e-06, "learning_rate": 0.004821354500881196, "loss": 2.7004, "step": 6709 }, { "crossentropy": 2.5421870946884155, "epoch": 0.3648821338263683, "grad_norm": 0.03544840216636658, "grad_norm_var": 4.3467162173125205e-06, "learning_rate": 0.004820110120074676, "loss": 2.5422, "step": 6710 }, { "crossentropy": 2.5939546823501587, "epoch": 0.3649365126838685, "grad_norm": 0.03255339339375496, "grad_norm_var": 4.55810444871992e-06, "learning_rate": 0.004818865750424759, "loss": 2.594, "step": 6711 }, { "crossentropy": 2.6474549770355225, "epoch": 0.3649908915413687, "grad_norm": 0.03345372527837753, "grad_norm_var": 4.431368177573667e-06, "learning_rate": 0.0048176213920086185, "loss": 2.6475, "step": 6712 }, { "crossentropy": 2.638548970222473, "epoch": 0.3650452703988689, "grad_norm": 0.03435014560818672, "grad_norm_var": 4.4099145711349225e-06, "learning_rate": 0.0048163770449034275, "loss": 2.6385, "step": 6713 }, { "crossentropy": 2.591763734817505, "epoch": 0.3650996492563691, "grad_norm": 0.03207111731171608, "grad_norm_var": 4.784786281990654e-06, "learning_rate": 0.004815132709186365, "loss": 2.5918, "step": 6714 }, { "crossentropy": 2.6828819513320923, "epoch": 0.3651540281138693, "grad_norm": 0.03568669408559799, "grad_norm_var": 4.86963082767707e-06, "learning_rate": 0.004813888384934597, "loss": 2.6829, "step": 6715 }, { "crossentropy": 2.7277785539627075, "epoch": 0.3652084069713695, "grad_norm": 0.03478330746293068, "grad_norm_var": 4.388216958818522e-06, "learning_rate": 0.004812644072225297, "loss": 2.7278, "step": 6716 }, { "crossentropy": 2.6478054523468018, "epoch": 0.36526278582886973, "grad_norm": 0.032820284366607666, "grad_norm_var": 4.444735554823218e-06, "learning_rate": 0.00481139977113564, "loss": 2.6478, "step": 6717 }, { "crossentropy": 2.648166060447693, "epoch": 0.36531716468636993, "grad_norm": 0.032988421618938446, "grad_norm_var": 4.54375939379862e-06, "learning_rate": 0.0048101554817427915, "loss": 2.6482, "step": 6718 }, { "crossentropy": 2.654443383216858, "epoch": 0.36537154354387014, "grad_norm": 0.03464266657829285, "grad_norm_var": 4.433126355715631e-06, "learning_rate": 0.004808911204123922, "loss": 2.6544, "step": 6719 }, { "crossentropy": 2.582178473472595, "epoch": 0.36542592240137034, "grad_norm": 0.03270164504647255, "grad_norm_var": 4.603658257217499e-06, "learning_rate": 0.004807666938356203, "loss": 2.5822, "step": 6720 }, { "crossentropy": 2.600250005722046, "epoch": 0.36548030125887054, "grad_norm": 0.03653162717819214, "grad_norm_var": 4.788280033792466e-06, "learning_rate": 0.0048064226845168, "loss": 2.6003, "step": 6721 }, { "crossentropy": 2.611547350883484, "epoch": 0.36553468011637075, "grad_norm": 0.033942487090826035, "grad_norm_var": 4.695424470268124e-06, "learning_rate": 0.004805178442682883, "loss": 2.6115, "step": 6722 }, { "crossentropy": 2.687898278236389, "epoch": 0.36558905897387095, "grad_norm": 0.035245101898908615, "grad_norm_var": 4.3455752482480955e-06, "learning_rate": 0.004803934212931617, "loss": 2.6879, "step": 6723 }, { "crossentropy": 2.6186643838882446, "epoch": 0.36564343783137115, "grad_norm": 0.034483980387449265, "grad_norm_var": 1.8228572130869643e-06, "learning_rate": 0.004802689995340167, "loss": 2.6187, "step": 6724 }, { "crossentropy": 2.653476119041443, "epoch": 0.36569781668887136, "grad_norm": 0.03551167994737625, "grad_norm_var": 1.742711324718288e-06, "learning_rate": 0.004801445789985702, "loss": 2.6535, "step": 6725 }, { "crossentropy": 2.565632939338684, "epoch": 0.36575219554637156, "grad_norm": 0.03460415452718735, "grad_norm_var": 1.6468336524716465e-06, "learning_rate": 0.0048002015969453825, "loss": 2.5656, "step": 6726 }, { "crossentropy": 2.585612416267395, "epoch": 0.36580657440387176, "grad_norm": 0.032220177352428436, "grad_norm_var": 1.7246264299529297e-06, "learning_rate": 0.004798957416296373, "loss": 2.5856, "step": 6727 }, { "crossentropy": 2.597899079322815, "epoch": 0.36586095326137197, "grad_norm": 0.03268483281135559, "grad_norm_var": 1.8306330243170536e-06, "learning_rate": 0.004797713248115839, "loss": 2.5979, "step": 6728 }, { "crossentropy": 2.5495474338531494, "epoch": 0.36591533211887217, "grad_norm": 0.0352974496781826, "grad_norm_var": 1.920933103194353e-06, "learning_rate": 0.00479646909248094, "loss": 2.5495, "step": 6729 }, { "crossentropy": 2.716167449951172, "epoch": 0.3659697109763724, "grad_norm": 0.03840694576501846, "grad_norm_var": 2.6833953077951463e-06, "learning_rate": 0.004795224949468836, "loss": 2.7162, "step": 6730 }, { "crossentropy": 2.6360461711883545, "epoch": 0.3660240898338726, "grad_norm": 0.03893667086958885, "grad_norm_var": 3.842837402263874e-06, "learning_rate": 0.004793980819156692, "loss": 2.636, "step": 6731 }, { "crossentropy": 2.6595640182495117, "epoch": 0.3660784686913728, "grad_norm": 0.03450227901339531, "grad_norm_var": 3.846060392363679e-06, "learning_rate": 0.004792736701621665, "loss": 2.6596, "step": 6732 }, { "crossentropy": 2.655771493911743, "epoch": 0.366132847548873, "grad_norm": 0.03396899998188019, "grad_norm_var": 3.637563844885392e-06, "learning_rate": 0.004791492596940915, "loss": 2.6558, "step": 6733 }, { "crossentropy": 2.605555534362793, "epoch": 0.3661872264063732, "grad_norm": 0.03708365187048912, "grad_norm_var": 3.70103489863415e-06, "learning_rate": 0.0047902485051916, "loss": 2.6056, "step": 6734 }, { "crossentropy": 2.650282382965088, "epoch": 0.3662416052638734, "grad_norm": 0.03387274220585823, "grad_norm_var": 3.7796705953311096e-06, "learning_rate": 0.004789004426450875, "loss": 2.6503, "step": 6735 }, { "crossentropy": 2.574069023132324, "epoch": 0.3662959841213736, "grad_norm": 0.03276989236474037, "grad_norm_var": 3.7590506633932685e-06, "learning_rate": 0.0047877603607958995, "loss": 2.5741, "step": 6736 }, { "crossentropy": 2.6930612325668335, "epoch": 0.3663503629788738, "grad_norm": 0.03311091661453247, "grad_norm_var": 3.7935989084913446e-06, "learning_rate": 0.004786516308303831, "loss": 2.6931, "step": 6737 }, { "crossentropy": 2.5989595651626587, "epoch": 0.366404741836374, "grad_norm": 0.031574051827192307, "grad_norm_var": 4.411867751165308e-06, "learning_rate": 0.00478527226905182, "loss": 2.599, "step": 6738 }, { "crossentropy": 2.6314443349838257, "epoch": 0.3664591206938742, "grad_norm": 0.03265194222331047, "grad_norm_var": 4.623655300629801e-06, "learning_rate": 0.004784028243117023, "loss": 2.6314, "step": 6739 }, { "crossentropy": 2.5692538022994995, "epoch": 0.3665134995513744, "grad_norm": 0.03558990731835365, "grad_norm_var": 4.700680998502975e-06, "learning_rate": 0.004782784230576593, "loss": 2.5693, "step": 6740 }, { "crossentropy": 2.467837333679199, "epoch": 0.3665678784088746, "grad_norm": 0.03157592937350273, "grad_norm_var": 5.16370711462095e-06, "learning_rate": 0.004781540231507682, "loss": 2.4678, "step": 6741 }, { "crossentropy": 2.5244529247283936, "epoch": 0.3666222572663748, "grad_norm": 0.03216193988919258, "grad_norm_var": 5.4384701239547245e-06, "learning_rate": 0.004780296245987443, "loss": 2.5245, "step": 6742 }, { "crossentropy": 2.6169224977493286, "epoch": 0.366676636123875, "grad_norm": 0.03479667752981186, "grad_norm_var": 5.190229910205917e-06, "learning_rate": 0.004779052274093026, "loss": 2.6169, "step": 6743 }, { "crossentropy": 2.5911999940872192, "epoch": 0.3667310149813752, "grad_norm": 0.0336202010512352, "grad_norm_var": 5.042034511410811e-06, "learning_rate": 0.00477780831590158, "loss": 2.5912, "step": 6744 }, { "crossentropy": 2.6144031286239624, "epoch": 0.36678539383887543, "grad_norm": 0.03180990740656853, "grad_norm_var": 5.370955369398724e-06, "learning_rate": 0.004776564371490257, "loss": 2.6144, "step": 6745 }, { "crossentropy": 2.5180580615997314, "epoch": 0.36683977269637563, "grad_norm": 0.03342298045754433, "grad_norm_var": 4.095943362423387e-06, "learning_rate": 0.004775320440936202, "loss": 2.5181, "step": 6746 }, { "crossentropy": 2.592197299003601, "epoch": 0.36689415155387584, "grad_norm": 0.03429805859923363, "grad_norm_var": 2.2888771745034618e-06, "learning_rate": 0.004774076524316563, "loss": 2.5922, "step": 6747 }, { "crossentropy": 2.5418344736099243, "epoch": 0.36694853041137604, "grad_norm": 0.032877352088689804, "grad_norm_var": 2.2477200158389776e-06, "learning_rate": 0.004772832621708488, "loss": 2.5418, "step": 6748 }, { "crossentropy": 2.5449397563934326, "epoch": 0.36700290926887624, "grad_norm": 0.034103959798812866, "grad_norm_var": 2.2582143197315275e-06, "learning_rate": 0.0047715887331891225, "loss": 2.5449, "step": 6749 }, { "crossentropy": 2.7722790241241455, "epoch": 0.36705728812637645, "grad_norm": 0.03536194562911987, "grad_norm_var": 1.611060489682623e-06, "learning_rate": 0.004770344858835611, "loss": 2.7723, "step": 6750 }, { "crossentropy": 2.5406148433685303, "epoch": 0.36711166698387665, "grad_norm": 0.0344536155462265, "grad_norm_var": 1.6726428464857815e-06, "learning_rate": 0.0047691009987250975, "loss": 2.5406, "step": 6751 }, { "crossentropy": 2.7005953788757324, "epoch": 0.36716604584137685, "grad_norm": 0.03468100354075432, "grad_norm_var": 1.7438689079154098e-06, "learning_rate": 0.0047678571529347225, "loss": 2.7006, "step": 6752 }, { "crossentropy": 2.6145612001419067, "epoch": 0.36722042469887706, "grad_norm": 0.0341707207262516, "grad_norm_var": 1.7582893778737724e-06, "learning_rate": 0.004766613321541634, "loss": 2.6146, "step": 6753 }, { "crossentropy": 2.5284963846206665, "epoch": 0.36727480355637726, "grad_norm": 0.033286262303590775, "grad_norm_var": 1.4854231665074765e-06, "learning_rate": 0.004765369504622967, "loss": 2.5285, "step": 6754 }, { "crossentropy": 2.5799057483673096, "epoch": 0.36732918241387746, "grad_norm": 0.032556165009737015, "grad_norm_var": 1.4991110528596193e-06, "learning_rate": 0.004764125702255865, "loss": 2.5799, "step": 6755 }, { "crossentropy": 2.6210325956344604, "epoch": 0.36738356127137767, "grad_norm": 0.03654763475060463, "grad_norm_var": 1.801232927866175e-06, "learning_rate": 0.004762881914517468, "loss": 2.621, "step": 6756 }, { "crossentropy": 2.5375423431396484, "epoch": 0.36743794012887787, "grad_norm": 0.033613238483667374, "grad_norm_var": 1.4747598236830005e-06, "learning_rate": 0.004761638141484913, "loss": 2.5375, "step": 6757 }, { "crossentropy": 2.6772271394729614, "epoch": 0.3674923189863781, "grad_norm": 0.03475054353475571, "grad_norm_var": 1.307447643711379e-06, "learning_rate": 0.004760394383235337, "loss": 2.6772, "step": 6758 }, { "crossentropy": 2.526787042617798, "epoch": 0.3675466978438783, "grad_norm": 0.032905347645282745, "grad_norm_var": 1.3356347652139963e-06, "learning_rate": 0.0047591506398458805, "loss": 2.5268, "step": 6759 }, { "crossentropy": 2.6259323358535767, "epoch": 0.3676010767013785, "grad_norm": 0.03434544801712036, "grad_norm_var": 1.3410960712387695e-06, "learning_rate": 0.004757906911393674, "loss": 2.6259, "step": 6760 }, { "crossentropy": 2.5176162719726562, "epoch": 0.3676554555588787, "grad_norm": 0.034645311534404755, "grad_norm_var": 1.03486925748051e-06, "learning_rate": 0.004756663197955857, "loss": 2.5176, "step": 6761 }, { "crossentropy": 2.5728917121887207, "epoch": 0.3677098344163789, "grad_norm": 0.03316938877105713, "grad_norm_var": 1.0626667904856112e-06, "learning_rate": 0.004755419499609562, "loss": 2.5729, "step": 6762 }, { "crossentropy": 2.6691747903823853, "epoch": 0.3677642132738791, "grad_norm": 0.03266345337033272, "grad_norm_var": 1.1887574809223146e-06, "learning_rate": 0.00475417581643192, "loss": 2.6692, "step": 6763 }, { "crossentropy": 2.649107813835144, "epoch": 0.3678185921313793, "grad_norm": 0.033423397690057755, "grad_norm_var": 1.1250593716538581e-06, "learning_rate": 0.004752932148500067, "loss": 2.6491, "step": 6764 }, { "crossentropy": 2.6268540620803833, "epoch": 0.3678729709888795, "grad_norm": 0.0343010388314724, "grad_norm_var": 1.1291060827607497e-06, "learning_rate": 0.00475168849589113, "loss": 2.6269, "step": 6765 }, { "crossentropy": 2.596426010131836, "epoch": 0.3679273498463797, "grad_norm": 0.03659804165363312, "grad_norm_var": 1.4400597879081374e-06, "learning_rate": 0.00475044485868224, "loss": 2.5964, "step": 6766 }, { "crossentropy": 2.608629822731018, "epoch": 0.3679817287038799, "grad_norm": 0.037169136106967926, "grad_norm_var": 2.017416606000094e-06, "learning_rate": 0.00474920123695053, "loss": 2.6086, "step": 6767 }, { "crossentropy": 2.5028247833251953, "epoch": 0.3680361075613801, "grad_norm": 0.033135730773210526, "grad_norm_var": 2.088494273776107e-06, "learning_rate": 0.0047479576307731235, "loss": 2.5028, "step": 6768 }, { "crossentropy": 2.634881615638733, "epoch": 0.3680904864188803, "grad_norm": 0.033394306898117065, "grad_norm_var": 2.129724638566301e-06, "learning_rate": 0.004746714040227149, "loss": 2.6349, "step": 6769 }, { "crossentropy": 2.4912238121032715, "epoch": 0.3681448652763805, "grad_norm": 0.04146441072225571, "grad_norm_var": 5.360901852699956e-06, "learning_rate": 0.004745470465389735, "loss": 2.4912, "step": 6770 }, { "crossentropy": 2.5497745275497437, "epoch": 0.3681992441338807, "grad_norm": 0.0324181467294693, "grad_norm_var": 5.4009491112355186e-06, "learning_rate": 0.004744226906338006, "loss": 2.5498, "step": 6771 }, { "crossentropy": 2.6204123497009277, "epoch": 0.3682536229913809, "grad_norm": 0.03321170434355736, "grad_norm_var": 5.256444877875059e-06, "learning_rate": 0.004742983363149085, "loss": 2.6204, "step": 6772 }, { "crossentropy": 2.677572250366211, "epoch": 0.36830800184888113, "grad_norm": 0.033691223710775375, "grad_norm_var": 5.248118694013935e-06, "learning_rate": 0.0047417398359000966, "loss": 2.6776, "step": 6773 }, { "crossentropy": 2.471442699432373, "epoch": 0.36836238070638133, "grad_norm": 0.037747353315353394, "grad_norm_var": 5.927349116528294e-06, "learning_rate": 0.004740496324668162, "loss": 2.4714, "step": 6774 }, { "crossentropy": 2.6769216060638428, "epoch": 0.36841675956388154, "grad_norm": 0.03469390422105789, "grad_norm_var": 5.712965221314296e-06, "learning_rate": 0.004739252829530403, "loss": 2.6769, "step": 6775 }, { "crossentropy": 2.629647731781006, "epoch": 0.3684711384213818, "grad_norm": 0.031839191913604736, "grad_norm_var": 6.242239501184447e-06, "learning_rate": 0.004738009350563943, "loss": 2.6296, "step": 6776 }, { "crossentropy": 2.560894727706909, "epoch": 0.368525517278882, "grad_norm": 0.03308216854929924, "grad_norm_var": 6.3850629433596815e-06, "learning_rate": 0.004736765887845896, "loss": 2.5609, "step": 6777 }, { "crossentropy": 2.7480698823928833, "epoch": 0.3685798961363822, "grad_norm": 0.03150106966495514, "grad_norm_var": 6.855039148368804e-06, "learning_rate": 0.004735522441453386, "loss": 2.7481, "step": 6778 }, { "crossentropy": 2.63126802444458, "epoch": 0.3686342749938824, "grad_norm": 0.03213558346033096, "grad_norm_var": 6.9943882218571896e-06, "learning_rate": 0.0047342790114635255, "loss": 2.6313, "step": 6779 }, { "crossentropy": 2.6248435974121094, "epoch": 0.3686886538513826, "grad_norm": 0.03460557386279106, "grad_norm_var": 6.9336467860799575e-06, "learning_rate": 0.0047330355979534324, "loss": 2.6248, "step": 6780 }, { "crossentropy": 2.6173882484436035, "epoch": 0.3687430327088828, "grad_norm": 0.03292083740234375, "grad_norm_var": 7.077687760659255e-06, "learning_rate": 0.004731792201000225, "loss": 2.6174, "step": 6781 }, { "crossentropy": 2.5050517320632935, "epoch": 0.368797411566383, "grad_norm": 0.032980408519506454, "grad_norm_var": 6.811549505937135e-06, "learning_rate": 0.004730548820681013, "loss": 2.5051, "step": 6782 }, { "crossentropy": 2.792978525161743, "epoch": 0.3688517904238832, "grad_norm": 0.03565731272101402, "grad_norm_var": 6.34065743967954e-06, "learning_rate": 0.004729305457072913, "loss": 2.793, "step": 6783 }, { "crossentropy": 2.5826278924942017, "epoch": 0.3689061692813834, "grad_norm": 0.03840330243110657, "grad_norm_var": 7.446827866043153e-06, "learning_rate": 0.004728062110253037, "loss": 2.5826, "step": 6784 }, { "crossentropy": 2.5574578046798706, "epoch": 0.3689605481388836, "grad_norm": 0.036092083901166916, "grad_norm_var": 7.554643162204349e-06, "learning_rate": 0.004726818780298493, "loss": 2.5575, "step": 6785 }, { "crossentropy": 2.5860824584960938, "epoch": 0.36901492699638383, "grad_norm": 0.03395512327551842, "grad_norm_var": 4.133747201480927e-06, "learning_rate": 0.0047255754672863946, "loss": 2.5861, "step": 6786 }, { "crossentropy": 2.54526150226593, "epoch": 0.36906930585388403, "grad_norm": 0.03212911635637283, "grad_norm_var": 4.202180846202043e-06, "learning_rate": 0.004724332171293851, "loss": 2.5453, "step": 6787 }, { "crossentropy": 2.6347540616989136, "epoch": 0.36912368471138424, "grad_norm": 0.032316144555807114, "grad_norm_var": 4.351257120767625e-06, "learning_rate": 0.004723088892397968, "loss": 2.6348, "step": 6788 }, { "crossentropy": 2.6100391149520874, "epoch": 0.36917806356888444, "grad_norm": 0.03376857191324234, "grad_norm_var": 4.34860748914258e-06, "learning_rate": 0.004721845630675853, "loss": 2.61, "step": 6789 }, { "crossentropy": 2.6113500595092773, "epoch": 0.36923244242638464, "grad_norm": 0.040656182914972305, "grad_norm_var": 6.335001868719668e-06, "learning_rate": 0.004720602386204616, "loss": 2.6114, "step": 6790 }, { "crossentropy": 2.4975342750549316, "epoch": 0.36928682128388485, "grad_norm": 0.03184468299150467, "grad_norm_var": 6.643745080227535e-06, "learning_rate": 0.004719359159061355, "loss": 2.4975, "step": 6791 }, { "crossentropy": 2.593051552772522, "epoch": 0.36934120014138505, "grad_norm": 0.03305472061038017, "grad_norm_var": 6.387027257043615e-06, "learning_rate": 0.004718115949323179, "loss": 2.5931, "step": 6792 }, { "crossentropy": 2.5557512044906616, "epoch": 0.36939557899888525, "grad_norm": 0.03705621510744095, "grad_norm_var": 6.851234405020705e-06, "learning_rate": 0.004716872757067189, "loss": 2.5558, "step": 6793 }, { "crossentropy": 2.574479341506958, "epoch": 0.36944995785638546, "grad_norm": 0.03133572265505791, "grad_norm_var": 6.915030679500017e-06, "learning_rate": 0.004715629582370486, "loss": 2.5745, "step": 6794 }, { "crossentropy": 2.4714549779891968, "epoch": 0.36950433671388566, "grad_norm": 0.03233972191810608, "grad_norm_var": 6.858533304820019e-06, "learning_rate": 0.004714386425310173, "loss": 2.4715, "step": 6795 }, { "crossentropy": 2.5358290672302246, "epoch": 0.36955871557138587, "grad_norm": 0.03196796402335167, "grad_norm_var": 7.192819868397035e-06, "learning_rate": 0.0047131432859633445, "loss": 2.5358, "step": 6796 }, { "crossentropy": 2.624803900718689, "epoch": 0.36961309442888607, "grad_norm": 0.03329925984144211, "grad_norm_var": 7.139504736468292e-06, "learning_rate": 0.004711900164407103, "loss": 2.6248, "step": 6797 }, { "crossentropy": 2.5503634214401245, "epoch": 0.36966747328638627, "grad_norm": 0.03401023894548416, "grad_norm_var": 7.041273767168186e-06, "learning_rate": 0.0047106570607185465, "loss": 2.5504, "step": 6798 }, { "crossentropy": 2.613956093788147, "epoch": 0.3697218521438865, "grad_norm": 0.03252200782299042, "grad_norm_var": 7.064374363428769e-06, "learning_rate": 0.0047094139749747676, "loss": 2.614, "step": 6799 }, { "crossentropy": 2.702030897140503, "epoch": 0.3697762310013867, "grad_norm": 0.03510161489248276, "grad_norm_var": 5.827916507458672e-06, "learning_rate": 0.0047081709072528625, "loss": 2.702, "step": 6800 }, { "crossentropy": 2.686751365661621, "epoch": 0.3698306098588869, "grad_norm": 0.03321519121527672, "grad_norm_var": 5.481555996138196e-06, "learning_rate": 0.0047069278576299285, "loss": 2.6868, "step": 6801 }, { "crossentropy": 2.587224841117859, "epoch": 0.3698849887163871, "grad_norm": 0.03470302000641823, "grad_norm_var": 5.545867130266925e-06, "learning_rate": 0.0047056848261830524, "loss": 2.5872, "step": 6802 }, { "crossentropy": 2.649297595024109, "epoch": 0.3699393675738873, "grad_norm": 0.033712051808834076, "grad_norm_var": 5.369336873570623e-06, "learning_rate": 0.0047044418129893305, "loss": 2.6493, "step": 6803 }, { "crossentropy": 2.514124870300293, "epoch": 0.3699937464313875, "grad_norm": 0.03242066502571106, "grad_norm_var": 5.349250569690785e-06, "learning_rate": 0.004703198818125852, "loss": 2.5141, "step": 6804 }, { "crossentropy": 2.648989200592041, "epoch": 0.3700481252888877, "grad_norm": 0.03405076265335083, "grad_norm_var": 5.352556315385328e-06, "learning_rate": 0.004701955841669706, "loss": 2.649, "step": 6805 }, { "crossentropy": 2.6683677434921265, "epoch": 0.3701025041463879, "grad_norm": 0.034494392573833466, "grad_norm_var": 2.1178486524987713e-06, "learning_rate": 0.004700712883697981, "loss": 2.6684, "step": 6806 }, { "crossentropy": 2.7232112884521484, "epoch": 0.3701568830038881, "grad_norm": 0.03504328429698944, "grad_norm_var": 2.074566394129745e-06, "learning_rate": 0.004699469944287763, "loss": 2.7232, "step": 6807 }, { "crossentropy": 2.603705048561096, "epoch": 0.3702112618613883, "grad_norm": 0.039122164249420166, "grad_norm_var": 3.897556312879719e-06, "learning_rate": 0.004698227023516139, "loss": 2.6037, "step": 6808 }, { "crossentropy": 2.5488226413726807, "epoch": 0.3702656407188885, "grad_norm": 0.03621017932891846, "grad_norm_var": 3.600316473498233e-06, "learning_rate": 0.004696984121460195, "loss": 2.5488, "step": 6809 }, { "crossentropy": 2.543026328086853, "epoch": 0.3703200195763887, "grad_norm": 0.033851075917482376, "grad_norm_var": 3.1116770460117015e-06, "learning_rate": 0.004695741238197012, "loss": 2.543, "step": 6810 }, { "crossentropy": 2.6307069063186646, "epoch": 0.3703743984338889, "grad_norm": 0.038785941898822784, "grad_norm_var": 4.170930980595955e-06, "learning_rate": 0.004694498373803673, "loss": 2.6307, "step": 6811 }, { "crossentropy": 2.581025242805481, "epoch": 0.3704287772913891, "grad_norm": 0.03339388594031334, "grad_norm_var": 3.8105532696045924e-06, "learning_rate": 0.004693255528357263, "loss": 2.581, "step": 6812 }, { "crossentropy": 2.544097065925598, "epoch": 0.3704831561488893, "grad_norm": 0.03351883590221405, "grad_norm_var": 3.7748707718274674e-06, "learning_rate": 0.0046920127019348555, "loss": 2.5441, "step": 6813 }, { "crossentropy": 2.6292413473129272, "epoch": 0.37053753500638953, "grad_norm": 0.0339222326874733, "grad_norm_var": 3.782682453915692e-06, "learning_rate": 0.004690769894613535, "loss": 2.6292, "step": 6814 }, { "crossentropy": 2.6140187978744507, "epoch": 0.37059191386388973, "grad_norm": 0.03513220697641373, "grad_norm_var": 3.4751425381886655e-06, "learning_rate": 0.004689527106470378, "loss": 2.614, "step": 6815 }, { "crossentropy": 2.5294185876846313, "epoch": 0.37064629272138994, "grad_norm": 0.035886891186237335, "grad_norm_var": 3.546065454488693e-06, "learning_rate": 0.004688284337582459, "loss": 2.5294, "step": 6816 }, { "crossentropy": 2.592798590660095, "epoch": 0.37070067157889014, "grad_norm": 0.03531103953719139, "grad_norm_var": 3.366156789948634e-06, "learning_rate": 0.004687041588026855, "loss": 2.5928, "step": 6817 }, { "crossentropy": 2.655365467071533, "epoch": 0.37075505043639034, "grad_norm": 0.03582523390650749, "grad_norm_var": 3.404557961722334e-06, "learning_rate": 0.004685798857880641, "loss": 2.6554, "step": 6818 }, { "crossentropy": 2.6766417026519775, "epoch": 0.37080942929389055, "grad_norm": 0.03597521781921387, "grad_norm_var": 3.32319205396762e-06, "learning_rate": 0.004684556147220888, "loss": 2.6766, "step": 6819 }, { "crossentropy": 2.6586458683013916, "epoch": 0.37086380815139075, "grad_norm": 0.03346773236989975, "grad_norm_var": 3.00592746902024e-06, "learning_rate": 0.00468331345612467, "loss": 2.6586, "step": 6820 }, { "crossentropy": 2.5912870168685913, "epoch": 0.37091818700889095, "grad_norm": 0.03547755256295204, "grad_norm_var": 2.9051253508303894e-06, "learning_rate": 0.0046820707846690545, "loss": 2.5913, "step": 6821 }, { "crossentropy": 2.650320053100586, "epoch": 0.37097256586639116, "grad_norm": 0.03812267631292343, "grad_norm_var": 3.319491743718148e-06, "learning_rate": 0.004680828132931113, "loss": 2.6503, "step": 6822 }, { "crossentropy": 2.575860857963562, "epoch": 0.37102694472389136, "grad_norm": 0.03219075873494148, "grad_norm_var": 4.026621987541594e-06, "learning_rate": 0.004679585500987914, "loss": 2.5759, "step": 6823 }, { "crossentropy": 2.6361019611358643, "epoch": 0.37108132358139156, "grad_norm": 0.031520161777734756, "grad_norm_var": 3.852663836842867e-06, "learning_rate": 0.004678342888916522, "loss": 2.6361, "step": 6824 }, { "crossentropy": 2.629850387573242, "epoch": 0.37113570243889177, "grad_norm": 0.03731808066368103, "grad_norm_var": 4.121149935774835e-06, "learning_rate": 0.004677100296794004, "loss": 2.6299, "step": 6825 }, { "crossentropy": 2.5652142763137817, "epoch": 0.37119008129639197, "grad_norm": 0.03865313529968262, "grad_norm_var": 4.83878312970328e-06, "learning_rate": 0.004675857724697427, "loss": 2.5652, "step": 6826 }, { "crossentropy": 2.614423394203186, "epoch": 0.3712444601538922, "grad_norm": 0.03549834340810776, "grad_norm_var": 3.978076197886741e-06, "learning_rate": 0.00467461517270385, "loss": 2.6144, "step": 6827 }, { "crossentropy": 2.64071786403656, "epoch": 0.3712988390113924, "grad_norm": 0.033113013952970505, "grad_norm_var": 4.045996544708534e-06, "learning_rate": 0.004673372640890335, "loss": 2.6407, "step": 6828 }, { "crossentropy": 2.5843549966812134, "epoch": 0.3713532178688926, "grad_norm": 0.03353477641940117, "grad_norm_var": 4.042740403851532e-06, "learning_rate": 0.004672130129333949, "loss": 2.5844, "step": 6829 }, { "crossentropy": 2.549157977104187, "epoch": 0.3714075967263928, "grad_norm": 0.03300704434514046, "grad_norm_var": 4.23384121095415e-06, "learning_rate": 0.004670887638111742, "loss": 2.5492, "step": 6830 }, { "crossentropy": 2.607932209968567, "epoch": 0.371461975583893, "grad_norm": 0.03644789755344391, "grad_norm_var": 4.364852478099442e-06, "learning_rate": 0.004669645167300778, "loss": 2.6079, "step": 6831 }, { "crossentropy": 2.628021478652954, "epoch": 0.3715163544413932, "grad_norm": 0.034358639270067215, "grad_norm_var": 4.347292688362293e-06, "learning_rate": 0.004668402716978115, "loss": 2.628, "step": 6832 }, { "crossentropy": 2.494847297668457, "epoch": 0.3715707332988934, "grad_norm": 0.034507621079683304, "grad_norm_var": 4.353119546137741e-06, "learning_rate": 0.0046671602872208045, "loss": 2.4948, "step": 6833 }, { "crossentropy": 2.6647586822509766, "epoch": 0.3716251121563936, "grad_norm": 0.03555355966091156, "grad_norm_var": 4.325616377068992e-06, "learning_rate": 0.004665917878105903, "loss": 2.6648, "step": 6834 }, { "crossentropy": 2.6823010444641113, "epoch": 0.3716794910138938, "grad_norm": 0.03716425597667694, "grad_norm_var": 4.581012459730962e-06, "learning_rate": 0.004664675489710462, "loss": 2.6823, "step": 6835 }, { "crossentropy": 2.6449499130249023, "epoch": 0.371733869871394, "grad_norm": 0.03223668783903122, "grad_norm_var": 4.926570408602995e-06, "learning_rate": 0.004663433122111536, "loss": 2.6449, "step": 6836 }, { "crossentropy": 2.7258177995681763, "epoch": 0.3717882487288942, "grad_norm": 0.03258466720581055, "grad_norm_var": 5.234180648699902e-06, "learning_rate": 0.004662190775386175, "loss": 2.7258, "step": 6837 }, { "crossentropy": 2.573553204536438, "epoch": 0.3718426275863944, "grad_norm": 0.03245155140757561, "grad_norm_var": 4.6851181806544885e-06, "learning_rate": 0.004660948449611425, "loss": 2.5736, "step": 6838 }, { "crossentropy": 2.62484610080719, "epoch": 0.3718970064438946, "grad_norm": 0.03341878578066826, "grad_norm_var": 4.420295662883415e-06, "learning_rate": 0.004659706144864335, "loss": 2.6248, "step": 6839 }, { "crossentropy": 2.5830905437469482, "epoch": 0.3719513853013948, "grad_norm": 0.03294894099235535, "grad_norm_var": 3.987735266889186e-06, "learning_rate": 0.004658463861221957, "loss": 2.5831, "step": 6840 }, { "crossentropy": 2.673567533493042, "epoch": 0.372005764158895, "grad_norm": 0.03393254056572914, "grad_norm_var": 3.4544918513577477e-06, "learning_rate": 0.004657221598761328, "loss": 2.6736, "step": 6841 }, { "crossentropy": 2.522533416748047, "epoch": 0.37206014301639523, "grad_norm": 0.034682124853134155, "grad_norm_var": 2.155437808022224e-06, "learning_rate": 0.004655979357559498, "loss": 2.5225, "step": 6842 }, { "crossentropy": 2.67974054813385, "epoch": 0.37211452187389543, "grad_norm": 0.03349510580301285, "grad_norm_var": 2.0300891849530285e-06, "learning_rate": 0.004654737137693508, "loss": 2.6797, "step": 6843 }, { "crossentropy": 2.627432942390442, "epoch": 0.37216890073139564, "grad_norm": 0.03387425094842911, "grad_norm_var": 1.979849370271382e-06, "learning_rate": 0.004653494939240397, "loss": 2.6274, "step": 6844 }, { "crossentropy": 2.63699209690094, "epoch": 0.37222327958889584, "grad_norm": 0.03504583612084389, "grad_norm_var": 2.0263260505798756e-06, "learning_rate": 0.004652252762277206, "loss": 2.637, "step": 6845 }, { "crossentropy": 2.652876377105713, "epoch": 0.37227765844639604, "grad_norm": 0.03462455794215202, "grad_norm_var": 1.952655748110356e-06, "learning_rate": 0.0046510106068809775, "loss": 2.6529, "step": 6846 }, { "crossentropy": 2.519674301147461, "epoch": 0.37233203730389625, "grad_norm": 0.0325254388153553, "grad_norm_var": 1.742774874786505e-06, "learning_rate": 0.004649768473128744, "loss": 2.5197, "step": 6847 }, { "crossentropy": 2.5126724243164062, "epoch": 0.37238641616139645, "grad_norm": 0.03180897980928421, "grad_norm_var": 2.0145001977333574e-06, "learning_rate": 0.004648526361097543, "loss": 2.5127, "step": 6848 }, { "crossentropy": 2.684652805328369, "epoch": 0.37244079501889665, "grad_norm": 0.035685475915670395, "grad_norm_var": 2.2118001460416067e-06, "learning_rate": 0.0046472842708644106, "loss": 2.6847, "step": 6849 }, { "crossentropy": 2.702501058578491, "epoch": 0.37249517387639686, "grad_norm": 0.03524074703454971, "grad_norm_var": 2.1479913225041477e-06, "learning_rate": 0.004646042202506377, "loss": 2.7025, "step": 6850 }, { "crossentropy": 2.620719313621521, "epoch": 0.37254955273389706, "grad_norm": 0.03442474454641342, "grad_norm_var": 1.4091950198787927e-06, "learning_rate": 0.004644800156100479, "loss": 2.6207, "step": 6851 }, { "crossentropy": 2.67257559299469, "epoch": 0.37260393159139726, "grad_norm": 0.033137138932943344, "grad_norm_var": 1.2858328677025263e-06, "learning_rate": 0.004643558131723741, "loss": 2.6726, "step": 6852 }, { "crossentropy": 2.7213964462280273, "epoch": 0.37265831044889747, "grad_norm": 0.033639490604400635, "grad_norm_var": 1.192524632408672e-06, "learning_rate": 0.004642316129453197, "loss": 2.7214, "step": 6853 }, { "crossentropy": 2.7219626903533936, "epoch": 0.37271268930639767, "grad_norm": 0.037615370005369186, "grad_norm_var": 1.9248294984073693e-06, "learning_rate": 0.004641074149365874, "loss": 2.722, "step": 6854 }, { "crossentropy": 2.6498066186904907, "epoch": 0.3727670681638979, "grad_norm": 0.0327078215777874, "grad_norm_var": 2.0239567944678e-06, "learning_rate": 0.004639832191538796, "loss": 2.6498, "step": 6855 }, { "crossentropy": 2.6945000886917114, "epoch": 0.3728214470213981, "grad_norm": 0.03292641416192055, "grad_norm_var": 2.0274061139732268e-06, "learning_rate": 0.004638590256048989, "loss": 2.6945, "step": 6856 }, { "crossentropy": 2.5587170124053955, "epoch": 0.3728758258788983, "grad_norm": 0.03248139098286629, "grad_norm_var": 2.1885926803428986e-06, "learning_rate": 0.00463734834297348, "loss": 2.5587, "step": 6857 }, { "crossentropy": 2.592982769012451, "epoch": 0.3729302047363985, "grad_norm": 0.03361603990197182, "grad_norm_var": 2.1619097070655254e-06, "learning_rate": 0.004636106452389287, "loss": 2.593, "step": 6858 }, { "crossentropy": 2.7114057540893555, "epoch": 0.3729845835938987, "grad_norm": 0.03433462977409363, "grad_norm_var": 2.1574974456151086e-06, "learning_rate": 0.004634864584373433, "loss": 2.7114, "step": 6859 }, { "crossentropy": 2.5746891498565674, "epoch": 0.3730389624513989, "grad_norm": 0.03283214569091797, "grad_norm_var": 2.2401372725112945e-06, "learning_rate": 0.004633622739002937, "loss": 2.5747, "step": 6860 }, { "crossentropy": 2.6210535764694214, "epoch": 0.3730933413088991, "grad_norm": 0.03480450063943863, "grad_norm_var": 2.2074018440167036e-06, "learning_rate": 0.004632380916354816, "loss": 2.6211, "step": 6861 }, { "crossentropy": 2.715172052383423, "epoch": 0.3731477201663993, "grad_norm": 0.03408408910036087, "grad_norm_var": 2.1734670466009205e-06, "learning_rate": 0.0046311391165060915, "loss": 2.7152, "step": 6862 }, { "crossentropy": 2.598938822746277, "epoch": 0.3732020990238995, "grad_norm": 0.034433234483003616, "grad_norm_var": 2.0598112494290647e-06, "learning_rate": 0.004629897339533771, "loss": 2.5989, "step": 6863 }, { "crossentropy": 2.550197720527649, "epoch": 0.3732564778813997, "grad_norm": 0.03435320407152176, "grad_norm_var": 1.7259485108812866e-06, "learning_rate": 0.004628655585514874, "loss": 2.5502, "step": 6864 }, { "crossentropy": 2.661066174507141, "epoch": 0.3733108567388999, "grad_norm": 0.03273957967758179, "grad_norm_var": 1.6631774657391643e-06, "learning_rate": 0.004627413854526412, "loss": 2.6611, "step": 6865 }, { "crossentropy": 2.498164653778076, "epoch": 0.3733652355964001, "grad_norm": 0.03491383418440819, "grad_norm_var": 1.614059995087384e-06, "learning_rate": 0.004626172146645394, "loss": 2.4982, "step": 6866 }, { "crossentropy": 2.7033965587615967, "epoch": 0.3734196144539003, "grad_norm": 0.03636680170893669, "grad_norm_var": 1.9752456169918587e-06, "learning_rate": 0.004624930461948831, "loss": 2.7034, "step": 6867 }, { "crossentropy": 2.568367838859558, "epoch": 0.3734739933114005, "grad_norm": 0.03351152688264847, "grad_norm_var": 1.9378581351731094e-06, "learning_rate": 0.0046236888005137325, "loss": 2.5684, "step": 6868 }, { "crossentropy": 2.6939666271209717, "epoch": 0.3735283721689007, "grad_norm": 0.034151382744312286, "grad_norm_var": 1.9238278851638665e-06, "learning_rate": 0.004622447162417102, "loss": 2.694, "step": 6869 }, { "crossentropy": 2.6463048458099365, "epoch": 0.3735827510264009, "grad_norm": 0.031582992523908615, "grad_norm_var": 1.384376326202021e-06, "learning_rate": 0.004621205547735948, "loss": 2.6463, "step": 6870 }, { "crossentropy": 2.5769033432006836, "epoch": 0.37363712988390113, "grad_norm": 0.032207340002059937, "grad_norm_var": 1.4689078923911408e-06, "learning_rate": 0.004619963956547273, "loss": 2.5769, "step": 6871 }, { "crossentropy": 2.681683897972107, "epoch": 0.37369150874140133, "grad_norm": 0.03300370275974274, "grad_norm_var": 1.461219727982623e-06, "learning_rate": 0.004618722388928078, "loss": 2.6817, "step": 6872 }, { "crossentropy": 2.5714510679244995, "epoch": 0.37374588759890154, "grad_norm": 0.03435623273253441, "grad_norm_var": 1.3729017720472599e-06, "learning_rate": 0.004617480844955367, "loss": 2.5715, "step": 6873 }, { "crossentropy": 2.516779661178589, "epoch": 0.37380026645640174, "grad_norm": 0.03386949002742767, "grad_norm_var": 1.3696624207030933e-06, "learning_rate": 0.004616239324706135, "loss": 2.5168, "step": 6874 }, { "crossentropy": 2.635097861289978, "epoch": 0.37385464531390195, "grad_norm": 0.033579517155885696, "grad_norm_var": 1.356158208478305e-06, "learning_rate": 0.004614997828257382, "loss": 2.6351, "step": 6875 }, { "crossentropy": 2.4984724521636963, "epoch": 0.37390902417140215, "grad_norm": 0.03659676015377045, "grad_norm_var": 1.7564422963791312e-06, "learning_rate": 0.004613756355686107, "loss": 2.4985, "step": 6876 }, { "crossentropy": 2.7442214488983154, "epoch": 0.37396340302890235, "grad_norm": 0.03418000787496567, "grad_norm_var": 1.7167134944503135e-06, "learning_rate": 0.0046125149070693015, "loss": 2.7442, "step": 6877 }, { "crossentropy": 2.5882253646850586, "epoch": 0.37401778188640256, "grad_norm": 0.03275490552186966, "grad_norm_var": 1.811452685757109e-06, "learning_rate": 0.004611273482483959, "loss": 2.5882, "step": 6878 }, { "crossentropy": 2.661208748817444, "epoch": 0.37407216074390276, "grad_norm": 0.032164134085178375, "grad_norm_var": 1.975716905583807e-06, "learning_rate": 0.004610032082007075, "loss": 2.6612, "step": 6879 }, { "crossentropy": 2.6349788904190063, "epoch": 0.37412653960140296, "grad_norm": 0.032717932015657425, "grad_norm_var": 2.0158449471507913e-06, "learning_rate": 0.004608790705715637, "loss": 2.635, "step": 6880 }, { "crossentropy": 2.6331214904785156, "epoch": 0.37418091845890317, "grad_norm": 0.03304273262619972, "grad_norm_var": 1.9840411255984842e-06, "learning_rate": 0.004607549353686634, "loss": 2.6331, "step": 6881 }, { "crossentropy": 2.6312522888183594, "epoch": 0.37423529731640337, "grad_norm": 0.034553732722997665, "grad_norm_var": 1.933262939475024e-06, "learning_rate": 0.004606308025997054, "loss": 2.6313, "step": 6882 }, { "crossentropy": 2.576886534690857, "epoch": 0.3742896761739036, "grad_norm": 0.0336679145693779, "grad_norm_var": 1.416246525466162e-06, "learning_rate": 0.004605066722723884, "loss": 2.5769, "step": 6883 }, { "crossentropy": 2.6079224348068237, "epoch": 0.3743440550314038, "grad_norm": 0.034169752150774, "grad_norm_var": 1.4446643879413463e-06, "learning_rate": 0.004603825443944107, "loss": 2.6079, "step": 6884 }, { "crossentropy": 2.6928824186325073, "epoch": 0.374398433888904, "grad_norm": 0.03551049530506134, "grad_norm_var": 1.671374991877618e-06, "learning_rate": 0.004602584189734708, "loss": 2.6929, "step": 6885 }, { "crossentropy": 2.6138700246810913, "epoch": 0.3744528127464042, "grad_norm": 0.034422311931848526, "grad_norm_var": 1.4031807766249141e-06, "learning_rate": 0.004601342960172665, "loss": 2.6139, "step": 6886 }, { "crossentropy": 2.6482112407684326, "epoch": 0.3745071916039044, "grad_norm": 0.03178899735212326, "grad_norm_var": 1.5029453589743628e-06, "learning_rate": 0.004600101755334963, "loss": 2.6482, "step": 6887 }, { "crossentropy": 2.6465556621551514, "epoch": 0.3745615704614046, "grad_norm": 0.03226882964372635, "grad_norm_var": 1.612140909750849e-06, "learning_rate": 0.004598860575298575, "loss": 2.6466, "step": 6888 }, { "crossentropy": 2.6309144496917725, "epoch": 0.3746159493189048, "grad_norm": 0.031993065029382706, "grad_norm_var": 1.7631429919816194e-06, "learning_rate": 0.004597619420140479, "loss": 2.6309, "step": 6889 }, { "crossentropy": 2.633644461631775, "epoch": 0.374670328176405, "grad_norm": 0.03257712721824646, "grad_norm_var": 1.8176533345523952e-06, "learning_rate": 0.004596378289937654, "loss": 2.6336, "step": 6890 }, { "crossentropy": 2.5247581005096436, "epoch": 0.3747247070339052, "grad_norm": 0.04107093811035156, "grad_norm_var": 5.40540197541961e-06, "learning_rate": 0.004595137184767069, "loss": 2.5248, "step": 6891 }, { "crossentropy": 2.7153096199035645, "epoch": 0.3747790858914054, "grad_norm": 0.03258439898490906, "grad_norm_var": 5.0049744368341195e-06, "learning_rate": 0.0045938961047057, "loss": 2.7153, "step": 6892 }, { "crossentropy": 2.7455207109451294, "epoch": 0.3748334647489056, "grad_norm": 0.03384025767445564, "grad_norm_var": 4.991201177881263e-06, "learning_rate": 0.004592655049830516, "loss": 2.7455, "step": 6893 }, { "crossentropy": 2.5772762298583984, "epoch": 0.3748878436064058, "grad_norm": 0.03512035310268402, "grad_norm_var": 5.044262508974104e-06, "learning_rate": 0.004591414020218485, "loss": 2.5773, "step": 6894 }, { "crossentropy": 2.6550785303115845, "epoch": 0.374942222463906, "grad_norm": 0.05536867305636406, "grad_norm_var": 3.350215733081118e-05, "learning_rate": 0.004590173015946575, "loss": 2.6551, "step": 6895 }, { "crossentropy": 2.587509870529175, "epoch": 0.3749966013214062, "grad_norm": 0.035237379372119904, "grad_norm_var": 3.303365047283007e-05, "learning_rate": 0.004588932037091755, "loss": 2.5875, "step": 6896 }, { "crossentropy": 2.6342554092407227, "epoch": 0.3750509801789064, "grad_norm": 0.0414583683013916, "grad_norm_var": 3.4757735856767955e-05, "learning_rate": 0.0045876910837309845, "loss": 2.6343, "step": 6897 }, { "crossentropy": 2.5468982458114624, "epoch": 0.3751053590364066, "grad_norm": 0.039431676268577576, "grad_norm_var": 3.5319175193024704e-05, "learning_rate": 0.004586450155941229, "loss": 2.5469, "step": 6898 }, { "crossentropy": 2.556040644645691, "epoch": 0.37515973789390683, "grad_norm": 0.04596007242798805, "grad_norm_var": 4.0478529727266725e-05, "learning_rate": 0.004585209253799451, "loss": 2.556, "step": 6899 }, { "crossentropy": 2.638131618499756, "epoch": 0.37521411675140703, "grad_norm": 0.03495378792285919, "grad_norm_var": 4.021583597262942e-05, "learning_rate": 0.004583968377382605, "loss": 2.6381, "step": 6900 }, { "crossentropy": 2.617143392562866, "epoch": 0.37526849560890724, "grad_norm": 0.03208140656352043, "grad_norm_var": 4.167711277491178e-05, "learning_rate": 0.004582727526767656, "loss": 2.6171, "step": 6901 }, { "crossentropy": 2.6315243244171143, "epoch": 0.37532287446640744, "grad_norm": 0.03351117670536041, "grad_norm_var": 4.202815925438968e-05, "learning_rate": 0.0045814867020315545, "loss": 2.6315, "step": 6902 }, { "crossentropy": 2.537369966506958, "epoch": 0.37537725332390764, "grad_norm": 0.03806416317820549, "grad_norm_var": 4.0273266961777614e-05, "learning_rate": 0.004580245903251258, "loss": 2.5374, "step": 6903 }, { "crossentropy": 2.6023839712142944, "epoch": 0.37543163218140785, "grad_norm": 0.03575458005070686, "grad_norm_var": 3.8731482630816916e-05, "learning_rate": 0.00457900513050372, "loss": 2.6024, "step": 6904 }, { "crossentropy": 2.6570494174957275, "epoch": 0.37548601103890805, "grad_norm": 0.0347304493188858, "grad_norm_var": 3.721250805832979e-05, "learning_rate": 0.004577764383865891, "loss": 2.657, "step": 6905 }, { "crossentropy": 2.7263503074645996, "epoch": 0.37554038989640826, "grad_norm": 0.035272445529699326, "grad_norm_var": 3.585820302415086e-05, "learning_rate": 0.00457652366341472, "loss": 2.7264, "step": 6906 }, { "crossentropy": 2.628393292427063, "epoch": 0.37559476875390846, "grad_norm": 0.0346849299967289, "grad_norm_var": 3.5602772153804874e-05, "learning_rate": 0.004575282969227158, "loss": 2.6284, "step": 6907 }, { "crossentropy": 2.681221604347229, "epoch": 0.37564914761140866, "grad_norm": 0.032545462250709534, "grad_norm_var": 3.562775518255063e-05, "learning_rate": 0.0045740423013801495, "loss": 2.6812, "step": 6908 }, { "crossentropy": 2.6640467643737793, "epoch": 0.37570352646890887, "grad_norm": 0.03297996148467064, "grad_norm_var": 3.607957758217465e-05, "learning_rate": 0.004572801659950641, "loss": 2.664, "step": 6909 }, { "crossentropy": 2.6511340141296387, "epoch": 0.37575790532640907, "grad_norm": 0.03245021402835846, "grad_norm_var": 3.730907157695566e-05, "learning_rate": 0.0045715610450155755, "loss": 2.6511, "step": 6910 }, { "crossentropy": 2.638853669166565, "epoch": 0.3758122841839093, "grad_norm": 0.03288457915186882, "grad_norm_var": 1.4303474448129627e-05, "learning_rate": 0.004570320456651893, "loss": 2.6389, "step": 6911 }, { "crossentropy": 2.577846050262451, "epoch": 0.3758666630414095, "grad_norm": 0.03359691798686981, "grad_norm_var": 1.458380254550154e-05, "learning_rate": 0.004569079894936536, "loss": 2.5778, "step": 6912 }, { "crossentropy": 2.624995708465576, "epoch": 0.3759210418989097, "grad_norm": 0.03302377834916115, "grad_norm_var": 1.2495238025991319e-05, "learning_rate": 0.004567839359946444, "loss": 2.625, "step": 6913 }, { "crossentropy": 2.631064534187317, "epoch": 0.3759754207564099, "grad_norm": 0.032187849283218384, "grad_norm_var": 1.161073556079726e-05, "learning_rate": 0.00456659885175855, "loss": 2.6311, "step": 6914 }, { "crossentropy": 2.644753575325012, "epoch": 0.3760297996139101, "grad_norm": 0.03348783031105995, "grad_norm_var": 2.5540617705376832e-06, "learning_rate": 0.004565358370449792, "loss": 2.6448, "step": 6915 }, { "crossentropy": 2.6286966800689697, "epoch": 0.3760841784714103, "grad_norm": 0.033404476940631866, "grad_norm_var": 2.483939409515743e-06, "learning_rate": 0.004564117916097102, "loss": 2.6287, "step": 6916 }, { "crossentropy": 2.635506749153137, "epoch": 0.3761385573289105, "grad_norm": 0.032868336886167526, "grad_norm_var": 2.3432379814100045e-06, "learning_rate": 0.004562877488777412, "loss": 2.6355, "step": 6917 }, { "crossentropy": 2.59977388381958, "epoch": 0.3761929361864107, "grad_norm": 0.033027008175849915, "grad_norm_var": 2.379145485592998e-06, "learning_rate": 0.0045616370885676535, "loss": 2.5998, "step": 6918 }, { "crossentropy": 2.6908459663391113, "epoch": 0.3762473150439109, "grad_norm": 0.033036839216947556, "grad_norm_var": 1.1072870606823027e-06, "learning_rate": 0.004560396715544754, "loss": 2.6908, "step": 6919 }, { "crossentropy": 2.612638831138611, "epoch": 0.3763016939014111, "grad_norm": 0.032443661242723465, "grad_norm_var": 7.953507547633331e-07, "learning_rate": 0.00455915636978564, "loss": 2.6126, "step": 6920 }, { "crossentropy": 2.6811047792434692, "epoch": 0.3763560727589113, "grad_norm": 0.03359280899167061, "grad_norm_var": 6.576000829925285e-07, "learning_rate": 0.004557916051367238, "loss": 2.6811, "step": 6921 }, { "crossentropy": 2.5070955753326416, "epoch": 0.3764104516164115, "grad_norm": 0.03761481121182442, "grad_norm_var": 1.6421699931999193e-06, "learning_rate": 0.004556675760366468, "loss": 2.5071, "step": 6922 }, { "crossentropy": 2.6574548482894897, "epoch": 0.3764648304739117, "grad_norm": 0.04222344234585762, "grad_norm_var": 6.521362617904295e-06, "learning_rate": 0.004555435496860256, "loss": 2.6575, "step": 6923 }, { "crossentropy": 2.5158212184906006, "epoch": 0.376519209331412, "grad_norm": 0.039428774267435074, "grad_norm_var": 8.298648595759089e-06, "learning_rate": 0.00455419526092552, "loss": 2.5158, "step": 6924 }, { "crossentropy": 2.5114893913269043, "epoch": 0.3765735881889122, "grad_norm": 0.032331690192222595, "grad_norm_var": 8.436049376449549e-06, "learning_rate": 0.004552955052639178, "loss": 2.5115, "step": 6925 }, { "crossentropy": 2.6515084505081177, "epoch": 0.3766279670464124, "grad_norm": 0.03253001719713211, "grad_norm_var": 8.417560930531105e-06, "learning_rate": 0.004551714872078145, "loss": 2.6515, "step": 6926 }, { "crossentropy": 2.628528118133545, "epoch": 0.3766823459039126, "grad_norm": 0.03512834757566452, "grad_norm_var": 8.329655840032199e-06, "learning_rate": 0.004550474719319342, "loss": 2.6285, "step": 6927 }, { "crossentropy": 2.673776388168335, "epoch": 0.3767367247614128, "grad_norm": 0.032210104167461395, "grad_norm_var": 8.592884729426095e-06, "learning_rate": 0.004549234594439674, "loss": 2.6738, "step": 6928 }, { "crossentropy": 2.7276341915130615, "epoch": 0.376791103618913, "grad_norm": 0.0315626785159111, "grad_norm_var": 8.971767050724767e-06, "learning_rate": 0.004547994497516058, "loss": 2.7276, "step": 6929 }, { "crossentropy": 2.6413886547088623, "epoch": 0.3768454824764132, "grad_norm": 0.03388175368309021, "grad_norm_var": 8.698359517433687e-06, "learning_rate": 0.004546754428625402, "loss": 2.6414, "step": 6930 }, { "crossentropy": 2.690036416053772, "epoch": 0.3768998613339134, "grad_norm": 0.040456682443618774, "grad_norm_var": 1.0980606036961546e-05, "learning_rate": 0.004545514387844612, "loss": 2.69, "step": 6931 }, { "crossentropy": 2.5975106954574585, "epoch": 0.3769542401914136, "grad_norm": 0.033992789685726166, "grad_norm_var": 1.0897960566467041e-05, "learning_rate": 0.004544274375250598, "loss": 2.5975, "step": 6932 }, { "crossentropy": 2.5721734762191772, "epoch": 0.3770086190489138, "grad_norm": 0.03353332728147507, "grad_norm_var": 1.0756933129283756e-05, "learning_rate": 0.00454303439092026, "loss": 2.5722, "step": 6933 }, { "crossentropy": 2.62636137008667, "epoch": 0.377062997906414, "grad_norm": 0.03412744030356407, "grad_norm_var": 1.057069082007796e-05, "learning_rate": 0.004541794434930505, "loss": 2.6264, "step": 6934 }, { "crossentropy": 2.6021119356155396, "epoch": 0.3771173767639142, "grad_norm": 0.03299225866794586, "grad_norm_var": 1.058177655187768e-05, "learning_rate": 0.004540554507358231, "loss": 2.6021, "step": 6935 }, { "crossentropy": 2.624491810798645, "epoch": 0.3771717556214144, "grad_norm": 0.032305579632520676, "grad_norm_var": 1.0627789509820342e-05, "learning_rate": 0.004539314608280339, "loss": 2.6245, "step": 6936 }, { "crossentropy": 2.536036729812622, "epoch": 0.3772261344789146, "grad_norm": 0.031146951019763947, "grad_norm_var": 1.1418035970068608e-05, "learning_rate": 0.0045380747377737245, "loss": 2.536, "step": 6937 }, { "crossentropy": 2.66250479221344, "epoch": 0.3772805133364148, "grad_norm": 0.0330842025578022, "grad_norm_var": 1.0950221695100495e-05, "learning_rate": 0.004536834895915286, "loss": 2.6625, "step": 6938 }, { "crossentropy": 2.6567113399505615, "epoch": 0.37733489219391503, "grad_norm": 0.03403828665614128, "grad_norm_var": 6.635937290672538e-06, "learning_rate": 0.004535595082781912, "loss": 2.6567, "step": 6939 }, { "crossentropy": 2.645573854446411, "epoch": 0.37738927105141523, "grad_norm": 0.03315941244363785, "grad_norm_var": 4.489240367224154e-06, "learning_rate": 0.004534355298450501, "loss": 2.6456, "step": 6940 }, { "crossentropy": 2.679767608642578, "epoch": 0.37744364990891544, "grad_norm": 0.03120722435414791, "grad_norm_var": 4.747942218870125e-06, "learning_rate": 0.00453311554299794, "loss": 2.6798, "step": 6941 }, { "crossentropy": 2.617802143096924, "epoch": 0.37749802876641564, "grad_norm": 0.032254382967948914, "grad_norm_var": 4.786861858741562e-06, "learning_rate": 0.004531875816501117, "loss": 2.6178, "step": 6942 }, { "crossentropy": 2.574678897857666, "epoch": 0.37755240762391584, "grad_norm": 0.053163010627031326, "grad_norm_var": 2.9168540846631372e-05, "learning_rate": 0.00453063611903692, "loss": 2.5747, "step": 6943 }, { "crossentropy": 2.4912678003311157, "epoch": 0.37760678648141605, "grad_norm": 0.03354280814528465, "grad_norm_var": 2.886025156085743e-05, "learning_rate": 0.004529396450682232, "loss": 2.4913, "step": 6944 }, { "crossentropy": 2.524744153022766, "epoch": 0.37766116533891625, "grad_norm": 0.033524200320243835, "grad_norm_var": 2.8292480587766227e-05, "learning_rate": 0.004528156811513938, "loss": 2.5247, "step": 6945 }, { "crossentropy": 2.699738025665283, "epoch": 0.37771554419641645, "grad_norm": 0.03507990017533302, "grad_norm_var": 2.8239401168838917e-05, "learning_rate": 0.00452691720160892, "loss": 2.6997, "step": 6946 }, { "crossentropy": 2.7574610710144043, "epoch": 0.37776992305391666, "grad_norm": 0.04001689329743385, "grad_norm_var": 2.7922752825190846e-05, "learning_rate": 0.004525677621044054, "loss": 2.7575, "step": 6947 }, { "crossentropy": 2.5500237941741943, "epoch": 0.37782430191141686, "grad_norm": 0.034985315054655075, "grad_norm_var": 2.7874449155544295e-05, "learning_rate": 0.00452443806989622, "loss": 2.55, "step": 6948 }, { "crossentropy": 2.6569098234176636, "epoch": 0.37787868076891706, "grad_norm": 0.03437776863574982, "grad_norm_var": 2.7766820553091564e-05, "learning_rate": 0.004523198548242296, "loss": 2.6569, "step": 6949 }, { "crossentropy": 2.6338900327682495, "epoch": 0.37793305962641727, "grad_norm": 0.03480396419763565, "grad_norm_var": 2.7722324106285477e-05, "learning_rate": 0.004521959056159151, "loss": 2.6339, "step": 6950 }, { "crossentropy": 2.5830849409103394, "epoch": 0.37798743848391747, "grad_norm": 0.0353623628616333, "grad_norm_var": 2.7445214711483316e-05, "learning_rate": 0.00452071959372366, "loss": 2.5831, "step": 6951 }, { "crossentropy": 2.628873586654663, "epoch": 0.3780418173414177, "grad_norm": 0.03563372790813446, "grad_norm_var": 2.6884924367305073e-05, "learning_rate": 0.004519480161012695, "loss": 2.6289, "step": 6952 }, { "crossentropy": 2.661346197128296, "epoch": 0.3780961961989179, "grad_norm": 0.034485600888729095, "grad_norm_var": 2.571669402826857e-05, "learning_rate": 0.00451824075810312, "loss": 2.6613, "step": 6953 }, { "crossentropy": 2.6942830085754395, "epoch": 0.3781505750564181, "grad_norm": 0.03541189059615135, "grad_norm_var": 2.5291616296426695e-05, "learning_rate": 0.004517001385071803, "loss": 2.6943, "step": 6954 }, { "crossentropy": 2.513518810272217, "epoch": 0.3782049539139183, "grad_norm": 0.035071149468421936, "grad_norm_var": 2.513076778277213e-05, "learning_rate": 0.004515762041995613, "loss": 2.5135, "step": 6955 }, { "crossentropy": 2.662582516670227, "epoch": 0.3782593327714185, "grad_norm": 0.03290211036801338, "grad_norm_var": 2.5223951398516232e-05, "learning_rate": 0.004514522728951408, "loss": 2.6626, "step": 6956 }, { "crossentropy": 2.559512495994568, "epoch": 0.3783137116289187, "grad_norm": 0.033027198165655136, "grad_norm_var": 2.4331301007988562e-05, "learning_rate": 0.004513283446016052, "loss": 2.5595, "step": 6957 }, { "crossentropy": 2.5697726011276245, "epoch": 0.3783680904864189, "grad_norm": 0.03296785056591034, "grad_norm_var": 2.402081685893102e-05, "learning_rate": 0.004512044193266402, "loss": 2.5698, "step": 6958 }, { "crossentropy": 2.5924710035324097, "epoch": 0.3784224693439191, "grad_norm": 0.03716569021344185, "grad_norm_var": 3.187970985899883e-06, "learning_rate": 0.004510804970779316, "loss": 2.5925, "step": 6959 }, { "crossentropy": 2.586261749267578, "epoch": 0.3784768482014193, "grad_norm": 0.03361183777451515, "grad_norm_var": 3.1758011897880614e-06, "learning_rate": 0.004509565778631651, "loss": 2.5863, "step": 6960 }, { "crossentropy": 2.5339781045913696, "epoch": 0.3785312270589195, "grad_norm": 0.03148694336414337, "grad_norm_var": 3.809382713546722e-06, "learning_rate": 0.004508326616900257, "loss": 2.534, "step": 6961 }, { "crossentropy": 2.59506356716156, "epoch": 0.3785856059164197, "grad_norm": 0.033478252589702606, "grad_norm_var": 3.904469295677206e-06, "learning_rate": 0.004507087485661988, "loss": 2.5951, "step": 6962 }, { "crossentropy": 2.667472243309021, "epoch": 0.3786399847739199, "grad_norm": 0.033284712582826614, "grad_norm_var": 1.941456202420438e-06, "learning_rate": 0.004505848384993696, "loss": 2.6675, "step": 6963 }, { "crossentropy": 2.6604918241500854, "epoch": 0.3786943636314201, "grad_norm": 0.03479645028710365, "grad_norm_var": 1.9252576177445148e-06, "learning_rate": 0.004504609314972224, "loss": 2.6605, "step": 6964 }, { "crossentropy": 2.6556025743484497, "epoch": 0.3787487424889203, "grad_norm": 0.03632771596312523, "grad_norm_var": 2.198272873683386e-06, "learning_rate": 0.00450337027567442, "loss": 2.6556, "step": 6965 }, { "crossentropy": 2.5849186182022095, "epoch": 0.3788031213464205, "grad_norm": 0.0352323018014431, "grad_norm_var": 2.2348903895703816e-06, "learning_rate": 0.004502131267177132, "loss": 2.5849, "step": 6966 }, { "crossentropy": 2.6183582544326782, "epoch": 0.3788575002039207, "grad_norm": 0.03332718461751938, "grad_norm_var": 2.230003018300654e-06, "learning_rate": 0.004500892289557196, "loss": 2.6184, "step": 6967 }, { "crossentropy": 2.7078369855880737, "epoch": 0.37891187906142093, "grad_norm": 0.03351789340376854, "grad_norm_var": 2.1231486047543677e-06, "learning_rate": 0.004499653342891455, "loss": 2.7078, "step": 6968 }, { "crossentropy": 2.6138793230056763, "epoch": 0.37896625791892113, "grad_norm": 0.03309283033013344, "grad_norm_var": 2.178522232938682e-06, "learning_rate": 0.004498414427256748, "loss": 2.6139, "step": 6969 }, { "crossentropy": 2.6890827417373657, "epoch": 0.37902063677642134, "grad_norm": 0.03190698102116585, "grad_norm_var": 2.306994236773938e-06, "learning_rate": 0.0044971755427299104, "loss": 2.6891, "step": 6970 }, { "crossentropy": 2.6249366998672485, "epoch": 0.37907501563392154, "grad_norm": 0.0338292233645916, "grad_norm_var": 2.1970129536879692e-06, "learning_rate": 0.004495936689387778, "loss": 2.6249, "step": 6971 }, { "crossentropy": 2.6556971073150635, "epoch": 0.37912939449142175, "grad_norm": 0.03465308994054794, "grad_norm_var": 2.1913359316454045e-06, "learning_rate": 0.004494697867307179, "loss": 2.6557, "step": 6972 }, { "crossentropy": 2.672300934791565, "epoch": 0.37918377334892195, "grad_norm": 0.0351150780916214, "grad_norm_var": 2.2328867275656084e-06, "learning_rate": 0.004493459076564949, "loss": 2.6723, "step": 6973 }, { "crossentropy": 2.6110715866088867, "epoch": 0.37923815220642215, "grad_norm": 0.03145729750394821, "grad_norm_var": 2.580786921500128e-06, "learning_rate": 0.0044922203172379155, "loss": 2.6111, "step": 6974 }, { "crossentropy": 2.665884256362915, "epoch": 0.37929253106392236, "grad_norm": 0.03858337551355362, "grad_norm_var": 3.3250740706045404e-06, "learning_rate": 0.004490981589402904, "loss": 2.6659, "step": 6975 }, { "crossentropy": 2.532299757003784, "epoch": 0.37934690992142256, "grad_norm": 0.037259139120578766, "grad_norm_var": 3.9768163156650434e-06, "learning_rate": 0.004489742893136738, "loss": 2.5323, "step": 6976 }, { "crossentropy": 2.617344379425049, "epoch": 0.37940128877892276, "grad_norm": 0.03274141624569893, "grad_norm_var": 3.619826482415857e-06, "learning_rate": 0.004488504228516246, "loss": 2.6173, "step": 6977 }, { "crossentropy": 2.723613977432251, "epoch": 0.37945566763642297, "grad_norm": 0.03353457525372505, "grad_norm_var": 3.6139461705497754e-06, "learning_rate": 0.004487265595618241, "loss": 2.7236, "step": 6978 }, { "crossentropy": 2.6639970541000366, "epoch": 0.37951004649392317, "grad_norm": 0.03577952831983566, "grad_norm_var": 3.668151329604535e-06, "learning_rate": 0.004486026994519547, "loss": 2.664, "step": 6979 }, { "crossentropy": 2.6295628547668457, "epoch": 0.3795644253514234, "grad_norm": 0.03379034996032715, "grad_norm_var": 3.684556035517724e-06, "learning_rate": 0.004484788425296981, "loss": 2.6296, "step": 6980 }, { "crossentropy": 2.7010074853897095, "epoch": 0.3796188042089236, "grad_norm": 0.035438500344753265, "grad_norm_var": 3.5035535449006703e-06, "learning_rate": 0.0044835498880273555, "loss": 2.701, "step": 6981 }, { "crossentropy": 2.615264892578125, "epoch": 0.3796731830664238, "grad_norm": 0.03507588058710098, "grad_norm_var": 3.4862365369497477e-06, "learning_rate": 0.004482311382787485, "loss": 2.6153, "step": 6982 }, { "crossentropy": 2.5961878299713135, "epoch": 0.379727561923924, "grad_norm": 0.03364201635122299, "grad_norm_var": 3.4508018272277435e-06, "learning_rate": 0.004481072909654178, "loss": 2.5962, "step": 6983 }, { "crossentropy": 2.6364879608154297, "epoch": 0.3797819407814242, "grad_norm": 0.03287631273269653, "grad_norm_var": 3.5467327651930147e-06, "learning_rate": 0.004479834468704247, "loss": 2.6365, "step": 6984 }, { "crossentropy": 2.616451144218445, "epoch": 0.3798363196389244, "grad_norm": 0.034930743277072906, "grad_norm_var": 3.4624037702629673e-06, "learning_rate": 0.004478596060014497, "loss": 2.6165, "step": 6985 }, { "crossentropy": 2.456866145133972, "epoch": 0.3798906984964246, "grad_norm": 0.03601861745119095, "grad_norm_var": 3.1449670018149702e-06, "learning_rate": 0.0044773576836617335, "loss": 2.4569, "step": 6986 }, { "crossentropy": 2.641992449760437, "epoch": 0.3799450773539248, "grad_norm": 0.03316342830657959, "grad_norm_var": 3.2473387242583363e-06, "learning_rate": 0.0044761193397227575, "loss": 2.642, "step": 6987 }, { "crossentropy": 2.625508427619934, "epoch": 0.379999456211425, "grad_norm": 0.031841348856687546, "grad_norm_var": 3.732316450964721e-06, "learning_rate": 0.004474881028274375, "loss": 2.6255, "step": 6988 }, { "crossentropy": 2.633925199508667, "epoch": 0.3800538350689252, "grad_norm": 0.03345071151852608, "grad_norm_var": 3.758517849216288e-06, "learning_rate": 0.004473642749393379, "loss": 2.6339, "step": 6989 }, { "crossentropy": 2.486574649810791, "epoch": 0.3801082139264254, "grad_norm": 0.03365590050816536, "grad_norm_var": 3.212953591205138e-06, "learning_rate": 0.004472404503156569, "loss": 2.4866, "step": 6990 }, { "crossentropy": 2.5175933837890625, "epoch": 0.3801625927839256, "grad_norm": 0.03359433263540268, "grad_norm_var": 2.0432582578841856e-06, "learning_rate": 0.0044711662896407415, "loss": 2.5176, "step": 6991 }, { "crossentropy": 2.6543173789978027, "epoch": 0.3802169716414258, "grad_norm": 0.03359765186905861, "grad_norm_var": 1.3752726930491473e-06, "learning_rate": 0.004469928108922687, "loss": 2.6543, "step": 6992 }, { "crossentropy": 2.5934923887252808, "epoch": 0.380271350498926, "grad_norm": 0.032357554882764816, "grad_norm_var": 1.4461194721417566e-06, "learning_rate": 0.004468689961079194, "loss": 2.5935, "step": 6993 }, { "crossentropy": 2.572413921356201, "epoch": 0.3803257293564262, "grad_norm": 0.03605131804943085, "grad_norm_var": 1.712083023951092e-06, "learning_rate": 0.004467451846187059, "loss": 2.5724, "step": 6994 }, { "crossentropy": 2.731646776199341, "epoch": 0.3803801082139264, "grad_norm": 0.034743133932352066, "grad_norm_var": 1.5442277290886783e-06, "learning_rate": 0.004466213764323061, "loss": 2.7316, "step": 6995 }, { "crossentropy": 2.669731020927429, "epoch": 0.38043448707142663, "grad_norm": 0.03443548083305359, "grad_norm_var": 1.5509816022993747e-06, "learning_rate": 0.0044649757155639895, "loss": 2.6697, "step": 6996 }, { "crossentropy": 2.5003111362457275, "epoch": 0.38048886592892683, "grad_norm": 0.0333324670791626, "grad_norm_var": 1.4395755333629356e-06, "learning_rate": 0.004463737699986624, "loss": 2.5003, "step": 6997 }, { "crossentropy": 2.640631914138794, "epoch": 0.38054324478642704, "grad_norm": 0.03448641300201416, "grad_norm_var": 1.370675695068722e-06, "learning_rate": 0.0044624997176677455, "loss": 2.6406, "step": 6998 }, { "crossentropy": 2.623401641845703, "epoch": 0.38059762364392724, "grad_norm": 0.03622681647539139, "grad_norm_var": 1.7041328165450879e-06, "learning_rate": 0.0044612617686841365, "loss": 2.6234, "step": 6999 }, { "crossentropy": 2.668256998062134, "epoch": 0.38065200250142744, "grad_norm": 0.03662954643368721, "grad_norm_var": 1.9983871228478866e-06, "learning_rate": 0.004460023853112567, "loss": 2.6683, "step": 7000 }, { "crossentropy": 2.510186553001404, "epoch": 0.38070638135892765, "grad_norm": 0.03639693558216095, "grad_norm_var": 2.259526597522763e-06, "learning_rate": 0.004458785971029816, "loss": 2.5102, "step": 7001 }, { "crossentropy": 2.6865888833999634, "epoch": 0.38076076021642785, "grad_norm": 0.032521940767765045, "grad_norm_var": 2.256870676999439e-06, "learning_rate": 0.004457548122512656, "loss": 2.6866, "step": 7002 }, { "crossentropy": 2.6232829093933105, "epoch": 0.38081513907392806, "grad_norm": 0.033009324222803116, "grad_norm_var": 2.278735360524096e-06, "learning_rate": 0.004456310307637854, "loss": 2.6233, "step": 7003 }, { "crossentropy": 2.6778870820999146, "epoch": 0.38086951793142826, "grad_norm": 0.034105084836483, "grad_norm_var": 1.9034970679245842e-06, "learning_rate": 0.004455072526482179, "loss": 2.6779, "step": 7004 }, { "crossentropy": 2.5446271896362305, "epoch": 0.38092389678892846, "grad_norm": 0.03236072137951851, "grad_norm_var": 2.0993151986183877e-06, "learning_rate": 0.0044538347791224005, "loss": 2.5446, "step": 7005 }, { "crossentropy": 2.5937145948410034, "epoch": 0.38097827564642867, "grad_norm": 0.03283444046974182, "grad_norm_var": 2.2031694110654274e-06, "learning_rate": 0.0044525970656352784, "loss": 2.5937, "step": 7006 }, { "crossentropy": 2.62966525554657, "epoch": 0.38103265450392887, "grad_norm": 0.033630698919296265, "grad_norm_var": 2.200471913726733e-06, "learning_rate": 0.004451359386097575, "loss": 2.6297, "step": 7007 }, { "crossentropy": 2.591723680496216, "epoch": 0.3810870333614291, "grad_norm": 0.03317273035645485, "grad_norm_var": 2.244182209350474e-06, "learning_rate": 0.004450121740586053, "loss": 2.5917, "step": 7008 }, { "crossentropy": 2.6154216527938843, "epoch": 0.3811414122189293, "grad_norm": 0.0334017314016819, "grad_norm_var": 2.063692772167706e-06, "learning_rate": 0.004448884129177467, "loss": 2.6154, "step": 7009 }, { "crossentropy": 2.708479166030884, "epoch": 0.3811957910764295, "grad_norm": 0.033508412539958954, "grad_norm_var": 1.8430848328548447e-06, "learning_rate": 0.004447646551948574, "loss": 2.7085, "step": 7010 }, { "crossentropy": 2.6265326738357544, "epoch": 0.3812501699339297, "grad_norm": 0.032052092254161835, "grad_norm_var": 2.04689867613807e-06, "learning_rate": 0.0044464090089761255, "loss": 2.6265, "step": 7011 }, { "crossentropy": 2.676725149154663, "epoch": 0.3813045487914299, "grad_norm": 0.04161527007818222, "grad_norm_var": 5.7990132244005335e-06, "learning_rate": 0.004445171500336875, "loss": 2.6767, "step": 7012 }, { "crossentropy": 2.6563546657562256, "epoch": 0.3813589276489301, "grad_norm": 0.03461470454931259, "grad_norm_var": 5.731178892574793e-06, "learning_rate": 0.004443934026107572, "loss": 2.6564, "step": 7013 }, { "crossentropy": 2.6963744163513184, "epoch": 0.3814133065064303, "grad_norm": 0.03591805696487427, "grad_norm_var": 5.8737834502761136e-06, "learning_rate": 0.00444269658636496, "loss": 2.6964, "step": 7014 }, { "crossentropy": 2.658565402030945, "epoch": 0.3814676853639305, "grad_norm": 0.033301353454589844, "grad_norm_var": 5.735077893834148e-06, "learning_rate": 0.004441459181185786, "loss": 2.6586, "step": 7015 }, { "crossentropy": 2.6374346017837524, "epoch": 0.3815220642214307, "grad_norm": 0.034964676946401596, "grad_norm_var": 5.394984215677188e-06, "learning_rate": 0.004440221810646796, "loss": 2.6374, "step": 7016 }, { "crossentropy": 2.6639679670333862, "epoch": 0.3815764430789309, "grad_norm": 0.03764984384179115, "grad_norm_var": 5.857929709516633e-06, "learning_rate": 0.004438984474824725, "loss": 2.664, "step": 7017 }, { "crossentropy": 2.650571584701538, "epoch": 0.3816308219364311, "grad_norm": 0.03535594046115875, "grad_norm_var": 5.691313364898874e-06, "learning_rate": 0.004437747173796314, "loss": 2.6506, "step": 7018 }, { "crossentropy": 2.646599054336548, "epoch": 0.3816852007939313, "grad_norm": 0.03215388208627701, "grad_norm_var": 5.9034752129024625e-06, "learning_rate": 0.0044365099076383, "loss": 2.6466, "step": 7019 }, { "crossentropy": 2.688328504562378, "epoch": 0.3817395796514315, "grad_norm": 0.032966770231723785, "grad_norm_var": 6.031494272708675e-06, "learning_rate": 0.004435272676427415, "loss": 2.6883, "step": 7020 }, { "crossentropy": 2.6356160640716553, "epoch": 0.3817939585089317, "grad_norm": 0.03172624111175537, "grad_norm_var": 6.224420613022928e-06, "learning_rate": 0.004434035480240392, "loss": 2.6356, "step": 7021 }, { "crossentropy": 2.515416383743286, "epoch": 0.3818483373664319, "grad_norm": 0.03209586441516876, "grad_norm_var": 6.40324906995444e-06, "learning_rate": 0.004432798319153963, "loss": 2.5154, "step": 7022 }, { "crossentropy": 2.547760844230652, "epoch": 0.3819027162239321, "grad_norm": 0.033293914049863815, "grad_norm_var": 6.43850756465904e-06, "learning_rate": 0.004431561193244852, "loss": 2.5478, "step": 7023 }, { "crossentropy": 2.598618745803833, "epoch": 0.38195709508143233, "grad_norm": 0.03321466222405434, "grad_norm_var": 6.43266739546357e-06, "learning_rate": 0.004430324102589786, "loss": 2.5986, "step": 7024 }, { "crossentropy": 2.683617353439331, "epoch": 0.38201147393893253, "grad_norm": 0.03182750195264816, "grad_norm_var": 6.763418709966838e-06, "learning_rate": 0.004429087047265488, "loss": 2.6836, "step": 7025 }, { "crossentropy": 2.6278682947158813, "epoch": 0.38206585279643274, "grad_norm": 0.034006617963314056, "grad_norm_var": 6.7368973842516404e-06, "learning_rate": 0.004427850027348678, "loss": 2.6279, "step": 7026 }, { "crossentropy": 2.645410180091858, "epoch": 0.38212023165393294, "grad_norm": 0.03210112079977989, "grad_norm_var": 6.723187285906316e-06, "learning_rate": 0.004426613042916078, "loss": 2.6454, "step": 7027 }, { "crossentropy": 2.672945499420166, "epoch": 0.38217461051143314, "grad_norm": 0.03660496696829796, "grad_norm_var": 3.3220003545016603e-06, "learning_rate": 0.004425376094044401, "loss": 2.6729, "step": 7028 }, { "crossentropy": 2.64665150642395, "epoch": 0.38222898936893335, "grad_norm": 0.03373456001281738, "grad_norm_var": 3.282114619046252e-06, "learning_rate": 0.004424139180810363, "loss": 2.6467, "step": 7029 }, { "crossentropy": 2.637297749519348, "epoch": 0.38228336822643355, "grad_norm": 0.036294300109148026, "grad_norm_var": 3.3968523535802056e-06, "learning_rate": 0.0044229023032906776, "loss": 2.6373, "step": 7030 }, { "crossentropy": 2.6527371406555176, "epoch": 0.38233774708393375, "grad_norm": 0.03253830224275589, "grad_norm_var": 3.4871050681320806e-06, "learning_rate": 0.004421665461562051, "loss": 2.6527, "step": 7031 }, { "crossentropy": 2.694248914718628, "epoch": 0.38239212594143396, "grad_norm": 0.034530047327280045, "grad_norm_var": 3.430436813966521e-06, "learning_rate": 0.004420428655701194, "loss": 2.6942, "step": 7032 }, { "crossentropy": 2.631836414337158, "epoch": 0.38244650479893416, "grad_norm": 0.03518746420741081, "grad_norm_var": 2.5309476235435904e-06, "learning_rate": 0.004419191885784813, "loss": 2.6318, "step": 7033 }, { "crossentropy": 2.5225380659103394, "epoch": 0.38250088365643437, "grad_norm": 0.03375685587525368, "grad_norm_var": 2.3168067866154163e-06, "learning_rate": 0.004417955151889608, "loss": 2.5225, "step": 7034 }, { "crossentropy": 2.668726325035095, "epoch": 0.38255526251393457, "grad_norm": 0.0341627299785614, "grad_norm_var": 2.207917203271107e-06, "learning_rate": 0.004416718454092283, "loss": 2.6687, "step": 7035 }, { "crossentropy": 2.6241369247436523, "epoch": 0.38260964137143477, "grad_norm": 0.034464213997125626, "grad_norm_var": 2.2161186089127034e-06, "learning_rate": 0.004415481792469536, "loss": 2.6241, "step": 7036 }, { "crossentropy": 2.649320363998413, "epoch": 0.382664020228935, "grad_norm": 0.03181532770395279, "grad_norm_var": 2.192917968845259e-06, "learning_rate": 0.004414245167098061, "loss": 2.6493, "step": 7037 }, { "crossentropy": 2.6688283681869507, "epoch": 0.3827183990864352, "grad_norm": 0.032853864133358, "grad_norm_var": 2.0639972422413383e-06, "learning_rate": 0.004413008578054558, "loss": 2.6688, "step": 7038 }, { "crossentropy": 2.534313201904297, "epoch": 0.3827727779439354, "grad_norm": 0.03472975268959999, "grad_norm_var": 2.1009098424191902e-06, "learning_rate": 0.0044117720254157125, "loss": 2.5343, "step": 7039 }, { "crossentropy": 2.6285316944122314, "epoch": 0.3828271568014356, "grad_norm": 0.03355195373296738, "grad_norm_var": 2.078822852448712e-06, "learning_rate": 0.004410535509258219, "loss": 2.6285, "step": 7040 }, { "crossentropy": 2.6409634351730347, "epoch": 0.3828815356589358, "grad_norm": 0.03349825367331505, "grad_norm_var": 1.7949494576257584e-06, "learning_rate": 0.004409299029658764, "loss": 2.641, "step": 7041 }, { "crossentropy": 2.6300971508026123, "epoch": 0.382935914516436, "grad_norm": 0.0358811616897583, "grad_norm_var": 2.018873608751875e-06, "learning_rate": 0.004408062586694032, "loss": 2.6301, "step": 7042 }, { "crossentropy": 2.6768280267715454, "epoch": 0.3829902933739362, "grad_norm": 0.036757130175828934, "grad_norm_var": 2.1287991955627636e-06, "learning_rate": 0.004406826180440706, "loss": 2.6768, "step": 7043 }, { "crossentropy": 2.638733386993408, "epoch": 0.3830446722314364, "grad_norm": 0.033670518547296524, "grad_norm_var": 1.8033144850178202e-06, "learning_rate": 0.004405589810975468, "loss": 2.6387, "step": 7044 }, { "crossentropy": 2.7048381567001343, "epoch": 0.3830990510889366, "grad_norm": 0.034490201622247696, "grad_norm_var": 1.7906816445328415e-06, "learning_rate": 0.004404353478374996, "loss": 2.7048, "step": 7045 }, { "crossentropy": 2.59085214138031, "epoch": 0.3831534299464368, "grad_norm": 0.03336942195892334, "grad_norm_var": 1.532557786266114e-06, "learning_rate": 0.0044031171827159645, "loss": 2.5909, "step": 7046 }, { "crossentropy": 2.6404120922088623, "epoch": 0.383207808803937, "grad_norm": 0.03156350553035736, "grad_norm_var": 1.7921407813906512e-06, "learning_rate": 0.004401880924075051, "loss": 2.6404, "step": 7047 }, { "crossentropy": 2.6636900901794434, "epoch": 0.3832621876614372, "grad_norm": 0.034854426980018616, "grad_norm_var": 1.820878660927644e-06, "learning_rate": 0.0044006447025289235, "loss": 2.6637, "step": 7048 }, { "crossentropy": 2.5086764097213745, "epoch": 0.3833165665189374, "grad_norm": 0.03485869988799095, "grad_norm_var": 1.777243656556765e-06, "learning_rate": 0.004399408518154253, "loss": 2.5087, "step": 7049 }, { "crossentropy": 2.6176739931106567, "epoch": 0.3833709453764376, "grad_norm": 0.0359545461833477, "grad_norm_var": 2.002769777742497e-06, "learning_rate": 0.0043981723710277065, "loss": 2.6177, "step": 7050 }, { "crossentropy": 2.714088201522827, "epoch": 0.3834253242339378, "grad_norm": 0.03456767275929451, "grad_norm_var": 2.0134502854082223e-06, "learning_rate": 0.004396936261225949, "loss": 2.7141, "step": 7051 }, { "crossentropy": 2.5972750186920166, "epoch": 0.38347970309143803, "grad_norm": 0.04046489670872688, "grad_norm_var": 4.4913268504586636e-06, "learning_rate": 0.004395700188825642, "loss": 2.5973, "step": 7052 }, { "crossentropy": 2.5154178142547607, "epoch": 0.38353408194893823, "grad_norm": 0.03189970180392265, "grad_norm_var": 4.460949864753862e-06, "learning_rate": 0.004394464153903445, "loss": 2.5154, "step": 7053 }, { "crossentropy": 2.598962187767029, "epoch": 0.38358846080643844, "grad_norm": 0.03441100940108299, "grad_norm_var": 4.258192779983625e-06, "learning_rate": 0.004393228156536017, "loss": 2.599, "step": 7054 }, { "crossentropy": 2.562290072441101, "epoch": 0.38364283966393864, "grad_norm": 0.035751018673181534, "grad_norm_var": 4.33319357140919e-06, "learning_rate": 0.004391992196800014, "loss": 2.5623, "step": 7055 }, { "crossentropy": 2.613268733024597, "epoch": 0.38369721852143884, "grad_norm": 0.03203043341636658, "grad_norm_var": 4.7151492098925754e-06, "learning_rate": 0.004390756274772087, "loss": 2.6133, "step": 7056 }, { "crossentropy": 2.6390374898910522, "epoch": 0.38375159737893905, "grad_norm": 0.03275510296225548, "grad_norm_var": 4.8614518695463385e-06, "learning_rate": 0.004389520390528888, "loss": 2.639, "step": 7057 }, { "crossentropy": 2.5212284326553345, "epoch": 0.38380597623643925, "grad_norm": 0.03543825447559357, "grad_norm_var": 4.796871071746587e-06, "learning_rate": 0.004388284544147066, "loss": 2.5212, "step": 7058 }, { "crossentropy": 2.63576602935791, "epoch": 0.38386035509393945, "grad_norm": 0.03616945073008537, "grad_norm_var": 4.645690799354116e-06, "learning_rate": 0.004387048735703265, "loss": 2.6358, "step": 7059 }, { "crossentropy": 2.583741784095764, "epoch": 0.38391473395143966, "grad_norm": 0.033039312809705734, "grad_norm_var": 4.741710909803888e-06, "learning_rate": 0.00438581296527413, "loss": 2.5837, "step": 7060 }, { "crossentropy": 2.6729841232299805, "epoch": 0.38396911280893986, "grad_norm": 0.03344032168388367, "grad_norm_var": 4.808627890338461e-06, "learning_rate": 0.0043845772329363045, "loss": 2.673, "step": 7061 }, { "crossentropy": 2.6009620428085327, "epoch": 0.38402349166644006, "grad_norm": 0.03514525294303894, "grad_norm_var": 4.759225867040362e-06, "learning_rate": 0.004383341538766424, "loss": 2.601, "step": 7062 }, { "crossentropy": 2.514360547065735, "epoch": 0.38407787052394027, "grad_norm": 0.0351550430059433, "grad_norm_var": 4.148934238725385e-06, "learning_rate": 0.004382105882841124, "loss": 2.5144, "step": 7063 }, { "crossentropy": 2.59349524974823, "epoch": 0.38413224938144047, "grad_norm": 0.03745848685503006, "grad_norm_var": 4.610420007753607e-06, "learning_rate": 0.004380870265237045, "loss": 2.5935, "step": 7064 }, { "crossentropy": 2.6723580360412598, "epoch": 0.3841866282389407, "grad_norm": 0.03394784778356552, "grad_norm_var": 4.668345623612807e-06, "learning_rate": 0.004379634686030812, "loss": 2.6724, "step": 7065 }, { "crossentropy": 2.6646991968154907, "epoch": 0.3842410070964409, "grad_norm": 0.03550204634666443, "grad_norm_var": 4.6146088623538285e-06, "learning_rate": 0.00437839914529906, "loss": 2.6647, "step": 7066 }, { "crossentropy": 2.6615296602249146, "epoch": 0.3842953859539411, "grad_norm": 0.03551195189356804, "grad_norm_var": 4.638129323475404e-06, "learning_rate": 0.004377163643118412, "loss": 2.6615, "step": 7067 }, { "crossentropy": 2.625943899154663, "epoch": 0.3843497648114413, "grad_norm": 0.03209425136446953, "grad_norm_var": 2.7869348939573804e-06, "learning_rate": 0.004375928179565494, "loss": 2.6259, "step": 7068 }, { "crossentropy": 2.639293909072876, "epoch": 0.3844041436689415, "grad_norm": 0.033298060297966, "grad_norm_var": 2.450553132556504e-06, "learning_rate": 0.004374692754716929, "loss": 2.6393, "step": 7069 }, { "crossentropy": 2.474992871284485, "epoch": 0.3844585225264417, "grad_norm": 0.034476228058338165, "grad_norm_var": 2.450508265220558e-06, "learning_rate": 0.004373457368649335, "loss": 2.475, "step": 7070 }, { "crossentropy": 2.6622159481048584, "epoch": 0.3845129013839419, "grad_norm": 0.03367038816213608, "grad_norm_var": 2.360373502522696e-06, "learning_rate": 0.0043722220214393315, "loss": 2.6622, "step": 7071 }, { "crossentropy": 2.6154139041900635, "epoch": 0.3845672802414421, "grad_norm": 0.032239530235528946, "grad_norm_var": 2.299252285884827e-06, "learning_rate": 0.004370986713163535, "loss": 2.6154, "step": 7072 }, { "crossentropy": 2.6539982557296753, "epoch": 0.38462165909894236, "grad_norm": 0.03596346452832222, "grad_norm_var": 2.2672443216203523e-06, "learning_rate": 0.004369751443898554, "loss": 2.654, "step": 7073 }, { "crossentropy": 2.6710504293441772, "epoch": 0.38467603795644256, "grad_norm": 0.03976825997233391, "grad_norm_var": 3.96089788657668e-06, "learning_rate": 0.004368516213721, "loss": 2.6711, "step": 7074 }, { "crossentropy": 2.5646997690200806, "epoch": 0.38473041681394277, "grad_norm": 0.07864514738321304, "grad_norm_var": 0.0001244499508882511, "learning_rate": 0.004367281022707485, "loss": 2.5647, "step": 7075 }, { "crossentropy": 2.641209602355957, "epoch": 0.38478479567144297, "grad_norm": 0.03773203492164612, "grad_norm_var": 0.0001230604682300064, "learning_rate": 0.004366045870934608, "loss": 2.6412, "step": 7076 }, { "crossentropy": 2.6116585731506348, "epoch": 0.3848391745289432, "grad_norm": 0.03491605445742607, "grad_norm_var": 0.00012234799462098692, "learning_rate": 0.004364810758478976, "loss": 2.6117, "step": 7077 }, { "crossentropy": 2.6804709434509277, "epoch": 0.3848935533864434, "grad_norm": 0.0345575213432312, "grad_norm_var": 0.00012258116727949426, "learning_rate": 0.004363575685417189, "loss": 2.6805, "step": 7078 }, { "crossentropy": 2.7117080688476562, "epoch": 0.3849479322439436, "grad_norm": 0.03553780913352966, "grad_norm_var": 0.0001224549026777655, "learning_rate": 0.004362340651825844, "loss": 2.7117, "step": 7079 }, { "crossentropy": 2.6237664222717285, "epoch": 0.3850023111014438, "grad_norm": 0.05960868299007416, "grad_norm_var": 0.00015201492602295099, "learning_rate": 0.00436110565778154, "loss": 2.6238, "step": 7080 }, { "crossentropy": 2.6296533346176147, "epoch": 0.385056689958944, "grad_norm": 0.033365555107593536, "grad_norm_var": 0.0001524451961989357, "learning_rate": 0.0043598707033608635, "loss": 2.6297, "step": 7081 }, { "crossentropy": 2.680406332015991, "epoch": 0.3851110688164442, "grad_norm": 0.0626603290438652, "grad_norm_var": 0.00018522363102259538, "learning_rate": 0.00435863578864041, "loss": 2.6804, "step": 7082 }, { "crossentropy": 2.491774559020996, "epoch": 0.3851654476739444, "grad_norm": 0.04075275734066963, "grad_norm_var": 0.00018319072268824024, "learning_rate": 0.004357400913696769, "loss": 2.4918, "step": 7083 }, { "crossentropy": 2.732062578201294, "epoch": 0.3852198265314446, "grad_norm": 0.035345371812582016, "grad_norm_var": 0.0001799018174704364, "learning_rate": 0.004356166078606523, "loss": 2.7321, "step": 7084 }, { "crossentropy": 2.6527098417282104, "epoch": 0.3852742053889448, "grad_norm": 0.03512656316161156, "grad_norm_var": 0.0001781334348888871, "learning_rate": 0.0043549312834462554, "loss": 2.6527, "step": 7085 }, { "crossentropy": 2.5593308210372925, "epoch": 0.385328584246445, "grad_norm": 0.03492377698421478, "grad_norm_var": 0.00017772545886644557, "learning_rate": 0.004353696528292551, "loss": 2.5593, "step": 7086 }, { "crossentropy": 2.6010807752609253, "epoch": 0.3853829631039452, "grad_norm": 0.035802632570266724, "grad_norm_var": 0.0001757692098598273, "learning_rate": 0.004352461813221983, "loss": 2.6011, "step": 7087 }, { "crossentropy": 2.506141424179077, "epoch": 0.3854373419614454, "grad_norm": 0.03315255045890808, "grad_norm_var": 0.00017467156663363478, "learning_rate": 0.00435122713831113, "loss": 2.5061, "step": 7088 }, { "crossentropy": 2.647080421447754, "epoch": 0.3854917208189456, "grad_norm": 0.034756850451231, "grad_norm_var": 0.00017569208748063314, "learning_rate": 0.004349992503636566, "loss": 2.6471, "step": 7089 }, { "crossentropy": 2.7192881107330322, "epoch": 0.3855460996764458, "grad_norm": 0.03941518813371658, "grad_norm_var": 0.00017578920511638103, "learning_rate": 0.004348757909274861, "loss": 2.7193, "step": 7090 }, { "crossentropy": 2.5674989223480225, "epoch": 0.385600478533946, "grad_norm": 0.04279180243611336, "grad_norm_var": 7.92470687796291e-05, "learning_rate": 0.0043475233553025815, "loss": 2.5675, "step": 7091 }, { "crossentropy": 2.6644551753997803, "epoch": 0.3856548573914462, "grad_norm": 0.03440070524811745, "grad_norm_var": 8.06828134082985e-05, "learning_rate": 0.004346288841796297, "loss": 2.6645, "step": 7092 }, { "crossentropy": 2.62967050075531, "epoch": 0.38570923624894643, "grad_norm": 0.0329587459564209, "grad_norm_var": 8.20388545674379e-05, "learning_rate": 0.004345054368832569, "loss": 2.6297, "step": 7093 }, { "crossentropy": 2.6749264001846313, "epoch": 0.38576361510644663, "grad_norm": 0.034600768238306046, "grad_norm_var": 8.201293809109623e-05, "learning_rate": 0.004343819936487958, "loss": 2.6749, "step": 7094 }, { "crossentropy": 2.6319026947021484, "epoch": 0.38581799396394684, "grad_norm": 0.03313090652227402, "grad_norm_var": 8.351017016322056e-05, "learning_rate": 0.004342585544839023, "loss": 2.6319, "step": 7095 }, { "crossentropy": 2.554766297340393, "epoch": 0.38587237282144704, "grad_norm": 0.03413207456469536, "grad_norm_var": 5.381480985599263e-05, "learning_rate": 0.004341351193962318, "loss": 2.5548, "step": 7096 }, { "crossentropy": 2.473252296447754, "epoch": 0.38592675167894724, "grad_norm": 0.03457092493772507, "grad_norm_var": 5.326810008004686e-05, "learning_rate": 0.004340116883934401, "loss": 2.4733, "step": 7097 }, { "crossentropy": 2.736277937889099, "epoch": 0.38598113053644745, "grad_norm": 0.033585187047719955, "grad_norm_var": 8.206535963137194e-06, "learning_rate": 0.004338882614831816, "loss": 2.7363, "step": 7098 }, { "crossentropy": 2.6371370553970337, "epoch": 0.38603550939394765, "grad_norm": 0.03467344865202904, "grad_norm_var": 6.331955979840299e-06, "learning_rate": 0.004337648386731117, "loss": 2.6371, "step": 7099 }, { "crossentropy": 2.5551236867904663, "epoch": 0.38608988825144785, "grad_norm": 0.03225192800164223, "grad_norm_var": 6.874401064385117e-06, "learning_rate": 0.004336414199708848, "loss": 2.5551, "step": 7100 }, { "crossentropy": 2.551913022994995, "epoch": 0.38614426710894806, "grad_norm": 0.03392721712589264, "grad_norm_var": 6.946802966460025e-06, "learning_rate": 0.004335180053841551, "loss": 2.5519, "step": 7101 }, { "crossentropy": 2.6005383729934692, "epoch": 0.38619864596644826, "grad_norm": 0.03486229106783867, "grad_norm_var": 6.9471900305280325e-06, "learning_rate": 0.004333945949205766, "loss": 2.6005, "step": 7102 }, { "crossentropy": 2.6390209197998047, "epoch": 0.38625302482394847, "grad_norm": 0.03371907025575638, "grad_norm_var": 6.978405573016393e-06, "learning_rate": 0.004332711885878034, "loss": 2.639, "step": 7103 }, { "crossentropy": 2.640999674797058, "epoch": 0.38630740368144867, "grad_norm": 0.03365618735551834, "grad_norm_var": 6.88308567996735e-06, "learning_rate": 0.004331477863934888, "loss": 2.641, "step": 7104 }, { "crossentropy": 2.6302921772003174, "epoch": 0.3863617825389489, "grad_norm": 0.03936991095542908, "grad_norm_var": 8.162220660852769e-06, "learning_rate": 0.004330243883452863, "loss": 2.6303, "step": 7105 }, { "crossentropy": 2.6637710332870483, "epoch": 0.3864161613964491, "grad_norm": 0.03243954852223396, "grad_norm_var": 7.215896290614585e-06, "learning_rate": 0.004329009944508486, "loss": 2.6638, "step": 7106 }, { "crossentropy": 2.6143163442611694, "epoch": 0.3864705402539493, "grad_norm": 0.034739624708890915, "grad_norm_var": 2.572018240173425e-06, "learning_rate": 0.0043277760471782875, "loss": 2.6143, "step": 7107 }, { "crossentropy": 2.6002583503723145, "epoch": 0.3865249191114495, "grad_norm": 0.03696276247501373, "grad_norm_var": 3.054713536940895e-06, "learning_rate": 0.004326542191538793, "loss": 2.6003, "step": 7108 }, { "crossentropy": 2.6017857789993286, "epoch": 0.3865792979689497, "grad_norm": 0.03896518424153328, "grad_norm_var": 4.196318522952693e-06, "learning_rate": 0.004325308377666521, "loss": 2.6018, "step": 7109 }, { "crossentropy": 2.691232442855835, "epoch": 0.3866336768264499, "grad_norm": 0.03536079078912735, "grad_norm_var": 4.219913589434409e-06, "learning_rate": 0.004324074605637996, "loss": 2.6912, "step": 7110 }, { "crossentropy": 2.591148018836975, "epoch": 0.3866880556839501, "grad_norm": 0.0335664302110672, "grad_norm_var": 4.136488572446112e-06, "learning_rate": 0.004322840875529734, "loss": 2.5911, "step": 7111 }, { "crossentropy": 2.582284688949585, "epoch": 0.3867424345414503, "grad_norm": 0.03301333636045456, "grad_norm_var": 4.314180785829937e-06, "learning_rate": 0.00432160718741825, "loss": 2.5823, "step": 7112 }, { "crossentropy": 2.571834683418274, "epoch": 0.3867968133989505, "grad_norm": 0.03321748971939087, "grad_norm_var": 4.457191606619705e-06, "learning_rate": 0.004320373541380053, "loss": 2.5718, "step": 7113 }, { "crossentropy": 2.405762553215027, "epoch": 0.3868511922564507, "grad_norm": 0.03456009551882744, "grad_norm_var": 4.378909696394796e-06, "learning_rate": 0.00431913993749166, "loss": 2.4058, "step": 7114 }, { "crossentropy": 2.6742924451828003, "epoch": 0.3869055711139509, "grad_norm": 0.03423769772052765, "grad_norm_var": 4.392629566897973e-06, "learning_rate": 0.00431790637582957, "loss": 2.6743, "step": 7115 }, { "crossentropy": 2.6361985206604004, "epoch": 0.3869599499714511, "grad_norm": 0.033973000943660736, "grad_norm_var": 4.0210116257363125e-06, "learning_rate": 0.004316672856470291, "loss": 2.6362, "step": 7116 }, { "crossentropy": 2.6427403688430786, "epoch": 0.3870143288289513, "grad_norm": 0.032996222376823425, "grad_norm_var": 4.18174495650647e-06, "learning_rate": 0.004315439379490326, "loss": 2.6427, "step": 7117 }, { "crossentropy": 2.6215745210647583, "epoch": 0.3870687076864515, "grad_norm": 0.03182405233383179, "grad_norm_var": 4.7040631770631685e-06, "learning_rate": 0.004314205944966171, "loss": 2.6216, "step": 7118 }, { "crossentropy": 2.6676418781280518, "epoch": 0.3871230865439517, "grad_norm": 0.03367168456315994, "grad_norm_var": 4.709374984112937e-06, "learning_rate": 0.004312972552974326, "loss": 2.6676, "step": 7119 }, { "crossentropy": 2.5840030908584595, "epoch": 0.3871774654014519, "grad_norm": 0.034686893224716187, "grad_norm_var": 4.655050539688806e-06, "learning_rate": 0.0043117392035912794, "loss": 2.584, "step": 7120 }, { "crossentropy": 2.758972644805908, "epoch": 0.38723184425895213, "grad_norm": 0.03363553807139397, "grad_norm_var": 3.0625170368294683e-06, "learning_rate": 0.004310505896893527, "loss": 2.759, "step": 7121 }, { "crossentropy": 2.590553879737854, "epoch": 0.38728622311645233, "grad_norm": 0.031779758632183075, "grad_norm_var": 3.2481709131939226e-06, "learning_rate": 0.004309272632957557, "loss": 2.5906, "step": 7122 }, { "crossentropy": 2.6660869121551514, "epoch": 0.38734060197395254, "grad_norm": 0.032795704901218414, "grad_norm_var": 3.3443295810855456e-06, "learning_rate": 0.004308039411859854, "loss": 2.6661, "step": 7123 }, { "crossentropy": 2.668628454208374, "epoch": 0.38739498083145274, "grad_norm": 0.03303154557943344, "grad_norm_var": 2.7981056405024587e-06, "learning_rate": 0.004306806233676901, "loss": 2.6686, "step": 7124 }, { "crossentropy": 2.6528286933898926, "epoch": 0.38744935968895294, "grad_norm": 0.03319406881928444, "grad_norm_var": 9.299880272816673e-07, "learning_rate": 0.00430557309848518, "loss": 2.6528, "step": 7125 }, { "crossentropy": 2.6915714740753174, "epoch": 0.38750373854645315, "grad_norm": 0.033324915915727615, "grad_norm_var": 6.761944860813236e-07, "learning_rate": 0.004304340006361167, "loss": 2.6916, "step": 7126 }, { "crossentropy": 2.593032121658325, "epoch": 0.38755811740395335, "grad_norm": 0.036667365580797195, "grad_norm_var": 1.369032955102154e-06, "learning_rate": 0.004303106957381338, "loss": 2.593, "step": 7127 }, { "crossentropy": 2.5955711603164673, "epoch": 0.38761249626145355, "grad_norm": 0.03623059019446373, "grad_norm_var": 1.7908528765206896e-06, "learning_rate": 0.004301873951622168, "loss": 2.5956, "step": 7128 }, { "crossentropy": 2.6994060277938843, "epoch": 0.38766687511895376, "grad_norm": 0.03352085128426552, "grad_norm_var": 1.7755038510702236e-06, "learning_rate": 0.004300640989160122, "loss": 2.6994, "step": 7129 }, { "crossentropy": 2.653250575065613, "epoch": 0.38772125397645396, "grad_norm": 0.032018374651670456, "grad_norm_var": 1.907490414076949e-06, "learning_rate": 0.004299408070071671, "loss": 2.6533, "step": 7130 }, { "crossentropy": 2.6532740592956543, "epoch": 0.38777563283395416, "grad_norm": 0.0333150252699852, "grad_norm_var": 1.8821564785294227e-06, "learning_rate": 0.004298175194433279, "loss": 2.6533, "step": 7131 }, { "crossentropy": 2.601436495780945, "epoch": 0.38783001169145437, "grad_norm": 0.0317755751311779, "grad_norm_var": 2.0575526416763876e-06, "learning_rate": 0.004296942362321406, "loss": 2.6014, "step": 7132 }, { "crossentropy": 2.6525423526763916, "epoch": 0.38788439054895457, "grad_norm": 0.03404660150408745, "grad_norm_var": 2.069362710284348e-06, "learning_rate": 0.004295709573812514, "loss": 2.6525, "step": 7133 }, { "crossentropy": 2.590882182121277, "epoch": 0.3879387694064548, "grad_norm": 0.032865092158317566, "grad_norm_var": 1.9086442960160208e-06, "learning_rate": 0.004294476828983056, "loss": 2.5909, "step": 7134 }, { "crossentropy": 2.630556583404541, "epoch": 0.387993148263955, "grad_norm": 0.034396544098854065, "grad_norm_var": 1.954695915265016e-06, "learning_rate": 0.004293244127909487, "loss": 2.6306, "step": 7135 }, { "crossentropy": 2.5474365949630737, "epoch": 0.3880475271214552, "grad_norm": 0.033585354685783386, "grad_norm_var": 1.8680020092290764e-06, "learning_rate": 0.00429201147066826, "loss": 2.5474, "step": 7136 }, { "crossentropy": 2.59438157081604, "epoch": 0.3881019059789554, "grad_norm": 0.035003915429115295, "grad_norm_var": 2.0076738010003464e-06, "learning_rate": 0.004290778857335819, "loss": 2.5944, "step": 7137 }, { "crossentropy": 2.5023598670959473, "epoch": 0.3881562848364556, "grad_norm": 0.034135427325963974, "grad_norm_var": 1.7837354836531978e-06, "learning_rate": 0.004289546287988614, "loss": 2.5024, "step": 7138 }, { "crossentropy": 2.64477276802063, "epoch": 0.3882106636939558, "grad_norm": 0.03418787568807602, "grad_norm_var": 1.7288097843251003e-06, "learning_rate": 0.004288313762703086, "loss": 2.6448, "step": 7139 }, { "crossentropy": 2.590141177177429, "epoch": 0.388265042551456, "grad_norm": 0.03323132544755936, "grad_norm_var": 1.7100037644295885e-06, "learning_rate": 0.0042870812815556745, "loss": 2.5901, "step": 7140 }, { "crossentropy": 2.557005524635315, "epoch": 0.3883194214089562, "grad_norm": 0.03195803239941597, "grad_norm_var": 1.91254970727846e-06, "learning_rate": 0.004285848844622815, "loss": 2.557, "step": 7141 }, { "crossentropy": 2.6393706798553467, "epoch": 0.3883738002664564, "grad_norm": 0.0327187217772007, "grad_norm_var": 1.9712023654323725e-06, "learning_rate": 0.004284616451980948, "loss": 2.6394, "step": 7142 }, { "crossentropy": 2.588369131088257, "epoch": 0.3884281791239566, "grad_norm": 0.03372333198785782, "grad_norm_var": 1.3593113636687267e-06, "learning_rate": 0.0042833841037064994, "loss": 2.5884, "step": 7143 }, { "crossentropy": 2.4847522974014282, "epoch": 0.3884825579814568, "grad_norm": 0.034422196447849274, "grad_norm_var": 9.160461622312668e-07, "learning_rate": 0.004282151799875901, "loss": 2.4848, "step": 7144 }, { "crossentropy": 2.59353506565094, "epoch": 0.388536936838957, "grad_norm": 0.0341198667883873, "grad_norm_var": 9.456075351389759e-07, "learning_rate": 0.00428091954056558, "loss": 2.5935, "step": 7145 }, { "crossentropy": 2.630220890045166, "epoch": 0.3885913156964572, "grad_norm": 0.035164713859558105, "grad_norm_var": 9.55787979589097e-07, "learning_rate": 0.004279687325851957, "loss": 2.6302, "step": 7146 }, { "crossentropy": 2.5992329120635986, "epoch": 0.3886456945539574, "grad_norm": 0.032933905720710754, "grad_norm_var": 9.826810209260032e-07, "learning_rate": 0.004278455155811457, "loss": 2.5992, "step": 7147 }, { "crossentropy": 2.5865819454193115, "epoch": 0.3887000734114576, "grad_norm": 0.0380161888897419, "grad_norm_var": 1.8639245819957253e-06, "learning_rate": 0.0042772230305204925, "loss": 2.5866, "step": 7148 }, { "crossentropy": 2.785888671875, "epoch": 0.38875445226895783, "grad_norm": 0.03442524001002312, "grad_norm_var": 1.8736313544787455e-06, "learning_rate": 0.0042759909500554825, "loss": 2.7859, "step": 7149 }, { "crossentropy": 2.651283621788025, "epoch": 0.38880883112645803, "grad_norm": 0.040139440447092056, "grad_norm_var": 4.026314330220678e-06, "learning_rate": 0.004274758914492841, "loss": 2.6513, "step": 7150 }, { "crossentropy": 2.51248836517334, "epoch": 0.38886320998395824, "grad_norm": 0.03180085867643356, "grad_norm_var": 4.486724380053699e-06, "learning_rate": 0.004273526923908973, "loss": 2.5125, "step": 7151 }, { "crossentropy": 2.5231800079345703, "epoch": 0.38891758884145844, "grad_norm": 0.03471693769097328, "grad_norm_var": 4.45170331872796e-06, "learning_rate": 0.004272294978380288, "loss": 2.5232, "step": 7152 }, { "crossentropy": 2.6825281381607056, "epoch": 0.38897196769895864, "grad_norm": 0.03412589803338051, "grad_norm_var": 4.431365964136883e-06, "learning_rate": 0.004271063077983191, "loss": 2.6825, "step": 7153 }, { "crossentropy": 2.7500576972961426, "epoch": 0.38902634655645885, "grad_norm": 0.03301848843693733, "grad_norm_var": 4.543340629533053e-06, "learning_rate": 0.004269831222794083, "loss": 2.7501, "step": 7154 }, { "crossentropy": 2.5672802925109863, "epoch": 0.38908072541395905, "grad_norm": 0.03300797566771507, "grad_norm_var": 4.6470367469773784e-06, "learning_rate": 0.00426859941288936, "loss": 2.5673, "step": 7155 }, { "crossentropy": 2.5950924158096313, "epoch": 0.38913510427145925, "grad_norm": 0.03339338302612305, "grad_norm_var": 4.627310986045757e-06, "learning_rate": 0.004267367648345422, "loss": 2.5951, "step": 7156 }, { "crossentropy": 2.7409547567367554, "epoch": 0.38918948312895946, "grad_norm": 0.03355221077799797, "grad_norm_var": 4.303157083042757e-06, "learning_rate": 0.0042661359292386555, "loss": 2.741, "step": 7157 }, { "crossentropy": 2.640709400177002, "epoch": 0.38924386198645966, "grad_norm": 0.03242472931742668, "grad_norm_var": 4.371717970079863e-06, "learning_rate": 0.004264904255645457, "loss": 2.6407, "step": 7158 }, { "crossentropy": 2.566935181617737, "epoch": 0.38929824084395986, "grad_norm": 0.03368443250656128, "grad_norm_var": 4.374863576661556e-06, "learning_rate": 0.004263672627642212, "loss": 2.5669, "step": 7159 }, { "crossentropy": 2.703294038772583, "epoch": 0.38935261970146007, "grad_norm": 0.036195941269397736, "grad_norm_var": 4.598233671823602e-06, "learning_rate": 0.004262441045305303, "loss": 2.7033, "step": 7160 }, { "crossentropy": 2.6251602172851562, "epoch": 0.38940699855896027, "grad_norm": 0.032601840794086456, "grad_norm_var": 4.803009527242809e-06, "learning_rate": 0.004261209508711113, "loss": 2.6252, "step": 7161 }, { "crossentropy": 2.8009527921676636, "epoch": 0.3894613774164605, "grad_norm": 0.03570690006017685, "grad_norm_var": 4.882076690583027e-06, "learning_rate": 0.004259978017936021, "loss": 2.801, "step": 7162 }, { "crossentropy": 2.6561988592147827, "epoch": 0.3895157562739607, "grad_norm": 0.03555640950798988, "grad_norm_var": 4.813605271765618e-06, "learning_rate": 0.004258746573056401, "loss": 2.6562, "step": 7163 }, { "crossentropy": 2.611141562461853, "epoch": 0.3895701351314609, "grad_norm": 0.0352705679833889, "grad_norm_var": 4.005935343152448e-06, "learning_rate": 0.004257515174148629, "loss": 2.6111, "step": 7164 }, { "crossentropy": 2.508777379989624, "epoch": 0.3896245139889611, "grad_norm": 0.031303368508815765, "grad_norm_var": 4.584299769283945e-06, "learning_rate": 0.004256283821289073, "loss": 2.5088, "step": 7165 }, { "crossentropy": 2.6034003496170044, "epoch": 0.3896788928464613, "grad_norm": 0.0325983390212059, "grad_norm_var": 2.1225446783475673e-06, "learning_rate": 0.004255052514554101, "loss": 2.6034, "step": 7166 }, { "crossentropy": 2.5600250959396362, "epoch": 0.3897332717039615, "grad_norm": 0.03208748623728752, "grad_norm_var": 2.055677249286567e-06, "learning_rate": 0.004253821254020078, "loss": 2.56, "step": 7167 }, { "crossentropy": 2.4677518606185913, "epoch": 0.3897876505614617, "grad_norm": 0.03428742662072182, "grad_norm_var": 2.0091298418136705e-06, "learning_rate": 0.004252590039763363, "loss": 2.4678, "step": 7168 }, { "crossentropy": 2.6292258501052856, "epoch": 0.3898420294189619, "grad_norm": 0.033547647297382355, "grad_norm_var": 1.995338126892001e-06, "learning_rate": 0.004251358871860319, "loss": 2.6292, "step": 7169 }, { "crossentropy": 2.6777000427246094, "epoch": 0.3898964082764621, "grad_norm": 0.03989015519618988, "grad_norm_var": 4.3772965388872225e-06, "learning_rate": 0.0042501277503873, "loss": 2.6777, "step": 7170 }, { "crossentropy": 2.6718562841415405, "epoch": 0.3899507871339623, "grad_norm": 0.03350543603301048, "grad_norm_var": 4.322367585645782e-06, "learning_rate": 0.004248896675420658, "loss": 2.6719, "step": 7171 }, { "crossentropy": 2.54836642742157, "epoch": 0.3900051659914625, "grad_norm": 0.03147092089056969, "grad_norm_var": 4.734585309199013e-06, "learning_rate": 0.004247665647036745, "loss": 2.5484, "step": 7172 }, { "crossentropy": 2.734197974205017, "epoch": 0.3900595448489627, "grad_norm": 0.03275899216532707, "grad_norm_var": 4.819179292483988e-06, "learning_rate": 0.004246434665311907, "loss": 2.7342, "step": 7173 }, { "crossentropy": 2.6251672506332397, "epoch": 0.3901139237064629, "grad_norm": 0.034114401787519455, "grad_norm_var": 4.6583452725743974e-06, "learning_rate": 0.004245203730322487, "loss": 2.6252, "step": 7174 }, { "crossentropy": 2.544222831726074, "epoch": 0.3901683025639631, "grad_norm": 0.03640102967619896, "grad_norm_var": 4.992150160635216e-06, "learning_rate": 0.004243972842144829, "loss": 2.5442, "step": 7175 }, { "crossentropy": 2.521244168281555, "epoch": 0.3902226814214633, "grad_norm": 0.0337899774312973, "grad_norm_var": 4.715595274354794e-06, "learning_rate": 0.004242742000855271, "loss": 2.5212, "step": 7176 }, { "crossentropy": 2.65655255317688, "epoch": 0.39027706027896353, "grad_norm": 0.03464288264513016, "grad_norm_var": 4.580314465594109e-06, "learning_rate": 0.004241511206530146, "loss": 2.6566, "step": 7177 }, { "crossentropy": 2.636605739593506, "epoch": 0.39033143913646373, "grad_norm": 0.03505774587392807, "grad_norm_var": 4.47477387711618e-06, "learning_rate": 0.0042402804592457915, "loss": 2.6366, "step": 7178 }, { "crossentropy": 2.5821805000305176, "epoch": 0.39038581799396394, "grad_norm": 0.034699466079473495, "grad_norm_var": 4.359138719702872e-06, "learning_rate": 0.0042390497590785335, "loss": 2.5822, "step": 7179 }, { "crossentropy": 2.5047167539596558, "epoch": 0.39044019685146414, "grad_norm": 0.0354200154542923, "grad_norm_var": 4.384076645950567e-06, "learning_rate": 0.004237819106104699, "loss": 2.5047, "step": 7180 }, { "crossentropy": 2.551488518714905, "epoch": 0.39049457570896434, "grad_norm": 0.036227572709321976, "grad_norm_var": 4.0644193338443615e-06, "learning_rate": 0.0042365885004006145, "loss": 2.5515, "step": 7181 }, { "crossentropy": 2.661024570465088, "epoch": 0.39054895456646455, "grad_norm": 0.03490617871284485, "grad_norm_var": 3.840995967586448e-06, "learning_rate": 0.004235357942042599, "loss": 2.661, "step": 7182 }, { "crossentropy": 2.629849433898926, "epoch": 0.39060333342396475, "grad_norm": 0.03175431117415428, "grad_norm_var": 3.957347274252155e-06, "learning_rate": 0.00423412743110697, "loss": 2.6298, "step": 7183 }, { "crossentropy": 2.5508006811141968, "epoch": 0.39065771228146495, "grad_norm": 0.03351752460002899, "grad_norm_var": 4.019257655474284e-06, "learning_rate": 0.0042328969676700455, "loss": 2.5508, "step": 7184 }, { "crossentropy": 2.670607328414917, "epoch": 0.39071209113896516, "grad_norm": 0.041928354650735855, "grad_norm_var": 7.36549281171537e-06, "learning_rate": 0.004231666551808134, "loss": 2.6706, "step": 7185 }, { "crossentropy": 2.678494691848755, "epoch": 0.39076646999646536, "grad_norm": 0.034828927367925644, "grad_norm_var": 5.670052795745534e-06, "learning_rate": 0.004230436183597548, "loss": 2.6785, "step": 7186 }, { "crossentropy": 2.676873803138733, "epoch": 0.39082084885396556, "grad_norm": 0.032415520399808884, "grad_norm_var": 5.916293147504442e-06, "learning_rate": 0.004229205863114593, "loss": 2.6769, "step": 7187 }, { "crossentropy": 2.6505956649780273, "epoch": 0.39087522771146577, "grad_norm": 0.03298989310860634, "grad_norm_var": 5.422541155547523e-06, "learning_rate": 0.00422797559043557, "loss": 2.6506, "step": 7188 }, { "crossentropy": 2.5359541177749634, "epoch": 0.39092960656896597, "grad_norm": 0.032819245010614395, "grad_norm_var": 5.407047627014929e-06, "learning_rate": 0.0042267453656367835, "loss": 2.536, "step": 7189 }, { "crossentropy": 2.5758813619613647, "epoch": 0.3909839854264662, "grad_norm": 0.034658607095479965, "grad_norm_var": 5.381646489299498e-06, "learning_rate": 0.004225515188794527, "loss": 2.5759, "step": 7190 }, { "crossentropy": 2.64075767993927, "epoch": 0.3910383642839664, "grad_norm": 0.035536497831344604, "grad_norm_var": 5.238456716709303e-06, "learning_rate": 0.0042242850599850945, "loss": 2.6408, "step": 7191 }, { "crossentropy": 2.590362071990967, "epoch": 0.3910927431414666, "grad_norm": 0.03209620341658592, "grad_norm_var": 5.623174707895112e-06, "learning_rate": 0.004223054979284782, "loss": 2.5904, "step": 7192 }, { "crossentropy": 2.596778154373169, "epoch": 0.3911471219989668, "grad_norm": 0.03286232054233551, "grad_norm_var": 5.809644661386448e-06, "learning_rate": 0.004221824946769875, "loss": 2.5968, "step": 7193 }, { "crossentropy": 2.580025792121887, "epoch": 0.391201500856467, "grad_norm": 0.03387931361794472, "grad_norm_var": 5.806037597745559e-06, "learning_rate": 0.004220594962516659, "loss": 2.58, "step": 7194 }, { "crossentropy": 2.662761688232422, "epoch": 0.3912558797139672, "grad_norm": 0.03182568773627281, "grad_norm_var": 6.210805255866108e-06, "learning_rate": 0.004219365026601417, "loss": 2.6628, "step": 7195 }, { "crossentropy": 2.6971594095230103, "epoch": 0.3913102585714674, "grad_norm": 0.056160591542720795, "grad_norm_var": 3.638979435486711e-05, "learning_rate": 0.004218135139100427, "loss": 2.6972, "step": 7196 }, { "crossentropy": 2.7054762840270996, "epoch": 0.3913646374289676, "grad_norm": 0.034094955772161484, "grad_norm_var": 3.647439194892441e-05, "learning_rate": 0.004216905300089968, "loss": 2.7055, "step": 7197 }, { "crossentropy": 2.677808403968811, "epoch": 0.3914190162864678, "grad_norm": 0.03304009884595871, "grad_norm_var": 3.681294350102119e-05, "learning_rate": 0.0042156755096463125, "loss": 2.6778, "step": 7198 }, { "crossentropy": 2.6129082441329956, "epoch": 0.391473395143968, "grad_norm": 0.03430958464741707, "grad_norm_var": 3.602135123278675e-05, "learning_rate": 0.0042144457678457295, "loss": 2.6129, "step": 7199 }, { "crossentropy": 2.6510647535324097, "epoch": 0.3915277740014682, "grad_norm": 0.03666429966688156, "grad_norm_var": 3.583563592172289e-05, "learning_rate": 0.004213216074764488, "loss": 2.6511, "step": 7200 }, { "crossentropy": 2.6541035175323486, "epoch": 0.3915821528589684, "grad_norm": 0.03262607380747795, "grad_norm_var": 3.343437097149954e-05, "learning_rate": 0.004211986430478854, "loss": 2.6541, "step": 7201 }, { "crossentropy": 2.613571524620056, "epoch": 0.3916365317164686, "grad_norm": 0.035174328833818436, "grad_norm_var": 3.343162367262364e-05, "learning_rate": 0.0042107568350650834, "loss": 2.6136, "step": 7202 }, { "crossentropy": 2.660470485687256, "epoch": 0.3916909105739688, "grad_norm": 0.03439110144972801, "grad_norm_var": 3.297579065262285e-05, "learning_rate": 0.004209527288599441, "loss": 2.6605, "step": 7203 }, { "crossentropy": 2.665339708328247, "epoch": 0.391745289431469, "grad_norm": 0.04942181333899498, "grad_norm_var": 4.5018866659664716e-05, "learning_rate": 0.004208297791158178, "loss": 2.6653, "step": 7204 }, { "crossentropy": 2.57725989818573, "epoch": 0.3917996682889692, "grad_norm": 0.04050106182694435, "grad_norm_var": 4.5221207195697495e-05, "learning_rate": 0.004207068342817547, "loss": 2.5773, "step": 7205 }, { "crossentropy": 2.5502114295959473, "epoch": 0.39185404714646943, "grad_norm": 0.03811701387166977, "grad_norm_var": 4.502618834013571e-05, "learning_rate": 0.0042058389436538, "loss": 2.5502, "step": 7206 }, { "crossentropy": 2.727333426475525, "epoch": 0.39190842600396963, "grad_norm": 0.03513481095433235, "grad_norm_var": 4.5110307038552454e-05, "learning_rate": 0.004204609593743179, "loss": 2.7273, "step": 7207 }, { "crossentropy": 2.59795081615448, "epoch": 0.39196280486146984, "grad_norm": 0.032206591218709946, "grad_norm_var": 4.504045722801686e-05, "learning_rate": 0.004203380293161931, "loss": 2.598, "step": 7208 }, { "crossentropy": 2.587461471557617, "epoch": 0.39201718371897004, "grad_norm": 0.031471844762563705, "grad_norm_var": 4.590998068168331e-05, "learning_rate": 0.004202151041986296, "loss": 2.5875, "step": 7209 }, { "crossentropy": 2.7199875116348267, "epoch": 0.39207156257647024, "grad_norm": 0.03446371480822563, "grad_norm_var": 4.570267826292292e-05, "learning_rate": 0.004200921840292507, "loss": 2.72, "step": 7210 }, { "crossentropy": 2.622052311897278, "epoch": 0.39212594143397045, "grad_norm": 0.0357908234000206, "grad_norm_var": 4.4028926701204264e-05, "learning_rate": 0.0041996926881568, "loss": 2.6221, "step": 7211 }, { "crossentropy": 2.633600950241089, "epoch": 0.39218032029147065, "grad_norm": 0.03533703461289406, "grad_norm_var": 1.8203537694549735e-05, "learning_rate": 0.004198463585655411, "loss": 2.6336, "step": 7212 }, { "crossentropy": 2.6631215810775757, "epoch": 0.39223469914897086, "grad_norm": 0.03664457052946091, "grad_norm_var": 1.803135904881823e-05, "learning_rate": 0.004197234532864558, "loss": 2.6631, "step": 7213 }, { "crossentropy": 2.558022975921631, "epoch": 0.39228907800647106, "grad_norm": 0.032516252249479294, "grad_norm_var": 1.8252169270675107e-05, "learning_rate": 0.004196005529860474, "loss": 2.558, "step": 7214 }, { "crossentropy": 2.6519672870635986, "epoch": 0.39234345686397126, "grad_norm": 0.0318121463060379, "grad_norm_var": 1.9179308981323065e-05, "learning_rate": 0.004194776576719376, "loss": 2.652, "step": 7215 }, { "crossentropy": 2.713363528251648, "epoch": 0.39239783572147147, "grad_norm": 0.038745325058698654, "grad_norm_var": 1.969892373998375e-05, "learning_rate": 0.004193547673517484, "loss": 2.7134, "step": 7216 }, { "crossentropy": 2.5728343725204468, "epoch": 0.39245221457897167, "grad_norm": 0.03430391103029251, "grad_norm_var": 1.91430905667172e-05, "learning_rate": 0.0041923188203310145, "loss": 2.5728, "step": 7217 }, { "crossentropy": 2.6505320072174072, "epoch": 0.3925065934364719, "grad_norm": 0.03949834033846855, "grad_norm_var": 1.9834464094461934e-05, "learning_rate": 0.004191090017236177, "loss": 2.6505, "step": 7218 }, { "crossentropy": 2.588518500328064, "epoch": 0.3925609722939721, "grad_norm": 0.04134727269411087, "grad_norm_var": 2.111396784384966e-05, "learning_rate": 0.004189861264309183, "loss": 2.5885, "step": 7219 }, { "crossentropy": 2.6007615327835083, "epoch": 0.3926153511514723, "grad_norm": 0.03271673619747162, "grad_norm_var": 1.0235007995390507e-05, "learning_rate": 0.004188632561626239, "loss": 2.6008, "step": 7220 }, { "crossentropy": 2.634702682495117, "epoch": 0.3926697300089725, "grad_norm": 0.035429488867521286, "grad_norm_var": 8.570993540736998e-06, "learning_rate": 0.0041874039092635465, "loss": 2.6347, "step": 7221 }, { "crossentropy": 2.542793035507202, "epoch": 0.39272410886647274, "grad_norm": 0.032507237046957016, "grad_norm_var": 8.465201617714978e-06, "learning_rate": 0.004186175307297305, "loss": 2.5428, "step": 7222 }, { "crossentropy": 2.6070873737335205, "epoch": 0.39277848772397295, "grad_norm": 0.03184117004275322, "grad_norm_var": 9.081975172718175e-06, "learning_rate": 0.004184946755803715, "loss": 2.6071, "step": 7223 }, { "crossentropy": 2.6536176204681396, "epoch": 0.39283286658147315, "grad_norm": 0.03454164043068886, "grad_norm_var": 8.61858211578595e-06, "learning_rate": 0.004183718254858965, "loss": 2.6536, "step": 7224 }, { "crossentropy": 2.728971838951111, "epoch": 0.39288724543897335, "grad_norm": 0.03742397949099541, "grad_norm_var": 8.084031723185753e-06, "learning_rate": 0.004182489804539248, "loss": 2.729, "step": 7225 }, { "crossentropy": 2.735833525657654, "epoch": 0.39294162429647356, "grad_norm": 0.034650035202503204, "grad_norm_var": 8.065240064294468e-06, "learning_rate": 0.004181261404920752, "loss": 2.7358, "step": 7226 }, { "crossentropy": 2.5827267169952393, "epoch": 0.39299600315397376, "grad_norm": 0.03252756968140602, "grad_norm_var": 8.525554415955704e-06, "learning_rate": 0.0041800330560796605, "loss": 2.5827, "step": 7227 }, { "crossentropy": 2.6934814453125, "epoch": 0.39305038201147396, "grad_norm": 0.03460024297237396, "grad_norm_var": 8.537687491019276e-06, "learning_rate": 0.004178804758092154, "loss": 2.6935, "step": 7228 }, { "crossentropy": 2.610898971557617, "epoch": 0.39310476086897417, "grad_norm": 0.03482287377119064, "grad_norm_var": 8.362432905569852e-06, "learning_rate": 0.00417757651103441, "loss": 2.6109, "step": 7229 }, { "crossentropy": 2.5478549003601074, "epoch": 0.39315913972647437, "grad_norm": 0.03336312994360924, "grad_norm_var": 8.131852085389337e-06, "learning_rate": 0.004176348314982604, "loss": 2.5479, "step": 7230 }, { "crossentropy": 2.5269471406936646, "epoch": 0.3932135185839746, "grad_norm": 0.032107189297676086, "grad_norm_var": 8.011563215691055e-06, "learning_rate": 0.004175120170012909, "loss": 2.5269, "step": 7231 }, { "crossentropy": 2.6168601512908936, "epoch": 0.3932678974414748, "grad_norm": 0.03283102810382843, "grad_norm_var": 7.2652858058649034e-06, "learning_rate": 0.0041738920762014915, "loss": 2.6169, "step": 7232 }, { "crossentropy": 2.5787405967712402, "epoch": 0.393322276298975, "grad_norm": 0.033891282975673676, "grad_norm_var": 7.295352564694786e-06, "learning_rate": 0.004172664033624516, "loss": 2.5787, "step": 7233 }, { "crossentropy": 2.6222426891326904, "epoch": 0.3933766551564752, "grad_norm": 0.030525721609592438, "grad_norm_var": 6.504297494187792e-06, "learning_rate": 0.004171436042358149, "loss": 2.6222, "step": 7234 }, { "crossentropy": 2.513115167617798, "epoch": 0.3934310340139754, "grad_norm": 0.0333317406475544, "grad_norm_var": 2.742790362303173e-06, "learning_rate": 0.004170208102478543, "loss": 2.5131, "step": 7235 }, { "crossentropy": 2.5431056022644043, "epoch": 0.3934854128714756, "grad_norm": 0.03258267417550087, "grad_norm_var": 2.7591557068944056e-06, "learning_rate": 0.004168980214061857, "loss": 2.5431, "step": 7236 }, { "crossentropy": 2.5638668537139893, "epoch": 0.3935397917289758, "grad_norm": 0.033411819487810135, "grad_norm_var": 2.51094374705037e-06, "learning_rate": 0.004167752377184246, "loss": 2.5639, "step": 7237 }, { "crossentropy": 2.652267098426819, "epoch": 0.393594170586476, "grad_norm": 0.034595053642988205, "grad_norm_var": 2.525124934046827e-06, "learning_rate": 0.004166524591921854, "loss": 2.6523, "step": 7238 }, { "crossentropy": 2.708650588989258, "epoch": 0.3936485494439762, "grad_norm": 0.031465236097574234, "grad_norm_var": 2.620386393317684e-06, "learning_rate": 0.00416529685835083, "loss": 2.7087, "step": 7239 }, { "crossentropy": 2.6247990131378174, "epoch": 0.3937029283014764, "grad_norm": 0.033418867737054825, "grad_norm_var": 2.5495185325879444e-06, "learning_rate": 0.004164069176547318, "loss": 2.6248, "step": 7240 }, { "crossentropy": 2.561039090156555, "epoch": 0.3937573071589766, "grad_norm": 0.03349975124001503, "grad_norm_var": 1.4440792902539167e-06, "learning_rate": 0.0041628415465874554, "loss": 2.561, "step": 7241 }, { "crossentropy": 2.6052980422973633, "epoch": 0.3938116860164768, "grad_norm": 0.032455991953611374, "grad_norm_var": 1.3285076546296533e-06, "learning_rate": 0.004161613968547381, "loss": 2.6053, "step": 7242 }, { "crossentropy": 2.61228609085083, "epoch": 0.393866064873977, "grad_norm": 0.032416168600320816, "grad_norm_var": 1.3376282157471128e-06, "learning_rate": 0.004160386442503225, "loss": 2.6123, "step": 7243 }, { "crossentropy": 2.6373146772384644, "epoch": 0.3939204437314772, "grad_norm": 0.032698314636945724, "grad_norm_var": 1.1788068534722302e-06, "learning_rate": 0.0041591589685311195, "loss": 2.6373, "step": 7244 }, { "crossentropy": 2.7336188554763794, "epoch": 0.3939748225889774, "grad_norm": 0.034304551780223846, "grad_norm_var": 1.067101028719095e-06, "learning_rate": 0.004157931546707193, "loss": 2.7336, "step": 7245 }, { "crossentropy": 2.639420747756958, "epoch": 0.39402920144647763, "grad_norm": 0.03246709331870079, "grad_norm_var": 1.065672727813769e-06, "learning_rate": 0.004156704177107563, "loss": 2.6394, "step": 7246 }, { "crossentropy": 2.6765079498291016, "epoch": 0.39408358030397783, "grad_norm": 0.03198125958442688, "grad_norm_var": 1.0795585034306086e-06, "learning_rate": 0.0041554768598083556, "loss": 2.6765, "step": 7247 }, { "crossentropy": 2.562193751335144, "epoch": 0.39413795916147804, "grad_norm": 0.03356211632490158, "grad_norm_var": 1.1094298891775021e-06, "learning_rate": 0.004154249594885686, "loss": 2.5622, "step": 7248 }, { "crossentropy": 2.6123127937316895, "epoch": 0.39419233801897824, "grad_norm": 0.03388185799121857, "grad_norm_var": 1.108206039519312e-06, "learning_rate": 0.004153022382415669, "loss": 2.6123, "step": 7249 }, { "crossentropy": 2.666040301322937, "epoch": 0.39424671687647844, "grad_norm": 0.03646741062402725, "grad_norm_var": 1.423907511807707e-06, "learning_rate": 0.004151795222474412, "loss": 2.666, "step": 7250 }, { "crossentropy": 2.7594269514083862, "epoch": 0.39430109573397865, "grad_norm": 0.03476641699671745, "grad_norm_var": 1.5617322673864078e-06, "learning_rate": 0.004150568115138028, "loss": 2.7594, "step": 7251 }, { "crossentropy": 2.596239686012268, "epoch": 0.39435547459147885, "grad_norm": 0.03546274080872536, "grad_norm_var": 1.776506093157694e-06, "learning_rate": 0.004149341060482614, "loss": 2.5962, "step": 7252 }, { "crossentropy": 2.601229667663574, "epoch": 0.39440985344897905, "grad_norm": 0.03465043380856514, "grad_norm_var": 1.8490070239249659e-06, "learning_rate": 0.004148114058584275, "loss": 2.6012, "step": 7253 }, { "crossentropy": 2.655938506126404, "epoch": 0.39446423230647926, "grad_norm": 0.03234579786658287, "grad_norm_var": 1.8760323040706641e-06, "learning_rate": 0.004146887109519109, "loss": 2.6559, "step": 7254 }, { "crossentropy": 2.5678796768188477, "epoch": 0.39451861116397946, "grad_norm": 0.03415796905755997, "grad_norm_var": 1.602164877835081e-06, "learning_rate": 0.00414566021336321, "loss": 2.5679, "step": 7255 }, { "crossentropy": 2.6296874284744263, "epoch": 0.39457299002147966, "grad_norm": 0.034264594316482544, "grad_norm_var": 1.6198412033415743e-06, "learning_rate": 0.004144433370192668, "loss": 2.6297, "step": 7256 }, { "crossentropy": 2.5581483840942383, "epoch": 0.39462736887897987, "grad_norm": 0.03210734575986862, "grad_norm_var": 1.7803100191763713e-06, "learning_rate": 0.004143206580083567, "loss": 2.5581, "step": 7257 }, { "crossentropy": 2.6392709016799927, "epoch": 0.39468174773648007, "grad_norm": 0.03287193551659584, "grad_norm_var": 1.7263253438276154e-06, "learning_rate": 0.004141979843111997, "loss": 2.6393, "step": 7258 }, { "crossentropy": 2.6861352920532227, "epoch": 0.3947361265939803, "grad_norm": 0.0336947999894619, "grad_norm_var": 1.618093736217389e-06, "learning_rate": 0.004140753159354039, "loss": 2.6861, "step": 7259 }, { "crossentropy": 2.6091872453689575, "epoch": 0.3947905054514805, "grad_norm": 0.03166767954826355, "grad_norm_var": 1.8262937678314415e-06, "learning_rate": 0.004139526528885768, "loss": 2.6092, "step": 7260 }, { "crossentropy": 2.6079283952713013, "epoch": 0.3948448843089807, "grad_norm": 0.03150033578276634, "grad_norm_var": 2.078972202218301e-06, "learning_rate": 0.004138299951783258, "loss": 2.6079, "step": 7261 }, { "crossentropy": 2.6134568452835083, "epoch": 0.3948992631664809, "grad_norm": 0.03193483501672745, "grad_norm_var": 2.169315204646136e-06, "learning_rate": 0.004137073428122585, "loss": 2.6135, "step": 7262 }, { "crossentropy": 2.6467727422714233, "epoch": 0.3949536420239811, "grad_norm": 0.03342204540967941, "grad_norm_var": 2.0154935246637895e-06, "learning_rate": 0.00413584695797981, "loss": 2.6468, "step": 7263 }, { "crossentropy": 2.4914413690567017, "epoch": 0.3950080208814813, "grad_norm": 0.03274223580956459, "grad_norm_var": 2.0558969459810267e-06, "learning_rate": 0.004134620541431002, "loss": 2.4914, "step": 7264 }, { "crossentropy": 2.6688467264175415, "epoch": 0.3950623997389815, "grad_norm": 0.03161340206861496, "grad_norm_var": 2.260854317874076e-06, "learning_rate": 0.004133394178552224, "loss": 2.6688, "step": 7265 }, { "crossentropy": 2.6410913467407227, "epoch": 0.3951167785964817, "grad_norm": 0.03370334208011627, "grad_norm_var": 1.591072656174225e-06, "learning_rate": 0.004132167869419528, "loss": 2.6411, "step": 7266 }, { "crossentropy": 2.6480231285095215, "epoch": 0.3951711574539819, "grad_norm": 0.03534957766532898, "grad_norm_var": 1.7355529819008837e-06, "learning_rate": 0.00413094161410897, "loss": 2.648, "step": 7267 }, { "crossentropy": 2.561453938484192, "epoch": 0.3952255363114821, "grad_norm": 0.032781150192022324, "grad_norm_var": 1.38241313347668e-06, "learning_rate": 0.004129715412696607, "loss": 2.5615, "step": 7268 }, { "crossentropy": 2.6026153564453125, "epoch": 0.3952799151689823, "grad_norm": 0.034800466150045395, "grad_norm_var": 1.4158262156263938e-06, "learning_rate": 0.00412848926525848, "loss": 2.6026, "step": 7269 }, { "crossentropy": 2.5729472637176514, "epoch": 0.3953342940264825, "grad_norm": 0.03199145942926407, "grad_norm_var": 1.4574086717490664e-06, "learning_rate": 0.004127263171870639, "loss": 2.5729, "step": 7270 }, { "crossentropy": 2.513132333755493, "epoch": 0.3953886728839827, "grad_norm": 0.03308260813355446, "grad_norm_var": 1.3690576975153648e-06, "learning_rate": 0.004126037132609121, "loss": 2.5131, "step": 7271 }, { "crossentropy": 2.58709979057312, "epoch": 0.3954430517414829, "grad_norm": 0.03233187273144722, "grad_norm_var": 1.269034776427841e-06, "learning_rate": 0.0041248111475499635, "loss": 2.5871, "step": 7272 }, { "crossentropy": 2.6125173568725586, "epoch": 0.3954974305989831, "grad_norm": 0.03383124619722366, "grad_norm_var": 1.2841432432054988e-06, "learning_rate": 0.004123585216769206, "loss": 2.6125, "step": 7273 }, { "crossentropy": 2.583184242248535, "epoch": 0.3955518094564833, "grad_norm": 0.03360896557569504, "grad_norm_var": 1.3096917882232084e-06, "learning_rate": 0.004122359340342873, "loss": 2.5832, "step": 7274 }, { "crossentropy": 2.6436681747436523, "epoch": 0.39560618831398353, "grad_norm": 0.03530711680650711, "grad_norm_var": 1.620776953649659e-06, "learning_rate": 0.004121133518346996, "loss": 2.6437, "step": 7275 }, { "crossentropy": 2.578293561935425, "epoch": 0.39566056717148373, "grad_norm": 0.03214094415307045, "grad_norm_var": 1.5441239369533173e-06, "learning_rate": 0.004119907750857601, "loss": 2.5783, "step": 7276 }, { "crossentropy": 2.538345694541931, "epoch": 0.39571494602898394, "grad_norm": 0.03308985009789467, "grad_norm_var": 1.3558343984517695e-06, "learning_rate": 0.004118682037950705, "loss": 2.5383, "step": 7277 }, { "crossentropy": 2.570748805999756, "epoch": 0.39576932488648414, "grad_norm": 0.03220917657017708, "grad_norm_var": 1.3130458136593563e-06, "learning_rate": 0.004117456379702327, "loss": 2.5707, "step": 7278 }, { "crossentropy": 2.63286554813385, "epoch": 0.39582370374398435, "grad_norm": 0.03191855549812317, "grad_norm_var": 1.4199051920643173e-06, "learning_rate": 0.0041162307761884835, "loss": 2.6329, "step": 7279 }, { "crossentropy": 2.54499614238739, "epoch": 0.39587808260148455, "grad_norm": 0.03350042924284935, "grad_norm_var": 1.4139676184502884e-06, "learning_rate": 0.004115005227485182, "loss": 2.545, "step": 7280 }, { "crossentropy": 2.53055477142334, "epoch": 0.39593246145898475, "grad_norm": 0.03603010252118111, "grad_norm_var": 1.6966189241610432e-06, "learning_rate": 0.004113779733668431, "loss": 2.5306, "step": 7281 }, { "crossentropy": 2.6517696380615234, "epoch": 0.39598684031648496, "grad_norm": 0.03242642432451248, "grad_norm_var": 1.7604677161807298e-06, "learning_rate": 0.004112554294814236, "loss": 2.6518, "step": 7282 }, { "crossentropy": 2.64858341217041, "epoch": 0.39604121917398516, "grad_norm": 0.03244892135262489, "grad_norm_var": 1.5323220323675926e-06, "learning_rate": 0.004111328910998594, "loss": 2.6486, "step": 7283 }, { "crossentropy": 2.6420098543167114, "epoch": 0.39609559803148536, "grad_norm": 0.03164884075522423, "grad_norm_var": 1.678514566782798e-06, "learning_rate": 0.004110103582297508, "loss": 2.642, "step": 7284 }, { "crossentropy": 2.5518659353256226, "epoch": 0.39614997688898557, "grad_norm": 0.031544070690870285, "grad_norm_var": 1.6237660675875085e-06, "learning_rate": 0.0041088783087869655, "loss": 2.5519, "step": 7285 }, { "crossentropy": 2.6403815746307373, "epoch": 0.39620435574648577, "grad_norm": 0.037566713988780975, "grad_norm_var": 2.858089142924319e-06, "learning_rate": 0.00410765309054296, "loss": 2.6404, "step": 7286 }, { "crossentropy": 2.665748357772827, "epoch": 0.396258734603986, "grad_norm": 0.03525060787796974, "grad_norm_var": 3.09107485387423e-06, "learning_rate": 0.0041064279276414786, "loss": 2.6657, "step": 7287 }, { "crossentropy": 2.7336026430130005, "epoch": 0.3963131134614862, "grad_norm": 0.03270037844777107, "grad_norm_var": 3.045686981922637e-06, "learning_rate": 0.004105202820158503, "loss": 2.7336, "step": 7288 }, { "crossentropy": 2.595562696456909, "epoch": 0.3963674923189864, "grad_norm": 0.03148144483566284, "grad_norm_var": 3.2717753748847297e-06, "learning_rate": 0.004103977768170013, "loss": 2.5956, "step": 7289 }, { "crossentropy": 2.6588497161865234, "epoch": 0.3964218711764866, "grad_norm": 0.030867865309119225, "grad_norm_var": 3.6301135999366872e-06, "learning_rate": 0.004102752771751989, "loss": 2.6588, "step": 7290 }, { "crossentropy": 2.532741665840149, "epoch": 0.3964762500339868, "grad_norm": 0.03480476886034012, "grad_norm_var": 3.500278353786102e-06, "learning_rate": 0.004101527830980401, "loss": 2.5327, "step": 7291 }, { "crossentropy": 2.538351535797119, "epoch": 0.396530628891487, "grad_norm": 0.03579925373196602, "grad_norm_var": 3.868040099325393e-06, "learning_rate": 0.004100302945931219, "loss": 2.5384, "step": 7292 }, { "crossentropy": 2.6331748962402344, "epoch": 0.3965850077489872, "grad_norm": 0.03397996723651886, "grad_norm_var": 3.889002921687738e-06, "learning_rate": 0.00409907811668041, "loss": 2.6332, "step": 7293 }, { "crossentropy": 2.5392532348632812, "epoch": 0.3966393866064874, "grad_norm": 0.033667732030153275, "grad_norm_var": 3.7930843015333e-06, "learning_rate": 0.004097853343303934, "loss": 2.5393, "step": 7294 }, { "crossentropy": 2.675153374671936, "epoch": 0.3966937654639876, "grad_norm": 0.03302035480737686, "grad_norm_var": 3.639973734565838e-06, "learning_rate": 0.004096628625877755, "loss": 2.6752, "step": 7295 }, { "crossentropy": 2.5953599214553833, "epoch": 0.3967481443214878, "grad_norm": 0.04691494628787041, "grad_norm_var": 1.480508528164553e-05, "learning_rate": 0.004095403964477827, "loss": 2.5954, "step": 7296 }, { "crossentropy": 2.500859260559082, "epoch": 0.396802523178988, "grad_norm": 0.03275228664278984, "grad_norm_var": 1.4757403216571771e-05, "learning_rate": 0.004094179359180099, "loss": 2.5009, "step": 7297 }, { "crossentropy": 2.5162878036499023, "epoch": 0.3968569020364882, "grad_norm": 0.03288537263870239, "grad_norm_var": 1.4663281807041984e-05, "learning_rate": 0.004092954810060525, "loss": 2.5163, "step": 7298 }, { "crossentropy": 2.598161458969116, "epoch": 0.3969112808939884, "grad_norm": 0.03313835337758064, "grad_norm_var": 1.4531255323398324e-05, "learning_rate": 0.004091730317195048, "loss": 2.5982, "step": 7299 }, { "crossentropy": 2.613680601119995, "epoch": 0.3969656597514886, "grad_norm": 0.03190222010016441, "grad_norm_var": 1.4447342080263708e-05, "learning_rate": 0.004090505880659607, "loss": 2.6137, "step": 7300 }, { "crossentropy": 2.5296868085861206, "epoch": 0.3970200386089888, "grad_norm": 0.04853387549519539, "grad_norm_var": 2.63192981041964e-05, "learning_rate": 0.004089281500530146, "loss": 2.5297, "step": 7301 }, { "crossentropy": 2.5495036840438843, "epoch": 0.397074417466489, "grad_norm": 0.03083644062280655, "grad_norm_var": 2.7142397530168514e-05, "learning_rate": 0.004088057176882597, "loss": 2.5495, "step": 7302 }, { "crossentropy": 2.7017204761505127, "epoch": 0.39712879632398923, "grad_norm": 0.03261123597621918, "grad_norm_var": 2.745739406300519e-05, "learning_rate": 0.0040868329097928915, "loss": 2.7017, "step": 7303 }, { "crossentropy": 2.625856041908264, "epoch": 0.39718317518148943, "grad_norm": 0.034960754215717316, "grad_norm_var": 2.716095288283261e-05, "learning_rate": 0.004085608699336958, "loss": 2.6259, "step": 7304 }, { "crossentropy": 2.5750426054000854, "epoch": 0.39723755403898964, "grad_norm": 0.0321262925863266, "grad_norm_var": 2.6894322327541835e-05, "learning_rate": 0.004084384545590719, "loss": 2.575, "step": 7305 }, { "crossentropy": 2.529132843017578, "epoch": 0.39729193289648984, "grad_norm": 0.03314902260899544, "grad_norm_var": 2.5985524557779056e-05, "learning_rate": 0.004083160448630098, "loss": 2.5291, "step": 7306 }, { "crossentropy": 2.5661884546279907, "epoch": 0.39734631175399004, "grad_norm": 0.031032219529151917, "grad_norm_var": 2.7007278531401835e-05, "learning_rate": 0.0040819364085310115, "loss": 2.5662, "step": 7307 }, { "crossentropy": 2.460864782333374, "epoch": 0.39740069061149025, "grad_norm": 0.033201251178979874, "grad_norm_var": 2.709403638720964e-05, "learning_rate": 0.004080712425369373, "loss": 2.4609, "step": 7308 }, { "crossentropy": 2.5823036432266235, "epoch": 0.39745506946899045, "grad_norm": 0.03375469148159027, "grad_norm_var": 2.711792015599827e-05, "learning_rate": 0.004079488499221092, "loss": 2.5823, "step": 7309 }, { "crossentropy": 2.6295284032821655, "epoch": 0.39750944832649066, "grad_norm": 0.03093516081571579, "grad_norm_var": 2.7944468779819652e-05, "learning_rate": 0.004078264630162078, "loss": 2.6295, "step": 7310 }, { "crossentropy": 2.656758189201355, "epoch": 0.39756382718399086, "grad_norm": 0.03291422128677368, "grad_norm_var": 2.796589430992437e-05, "learning_rate": 0.00407704081826823, "loss": 2.6568, "step": 7311 }, { "crossentropy": 2.5770034790039062, "epoch": 0.39761820604149106, "grad_norm": 0.03373951464891434, "grad_norm_var": 1.6967147353348875e-05, "learning_rate": 0.004075817063615451, "loss": 2.577, "step": 7312 }, { "crossentropy": 2.593321919441223, "epoch": 0.39767258489899127, "grad_norm": 0.03173987567424774, "grad_norm_var": 1.715300414909428e-05, "learning_rate": 0.004074593366279636, "loss": 2.5933, "step": 7313 }, { "crossentropy": 2.6495158672332764, "epoch": 0.39772696375649147, "grad_norm": 0.033650390803813934, "grad_norm_var": 1.7117578046318452e-05, "learning_rate": 0.004073369726336677, "loss": 2.6495, "step": 7314 }, { "crossentropy": 2.5841269493103027, "epoch": 0.3977813426139917, "grad_norm": 0.033361271023750305, "grad_norm_var": 1.710580059370766e-05, "learning_rate": 0.004072146143862465, "loss": 2.5841, "step": 7315 }, { "crossentropy": 2.617843508720398, "epoch": 0.3978357214714919, "grad_norm": 0.0346217080950737, "grad_norm_var": 1.6933186611889765e-05, "learning_rate": 0.004070922618932883, "loss": 2.6178, "step": 7316 }, { "crossentropy": 2.624174952507019, "epoch": 0.3978901003289921, "grad_norm": 0.03214428201317787, "grad_norm_var": 1.5744807000524104e-06, "learning_rate": 0.004069699151623814, "loss": 2.6242, "step": 7317 }, { "crossentropy": 2.7474379539489746, "epoch": 0.3979444791864923, "grad_norm": 0.03792574629187584, "grad_norm_var": 2.8608653386265798e-06, "learning_rate": 0.004068475742011138, "loss": 2.7474, "step": 7318 }, { "crossentropy": 2.6985639333724976, "epoch": 0.3979988580439925, "grad_norm": 0.03276770934462547, "grad_norm_var": 2.849241567959235e-06, "learning_rate": 0.004067252390170726, "loss": 2.6986, "step": 7319 }, { "crossentropy": 2.513669967651367, "epoch": 0.3980532369014927, "grad_norm": 0.03272765129804611, "grad_norm_var": 2.6519900246236085e-06, "learning_rate": 0.004066029096178451, "loss": 2.5137, "step": 7320 }, { "crossentropy": 2.5844926834106445, "epoch": 0.3981076157589929, "grad_norm": 0.03256447985768318, "grad_norm_var": 2.6064042241896734e-06, "learning_rate": 0.004064805860110183, "loss": 2.5845, "step": 7321 }, { "crossentropy": 2.6919738054275513, "epoch": 0.3981619946164931, "grad_norm": 0.033552419394254684, "grad_norm_var": 2.617096396099333e-06, "learning_rate": 0.004063582682041781, "loss": 2.692, "step": 7322 }, { "crossentropy": 2.5213412046432495, "epoch": 0.3982163734739933, "grad_norm": 0.03241041302680969, "grad_norm_var": 2.3439771601701614e-06, "learning_rate": 0.004062359562049108, "loss": 2.5213, "step": 7323 }, { "crossentropy": 2.5603107213974, "epoch": 0.3982707523314935, "grad_norm": 0.03487221151590347, "grad_norm_var": 2.5074727638620006e-06, "learning_rate": 0.004061136500208024, "loss": 2.5603, "step": 7324 }, { "crossentropy": 2.5858348608016968, "epoch": 0.3983251311889937, "grad_norm": 0.031422074884176254, "grad_norm_var": 2.72326521195849e-06, "learning_rate": 0.004059913496594377, "loss": 2.5858, "step": 7325 }, { "crossentropy": 2.645311713218689, "epoch": 0.3983795100464939, "grad_norm": 0.032647307962179184, "grad_norm_var": 2.3873212059381696e-06, "learning_rate": 0.0040586905512840195, "loss": 2.6453, "step": 7326 }, { "crossentropy": 2.682954788208008, "epoch": 0.3984338889039941, "grad_norm": 0.03693351522088051, "grad_norm_var": 3.181499120481305e-06, "learning_rate": 0.004057467664352795, "loss": 2.683, "step": 7327 }, { "crossentropy": 2.630884528160095, "epoch": 0.3984882677614943, "grad_norm": 0.03315476328134537, "grad_norm_var": 3.1894613481510866e-06, "learning_rate": 0.004056244835876547, "loss": 2.6309, "step": 7328 }, { "crossentropy": 2.5517234802246094, "epoch": 0.3985426466189945, "grad_norm": 0.03375834971666336, "grad_norm_var": 2.96205916125424e-06, "learning_rate": 0.004055022065931116, "loss": 2.5517, "step": 7329 }, { "crossentropy": 2.585424304008484, "epoch": 0.3985970254764947, "grad_norm": 0.032235536724328995, "grad_norm_var": 3.088446265932185e-06, "learning_rate": 0.004053799354592334, "loss": 2.5854, "step": 7330 }, { "crossentropy": 2.5072704553604126, "epoch": 0.39865140433399493, "grad_norm": 0.04314350336790085, "grad_norm_var": 8.798631956708803e-06, "learning_rate": 0.004052576701936034, "loss": 2.5073, "step": 7331 }, { "crossentropy": 2.6236214637756348, "epoch": 0.39870578319149513, "grad_norm": 0.032225463539361954, "grad_norm_var": 9.016414176717345e-06, "learning_rate": 0.004051354108038046, "loss": 2.6236, "step": 7332 }, { "crossentropy": 2.7018983364105225, "epoch": 0.39876016204899534, "grad_norm": 0.036620695143938065, "grad_norm_var": 9.143103504260832e-06, "learning_rate": 0.004050131572974188, "loss": 2.7019, "step": 7333 }, { "crossentropy": 2.575827956199646, "epoch": 0.39881454090649554, "grad_norm": 0.033113639801740646, "grad_norm_var": 8.270536207566117e-06, "learning_rate": 0.004048909096820286, "loss": 2.5758, "step": 7334 }, { "crossentropy": 2.7290451526641846, "epoch": 0.39886891976399574, "grad_norm": 0.032829418778419495, "grad_norm_var": 8.260558016429051e-06, "learning_rate": 0.004047686679652156, "loss": 2.729, "step": 7335 }, { "crossentropy": 2.557729125022888, "epoch": 0.39892329862149595, "grad_norm": 0.03333422541618347, "grad_norm_var": 8.179581802875089e-06, "learning_rate": 0.00404646432154561, "loss": 2.5577, "step": 7336 }, { "crossentropy": 2.5540943145751953, "epoch": 0.39897767747899615, "grad_norm": 0.031076252460479736, "grad_norm_var": 8.613003791416163e-06, "learning_rate": 0.004045242022576458, "loss": 2.5541, "step": 7337 }, { "crossentropy": 2.675482392311096, "epoch": 0.39903205633649635, "grad_norm": 0.03637218847870827, "grad_norm_var": 8.957419509506708e-06, "learning_rate": 0.004044019782820505, "loss": 2.6755, "step": 7338 }, { "crossentropy": 2.6295723915100098, "epoch": 0.39908643519399656, "grad_norm": 0.03297390043735504, "grad_norm_var": 8.847742341936554e-06, "learning_rate": 0.004042797602353553, "loss": 2.6296, "step": 7339 }, { "crossentropy": 2.6580722332000732, "epoch": 0.39914081405149676, "grad_norm": 0.03316366672515869, "grad_norm_var": 8.870120696527484e-06, "learning_rate": 0.004041575481251403, "loss": 2.6581, "step": 7340 }, { "crossentropy": 2.624782085418701, "epoch": 0.39919519290899697, "grad_norm": 0.03672397881746292, "grad_norm_var": 8.760237664212575e-06, "learning_rate": 0.004040353419589846, "loss": 2.6248, "step": 7341 }, { "crossentropy": 2.5852922201156616, "epoch": 0.39924957176649717, "grad_norm": 0.03483278676867485, "grad_norm_var": 8.549732574526734e-06, "learning_rate": 0.0040391314174446765, "loss": 2.5853, "step": 7342 }, { "crossentropy": 2.6131073236465454, "epoch": 0.3993039506239974, "grad_norm": 0.039184488356113434, "grad_norm_var": 9.587556069539278e-06, "learning_rate": 0.0040379094748916825, "loss": 2.6131, "step": 7343 }, { "crossentropy": 2.6128039360046387, "epoch": 0.3993583294814976, "grad_norm": 0.03522741049528122, "grad_norm_var": 9.436912767477972e-06, "learning_rate": 0.004036687592006642, "loss": 2.6128, "step": 7344 }, { "crossentropy": 2.6607218980789185, "epoch": 0.3994127083389978, "grad_norm": 0.03164120763540268, "grad_norm_var": 1.0011372368890862e-05, "learning_rate": 0.004035465768865341, "loss": 2.6607, "step": 7345 }, { "crossentropy": 2.665544271469116, "epoch": 0.399467087196498, "grad_norm": 0.032464347779750824, "grad_norm_var": 9.94041483352668e-06, "learning_rate": 0.004034244005543555, "loss": 2.6655, "step": 7346 }, { "crossentropy": 2.494367003440857, "epoch": 0.3995214660539982, "grad_norm": 0.03360327333211899, "grad_norm_var": 4.866828706303188e-06, "learning_rate": 0.004033022302117054, "loss": 2.4944, "step": 7347 }, { "crossentropy": 2.4629454612731934, "epoch": 0.3995758449114984, "grad_norm": 0.03578554466366768, "grad_norm_var": 4.775485364022907e-06, "learning_rate": 0.004031800658661607, "loss": 2.4629, "step": 7348 }, { "crossentropy": 2.5724302530288696, "epoch": 0.3996302237689986, "grad_norm": 0.033169135451316833, "grad_norm_var": 4.456290890115603e-06, "learning_rate": 0.004030579075252985, "loss": 2.5724, "step": 7349 }, { "crossentropy": 2.7113853693008423, "epoch": 0.3996846026264988, "grad_norm": 0.030237987637519836, "grad_norm_var": 5.348812319600343e-06, "learning_rate": 0.00402935755196694, "loss": 2.7114, "step": 7350 }, { "crossentropy": 2.5676814317703247, "epoch": 0.399738981483999, "grad_norm": 0.03387928009033203, "grad_norm_var": 5.265915682136207e-06, "learning_rate": 0.004028136088879238, "loss": 2.5677, "step": 7351 }, { "crossentropy": 2.5936139822006226, "epoch": 0.3997933603414992, "grad_norm": 0.032309260219335556, "grad_norm_var": 5.419739955085194e-06, "learning_rate": 0.004026914686065628, "loss": 2.5936, "step": 7352 }, { "crossentropy": 2.632739305496216, "epoch": 0.3998477391989994, "grad_norm": 0.032996419817209244, "grad_norm_var": 4.92332205171969e-06, "learning_rate": 0.004025693343601863, "loss": 2.6327, "step": 7353 }, { "crossentropy": 2.5783846378326416, "epoch": 0.3999021180564996, "grad_norm": 0.03284706547856331, "grad_norm_var": 4.60160415539501e-06, "learning_rate": 0.0040244720615636885, "loss": 2.5784, "step": 7354 }, { "crossentropy": 2.5412590503692627, "epoch": 0.3999564969139998, "grad_norm": 0.03086879663169384, "grad_norm_var": 5.11464646383504e-06, "learning_rate": 0.004023250840026845, "loss": 2.5413, "step": 7355 }, { "crossentropy": 2.6137754917144775, "epoch": 0.4000108757715, "grad_norm": 0.034622594714164734, "grad_norm_var": 5.146572399243584e-06, "learning_rate": 0.004022029679067075, "loss": 2.6138, "step": 7356 }, { "crossentropy": 2.612385630607605, "epoch": 0.4000652546290002, "grad_norm": 0.033454470336437225, "grad_norm_var": 4.528941279190396e-06, "learning_rate": 0.004020808578760114, "loss": 2.6124, "step": 7357 }, { "crossentropy": 2.6133800745010376, "epoch": 0.4001196334865004, "grad_norm": 0.032933272421360016, "grad_norm_var": 4.434691155943262e-06, "learning_rate": 0.004019587539181691, "loss": 2.6134, "step": 7358 }, { "crossentropy": 2.52221143245697, "epoch": 0.40017401234400063, "grad_norm": 0.03710886836051941, "grad_norm_var": 3.117362452415802e-06, "learning_rate": 0.0040183665604075335, "loss": 2.5222, "step": 7359 }, { "crossentropy": 2.525269627571106, "epoch": 0.40022839120150083, "grad_norm": 0.03553851693868637, "grad_norm_var": 3.202457663942559e-06, "learning_rate": 0.00401714564251337, "loss": 2.5253, "step": 7360 }, { "crossentropy": 2.6495736837387085, "epoch": 0.40028277005900104, "grad_norm": 0.03382882848381996, "grad_norm_var": 3.005689135605185e-06, "learning_rate": 0.004015924785574914, "loss": 2.6496, "step": 7361 }, { "crossentropy": 2.6450694799423218, "epoch": 0.40033714891650124, "grad_norm": 0.032713040709495544, "grad_norm_var": 2.975943594130216e-06, "learning_rate": 0.004014703989667886, "loss": 2.6451, "step": 7362 }, { "crossentropy": 2.56203031539917, "epoch": 0.40039152777400144, "grad_norm": 0.03661723434925079, "grad_norm_var": 3.5877959064359166e-06, "learning_rate": 0.004013483254868001, "loss": 2.562, "step": 7363 }, { "crossentropy": 2.712110757827759, "epoch": 0.40044590663150165, "grad_norm": 0.03724493831396103, "grad_norm_var": 4.130250710429939e-06, "learning_rate": 0.004012262581250961, "loss": 2.7121, "step": 7364 }, { "crossentropy": 2.669063925743103, "epoch": 0.40050028548900185, "grad_norm": 0.03551721200346947, "grad_norm_var": 4.2857526946768795e-06, "learning_rate": 0.004011041968892478, "loss": 2.6691, "step": 7365 }, { "crossentropy": 2.4817439317703247, "epoch": 0.40055466434650205, "grad_norm": 0.03483403101563454, "grad_norm_var": 3.349704921586889e-06, "learning_rate": 0.004009821417868247, "loss": 2.4817, "step": 7366 }, { "crossentropy": 2.7035696506500244, "epoch": 0.40060904320400226, "grad_norm": 0.03353477269411087, "grad_norm_var": 3.3721816001555977e-06, "learning_rate": 0.00400860092825397, "loss": 2.7036, "step": 7367 }, { "crossentropy": 2.6761590242385864, "epoch": 0.40066342206150246, "grad_norm": 0.0312960185110569, "grad_norm_var": 3.689836859348825e-06, "learning_rate": 0.00400738050012534, "loss": 2.6762, "step": 7368 }, { "crossentropy": 2.5367605686187744, "epoch": 0.40071780091900266, "grad_norm": 0.0357455313205719, "grad_norm_var": 3.7495148598592434e-06, "learning_rate": 0.004006160133558046, "loss": 2.5368, "step": 7369 }, { "crossentropy": 2.587629795074463, "epoch": 0.40077217977650287, "grad_norm": 0.032058220356702805, "grad_norm_var": 3.940602618574086e-06, "learning_rate": 0.004004939828627771, "loss": 2.5876, "step": 7370 }, { "crossentropy": 2.5838674306869507, "epoch": 0.4008265586340031, "grad_norm": 0.03295568376779556, "grad_norm_var": 3.273425713600358e-06, "learning_rate": 0.004003719585410204, "loss": 2.5839, "step": 7371 }, { "crossentropy": 2.596969246864319, "epoch": 0.40088093749150333, "grad_norm": 0.03311211243271828, "grad_norm_var": 3.3661987295002344e-06, "learning_rate": 0.004002499403981017, "loss": 2.597, "step": 7372 }, { "crossentropy": 2.658413887023926, "epoch": 0.40093531634900353, "grad_norm": 0.034599486738443375, "grad_norm_var": 3.3219857949794962e-06, "learning_rate": 0.0040012792844158875, "loss": 2.6584, "step": 7373 }, { "crossentropy": 2.523485541343689, "epoch": 0.40098969520650374, "grad_norm": 0.031149111688137054, "grad_norm_var": 3.85852180989765e-06, "learning_rate": 0.004000059226790487, "loss": 2.5235, "step": 7374 }, { "crossentropy": 2.5317527055740356, "epoch": 0.40104407406400394, "grad_norm": 0.03460092842578888, "grad_norm_var": 3.2925897696091273e-06, "learning_rate": 0.00399883923118048, "loss": 2.5318, "step": 7375 }, { "crossentropy": 2.6616557836532593, "epoch": 0.40109845292150414, "grad_norm": 0.032450754195451736, "grad_norm_var": 3.289698010452054e-06, "learning_rate": 0.003997619297661528, "loss": 2.6617, "step": 7376 }, { "crossentropy": 2.499809980392456, "epoch": 0.40115283177900435, "grad_norm": 0.033288583159446716, "grad_norm_var": 3.3124265328221894e-06, "learning_rate": 0.003996399426309296, "loss": 2.4998, "step": 7377 }, { "crossentropy": 2.7145503759384155, "epoch": 0.40120721063650455, "grad_norm": 0.03403080627322197, "grad_norm_var": 3.2198999905195167e-06, "learning_rate": 0.003995179617199435, "loss": 2.7146, "step": 7378 }, { "crossentropy": 2.6452914476394653, "epoch": 0.40126158949400476, "grad_norm": 0.03382652997970581, "grad_norm_var": 2.7103629176181604e-06, "learning_rate": 0.0039939598704075985, "loss": 2.6453, "step": 7379 }, { "crossentropy": 2.660678267478943, "epoch": 0.40131596835150496, "grad_norm": 0.03288205340504646, "grad_norm_var": 1.8758646379507873e-06, "learning_rate": 0.003992740186009432, "loss": 2.6607, "step": 7380 }, { "crossentropy": 2.5614328384399414, "epoch": 0.40137034720900516, "grad_norm": 0.031616948544979095, "grad_norm_var": 1.7737563857913216e-06, "learning_rate": 0.0039915205640805775, "loss": 2.5614, "step": 7381 }, { "crossentropy": 2.591994881629944, "epoch": 0.40142472606650537, "grad_norm": 0.0353136770427227, "grad_norm_var": 1.8895120353132806e-06, "learning_rate": 0.003990301004696682, "loss": 2.592, "step": 7382 }, { "crossentropy": 2.701565742492676, "epoch": 0.40147910492400557, "grad_norm": 0.03163663297891617, "grad_norm_var": 2.0499191294003996e-06, "learning_rate": 0.0039890815079333735, "loss": 2.7016, "step": 7383 }, { "crossentropy": 2.5387396812438965, "epoch": 0.4015334837815058, "grad_norm": 0.033705826848745346, "grad_norm_var": 1.8138940642462945e-06, "learning_rate": 0.003987862073866288, "loss": 2.5387, "step": 7384 }, { "crossentropy": 2.541064739227295, "epoch": 0.401587862639006, "grad_norm": 0.03277638554573059, "grad_norm_var": 1.4010090901989125e-06, "learning_rate": 0.003986642702571055, "loss": 2.5411, "step": 7385 }, { "crossentropy": 2.641740918159485, "epoch": 0.4016422414965062, "grad_norm": 0.03212796151638031, "grad_norm_var": 1.3913911118707338e-06, "learning_rate": 0.003985423394123295, "loss": 2.6417, "step": 7386 }, { "crossentropy": 2.4981924295425415, "epoch": 0.4016966203540064, "grad_norm": 0.03349838778376579, "grad_norm_var": 1.3972149494443273e-06, "learning_rate": 0.003984204148598629, "loss": 2.4982, "step": 7387 }, { "crossentropy": 2.5628154277801514, "epoch": 0.4017509992115066, "grad_norm": 0.035603299736976624, "grad_norm_var": 1.7680181556899964e-06, "learning_rate": 0.003982984966072678, "loss": 2.5628, "step": 7388 }, { "crossentropy": 2.5805951356887817, "epoch": 0.4018053780690068, "grad_norm": 0.03276245668530464, "grad_norm_var": 1.6653482521143565e-06, "learning_rate": 0.003981765846621048, "loss": 2.5806, "step": 7389 }, { "crossentropy": 2.635859727859497, "epoch": 0.401859756926507, "grad_norm": 0.032670214772224426, "grad_norm_var": 1.3931179077446172e-06, "learning_rate": 0.003980546790319353, "loss": 2.6359, "step": 7390 }, { "crossentropy": 2.7280993461608887, "epoch": 0.4019141357840072, "grad_norm": 0.03448260948061943, "grad_norm_var": 1.3734611713794402e-06, "learning_rate": 0.0039793277972431955, "loss": 2.7281, "step": 7391 }, { "crossentropy": 2.661067843437195, "epoch": 0.4019685146415074, "grad_norm": 0.03737485036253929, "grad_norm_var": 2.3365183794782363e-06, "learning_rate": 0.0039781088674681764, "loss": 2.6611, "step": 7392 }, { "crossentropy": 2.6318646669387817, "epoch": 0.4020228934990076, "grad_norm": 0.03476378321647644, "grad_norm_var": 2.4113123360665806e-06, "learning_rate": 0.003976890001069893, "loss": 2.6319, "step": 7393 }, { "crossentropy": 2.6267147064208984, "epoch": 0.4020772723565078, "grad_norm": 0.033503834158182144, "grad_norm_var": 2.404864896637706e-06, "learning_rate": 0.003975671198123936, "loss": 2.6267, "step": 7394 }, { "crossentropy": 2.5071773529052734, "epoch": 0.402131651214008, "grad_norm": 0.03207625821232796, "grad_norm_var": 2.557255380350859e-06, "learning_rate": 0.003974452458705897, "loss": 2.5072, "step": 7395 }, { "crossentropy": 2.6613614559173584, "epoch": 0.4021860300715082, "grad_norm": 0.0435505211353302, "grad_norm_var": 8.721067710770515e-06, "learning_rate": 0.003973233782891362, "loss": 2.6614, "step": 7396 }, { "crossentropy": 2.699031114578247, "epoch": 0.4022404089290084, "grad_norm": 0.03411482647061348, "grad_norm_var": 8.245255544508424e-06, "learning_rate": 0.003972015170755909, "loss": 2.699, "step": 7397 }, { "crossentropy": 2.586833357810974, "epoch": 0.4022947877865086, "grad_norm": 0.032511044293642044, "grad_norm_var": 8.384509931786722e-06, "learning_rate": 0.003970796622375115, "loss": 2.5868, "step": 7398 }, { "crossentropy": 2.64581298828125, "epoch": 0.4023491666440088, "grad_norm": 0.033408861607313156, "grad_norm_var": 7.975700342360836e-06, "learning_rate": 0.00396957813782456, "loss": 2.6458, "step": 7399 }, { "crossentropy": 2.6013476848602295, "epoch": 0.40240354550150903, "grad_norm": 0.033299703150987625, "grad_norm_var": 8.018627007958663e-06, "learning_rate": 0.003968359717179804, "loss": 2.6013, "step": 7400 }, { "crossentropy": 2.6761021614074707, "epoch": 0.40245792435900923, "grad_norm": 0.032623618841171265, "grad_norm_var": 8.050769860564142e-06, "learning_rate": 0.003967141360516417, "loss": 2.6761, "step": 7401 }, { "crossentropy": 2.6272767782211304, "epoch": 0.40251230321650944, "grad_norm": 0.033009301871061325, "grad_norm_var": 7.847218470430755e-06, "learning_rate": 0.003965923067909962, "loss": 2.6273, "step": 7402 }, { "crossentropy": 2.6366838216781616, "epoch": 0.40256668207400964, "grad_norm": 0.031793415546417236, "grad_norm_var": 8.217576219987413e-06, "learning_rate": 0.003964704839435992, "loss": 2.6367, "step": 7403 }, { "crossentropy": 2.535654902458191, "epoch": 0.40262106093150984, "grad_norm": 0.031747911125421524, "grad_norm_var": 8.436408666689875e-06, "learning_rate": 0.003963486675170063, "loss": 2.5357, "step": 7404 }, { "crossentropy": 2.5616869926452637, "epoch": 0.40267543978901005, "grad_norm": 0.036859508603811264, "grad_norm_var": 8.819960512806922e-06, "learning_rate": 0.003962268575187725, "loss": 2.5617, "step": 7405 }, { "crossentropy": 2.5626403093338013, "epoch": 0.40272981864651025, "grad_norm": 0.03184165805578232, "grad_norm_var": 9.035944547748585e-06, "learning_rate": 0.003961050539564522, "loss": 2.5626, "step": 7406 }, { "crossentropy": 2.6719521284103394, "epoch": 0.40278419750401045, "grad_norm": 0.0317213237285614, "grad_norm_var": 9.402956185705635e-06, "learning_rate": 0.003959832568375997, "loss": 2.672, "step": 7407 }, { "crossentropy": 2.5979634523391724, "epoch": 0.40283857636151066, "grad_norm": 0.03193990886211395, "grad_norm_var": 8.812580041515858e-06, "learning_rate": 0.003958614661697685, "loss": 2.598, "step": 7408 }, { "crossentropy": 2.5721147060394287, "epoch": 0.40289295521901086, "grad_norm": 0.03292107954621315, "grad_norm_var": 8.756764930358216e-06, "learning_rate": 0.00395739681960512, "loss": 2.5721, "step": 7409 }, { "crossentropy": 2.6305688619613647, "epoch": 0.40294733407651107, "grad_norm": 0.03665820509195328, "grad_norm_var": 9.35599954973437e-06, "learning_rate": 0.003956179042173836, "loss": 2.6306, "step": 7410 }, { "crossentropy": 2.629836678504944, "epoch": 0.40300171293401127, "grad_norm": 0.038754019886255264, "grad_norm_var": 1.064849132939993e-05, "learning_rate": 0.003954961329479351, "loss": 2.6298, "step": 7411 }, { "crossentropy": 2.5770469903945923, "epoch": 0.4030560917915115, "grad_norm": 0.032646939158439636, "grad_norm_var": 4.444665252156752e-06, "learning_rate": 0.00395374368159719, "loss": 2.577, "step": 7412 }, { "crossentropy": 2.6058998107910156, "epoch": 0.4031104706490117, "grad_norm": 0.03442758694291115, "grad_norm_var": 4.476805564036732e-06, "learning_rate": 0.003952526098602873, "loss": 2.6059, "step": 7413 }, { "crossentropy": 2.5190852880477905, "epoch": 0.4031648495065119, "grad_norm": 0.033246610313653946, "grad_norm_var": 4.412623551967344e-06, "learning_rate": 0.00395130858057191, "loss": 2.5191, "step": 7414 }, { "crossentropy": 2.5588343143463135, "epoch": 0.4032192283640121, "grad_norm": 0.032952770590782166, "grad_norm_var": 4.434586421938244e-06, "learning_rate": 0.003950091127579809, "loss": 2.5588, "step": 7415 }, { "crossentropy": 2.559045672416687, "epoch": 0.4032736072215123, "grad_norm": 0.03156377375125885, "grad_norm_var": 4.6757038532425795e-06, "learning_rate": 0.003948873739702079, "loss": 2.559, "step": 7416 }, { "crossentropy": 2.6187374591827393, "epoch": 0.4033279860790125, "grad_norm": 0.037435662001371384, "grad_norm_var": 5.61247204441002e-06, "learning_rate": 0.00394765641701422, "loss": 2.6187, "step": 7417 }, { "crossentropy": 2.567339539527893, "epoch": 0.4033823649365127, "grad_norm": 0.03681303933262825, "grad_norm_var": 6.156317298151396e-06, "learning_rate": 0.003946439159591728, "loss": 2.5673, "step": 7418 }, { "crossentropy": 2.599404454231262, "epoch": 0.4034367437940129, "grad_norm": 0.03295167163014412, "grad_norm_var": 5.9059231508913505e-06, "learning_rate": 0.0039452219675100994, "loss": 2.5994, "step": 7419 }, { "crossentropy": 2.582564949989319, "epoch": 0.4034911226515131, "grad_norm": 0.043083831667900085, "grad_norm_var": 1.0487931844911184e-05, "learning_rate": 0.0039440048408448175, "loss": 2.5826, "step": 7420 }, { "crossentropy": 2.6441357135772705, "epoch": 0.4035455015090133, "grad_norm": 0.03516722843050957, "grad_norm_var": 1.0188363743039256e-05, "learning_rate": 0.003942787779671375, "loss": 2.6441, "step": 7421 }, { "crossentropy": 2.5464826822280884, "epoch": 0.4035998803665135, "grad_norm": 0.031609538942575455, "grad_norm_var": 1.0278115835152661e-05, "learning_rate": 0.003941570784065245, "loss": 2.5465, "step": 7422 }, { "crossentropy": 2.6816192865371704, "epoch": 0.4036542592240137, "grad_norm": 0.03166621923446655, "grad_norm_var": 1.0299590650677572e-05, "learning_rate": 0.00394035385410191, "loss": 2.6816, "step": 7423 }, { "crossentropy": 2.5254262685775757, "epoch": 0.4037086380815139, "grad_norm": 0.034437183290719986, "grad_norm_var": 9.798679325898988e-06, "learning_rate": 0.003939136989856841, "loss": 2.5254, "step": 7424 }, { "crossentropy": 2.6583242416381836, "epoch": 0.4037630169390141, "grad_norm": 0.0442967414855957, "grad_norm_var": 1.5080719339279474e-05, "learning_rate": 0.003937920191405506, "loss": 2.6583, "step": 7425 }, { "crossentropy": 2.6281810998916626, "epoch": 0.4038173957965143, "grad_norm": 0.032245881855487823, "grad_norm_var": 1.560549770005983e-05, "learning_rate": 0.0039367034588233705, "loss": 2.6282, "step": 7426 }, { "crossentropy": 2.606797933578491, "epoch": 0.4038717746540145, "grad_norm": 0.03394749015569687, "grad_norm_var": 1.4775704548270855e-05, "learning_rate": 0.003935486792185896, "loss": 2.6068, "step": 7427 }, { "crossentropy": 2.5870665311813354, "epoch": 0.40392615351151473, "grad_norm": 0.03416142612695694, "grad_norm_var": 1.4462931563853875e-05, "learning_rate": 0.003934270191568538, "loss": 2.5871, "step": 7428 }, { "crossentropy": 2.6194926500320435, "epoch": 0.40398053236901493, "grad_norm": 0.03255840763449669, "grad_norm_var": 1.4824058711541502e-05, "learning_rate": 0.003933053657046749, "loss": 2.6195, "step": 7429 }, { "crossentropy": 2.5313671827316284, "epoch": 0.40403491122651514, "grad_norm": 0.03231925889849663, "grad_norm_var": 1.5080215174125232e-05, "learning_rate": 0.003931837188695979, "loss": 2.5314, "step": 7430 }, { "crossentropy": 2.588559150695801, "epoch": 0.40408929008401534, "grad_norm": 0.03364108130335808, "grad_norm_var": 1.4937944417222199e-05, "learning_rate": 0.003930620786591668, "loss": 2.5886, "step": 7431 }, { "crossentropy": 2.6067981719970703, "epoch": 0.40414366894151554, "grad_norm": 0.034962136298418045, "grad_norm_var": 1.41622586329213e-05, "learning_rate": 0.00392940445080926, "loss": 2.6068, "step": 7432 }, { "crossentropy": 2.62353777885437, "epoch": 0.40419804779901575, "grad_norm": 0.03555557131767273, "grad_norm_var": 1.3792928705627591e-05, "learning_rate": 0.003928188181424191, "loss": 2.6235, "step": 7433 }, { "crossentropy": 2.6689990758895874, "epoch": 0.40425242665651595, "grad_norm": 0.038110751658678055, "grad_norm_var": 1.42181972987311e-05, "learning_rate": 0.00392697197851189, "loss": 2.669, "step": 7434 }, { "crossentropy": 2.626444935798645, "epoch": 0.40430680551401615, "grad_norm": 0.03257264196872711, "grad_norm_var": 1.4332949780500546e-05, "learning_rate": 0.003925755842147788, "loss": 2.6264, "step": 7435 }, { "crossentropy": 2.5608396530151367, "epoch": 0.40436118437151636, "grad_norm": 0.03239339590072632, "grad_norm_var": 9.983042810942043e-06, "learning_rate": 0.003924539772407306, "loss": 2.5608, "step": 7436 }, { "crossentropy": 2.564718246459961, "epoch": 0.40441556322901656, "grad_norm": 0.03505855053663254, "grad_norm_var": 9.971979748950252e-06, "learning_rate": 0.003923323769365864, "loss": 2.5647, "step": 7437 }, { "crossentropy": 2.6743305921554565, "epoch": 0.40446994208651676, "grad_norm": 0.032406531274318695, "grad_norm_var": 9.720885913194522e-06, "learning_rate": 0.00392210783309888, "loss": 2.6743, "step": 7438 }, { "crossentropy": 2.5319902896881104, "epoch": 0.40452432094401697, "grad_norm": 0.03356519713997841, "grad_norm_var": 9.25513900147523e-06, "learning_rate": 0.003920891963681762, "loss": 2.532, "step": 7439 }, { "crossentropy": 2.6182444095611572, "epoch": 0.40457869980151717, "grad_norm": 0.03261912614107132, "grad_norm_var": 9.48046813825006e-06, "learning_rate": 0.003919676161189919, "loss": 2.6182, "step": 7440 }, { "crossentropy": 2.546432375907898, "epoch": 0.4046330786590174, "grad_norm": 0.03216375783085823, "grad_norm_var": 2.6722165817588106e-06, "learning_rate": 0.0039184604256987535, "loss": 2.5464, "step": 7441 }, { "crossentropy": 2.6008856296539307, "epoch": 0.4046874575165176, "grad_norm": 0.033535879105329514, "grad_norm_var": 2.5359916146436754e-06, "learning_rate": 0.003917244757283662, "loss": 2.6009, "step": 7442 }, { "crossentropy": 2.684719681739807, "epoch": 0.4047418363740178, "grad_norm": 0.04833407700061798, "grad_norm_var": 1.5902094759231062e-05, "learning_rate": 0.003916029156020044, "loss": 2.6847, "step": 7443 }, { "crossentropy": 2.587733268737793, "epoch": 0.404796215231518, "grad_norm": 0.03496631979942322, "grad_norm_var": 1.589311838671332e-05, "learning_rate": 0.003914813621983286, "loss": 2.5877, "step": 7444 }, { "crossentropy": 2.5116934776306152, "epoch": 0.4048505940890182, "grad_norm": 0.032157618552446365, "grad_norm_var": 1.6016140861880673e-05, "learning_rate": 0.0039135981552487755, "loss": 2.5117, "step": 7445 }, { "crossentropy": 2.549926519393921, "epoch": 0.4049049729465184, "grad_norm": 0.0331810861825943, "grad_norm_var": 1.579501000184876e-05, "learning_rate": 0.003912382755891893, "loss": 2.5499, "step": 7446 }, { "crossentropy": 2.711159348487854, "epoch": 0.4049593518040186, "grad_norm": 0.03455404192209244, "grad_norm_var": 1.5718022950414468e-05, "learning_rate": 0.003911167423988021, "loss": 2.7112, "step": 7447 }, { "crossentropy": 2.645883321762085, "epoch": 0.4050137306615188, "grad_norm": 0.032674189656972885, "grad_norm_var": 1.5983083507207367e-05, "learning_rate": 0.003909952159612528, "loss": 2.6459, "step": 7448 }, { "crossentropy": 2.6308475732803345, "epoch": 0.405068109519019, "grad_norm": 0.031103091314435005, "grad_norm_var": 1.666406045237264e-05, "learning_rate": 0.0039087369628407875, "loss": 2.6308, "step": 7449 }, { "crossentropy": 2.5973201990127563, "epoch": 0.4051224883765192, "grad_norm": 0.03352876007556915, "grad_norm_var": 1.567088194408688e-05, "learning_rate": 0.003907521833748161, "loss": 2.5973, "step": 7450 }, { "crossentropy": 2.4884122610092163, "epoch": 0.4051768672340194, "grad_norm": 0.033410970121622086, "grad_norm_var": 1.5549572145483294e-05, "learning_rate": 0.003906306772410014, "loss": 2.4884, "step": 7451 }, { "crossentropy": 2.520238161087036, "epoch": 0.4052312460915196, "grad_norm": 0.03293728455901146, "grad_norm_var": 1.5444061855734398e-05, "learning_rate": 0.003905091778901701, "loss": 2.5202, "step": 7452 }, { "crossentropy": 2.477941155433655, "epoch": 0.4052856249490198, "grad_norm": 0.034199733287096024, "grad_norm_var": 1.538466606905846e-05, "learning_rate": 0.003903876853298573, "loss": 2.4779, "step": 7453 }, { "crossentropy": 2.6775577068328857, "epoch": 0.40534000380652, "grad_norm": 0.03256300464272499, "grad_norm_var": 1.5351207351334954e-05, "learning_rate": 0.0039026619956759817, "loss": 2.6776, "step": 7454 }, { "crossentropy": 2.5783404111862183, "epoch": 0.4053943826640202, "grad_norm": 0.03384082764387131, "grad_norm_var": 1.5336544372851464e-05, "learning_rate": 0.003901447206109272, "loss": 2.5783, "step": 7455 }, { "crossentropy": 2.5474921464920044, "epoch": 0.40544876152152043, "grad_norm": 0.032414160668849945, "grad_norm_var": 1.5379930423162892e-05, "learning_rate": 0.0039002324846737807, "loss": 2.5475, "step": 7456 }, { "crossentropy": 2.6318581104278564, "epoch": 0.40550314037902063, "grad_norm": 0.038783762603998184, "grad_norm_var": 1.6411843457060634e-05, "learning_rate": 0.003899017831444846, "loss": 2.6319, "step": 7457 }, { "crossentropy": 2.5529398918151855, "epoch": 0.40555751923652084, "grad_norm": 0.033507756888866425, "grad_norm_var": 1.641555129117052e-05, "learning_rate": 0.0038978032464977996, "loss": 2.5529, "step": 7458 }, { "crossentropy": 2.5865710973739624, "epoch": 0.40561189809402104, "grad_norm": 0.032784365117549896, "grad_norm_var": 2.865829360078906e-06, "learning_rate": 0.003896588729907966, "loss": 2.5866, "step": 7459 }, { "crossentropy": 2.611057162284851, "epoch": 0.40566627695152124, "grad_norm": 0.03183203190565109, "grad_norm_var": 2.8828855135644283e-06, "learning_rate": 0.003895374281750673, "loss": 2.6111, "step": 7460 }, { "crossentropy": 2.5171152353286743, "epoch": 0.40572065580902145, "grad_norm": 0.038547154515981674, "grad_norm_var": 4.425464932837269e-06, "learning_rate": 0.0038941599021012365, "loss": 2.5171, "step": 7461 }, { "crossentropy": 2.6454416513442993, "epoch": 0.40577503466652165, "grad_norm": 0.033170342445373535, "grad_norm_var": 4.426274779652572e-06, "learning_rate": 0.0038929455910349715, "loss": 2.6454, "step": 7462 }, { "crossentropy": 2.543592929840088, "epoch": 0.40582941352402185, "grad_norm": 0.032233938574790955, "grad_norm_var": 4.511105138296995e-06, "learning_rate": 0.00389173134862719, "loss": 2.5436, "step": 7463 }, { "crossentropy": 2.5937150716781616, "epoch": 0.40588379238152206, "grad_norm": 0.03252440318465233, "grad_norm_var": 4.530911575852794e-06, "learning_rate": 0.003890517174953196, "loss": 2.5937, "step": 7464 }, { "crossentropy": 2.686095118522644, "epoch": 0.40593817123902226, "grad_norm": 0.03242764621973038, "grad_norm_var": 4.202002906268596e-06, "learning_rate": 0.0038893030700882913, "loss": 2.6861, "step": 7465 }, { "crossentropy": 2.6425251960754395, "epoch": 0.40599255009652246, "grad_norm": 0.03424576297402382, "grad_norm_var": 4.220713936347921e-06, "learning_rate": 0.003888089034107777, "loss": 2.6425, "step": 7466 }, { "crossentropy": 2.6470770835876465, "epoch": 0.40604692895402267, "grad_norm": 0.033255599439144135, "grad_norm_var": 4.228499177386045e-06, "learning_rate": 0.003886875067086943, "loss": 2.6471, "step": 7467 }, { "crossentropy": 2.641032099723816, "epoch": 0.40610130781152287, "grad_norm": 0.032253652811050415, "grad_norm_var": 4.327617006636789e-06, "learning_rate": 0.0038856611691010794, "loss": 2.641, "step": 7468 }, { "crossentropy": 2.5921331644058228, "epoch": 0.4061556866690231, "grad_norm": 0.03349197655916214, "grad_norm_var": 4.30813355969713e-06, "learning_rate": 0.003884447340225472, "loss": 2.5921, "step": 7469 }, { "crossentropy": 2.5882474184036255, "epoch": 0.4062100655265233, "grad_norm": 0.03230326622724533, "grad_norm_var": 4.348861300875149e-06, "learning_rate": 0.003883233580535399, "loss": 2.5882, "step": 7470 }, { "crossentropy": 2.491024971008301, "epoch": 0.4062644443840235, "grad_norm": 0.03258230909705162, "grad_norm_var": 4.407616231146862e-06, "learning_rate": 0.0038820198901061394, "loss": 2.491, "step": 7471 }, { "crossentropy": 2.6117244958877563, "epoch": 0.4063188232415237, "grad_norm": 0.030881742015480995, "grad_norm_var": 4.780820172946504e-06, "learning_rate": 0.0038808062690129643, "loss": 2.6117, "step": 7472 }, { "crossentropy": 2.593702793121338, "epoch": 0.4063732020990239, "grad_norm": 0.032342877238988876, "grad_norm_var": 2.7729895141579414e-06, "learning_rate": 0.00387959271733114, "loss": 2.5937, "step": 7473 }, { "crossentropy": 2.56113600730896, "epoch": 0.4064275809565241, "grad_norm": 0.033184681087732315, "grad_norm_var": 2.7586766732888424e-06, "learning_rate": 0.003878379235135933, "loss": 2.5611, "step": 7474 }, { "crossentropy": 2.599774718284607, "epoch": 0.4064819598140243, "grad_norm": 0.03181453049182892, "grad_norm_var": 2.84584595276389e-06, "learning_rate": 0.0038771658225025974, "loss": 2.5998, "step": 7475 }, { "crossentropy": 2.6067962646484375, "epoch": 0.4065363386715245, "grad_norm": 0.03383748605847359, "grad_norm_var": 2.80007984868034e-06, "learning_rate": 0.0038759524795063905, "loss": 2.6068, "step": 7476 }, { "crossentropy": 2.569576144218445, "epoch": 0.4065907175290247, "grad_norm": 0.03491842374205589, "grad_norm_var": 9.723600517632837e-07, "learning_rate": 0.003874739206222565, "loss": 2.5696, "step": 7477 }, { "crossentropy": 2.6739896535873413, "epoch": 0.4066450963865249, "grad_norm": 0.033556677401065826, "grad_norm_var": 9.986126488797324e-07, "learning_rate": 0.0038735260027263634, "loss": 2.674, "step": 7478 }, { "crossentropy": 2.514407157897949, "epoch": 0.4066994752440251, "grad_norm": 0.033019326627254486, "grad_norm_var": 9.709830536794394e-07, "learning_rate": 0.0038723128690930295, "loss": 2.5144, "step": 7479 }, { "crossentropy": 2.68319034576416, "epoch": 0.4067538541015253, "grad_norm": 0.034689441323280334, "grad_norm_var": 1.151184154283289e-06, "learning_rate": 0.0038710998053978008, "loss": 2.6832, "step": 7480 }, { "crossentropy": 2.721849203109741, "epoch": 0.4068082329590255, "grad_norm": 0.03244630992412567, "grad_norm_var": 1.1496563617918125e-06, "learning_rate": 0.0038698868117159075, "loss": 2.7218, "step": 7481 }, { "crossentropy": 2.630092978477478, "epoch": 0.4068626118165257, "grad_norm": 0.03493443503975868, "grad_norm_var": 1.2889585659321545e-06, "learning_rate": 0.003868673888122582, "loss": 2.6301, "step": 7482 }, { "crossentropy": 2.5874717235565186, "epoch": 0.4069169906740259, "grad_norm": 0.0334957055747509, "grad_norm_var": 1.2977177417243611e-06, "learning_rate": 0.003867461034693048, "loss": 2.5875, "step": 7483 }, { "crossentropy": 2.6224619150161743, "epoch": 0.40697136953152613, "grad_norm": 0.031582314521074295, "grad_norm_var": 1.4024992900794058e-06, "learning_rate": 0.0038662482515025233, "loss": 2.6225, "step": 7484 }, { "crossentropy": 2.5158581733703613, "epoch": 0.40702574838902633, "grad_norm": 0.032456040382385254, "grad_norm_var": 1.4109542498731553e-06, "learning_rate": 0.003865035538626225, "loss": 2.5159, "step": 7485 }, { "crossentropy": 2.600654721260071, "epoch": 0.40708012724652654, "grad_norm": 0.0314876027405262, "grad_norm_var": 1.5286190318516361e-06, "learning_rate": 0.003863822896139367, "loss": 2.6007, "step": 7486 }, { "crossentropy": 2.55756676197052, "epoch": 0.40713450610402674, "grad_norm": 0.03343462944030762, "grad_norm_var": 1.5320243702587281e-06, "learning_rate": 0.0038626103241171505, "loss": 2.5576, "step": 7487 }, { "crossentropy": 2.679749846458435, "epoch": 0.40718888496152694, "grad_norm": 0.03205617517232895, "grad_norm_var": 1.28572515840919e-06, "learning_rate": 0.0038613978226347836, "loss": 2.6797, "step": 7488 }, { "crossentropy": 2.659158706665039, "epoch": 0.40724326381902715, "grad_norm": 0.03399825468659401, "grad_norm_var": 1.294618810520944e-06, "learning_rate": 0.003860185391767461, "loss": 2.6592, "step": 7489 }, { "crossentropy": 2.6513240337371826, "epoch": 0.40729764267652735, "grad_norm": 0.0322842076420784, "grad_norm_var": 1.3449754441285355e-06, "learning_rate": 0.003858973031590378, "loss": 2.6513, "step": 7490 }, { "crossentropy": 2.5686761140823364, "epoch": 0.40735202153402755, "grad_norm": 0.03219839185476303, "grad_norm_var": 1.2870759947842975e-06, "learning_rate": 0.0038577607421787243, "loss": 2.5687, "step": 7491 }, { "crossentropy": 2.6464736461639404, "epoch": 0.40740640039152776, "grad_norm": 0.033427901566028595, "grad_norm_var": 1.2600008518342625e-06, "learning_rate": 0.0038565485236076824, "loss": 2.6465, "step": 7492 }, { "crossentropy": 2.5151926279067993, "epoch": 0.40746077924902796, "grad_norm": 0.03208734095096588, "grad_norm_var": 1.083628609871839e-06, "learning_rate": 0.0038553363759524367, "loss": 2.5152, "step": 7493 }, { "crossentropy": 2.5436160564422607, "epoch": 0.40751515810652816, "grad_norm": 0.03180363401770592, "grad_norm_var": 1.1332359657352982e-06, "learning_rate": 0.0038541242992881624, "loss": 2.5436, "step": 7494 }, { "crossentropy": 2.5336880683898926, "epoch": 0.40756953696402837, "grad_norm": 0.03135848790407181, "grad_norm_var": 1.265394104848124e-06, "learning_rate": 0.00385291229369003, "loss": 2.5337, "step": 7495 }, { "crossentropy": 2.6402599811553955, "epoch": 0.40762391582152857, "grad_norm": 0.03128272667527199, "grad_norm_var": 1.1024444542465441e-06, "learning_rate": 0.0038517003592332066, "loss": 2.6403, "step": 7496 }, { "crossentropy": 2.588911294937134, "epoch": 0.4076782946790288, "grad_norm": 0.03482260927557945, "grad_norm_var": 1.4317410444062798e-06, "learning_rate": 0.00385048849599286, "loss": 2.5889, "step": 7497 }, { "crossentropy": 2.7312474250793457, "epoch": 0.407732673536529, "grad_norm": 0.03453310579061508, "grad_norm_var": 1.3206045011731592e-06, "learning_rate": 0.003849276704044141, "loss": 2.7312, "step": 7498 }, { "crossentropy": 2.7216843366622925, "epoch": 0.4077870523940292, "grad_norm": 0.036165010184049606, "grad_norm_var": 2.068942837653703e-06, "learning_rate": 0.0038480649834622097, "loss": 2.7217, "step": 7499 }, { "crossentropy": 2.684593915939331, "epoch": 0.4078414312515294, "grad_norm": 0.032290153205394745, "grad_norm_var": 1.9842817308544e-06, "learning_rate": 0.003846853334322215, "loss": 2.6846, "step": 7500 }, { "crossentropy": 2.7088953256607056, "epoch": 0.4078958101090296, "grad_norm": 0.03332894295454025, "grad_norm_var": 1.9854248252922554e-06, "learning_rate": 0.0038456417566993008, "loss": 2.7089, "step": 7501 }, { "crossentropy": 2.628993511199951, "epoch": 0.4079501889665298, "grad_norm": 0.033874593675136566, "grad_norm_var": 1.8888491969857104e-06, "learning_rate": 0.003844430250668609, "loss": 2.629, "step": 7502 }, { "crossentropy": 2.5853331089019775, "epoch": 0.40800456782403, "grad_norm": 0.034030817449092865, "grad_norm_var": 1.9409128853878703e-06, "learning_rate": 0.003843218816305274, "loss": 2.5853, "step": 7503 }, { "crossentropy": 2.5972816944122314, "epoch": 0.4080589466815302, "grad_norm": 0.0327792689204216, "grad_norm_var": 1.8733015281740517e-06, "learning_rate": 0.003842007453684431, "loss": 2.5973, "step": 7504 }, { "crossentropy": 2.588180184364319, "epoch": 0.4081133255390304, "grad_norm": 0.034905970096588135, "grad_norm_var": 2.0284792128231107e-06, "learning_rate": 0.003840796162881206, "loss": 2.5882, "step": 7505 }, { "crossentropy": 2.6030818223953247, "epoch": 0.4081677043965306, "grad_norm": 0.03256897255778313, "grad_norm_var": 1.998839686706497e-06, "learning_rate": 0.003839584943970722, "loss": 2.6031, "step": 7506 }, { "crossentropy": 2.7048728466033936, "epoch": 0.4082220832540308, "grad_norm": 0.03257826715707779, "grad_norm_var": 1.9563107705524693e-06, "learning_rate": 0.0038383737970280976, "loss": 2.7049, "step": 7507 }, { "crossentropy": 2.5309741497039795, "epoch": 0.408276462111531, "grad_norm": 0.031146952882409096, "grad_norm_var": 2.2242935855287658e-06, "learning_rate": 0.0038371627221284493, "loss": 2.531, "step": 7508 }, { "crossentropy": 2.592730402946472, "epoch": 0.4083308409690312, "grad_norm": 0.03160068020224571, "grad_norm_var": 2.3046305446649257e-06, "learning_rate": 0.0038359517193468817, "loss": 2.5927, "step": 7509 }, { "crossentropy": 2.6451140642166138, "epoch": 0.4083852198265314, "grad_norm": 0.033853184431791306, "grad_norm_var": 2.221958161655379e-06, "learning_rate": 0.0038347407887585052, "loss": 2.6451, "step": 7510 }, { "crossentropy": 2.553077816963196, "epoch": 0.4084395986840316, "grad_norm": 0.03216644749045372, "grad_norm_var": 2.064916134136132e-06, "learning_rate": 0.003833529930438419, "loss": 2.5531, "step": 7511 }, { "crossentropy": 2.579169273376465, "epoch": 0.4084939775415318, "grad_norm": 0.03349415957927704, "grad_norm_var": 1.791834957985553e-06, "learning_rate": 0.003832319144461718, "loss": 2.5792, "step": 7512 }, { "crossentropy": 2.650501847267151, "epoch": 0.40854835639903203, "grad_norm": 0.03235512226819992, "grad_norm_var": 1.6989657382107756e-06, "learning_rate": 0.0038311084309034936, "loss": 2.6505, "step": 7513 }, { "crossentropy": 2.4810279607772827, "epoch": 0.40860273525653223, "grad_norm": 0.0336126871407032, "grad_norm_var": 1.5919294562500059e-06, "learning_rate": 0.0038298977898388376, "loss": 2.481, "step": 7514 }, { "crossentropy": 2.5198867321014404, "epoch": 0.40865711411403244, "grad_norm": 0.03314783424139023, "grad_norm_var": 9.568110746283835e-07, "learning_rate": 0.003828687221342828, "loss": 2.5199, "step": 7515 }, { "crossentropy": 2.5137592554092407, "epoch": 0.40871149297153264, "grad_norm": 0.046595342457294464, "grad_norm_var": 1.2424484549267824e-05, "learning_rate": 0.0038274767254905468, "loss": 2.5138, "step": 7516 }, { "crossentropy": 2.5725133419036865, "epoch": 0.40876587182903285, "grad_norm": 0.03317682817578316, "grad_norm_var": 1.243705558975036e-05, "learning_rate": 0.0038262663023570643, "loss": 2.5725, "step": 7517 }, { "crossentropy": 2.661880850791931, "epoch": 0.40882025068653305, "grad_norm": 0.032772090286016464, "grad_norm_var": 1.2512047919356833e-05, "learning_rate": 0.00382505595201745, "loss": 2.6619, "step": 7518 }, { "crossentropy": 2.554252505302429, "epoch": 0.40887462954403325, "grad_norm": 0.03220842406153679, "grad_norm_var": 1.2663298924941524e-05, "learning_rate": 0.003823845674546774, "loss": 2.5543, "step": 7519 }, { "crossentropy": 2.4993252754211426, "epoch": 0.4089290084015335, "grad_norm": 0.03486054763197899, "grad_norm_var": 1.2682648939331741e-05, "learning_rate": 0.0038226354700200893, "loss": 2.4993, "step": 7520 }, { "crossentropy": 2.580663800239563, "epoch": 0.4089833872590337, "grad_norm": 0.033286359161138535, "grad_norm_var": 1.261104959926483e-05, "learning_rate": 0.0038214253385124566, "loss": 2.5807, "step": 7521 }, { "crossentropy": 2.6316503286361694, "epoch": 0.4090377661165339, "grad_norm": 0.03329439461231232, "grad_norm_var": 1.2533189606486619e-05, "learning_rate": 0.003820215280098926, "loss": 2.6317, "step": 7522 }, { "crossentropy": 2.6122137308120728, "epoch": 0.4090921449740341, "grad_norm": 0.03260127082467079, "grad_norm_var": 1.2529600167916751e-05, "learning_rate": 0.003819005294854543, "loss": 2.6122, "step": 7523 }, { "crossentropy": 2.5018367767333984, "epoch": 0.4091465238315343, "grad_norm": 0.04233443737030029, "grad_norm_var": 1.64531492720404e-05, "learning_rate": 0.0038177953828543495, "loss": 2.5018, "step": 7524 }, { "crossentropy": 2.6451210975646973, "epoch": 0.40920090268903453, "grad_norm": 0.033739011734724045, "grad_norm_var": 1.592370835981604e-05, "learning_rate": 0.0038165855441733864, "loss": 2.6451, "step": 7525 }, { "crossentropy": 2.6440945863723755, "epoch": 0.40925528154653473, "grad_norm": 0.03740859776735306, "grad_norm_var": 1.636275478146676e-05, "learning_rate": 0.003815375778886683, "loss": 2.6441, "step": 7526 }, { "crossentropy": 2.598877191543579, "epoch": 0.40930966040403494, "grad_norm": 0.03310869261622429, "grad_norm_var": 1.6085392706892815e-05, "learning_rate": 0.00381416608706927, "loss": 2.5989, "step": 7527 }, { "crossentropy": 2.470782518386841, "epoch": 0.40936403926153514, "grad_norm": 0.034494318068027496, "grad_norm_var": 1.59638062899101e-05, "learning_rate": 0.0038129564687961717, "loss": 2.4708, "step": 7528 }, { "crossentropy": 2.5531556606292725, "epoch": 0.40941841811903534, "grad_norm": 0.031162142753601074, "grad_norm_var": 1.6463479282575502e-05, "learning_rate": 0.003811746924142404, "loss": 2.5532, "step": 7529 }, { "crossentropy": 2.5523852109909058, "epoch": 0.40947279697653555, "grad_norm": 0.03286508470773697, "grad_norm_var": 1.6623011429811365e-05, "learning_rate": 0.0038105374531829882, "loss": 2.5524, "step": 7530 }, { "crossentropy": 2.7066988945007324, "epoch": 0.40952717583403575, "grad_norm": 0.031808629631996155, "grad_norm_var": 1.703296498589598e-05, "learning_rate": 0.0038093280559929267, "loss": 2.7067, "step": 7531 }, { "crossentropy": 2.659034013748169, "epoch": 0.40958155469153595, "grad_norm": 0.034009817987680435, "grad_norm_var": 7.025598425614415e-06, "learning_rate": 0.0038081187326472307, "loss": 2.659, "step": 7532 }, { "crossentropy": 2.6314351558685303, "epoch": 0.40963593354903616, "grad_norm": 0.03646083548665047, "grad_norm_var": 7.362993457580141e-06, "learning_rate": 0.0038069094832209006, "loss": 2.6314, "step": 7533 }, { "crossentropy": 2.5771613121032715, "epoch": 0.40969031240653636, "grad_norm": 0.036244627088308334, "grad_norm_var": 7.4782474139134875e-06, "learning_rate": 0.0038057003077889306, "loss": 2.5772, "step": 7534 }, { "crossentropy": 2.592492699623108, "epoch": 0.40974469126403656, "grad_norm": 0.03462444990873337, "grad_norm_var": 7.147408659522597e-06, "learning_rate": 0.0038044912064263137, "loss": 2.5925, "step": 7535 }, { "crossentropy": 2.608119249343872, "epoch": 0.40979907012153677, "grad_norm": 0.032779715955257416, "grad_norm_var": 7.323250949803755e-06, "learning_rate": 0.0038032821792080393, "loss": 2.6081, "step": 7536 }, { "crossentropy": 2.622975707054138, "epoch": 0.40985344897903697, "grad_norm": 0.032935310155153275, "grad_norm_var": 7.382559236089545e-06, "learning_rate": 0.0038020732262090875, "loss": 2.623, "step": 7537 }, { "crossentropy": 2.6546467542648315, "epoch": 0.4099078278365372, "grad_norm": 0.03166861832141876, "grad_norm_var": 7.780255887944396e-06, "learning_rate": 0.003800864347504437, "loss": 2.6546, "step": 7538 }, { "crossentropy": 2.5550893545150757, "epoch": 0.4099622066940374, "grad_norm": 0.032472118735313416, "grad_norm_var": 7.80995426866692e-06, "learning_rate": 0.0037996555431690634, "loss": 2.5551, "step": 7539 }, { "crossentropy": 2.5822269916534424, "epoch": 0.4100165855515376, "grad_norm": 0.03617045283317566, "grad_norm_var": 3.5462899197033954e-06, "learning_rate": 0.003798446813277932, "loss": 2.5822, "step": 7540 }, { "crossentropy": 2.5844485759735107, "epoch": 0.4100709644090378, "grad_norm": 0.03537839278578758, "grad_norm_var": 3.6851881444179003e-06, "learning_rate": 0.003797238157906009, "loss": 2.5844, "step": 7541 }, { "crossentropy": 2.7241071462631226, "epoch": 0.410125343266538, "grad_norm": 0.034555282443761826, "grad_norm_var": 2.8875463530993068e-06, "learning_rate": 0.0037960295771282556, "loss": 2.7241, "step": 7542 }, { "crossentropy": 2.507905602455139, "epoch": 0.4101797221240382, "grad_norm": 0.033892277628183365, "grad_norm_var": 2.8540969371108776e-06, "learning_rate": 0.003794821071019624, "loss": 2.5079, "step": 7543 }, { "crossentropy": 2.5168709754943848, "epoch": 0.4102341009815384, "grad_norm": 0.033427782356739044, "grad_norm_var": 2.8328729052261187e-06, "learning_rate": 0.003793612639655068, "loss": 2.5169, "step": 7544 }, { "crossentropy": 2.7609227895736694, "epoch": 0.4102884798390386, "grad_norm": 0.03596283122897148, "grad_norm_var": 2.5985956608409745e-06, "learning_rate": 0.0037924042831095294, "loss": 2.7609, "step": 7545 }, { "crossentropy": 2.632037878036499, "epoch": 0.4103428586965388, "grad_norm": 0.033775974065065384, "grad_norm_var": 2.5030797895392657e-06, "learning_rate": 0.003791196001457951, "loss": 2.632, "step": 7546 }, { "crossentropy": 2.5836920738220215, "epoch": 0.410397237554039, "grad_norm": 0.03334959223866463, "grad_norm_var": 2.173418796111437e-06, "learning_rate": 0.0037899877947752724, "loss": 2.5837, "step": 7547 }, { "crossentropy": 2.5341715812683105, "epoch": 0.4104516164115392, "grad_norm": 0.03352260962128639, "grad_norm_var": 2.2026718183530227e-06, "learning_rate": 0.00378877966313642, "loss": 2.5342, "step": 7548 }, { "crossentropy": 2.632243275642395, "epoch": 0.4105059952690394, "grad_norm": 0.03396476432681084, "grad_norm_var": 1.840076645931212e-06, "learning_rate": 0.003787571606616324, "loss": 2.6322, "step": 7549 }, { "crossentropy": 2.5890605449676514, "epoch": 0.4105603741265396, "grad_norm": 0.0322398841381073, "grad_norm_var": 1.6680842298022817e-06, "learning_rate": 0.003786363625289908, "loss": 2.5891, "step": 7550 }, { "crossentropy": 2.684644341468811, "epoch": 0.4106147529840398, "grad_norm": 0.04353472217917442, "grad_norm_var": 7.615556115252627e-06, "learning_rate": 0.003785155719232087, "loss": 2.6846, "step": 7551 }, { "crossentropy": 2.595014214515686, "epoch": 0.41066913184154, "grad_norm": 0.034921299666166306, "grad_norm_var": 7.453277682783974e-06, "learning_rate": 0.003783947888517775, "loss": 2.595, "step": 7552 }, { "crossentropy": 2.5755231380462646, "epoch": 0.41072351069904023, "grad_norm": 0.033095259219408035, "grad_norm_var": 7.421811260246933e-06, "learning_rate": 0.0037827401332218838, "loss": 2.5755, "step": 7553 }, { "crossentropy": 2.630474090576172, "epoch": 0.41077788955654043, "grad_norm": 0.032986558973789215, "grad_norm_var": 7.033574348397295e-06, "learning_rate": 0.003781532453419313, "loss": 2.6305, "step": 7554 }, { "crossentropy": 2.659002900123596, "epoch": 0.41083226841404064, "grad_norm": 0.03359665721654892, "grad_norm_var": 6.796841529646489e-06, "learning_rate": 0.003780324849184964, "loss": 2.659, "step": 7555 }, { "crossentropy": 2.543203353881836, "epoch": 0.41088664727154084, "grad_norm": 0.03225028142333031, "grad_norm_var": 6.961762525544332e-06, "learning_rate": 0.003779117320593731, "loss": 2.5432, "step": 7556 }, { "crossentropy": 2.50041401386261, "epoch": 0.41094102612904104, "grad_norm": 0.0339253693819046, "grad_norm_var": 6.904822903086987e-06, "learning_rate": 0.003777909867720502, "loss": 2.5004, "step": 7557 }, { "crossentropy": 2.5313446521759033, "epoch": 0.41099540498654125, "grad_norm": 0.03326944261789322, "grad_norm_var": 6.966547776264079e-06, "learning_rate": 0.0037767024906401672, "loss": 2.5313, "step": 7558 }, { "crossentropy": 2.6808395385742188, "epoch": 0.41104978384404145, "grad_norm": 0.03159148991107941, "grad_norm_var": 7.40167988386205e-06, "learning_rate": 0.0037754951894275997, "loss": 2.6808, "step": 7559 }, { "crossentropy": 2.6689398288726807, "epoch": 0.41110416270154165, "grad_norm": 0.0337325781583786, "grad_norm_var": 7.380638731534159e-06, "learning_rate": 0.0037742879641576798, "loss": 2.6689, "step": 7560 }, { "crossentropy": 2.51133930683136, "epoch": 0.41115854155904186, "grad_norm": 0.03575979173183441, "grad_norm_var": 7.332986733648896e-06, "learning_rate": 0.0037730808149052794, "loss": 2.5113, "step": 7561 }, { "crossentropy": 2.57095205783844, "epoch": 0.41121292041654206, "grad_norm": 0.03235390782356262, "grad_norm_var": 7.51982474846141e-06, "learning_rate": 0.0037718737417452614, "loss": 2.571, "step": 7562 }, { "crossentropy": 2.6506240367889404, "epoch": 0.41126729927404226, "grad_norm": 0.03174923360347748, "grad_norm_var": 7.819937637097245e-06, "learning_rate": 0.0037706667447524878, "loss": 2.6506, "step": 7563 }, { "crossentropy": 2.5823711156845093, "epoch": 0.41132167813154247, "grad_norm": 0.033346034586429596, "grad_norm_var": 7.830909434148637e-06, "learning_rate": 0.003769459824001819, "loss": 2.5824, "step": 7564 }, { "crossentropy": 2.5960144996643066, "epoch": 0.41137605698904267, "grad_norm": 0.03452803194522858, "grad_norm_var": 7.855991090665916e-06, "learning_rate": 0.003768252979568104, "loss": 2.596, "step": 7565 }, { "crossentropy": 2.6610374450683594, "epoch": 0.4114304358465429, "grad_norm": 0.0434039905667305, "grad_norm_var": 1.3129952353170086e-05, "learning_rate": 0.0037670462115261907, "loss": 2.661, "step": 7566 }, { "crossentropy": 2.63443660736084, "epoch": 0.4114848147040431, "grad_norm": 0.040442854166030884, "grad_norm_var": 1.0055556080019028e-05, "learning_rate": 0.003765839519950922, "loss": 2.6344, "step": 7567 }, { "crossentropy": 2.6761680841445923, "epoch": 0.4115391935615433, "grad_norm": 0.03320790082216263, "grad_norm_var": 1.0127839774817793e-05, "learning_rate": 0.0037646329049171347, "loss": 2.6762, "step": 7568 }, { "crossentropy": 2.698335886001587, "epoch": 0.4115935724190435, "grad_norm": 0.031221890822052956, "grad_norm_var": 1.0654966576129872e-05, "learning_rate": 0.0037634263664996655, "loss": 2.6983, "step": 7569 }, { "crossentropy": 2.620572805404663, "epoch": 0.4116479512765437, "grad_norm": 0.033634498715400696, "grad_norm_var": 1.0575477749208171e-05, "learning_rate": 0.003762219904773337, "loss": 2.6206, "step": 7570 }, { "crossentropy": 2.6138757467269897, "epoch": 0.4117023301340439, "grad_norm": 0.03384970873594284, "grad_norm_var": 1.0557406598700224e-05, "learning_rate": 0.003761013519812977, "loss": 2.6139, "step": 7571 }, { "crossentropy": 2.596996784210205, "epoch": 0.4117567089915441, "grad_norm": 0.03287064656615257, "grad_norm_var": 1.0414672140528894e-05, "learning_rate": 0.003759807211693405, "loss": 2.597, "step": 7572 }, { "crossentropy": 2.6200404167175293, "epoch": 0.4118110878490443, "grad_norm": 0.03404762223362923, "grad_norm_var": 1.0409410618571881e-05, "learning_rate": 0.0037586009804894322, "loss": 2.62, "step": 7573 }, { "crossentropy": 2.6283161640167236, "epoch": 0.4118654667065445, "grad_norm": 0.03601479157805443, "grad_norm_var": 1.0498441738144995e-05, "learning_rate": 0.003757394826275868, "loss": 2.6283, "step": 7574 }, { "crossentropy": 2.6120685338974, "epoch": 0.4119198455640447, "grad_norm": 0.034797318279743195, "grad_norm_var": 9.904096636764248e-06, "learning_rate": 0.0037561887491275206, "loss": 2.6121, "step": 7575 }, { "crossentropy": 2.675272583961487, "epoch": 0.4119742244215449, "grad_norm": 0.03511045128107071, "grad_norm_var": 9.847770315884194e-06, "learning_rate": 0.0037549827491191855, "loss": 2.6753, "step": 7576 }, { "crossentropy": 2.5696204900741577, "epoch": 0.4120286032790451, "grad_norm": 0.03259632736444473, "grad_norm_var": 1.0056242314624092e-05, "learning_rate": 0.0037537768263256606, "loss": 2.5696, "step": 7577 }, { "crossentropy": 2.58948016166687, "epoch": 0.4120829821365453, "grad_norm": 0.03648736700415611, "grad_norm_var": 9.900833203241412e-06, "learning_rate": 0.0037525709808217357, "loss": 2.5895, "step": 7578 }, { "crossentropy": 2.5421526432037354, "epoch": 0.4121373609940455, "grad_norm": 0.03358437120914459, "grad_norm_var": 9.357060542573024e-06, "learning_rate": 0.0037513652126821935, "loss": 2.5422, "step": 7579 }, { "crossentropy": 2.6314088106155396, "epoch": 0.4121917398515457, "grad_norm": 0.032765232026576996, "grad_norm_var": 9.502083416743875e-06, "learning_rate": 0.0037501595219818174, "loss": 2.6314, "step": 7580 }, { "crossentropy": 2.604061007499695, "epoch": 0.41224611870904593, "grad_norm": 0.03764289990067482, "grad_norm_var": 9.94976826719066e-06, "learning_rate": 0.0037489539087953837, "loss": 2.6041, "step": 7581 }, { "crossentropy": 2.5269566774368286, "epoch": 0.41230049756654613, "grad_norm": 0.03220424801111221, "grad_norm_var": 5.3963347966310174e-06, "learning_rate": 0.003747748373197662, "loss": 2.527, "step": 7582 }, { "crossentropy": 2.519623875617981, "epoch": 0.41235487642404633, "grad_norm": 0.03546854108572006, "grad_norm_var": 2.9381872135047683e-06, "learning_rate": 0.003746542915263419, "loss": 2.5196, "step": 7583 }, { "crossentropy": 2.6698365211486816, "epoch": 0.41240925528154654, "grad_norm": 0.0321178212761879, "grad_norm_var": 3.1412417725398446e-06, "learning_rate": 0.0037453375350674153, "loss": 2.6698, "step": 7584 }, { "crossentropy": 2.448391079902649, "epoch": 0.41246363413904674, "grad_norm": 0.033106423914432526, "grad_norm_var": 2.6586523482686916e-06, "learning_rate": 0.0037441322326844063, "loss": 2.4484, "step": 7585 }, { "crossentropy": 2.549813985824585, "epoch": 0.41251801299654695, "grad_norm": 0.03237321972846985, "grad_norm_var": 2.843701758160841e-06, "learning_rate": 0.0037429270081891473, "loss": 2.5498, "step": 7586 }, { "crossentropy": 2.590774893760681, "epoch": 0.41257239185404715, "grad_norm": 0.03300800547003746, "grad_norm_var": 2.912121185725397e-06, "learning_rate": 0.003741721861656383, "loss": 2.5908, "step": 7587 }, { "crossentropy": 2.666378140449524, "epoch": 0.41262677071154735, "grad_norm": 0.03324440121650696, "grad_norm_var": 2.8639635780350683e-06, "learning_rate": 0.0037405167931608553, "loss": 2.6664, "step": 7588 }, { "crossentropy": 2.5360459089279175, "epoch": 0.41268114956904756, "grad_norm": 0.03370535001158714, "grad_norm_var": 2.8707352271792676e-06, "learning_rate": 0.0037393118027773033, "loss": 2.536, "step": 7589 }, { "crossentropy": 2.5968165397644043, "epoch": 0.41273552842654776, "grad_norm": 0.033769186586141586, "grad_norm_var": 2.586893405049674e-06, "learning_rate": 0.0037381068905804555, "loss": 2.5968, "step": 7590 }, { "crossentropy": 2.683692216873169, "epoch": 0.41278990728404796, "grad_norm": 0.03305256739258766, "grad_norm_var": 2.5623171863701592e-06, "learning_rate": 0.003736902056645043, "loss": 2.6837, "step": 7591 }, { "crossentropy": 2.5634926557540894, "epoch": 0.41284428614154817, "grad_norm": 0.035824451595544815, "grad_norm_var": 2.722287822619756e-06, "learning_rate": 0.0037356973010457886, "loss": 2.5635, "step": 7592 }, { "crossentropy": 2.613430976867676, "epoch": 0.41289866499904837, "grad_norm": 0.03462047874927521, "grad_norm_var": 2.650969537728159e-06, "learning_rate": 0.0037344926238574074, "loss": 2.6134, "step": 7593 }, { "crossentropy": 2.509530186653137, "epoch": 0.4129530438565486, "grad_norm": 0.03276996687054634, "grad_norm_var": 2.25002296830399e-06, "learning_rate": 0.003733288025154613, "loss": 2.5095, "step": 7594 }, { "crossentropy": 2.5222980976104736, "epoch": 0.4130074227140488, "grad_norm": 0.031625378876924515, "grad_norm_var": 2.5210114908154798e-06, "learning_rate": 0.003732083505012118, "loss": 2.5223, "step": 7595 }, { "crossentropy": 2.7291027307510376, "epoch": 0.413061801571549, "grad_norm": 0.03390905633568764, "grad_norm_var": 2.4783489604583594e-06, "learning_rate": 0.003730879063504618, "loss": 2.7291, "step": 7596 }, { "crossentropy": 2.663857936859131, "epoch": 0.4131161804290492, "grad_norm": 0.03369465097784996, "grad_norm_var": 1.3520275659622526e-06, "learning_rate": 0.0037296747007068178, "loss": 2.6639, "step": 7597 }, { "crossentropy": 2.6374478340148926, "epoch": 0.4131705592865494, "grad_norm": 0.03296985104680061, "grad_norm_var": 1.2660008542210906e-06, "learning_rate": 0.003728470416693407, "loss": 2.6374, "step": 7598 }, { "crossentropy": 2.631886601448059, "epoch": 0.4132249381440496, "grad_norm": 0.03214598447084427, "grad_norm_var": 1.0633766322099003e-06, "learning_rate": 0.0037272662115390752, "loss": 2.6319, "step": 7599 }, { "crossentropy": 2.4362094402313232, "epoch": 0.4132793170015498, "grad_norm": 0.031341418623924255, "grad_norm_var": 1.2178463006234468e-06, "learning_rate": 0.0037260620853185084, "loss": 2.4362, "step": 7600 }, { "crossentropy": 2.7186728715896606, "epoch": 0.41333369585905, "grad_norm": 0.033633410930633545, "grad_norm_var": 1.2288023316412803e-06, "learning_rate": 0.0037248580381063806, "loss": 2.7187, "step": 7601 }, { "crossentropy": 2.5812908411026, "epoch": 0.4133880747165502, "grad_norm": 0.031827155500650406, "grad_norm_var": 1.3098534843010319e-06, "learning_rate": 0.00372365406997737, "loss": 2.5813, "step": 7602 }, { "crossentropy": 2.546412706375122, "epoch": 0.4134424535740504, "grad_norm": 0.03250740468502045, "grad_norm_var": 1.338086254833402e-06, "learning_rate": 0.003722450181006145, "loss": 2.5464, "step": 7603 }, { "crossentropy": 2.6539247035980225, "epoch": 0.4134968324315506, "grad_norm": 0.03260833024978638, "grad_norm_var": 1.3566427064660991e-06, "learning_rate": 0.003721246371267367, "loss": 2.6539, "step": 7604 }, { "crossentropy": 2.6619601249694824, "epoch": 0.4135512112890508, "grad_norm": 0.03593182936310768, "grad_norm_var": 1.8386671816361666e-06, "learning_rate": 0.003720042640835698, "loss": 2.662, "step": 7605 }, { "crossentropy": 2.6117810010910034, "epoch": 0.413605590146551, "grad_norm": 0.0360824391245842, "grad_norm_var": 2.328792520543167e-06, "learning_rate": 0.003718838989785791, "loss": 2.6118, "step": 7606 }, { "crossentropy": 2.681046485900879, "epoch": 0.4136599690040512, "grad_norm": 0.03260480985045433, "grad_norm_var": 2.3626037301470333e-06, "learning_rate": 0.0037176354181922937, "loss": 2.681, "step": 7607 }, { "crossentropy": 2.6476523876190186, "epoch": 0.4137143478615514, "grad_norm": 0.031458642333745956, "grad_norm_var": 2.131541862860203e-06, "learning_rate": 0.003716431926129854, "loss": 2.6477, "step": 7608 }, { "crossentropy": 2.6894943714141846, "epoch": 0.4137687267190516, "grad_norm": 0.03278525918722153, "grad_norm_var": 1.9719893226868268e-06, "learning_rate": 0.0037152285136731102, "loss": 2.6895, "step": 7609 }, { "crossentropy": 2.525234341621399, "epoch": 0.41382310557655183, "grad_norm": 0.03306793421506882, "grad_norm_var": 1.9686586352297974e-06, "learning_rate": 0.0037140251808966945, "loss": 2.5252, "step": 7610 }, { "crossentropy": 2.6763272285461426, "epoch": 0.41387748443405203, "grad_norm": 0.03272107616066933, "grad_norm_var": 1.841103363616157e-06, "learning_rate": 0.0037128219278752396, "loss": 2.6763, "step": 7611 }, { "crossentropy": 2.5535470247268677, "epoch": 0.41393186329155224, "grad_norm": 0.031003935262560844, "grad_norm_var": 2.047675864563766e-06, "learning_rate": 0.0037116187546833677, "loss": 2.5535, "step": 7612 }, { "crossentropy": 2.568228006362915, "epoch": 0.41398624214905244, "grad_norm": 0.03225176781415939, "grad_norm_var": 2.0247260871852768e-06, "learning_rate": 0.0037104156613956985, "loss": 2.5682, "step": 7613 }, { "crossentropy": 2.7011876106262207, "epoch": 0.41404062100655264, "grad_norm": 0.03323874995112419, "grad_norm_var": 2.035018438593577e-06, "learning_rate": 0.0037092126480868497, "loss": 2.7012, "step": 7614 }, { "crossentropy": 2.6304733753204346, "epoch": 0.41409499986405285, "grad_norm": 0.03134462609887123, "grad_norm_var": 2.147773461339666e-06, "learning_rate": 0.0037080097148314284, "loss": 2.6305, "step": 7615 }, { "crossentropy": 2.7861714363098145, "epoch": 0.41414937872155305, "grad_norm": 0.03303693234920502, "grad_norm_var": 2.0032346120669863e-06, "learning_rate": 0.0037068068617040397, "loss": 2.7862, "step": 7616 }, { "crossentropy": 2.542797088623047, "epoch": 0.41420375757905326, "grad_norm": 0.03486822172999382, "grad_norm_var": 2.2223245485744467e-06, "learning_rate": 0.003705604088779285, "loss": 2.5428, "step": 7617 }, { "crossentropy": 2.5791362524032593, "epoch": 0.41425813643655346, "grad_norm": 0.03359506279230118, "grad_norm_var": 2.1509405598932186e-06, "learning_rate": 0.003704401396131756, "loss": 2.5791, "step": 7618 }, { "crossentropy": 2.5384833812713623, "epoch": 0.41431251529405366, "grad_norm": 0.032637182623147964, "grad_norm_var": 2.1422722461899295e-06, "learning_rate": 0.0037031987838360447, "loss": 2.5385, "step": 7619 }, { "crossentropy": 2.5487654209136963, "epoch": 0.41436689415155387, "grad_norm": 0.03267677128314972, "grad_norm_var": 2.1382854377308687e-06, "learning_rate": 0.0037019962519667378, "loss": 2.5488, "step": 7620 }, { "crossentropy": 2.5420844554901123, "epoch": 0.41442127300905407, "grad_norm": 0.03235814720392227, "grad_norm_var": 1.5783664718282125e-06, "learning_rate": 0.003700793800598412, "loss": 2.5421, "step": 7621 }, { "crossentropy": 2.7214447259902954, "epoch": 0.4144756518665543, "grad_norm": 0.03487754985690117, "grad_norm_var": 1.1511248014059305e-06, "learning_rate": 0.003699591429805642, "loss": 2.7214, "step": 7622 }, { "crossentropy": 2.6201300621032715, "epoch": 0.4145300307240545, "grad_norm": 0.03599340468645096, "grad_norm_var": 1.7883147671044017e-06, "learning_rate": 0.003698389139663003, "loss": 2.6201, "step": 7623 }, { "crossentropy": 2.7026519775390625, "epoch": 0.4145844095815547, "grad_norm": 0.03363528102636337, "grad_norm_var": 1.638631024246254e-06, "learning_rate": 0.0036971869302450523, "loss": 2.7027, "step": 7624 }, { "crossentropy": 2.595098853111267, "epoch": 0.4146387884390549, "grad_norm": 0.030898284167051315, "grad_norm_var": 1.9480959952178903e-06, "learning_rate": 0.0036959848016263554, "loss": 2.5951, "step": 7625 }, { "crossentropy": 2.6091654300689697, "epoch": 0.4146931672965551, "grad_norm": 0.033143822103738785, "grad_norm_var": 1.9490137197147495e-06, "learning_rate": 0.003694782753881465, "loss": 2.6092, "step": 7626 }, { "crossentropy": 2.5496203899383545, "epoch": 0.4147475461540553, "grad_norm": 0.03273102268576622, "grad_norm_var": 1.9486267171996458e-06, "learning_rate": 0.003693580787084931, "loss": 2.5496, "step": 7627 }, { "crossentropy": 2.614714741706848, "epoch": 0.4148019250115555, "grad_norm": 0.03652751073241234, "grad_norm_var": 2.3720556624602536e-06, "learning_rate": 0.0036923789013112995, "loss": 2.6147, "step": 7628 }, { "crossentropy": 2.606268525123596, "epoch": 0.4148563038690557, "grad_norm": 0.03597782552242279, "grad_norm_var": 2.687509319918418e-06, "learning_rate": 0.003691177096635108, "loss": 2.6063, "step": 7629 }, { "crossentropy": 2.6056710481643677, "epoch": 0.4149106827265559, "grad_norm": 0.03414600342512131, "grad_norm_var": 2.6957048842479738e-06, "learning_rate": 0.0036899753731308937, "loss": 2.6057, "step": 7630 }, { "crossentropy": 2.7076019048690796, "epoch": 0.4149650615840561, "grad_norm": 0.0328306183218956, "grad_norm_var": 2.376356623049319e-06, "learning_rate": 0.003688773730873186, "loss": 2.7076, "step": 7631 }, { "crossentropy": 2.6010385751724243, "epoch": 0.4150194404415563, "grad_norm": 0.03336448594927788, "grad_norm_var": 2.3521010853742402e-06, "learning_rate": 0.0036875721699365085, "loss": 2.601, "step": 7632 }, { "crossentropy": 2.479525327682495, "epoch": 0.4150738192990565, "grad_norm": 0.033376824110746384, "grad_norm_var": 2.272002202869753e-06, "learning_rate": 0.0036863706903953798, "loss": 2.4795, "step": 7633 }, { "crossentropy": 2.7459516525268555, "epoch": 0.4151281981565567, "grad_norm": 0.03433240205049515, "grad_norm_var": 2.298308333143854e-06, "learning_rate": 0.0036851692923243196, "loss": 2.746, "step": 7634 }, { "crossentropy": 2.6724003553390503, "epoch": 0.4151825770140569, "grad_norm": 0.03442748263478279, "grad_norm_var": 2.2403479119637026e-06, "learning_rate": 0.0036839679757978312, "loss": 2.6724, "step": 7635 }, { "crossentropy": 2.549406051635742, "epoch": 0.4152369558715571, "grad_norm": 0.03484738990664482, "grad_norm_var": 2.2007439702921455e-06, "learning_rate": 0.003682766740890423, "loss": 2.5494, "step": 7636 }, { "crossentropy": 2.591102123260498, "epoch": 0.4152913347290573, "grad_norm": 0.033793602138757706, "grad_norm_var": 2.0216495895395334e-06, "learning_rate": 0.003681565587676594, "loss": 2.5911, "step": 7637 }, { "crossentropy": 2.4788999557495117, "epoch": 0.41534571358655753, "grad_norm": 0.03186161816120148, "grad_norm_var": 2.259963468645374e-06, "learning_rate": 0.0036803645162308374, "loss": 2.4789, "step": 7638 }, { "crossentropy": 2.666898727416992, "epoch": 0.41540009244405773, "grad_norm": 0.03227691352367401, "grad_norm_var": 2.070013182897621e-06, "learning_rate": 0.0036791635266276448, "loss": 2.6669, "step": 7639 }, { "crossentropy": 2.4538501501083374, "epoch": 0.41545447130155794, "grad_norm": 0.03511028736829758, "grad_norm_var": 2.2059099117889443e-06, "learning_rate": 0.0036779626189414956, "loss": 2.4539, "step": 7640 }, { "crossentropy": 2.5402872562408447, "epoch": 0.41550885015905814, "grad_norm": 0.03151514381170273, "grad_norm_var": 1.996963620801151e-06, "learning_rate": 0.0036767617932468737, "loss": 2.5403, "step": 7641 }, { "crossentropy": 2.5908762216567993, "epoch": 0.41556322901655834, "grad_norm": 0.03271050751209259, "grad_norm_var": 2.0446703232053383e-06, "learning_rate": 0.0036755610496182534, "loss": 2.5909, "step": 7642 }, { "crossentropy": 2.523309111595154, "epoch": 0.41561760787405855, "grad_norm": 0.0379781499505043, "grad_norm_var": 3.0599973022421105e-06, "learning_rate": 0.0036743603881301003, "loss": 2.5233, "step": 7643 }, { "crossentropy": 2.6000304222106934, "epoch": 0.41567198673155875, "grad_norm": 0.035053614526987076, "grad_norm_var": 2.7122906319463716e-06, "learning_rate": 0.0036731598088568795, "loss": 2.6, "step": 7644 }, { "crossentropy": 2.6770546436309814, "epoch": 0.41572636558905895, "grad_norm": 0.033545564860105515, "grad_norm_var": 2.4325728628819703e-06, "learning_rate": 0.0036719593118730532, "loss": 2.6771, "step": 7645 }, { "crossentropy": 2.54064404964447, "epoch": 0.41578074444655916, "grad_norm": 0.03302576392889023, "grad_norm_var": 2.462785252267616e-06, "learning_rate": 0.0036707588972530692, "loss": 2.5406, "step": 7646 }, { "crossentropy": 2.4223908185958862, "epoch": 0.41583512330405936, "grad_norm": 0.03121720254421234, "grad_norm_var": 2.8239361919840086e-06, "learning_rate": 0.0036695585650713797, "loss": 2.4224, "step": 7647 }, { "crossentropy": 2.5944453477859497, "epoch": 0.41588950216155957, "grad_norm": 0.033310454338788986, "grad_norm_var": 2.826192198522072e-06, "learning_rate": 0.003668358315402429, "loss": 2.5944, "step": 7648 }, { "crossentropy": 2.685790181159973, "epoch": 0.41594388101905977, "grad_norm": 0.03242027014493942, "grad_norm_var": 2.918084275368993e-06, "learning_rate": 0.0036671581483206533, "loss": 2.6858, "step": 7649 }, { "crossentropy": 2.58060085773468, "epoch": 0.41599825987656, "grad_norm": 0.030782286077737808, "grad_norm_var": 3.3539735612342704e-06, "learning_rate": 0.003665958063900485, "loss": 2.5806, "step": 7650 }, { "crossentropy": 2.672533392906189, "epoch": 0.4160526387340602, "grad_norm": 0.03132328391075134, "grad_norm_var": 3.5174101337388798e-06, "learning_rate": 0.003664758062216357, "loss": 2.6725, "step": 7651 }, { "crossentropy": 2.614671230316162, "epoch": 0.4161070175915604, "grad_norm": 0.03281240165233612, "grad_norm_var": 3.3219872627470702e-06, "learning_rate": 0.0036635581433426875, "loss": 2.6147, "step": 7652 }, { "crossentropy": 2.659835934638977, "epoch": 0.4161613964490606, "grad_norm": 0.033326856791973114, "grad_norm_var": 3.2890817948799667e-06, "learning_rate": 0.0036623583073538968, "loss": 2.6598, "step": 7653 }, { "crossentropy": 2.649250030517578, "epoch": 0.4162157753065608, "grad_norm": 0.032208122313022614, "grad_norm_var": 3.243211438432268e-06, "learning_rate": 0.0036611585543243966, "loss": 2.6493, "step": 7654 }, { "crossentropy": 2.6399935483932495, "epoch": 0.416270154164061, "grad_norm": 0.033467669039964676, "grad_norm_var": 3.210906831641452e-06, "learning_rate": 0.0036599588843285926, "loss": 2.64, "step": 7655 }, { "crossentropy": 2.6420000791549683, "epoch": 0.4163245330215612, "grad_norm": 0.03227635845541954, "grad_norm_var": 2.958154562220557e-06, "learning_rate": 0.0036587592974408924, "loss": 2.642, "step": 7656 }, { "crossentropy": 2.6102993488311768, "epoch": 0.4163789118790614, "grad_norm": 0.034160029143095016, "grad_norm_var": 2.8943531199439205e-06, "learning_rate": 0.0036575597937356873, "loss": 2.6103, "step": 7657 }, { "crossentropy": 2.6103140115737915, "epoch": 0.4164332907365616, "grad_norm": 0.036166444420814514, "grad_norm_var": 3.460813199059459e-06, "learning_rate": 0.0036563603732873737, "loss": 2.6103, "step": 7658 }, { "crossentropy": 2.5433939695358276, "epoch": 0.4164876695940618, "grad_norm": 0.03127440810203552, "grad_norm_var": 2.103424843526563e-06, "learning_rate": 0.0036551610361703376, "loss": 2.5434, "step": 7659 }, { "crossentropy": 2.6396725177764893, "epoch": 0.416542048451562, "grad_norm": 0.0339733250439167, "grad_norm_var": 1.865896814083554e-06, "learning_rate": 0.0036539617824589594, "loss": 2.6397, "step": 7660 }, { "crossentropy": 2.404963970184326, "epoch": 0.4165964273090622, "grad_norm": 0.03251483663916588, "grad_norm_var": 1.834046149595756e-06, "learning_rate": 0.0036527626122276153, "loss": 2.405, "step": 7661 }, { "crossentropy": 2.545815587043762, "epoch": 0.4166508061665624, "grad_norm": 0.031652554869651794, "grad_norm_var": 1.9043837214242793e-06, "learning_rate": 0.0036515635255506803, "loss": 2.5458, "step": 7662 }, { "crossentropy": 2.595018148422241, "epoch": 0.4167051850240626, "grad_norm": 0.03449495509266853, "grad_norm_var": 1.93639319663851e-06, "learning_rate": 0.003650364522502517, "loss": 2.595, "step": 7663 }, { "crossentropy": 2.584190011024475, "epoch": 0.4167595638815628, "grad_norm": 0.035751789808273315, "grad_norm_var": 2.447304275253412e-06, "learning_rate": 0.003649165603157488, "loss": 2.5842, "step": 7664 }, { "crossentropy": 2.619794487953186, "epoch": 0.416813942739063, "grad_norm": 0.03703825920820236, "grad_norm_var": 3.3999049367335864e-06, "learning_rate": 0.00364796676758995, "loss": 2.6198, "step": 7665 }, { "crossentropy": 2.6579272747039795, "epoch": 0.41686832159656323, "grad_norm": 0.035162732005119324, "grad_norm_var": 3.1132172033041436e-06, "learning_rate": 0.00364676801587425, "loss": 2.6579, "step": 7666 }, { "crossentropy": 2.60320782661438, "epoch": 0.41692270045406343, "grad_norm": 0.03363681584596634, "grad_norm_var": 2.745365733371659e-06, "learning_rate": 0.0036455693480847394, "loss": 2.6032, "step": 7667 }, { "crossentropy": 2.573800206184387, "epoch": 0.41697707931156364, "grad_norm": 0.03215871751308441, "grad_norm_var": 2.853342167624673e-06, "learning_rate": 0.0036443707642957526, "loss": 2.5738, "step": 7668 }, { "crossentropy": 2.567469596862793, "epoch": 0.4170314581690639, "grad_norm": 0.033020224422216415, "grad_norm_var": 2.8746375516261817e-06, "learning_rate": 0.003643172264581628, "loss": 2.5675, "step": 7669 }, { "crossentropy": 2.6640177965164185, "epoch": 0.4170858370265641, "grad_norm": 0.034327611327171326, "grad_norm_var": 2.738087399945371e-06, "learning_rate": 0.003641973849016695, "loss": 2.664, "step": 7670 }, { "crossentropy": 2.6307953596115112, "epoch": 0.4171402158840643, "grad_norm": 0.03293890878558159, "grad_norm_var": 2.780210769258845e-06, "learning_rate": 0.0036407755176752765, "loss": 2.6308, "step": 7671 }, { "crossentropy": 2.55881667137146, "epoch": 0.4171945947415645, "grad_norm": 0.034385599195957184, "grad_norm_var": 2.634199812169095e-06, "learning_rate": 0.003639577270631693, "loss": 2.5588, "step": 7672 }, { "crossentropy": 2.6364694833755493, "epoch": 0.4172489735990647, "grad_norm": 0.03312161937355995, "grad_norm_var": 2.6678167570401885e-06, "learning_rate": 0.0036383791079602607, "loss": 2.6365, "step": 7673 }, { "crossentropy": 2.601107597351074, "epoch": 0.4173033524565649, "grad_norm": 0.03310293331742287, "grad_norm_var": 2.3086717563865097e-06, "learning_rate": 0.003637181029735286, "loss": 2.6011, "step": 7674 }, { "crossentropy": 2.6270354986190796, "epoch": 0.4173577313140651, "grad_norm": 0.03328467532992363, "grad_norm_var": 1.921900417419196e-06, "learning_rate": 0.0036359830360310724, "loss": 2.627, "step": 7675 }, { "crossentropy": 2.601077675819397, "epoch": 0.4174121101715653, "grad_norm": 0.03169545903801918, "grad_norm_var": 2.1891007839186054e-06, "learning_rate": 0.003634785126921921, "loss": 2.6011, "step": 7676 }, { "crossentropy": 2.579974412918091, "epoch": 0.4174664890290655, "grad_norm": 0.03226018697023392, "grad_norm_var": 2.231457890046878e-06, "learning_rate": 0.003633587302482122, "loss": 2.58, "step": 7677 }, { "crossentropy": 2.613873243331909, "epoch": 0.4175208678865657, "grad_norm": 0.032960109412670135, "grad_norm_var": 1.9940767551605726e-06, "learning_rate": 0.003632389562785964, "loss": 2.6139, "step": 7678 }, { "crossentropy": 2.5947500467300415, "epoch": 0.41757524674406593, "grad_norm": 0.03273390606045723, "grad_norm_var": 2.0033102713760848e-06, "learning_rate": 0.0036311919079077314, "loss": 2.5947, "step": 7679 }, { "crossentropy": 2.5698622465133667, "epoch": 0.41762962560156613, "grad_norm": 0.03195192664861679, "grad_norm_var": 1.8148967285071144e-06, "learning_rate": 0.0036299943379217, "loss": 2.5699, "step": 7680 }, { "crossentropy": 2.5627551078796387, "epoch": 0.41768400445906634, "grad_norm": 0.032834652811288834, "grad_norm_var": 8.583865889671971e-07, "learning_rate": 0.003628796852902143, "loss": 2.5628, "step": 7681 }, { "crossentropy": 2.546412229537964, "epoch": 0.41773838331656654, "grad_norm": 0.03546145558357239, "grad_norm_var": 9.461816001847072e-07, "learning_rate": 0.003627599452923327, "loss": 2.5464, "step": 7682 }, { "crossentropy": 2.630091071128845, "epoch": 0.41779276217406675, "grad_norm": 0.031252019107341766, "grad_norm_var": 1.1364033945147437e-06, "learning_rate": 0.0036264021380595112, "loss": 2.6301, "step": 7683 }, { "crossentropy": 2.6295015811920166, "epoch": 0.41784714103156695, "grad_norm": 0.032403185963630676, "grad_norm_var": 1.1137554067702783e-06, "learning_rate": 0.003625204908384958, "loss": 2.6295, "step": 7684 }, { "crossentropy": 2.5653076171875, "epoch": 0.41790151988906715, "grad_norm": 0.03303232416510582, "grad_norm_var": 1.1138239584161195e-06, "learning_rate": 0.003624007763973911, "loss": 2.5653, "step": 7685 }, { "crossentropy": 2.5651493072509766, "epoch": 0.41795589874656736, "grad_norm": 0.03320689499378204, "grad_norm_var": 9.915740200215165e-07, "learning_rate": 0.003622810704900621, "loss": 2.5651, "step": 7686 }, { "crossentropy": 2.5708149671554565, "epoch": 0.41801027760406756, "grad_norm": 0.03296598792076111, "grad_norm_var": 9.917093654489168e-07, "learning_rate": 0.0036216137312393283, "loss": 2.5708, "step": 7687 }, { "crossentropy": 2.581836223602295, "epoch": 0.41806465646156776, "grad_norm": 0.032401345670223236, "grad_norm_var": 8.489299259954615e-07, "learning_rate": 0.003620416843064266, "loss": 2.5818, "step": 7688 }, { "crossentropy": 2.5593851804733276, "epoch": 0.41811903531906797, "grad_norm": 0.03945479914546013, "grad_norm_var": 3.634266395392306e-06, "learning_rate": 0.0036192200404496634, "loss": 2.5594, "step": 7689 }, { "crossentropy": 2.6521507501602173, "epoch": 0.41817341417656817, "grad_norm": 0.03416132926940918, "grad_norm_var": 3.6923285912406067e-06, "learning_rate": 0.003618023323469749, "loss": 2.6522, "step": 7690 }, { "crossentropy": 2.670129179954529, "epoch": 0.4182277930340684, "grad_norm": 0.033124975860118866, "grad_norm_var": 3.6932644285835845e-06, "learning_rate": 0.0036168266921987385, "loss": 2.6701, "step": 7691 }, { "crossentropy": 2.6871367692947388, "epoch": 0.4182821718915686, "grad_norm": 0.03527441620826721, "grad_norm_var": 3.754970564240345e-06, "learning_rate": 0.003615630146710848, "loss": 2.6871, "step": 7692 }, { "crossentropy": 2.653587579727173, "epoch": 0.4183365507490688, "grad_norm": 0.03598082810640335, "grad_norm_var": 4.021253261613881e-06, "learning_rate": 0.003614433687080285, "loss": 2.6536, "step": 7693 }, { "crossentropy": 2.5985294580459595, "epoch": 0.418390929606569, "grad_norm": 0.032309457659721375, "grad_norm_var": 4.111901491648853e-06, "learning_rate": 0.0036132373133812507, "loss": 2.5985, "step": 7694 }, { "crossentropy": 2.505179524421692, "epoch": 0.4184453084640692, "grad_norm": 0.03326067328453064, "grad_norm_var": 4.064245501671631e-06, "learning_rate": 0.0036120410256879486, "loss": 2.5052, "step": 7695 }, { "crossentropy": 2.664920210838318, "epoch": 0.4184996873215694, "grad_norm": 0.0345391184091568, "grad_norm_var": 3.882247172764102e-06, "learning_rate": 0.0036108448240745657, "loss": 2.6649, "step": 7696 }, { "crossentropy": 2.590181827545166, "epoch": 0.4185540661790696, "grad_norm": 0.03339463099837303, "grad_norm_var": 3.825739854285542e-06, "learning_rate": 0.003609648708615292, "loss": 2.5902, "step": 7697 }, { "crossentropy": 2.546104073524475, "epoch": 0.4186084450365698, "grad_norm": 0.031164046376943588, "grad_norm_var": 4.07895473821344e-06, "learning_rate": 0.003608452679384311, "loss": 2.5461, "step": 7698 }, { "crossentropy": 2.678858995437622, "epoch": 0.41866282389407, "grad_norm": 0.03245924785733223, "grad_norm_var": 3.788822340073233e-06, "learning_rate": 0.0036072567364557957, "loss": 2.6789, "step": 7699 }, { "crossentropy": 2.4312562942504883, "epoch": 0.4187172027515702, "grad_norm": 0.03228097781538963, "grad_norm_var": 3.8108186331336594e-06, "learning_rate": 0.0036060608799039183, "loss": 2.4313, "step": 7700 }, { "crossentropy": 2.6879228353500366, "epoch": 0.4187715816090704, "grad_norm": 0.03267236053943634, "grad_norm_var": 3.850395416774415e-06, "learning_rate": 0.003604865109802848, "loss": 2.6879, "step": 7701 }, { "crossentropy": 2.585716128349304, "epoch": 0.4188259604665706, "grad_norm": 0.034272029995918274, "grad_norm_var": 3.856144858419413e-06, "learning_rate": 0.0036036694262267413, "loss": 2.5857, "step": 7702 }, { "crossentropy": 2.506662964820862, "epoch": 0.4188803393240708, "grad_norm": 0.03277359902858734, "grad_norm_var": 3.878114603873308e-06, "learning_rate": 0.0036024738292497554, "loss": 2.5067, "step": 7703 }, { "crossentropy": 2.6136369705200195, "epoch": 0.418934718181571, "grad_norm": 0.03429390862584114, "grad_norm_var": 3.769164748762312e-06, "learning_rate": 0.003601278318946041, "loss": 2.6136, "step": 7704 }, { "crossentropy": 2.3900164365768433, "epoch": 0.4189890970390712, "grad_norm": 0.03211876377463341, "grad_norm_var": 1.6392616960327066e-06, "learning_rate": 0.003600082895389738, "loss": 2.39, "step": 7705 }, { "crossentropy": 2.5966944694519043, "epoch": 0.4190434758965714, "grad_norm": 0.03258489817380905, "grad_norm_var": 1.6303591786670512e-06, "learning_rate": 0.0035988875586549928, "loss": 2.5967, "step": 7706 }, { "crossentropy": 2.585669159889221, "epoch": 0.41909785475407163, "grad_norm": 0.03381786867976189, "grad_norm_var": 1.6459052522172294e-06, "learning_rate": 0.003597692308815932, "loss": 2.5857, "step": 7707 }, { "crossentropy": 2.647527813911438, "epoch": 0.41915223361157183, "grad_norm": 0.03298330679535866, "grad_norm_var": 1.3784084299230116e-06, "learning_rate": 0.003596497145946687, "loss": 2.6475, "step": 7708 }, { "crossentropy": 2.607440948486328, "epoch": 0.41920661246907204, "grad_norm": 0.032801348716020584, "grad_norm_var": 8.235511714757906e-07, "learning_rate": 0.0035953020701213825, "loss": 2.6074, "step": 7709 }, { "crossentropy": 2.6265556812286377, "epoch": 0.41926099132657224, "grad_norm": 0.03629201650619507, "grad_norm_var": 1.457251858754706e-06, "learning_rate": 0.0035941070814141324, "loss": 2.6266, "step": 7710 }, { "crossentropy": 2.616441249847412, "epoch": 0.41931537018407244, "grad_norm": 0.032518256455659866, "grad_norm_var": 1.4888426181504378e-06, "learning_rate": 0.0035929121798990494, "loss": 2.6164, "step": 7711 }, { "crossentropy": 2.5851627588272095, "epoch": 0.41936974904157265, "grad_norm": 0.03447309136390686, "grad_norm_var": 1.4771974760720285e-06, "learning_rate": 0.0035917173656502435, "loss": 2.5852, "step": 7712 }, { "crossentropy": 2.5877187252044678, "epoch": 0.41942412789907285, "grad_norm": 0.03302779793739319, "grad_norm_var": 1.4751722631507642e-06, "learning_rate": 0.003590522638741812, "loss": 2.5877, "step": 7713 }, { "crossentropy": 2.537270188331604, "epoch": 0.41947850675657306, "grad_norm": 0.03507867455482483, "grad_norm_var": 1.3920169068130465e-06, "learning_rate": 0.0035893279992478535, "loss": 2.5373, "step": 7714 }, { "crossentropy": 2.6825393438339233, "epoch": 0.41953288561407326, "grad_norm": 0.03288709372282028, "grad_norm_var": 1.349619744993718e-06, "learning_rate": 0.0035881334472424578, "loss": 2.6825, "step": 7715 }, { "crossentropy": 2.5330417156219482, "epoch": 0.41958726447157346, "grad_norm": 0.03320792317390442, "grad_norm_var": 1.2613416599969081e-06, "learning_rate": 0.0035869389827997067, "loss": 2.533, "step": 7716 }, { "crossentropy": 2.572913646697998, "epoch": 0.41964164332907367, "grad_norm": 0.03324738144874573, "grad_norm_var": 1.2194968488974296e-06, "learning_rate": 0.0035857446059936837, "loss": 2.5729, "step": 7717 }, { "crossentropy": 2.684448480606079, "epoch": 0.41969602218657387, "grad_norm": 0.035519372671842575, "grad_norm_var": 1.441207758710814e-06, "learning_rate": 0.0035845503168984638, "loss": 2.6844, "step": 7718 }, { "crossentropy": 2.621618151664734, "epoch": 0.4197504010440741, "grad_norm": 0.034098587930202484, "grad_norm_var": 1.4046568311775731e-06, "learning_rate": 0.0035833561155881118, "loss": 2.6216, "step": 7719 }, { "crossentropy": 2.528489589691162, "epoch": 0.4198047799015743, "grad_norm": 0.03224804997444153, "grad_norm_var": 1.4999886001655638e-06, "learning_rate": 0.003582162002136694, "loss": 2.5285, "step": 7720 }, { "crossentropy": 2.6602373123168945, "epoch": 0.4198591587590745, "grad_norm": 0.03222094476222992, "grad_norm_var": 1.4810528844009033e-06, "learning_rate": 0.003580967976618266, "loss": 2.6602, "step": 7721 }, { "crossentropy": 2.610177993774414, "epoch": 0.4199135376165747, "grad_norm": 0.0341709740459919, "grad_norm_var": 1.4314526829629914e-06, "learning_rate": 0.00357977403910688, "loss": 2.6102, "step": 7722 }, { "crossentropy": 2.5683406591415405, "epoch": 0.4199679164740749, "grad_norm": 0.0335145927965641, "grad_norm_var": 1.4309001117438862e-06, "learning_rate": 0.0035785801896765857, "loss": 2.5683, "step": 7723 }, { "crossentropy": 2.6014273166656494, "epoch": 0.4200222953315751, "grad_norm": 0.03396191820502281, "grad_norm_var": 1.4046658350380196e-06, "learning_rate": 0.0035773864284014225, "loss": 2.6014, "step": 7724 }, { "crossentropy": 2.5038291215896606, "epoch": 0.4200766741890753, "grad_norm": 0.0332571379840374, "grad_norm_var": 1.3627787052143218e-06, "learning_rate": 0.0035761927553554264, "loss": 2.5038, "step": 7725 }, { "crossentropy": 2.6117297410964966, "epoch": 0.4201310530465755, "grad_norm": 0.033747076988220215, "grad_norm_var": 8.991457767356289e-07, "learning_rate": 0.0035749991706126288, "loss": 2.6117, "step": 7726 }, { "crossentropy": 2.551530122756958, "epoch": 0.4201854319040757, "grad_norm": 0.03337288275361061, "grad_norm_var": 8.245292556469109e-07, "learning_rate": 0.0035738056742470525, "loss": 2.5515, "step": 7727 }, { "crossentropy": 2.5451172590255737, "epoch": 0.4202398107615759, "grad_norm": 0.03382192924618721, "grad_norm_var": 7.775791341072328e-07, "learning_rate": 0.00357261226633272, "loss": 2.5451, "step": 7728 }, { "crossentropy": 2.523942708969116, "epoch": 0.4202941896190761, "grad_norm": 0.03405216708779335, "grad_norm_var": 7.66867625421469e-07, "learning_rate": 0.0035714189469436444, "loss": 2.5239, "step": 7729 }, { "crossentropy": 2.5299957990646362, "epoch": 0.4203485684765763, "grad_norm": 0.03546123579144478, "grad_norm_var": 8.488673783233272e-07, "learning_rate": 0.003570225716153833, "loss": 2.53, "step": 7730 }, { "crossentropy": 2.637394905090332, "epoch": 0.4204029473340765, "grad_norm": 0.033294256776571274, "grad_norm_var": 8.164909730305296e-07, "learning_rate": 0.003569032574037289, "loss": 2.6374, "step": 7731 }, { "crossentropy": 2.6403164863586426, "epoch": 0.4204573261915767, "grad_norm": 0.03598596155643463, "grad_norm_var": 1.1166493855324643e-06, "learning_rate": 0.0035678395206680136, "loss": 2.6403, "step": 7732 }, { "crossentropy": 2.7059249877929688, "epoch": 0.4205117050490769, "grad_norm": 0.036659106612205505, "grad_norm_var": 1.5593654158652103e-06, "learning_rate": 0.003566646556119992, "loss": 2.7059, "step": 7733 }, { "crossentropy": 2.6480451822280884, "epoch": 0.4205660839065771, "grad_norm": 0.032100219279527664, "grad_norm_var": 1.636862888874241e-06, "learning_rate": 0.003565453680467218, "loss": 2.648, "step": 7734 }, { "crossentropy": 2.5235695838928223, "epoch": 0.42062046276407733, "grad_norm": 0.032100193202495575, "grad_norm_var": 1.826337277136686e-06, "learning_rate": 0.0035642608937836674, "loss": 2.5236, "step": 7735 }, { "crossentropy": 2.636171817779541, "epoch": 0.42067484162157753, "grad_norm": 0.036655183881521225, "grad_norm_var": 2.1588429271754764e-06, "learning_rate": 0.003563068196143318, "loss": 2.6362, "step": 7736 }, { "crossentropy": 2.6528000831604004, "epoch": 0.42072922047907774, "grad_norm": 0.03342698514461517, "grad_norm_var": 1.9598928784845052e-06, "learning_rate": 0.003561875587620141, "loss": 2.6528, "step": 7737 }, { "crossentropy": 2.609197974205017, "epoch": 0.42078359933657794, "grad_norm": 0.03276412561535835, "grad_norm_var": 2.070067866778193e-06, "learning_rate": 0.0035606830682880963, "loss": 2.6092, "step": 7738 }, { "crossentropy": 2.6089422702789307, "epoch": 0.42083797819407814, "grad_norm": 0.03271999582648277, "grad_norm_var": 2.162115075357516e-06, "learning_rate": 0.003559490638221147, "loss": 2.6089, "step": 7739 }, { "crossentropy": 2.5662548542022705, "epoch": 0.42089235705157835, "grad_norm": 0.031103303655982018, "grad_norm_var": 2.6725991670309973e-06, "learning_rate": 0.0035582982974932467, "loss": 2.5663, "step": 7740 }, { "crossentropy": 2.749099612236023, "epoch": 0.42094673590907855, "grad_norm": 0.03384983167052269, "grad_norm_var": 2.6530286634741565e-06, "learning_rate": 0.003557106046178341, "loss": 2.7491, "step": 7741 }, { "crossentropy": 2.559411406517029, "epoch": 0.42100111476657875, "grad_norm": 0.033377137035131454, "grad_norm_var": 2.6651619939030743e-06, "learning_rate": 0.003555913884350372, "loss": 2.5594, "step": 7742 }, { "crossentropy": 2.5995023250579834, "epoch": 0.42105549362407896, "grad_norm": 0.03367660194635391, "grad_norm_var": 2.6537712659360868e-06, "learning_rate": 0.0035547218120832802, "loss": 2.5995, "step": 7743 }, { "crossentropy": 2.733278751373291, "epoch": 0.42110987248157916, "grad_norm": 0.03613153100013733, "grad_norm_var": 2.9891378886893418e-06, "learning_rate": 0.0035535298294509915, "loss": 2.7333, "step": 7744 }, { "crossentropy": 2.4591907262802124, "epoch": 0.42116425133907937, "grad_norm": 0.03406354412436485, "grad_norm_var": 2.9892859954346126e-06, "learning_rate": 0.0035523379365274355, "loss": 2.4592, "step": 7745 }, { "crossentropy": 2.5831124782562256, "epoch": 0.42121863019657957, "grad_norm": 0.034793347120285034, "grad_norm_var": 2.8835291943542315e-06, "learning_rate": 0.0035511461333865312, "loss": 2.5831, "step": 7746 }, { "crossentropy": 2.6334720849990845, "epoch": 0.4212730090540798, "grad_norm": 0.03383029252290726, "grad_norm_var": 2.8568482476960664e-06, "learning_rate": 0.0035499544201021927, "loss": 2.6335, "step": 7747 }, { "crossentropy": 2.557037115097046, "epoch": 0.42132738791158, "grad_norm": 0.03192669898271561, "grad_norm_var": 2.7860291551584736e-06, "learning_rate": 0.00354876279674833, "loss": 2.557, "step": 7748 }, { "crossentropy": 2.541709542274475, "epoch": 0.4213817667690802, "grad_norm": 0.03261542692780495, "grad_norm_var": 2.2118262847472023e-06, "learning_rate": 0.0035475712633988433, "loss": 2.5417, "step": 7749 }, { "crossentropy": 2.5451364517211914, "epoch": 0.4214361456265804, "grad_norm": 0.03330771625041962, "grad_norm_var": 2.0863001101830883e-06, "learning_rate": 0.0035463798201276343, "loss": 2.5451, "step": 7750 }, { "crossentropy": 2.743005394935608, "epoch": 0.4214905244840806, "grad_norm": 0.03520672395825386, "grad_norm_var": 2.1008013181140663e-06, "learning_rate": 0.0035451884670085945, "loss": 2.743, "step": 7751 }, { "crossentropy": 2.6688008308410645, "epoch": 0.4215449033415808, "grad_norm": 0.03351840749382973, "grad_norm_var": 1.486289217724372e-06, "learning_rate": 0.003543997204115608, "loss": 2.6688, "step": 7752 }, { "crossentropy": 2.615739345550537, "epoch": 0.421599282199081, "grad_norm": 0.03330609202384949, "grad_norm_var": 1.4886935858148768e-06, "learning_rate": 0.0035428060315225583, "loss": 2.6157, "step": 7753 }, { "crossentropy": 2.5517078638076782, "epoch": 0.4216536610565812, "grad_norm": 0.0314779207110405, "grad_norm_var": 1.7203316193516026e-06, "learning_rate": 0.0035416149493033206, "loss": 2.5517, "step": 7754 }, { "crossentropy": 2.490576982498169, "epoch": 0.4217080399140814, "grad_norm": 0.03308640420436859, "grad_norm_var": 1.6939606734627053e-06, "learning_rate": 0.0035404239575317622, "loss": 2.4906, "step": 7755 }, { "crossentropy": 2.616336226463318, "epoch": 0.4217624187715816, "grad_norm": 0.033712513744831085, "grad_norm_var": 1.3015127440986484e-06, "learning_rate": 0.00353923305628175, "loss": 2.6163, "step": 7756 }, { "crossentropy": 2.6429957151412964, "epoch": 0.4218167976290818, "grad_norm": 0.03202715888619423, "grad_norm_var": 1.4526871822434506e-06, "learning_rate": 0.003538042245627143, "loss": 2.643, "step": 7757 }, { "crossentropy": 2.549505591392517, "epoch": 0.421871176486582, "grad_norm": 0.037694256752729416, "grad_norm_var": 2.544741235460668e-06, "learning_rate": 0.003536851525641792, "loss": 2.5495, "step": 7758 }, { "crossentropy": 2.562925934791565, "epoch": 0.4219255553440822, "grad_norm": 0.03417731449007988, "grad_norm_var": 2.5539474137728847e-06, "learning_rate": 0.003535660896399544, "loss": 2.5629, "step": 7759 }, { "crossentropy": 2.593280553817749, "epoch": 0.4219799342015824, "grad_norm": 0.03310740739107132, "grad_norm_var": 2.1873172905495855e-06, "learning_rate": 0.0035344703579742454, "loss": 2.5933, "step": 7760 }, { "crossentropy": 2.5587871074676514, "epoch": 0.4220343130590826, "grad_norm": 0.032717734575271606, "grad_norm_var": 2.2201560665097152e-06, "learning_rate": 0.0035332799104397256, "loss": 2.5588, "step": 7761 }, { "crossentropy": 2.5193123817443848, "epoch": 0.4220886919165828, "grad_norm": 0.032667022198438644, "grad_norm_var": 2.1450134984316372e-06, "learning_rate": 0.0035320895538698206, "loss": 2.5193, "step": 7762 }, { "crossentropy": 2.609911561012268, "epoch": 0.42214307077408303, "grad_norm": 0.033714983612298965, "grad_norm_var": 2.1392088743704223e-06, "learning_rate": 0.0035308992883383516, "loss": 2.6099, "step": 7763 }, { "crossentropy": 2.5439263582229614, "epoch": 0.42219744963158323, "grad_norm": 0.03368275612592697, "grad_norm_var": 1.9889756994015868e-06, "learning_rate": 0.0035297091139191395, "loss": 2.5439, "step": 7764 }, { "crossentropy": 2.5790419578552246, "epoch": 0.42225182848908344, "grad_norm": 0.03479156643152237, "grad_norm_var": 2.027929281573209e-06, "learning_rate": 0.0035285190306859975, "loss": 2.579, "step": 7765 }, { "crossentropy": 2.5320773124694824, "epoch": 0.42230620734658364, "grad_norm": 0.031427767127752304, "grad_norm_var": 2.331417885344476e-06, "learning_rate": 0.0035273290387127323, "loss": 2.5321, "step": 7766 }, { "crossentropy": 2.5196373462677, "epoch": 0.42236058620408384, "grad_norm": 0.03345179185271263, "grad_norm_var": 2.129168353792732e-06, "learning_rate": 0.0035261391380731477, "loss": 2.5196, "step": 7767 }, { "crossentropy": 2.531937003135681, "epoch": 0.42241496506158405, "grad_norm": 0.033590417355298996, "grad_norm_var": 2.1305326383704026e-06, "learning_rate": 0.0035249493288410406, "loss": 2.5319, "step": 7768 }, { "crossentropy": 2.666514754295349, "epoch": 0.42246934391908425, "grad_norm": 0.03197474032640457, "grad_norm_var": 2.2605698950878273e-06, "learning_rate": 0.0035237596110902006, "loss": 2.6665, "step": 7769 }, { "crossentropy": 2.6269372701644897, "epoch": 0.42252372277658445, "grad_norm": 0.03379072993993759, "grad_norm_var": 2.0233343520257666e-06, "learning_rate": 0.0035225699848944114, "loss": 2.6269, "step": 7770 }, { "crossentropy": 2.751792311668396, "epoch": 0.42257810163408466, "grad_norm": 0.03987245634198189, "grad_norm_var": 4.549062992727168e-06, "learning_rate": 0.0035213804503274582, "loss": 2.7518, "step": 7771 }, { "crossentropy": 2.5809131860733032, "epoch": 0.42263248049158486, "grad_norm": 0.034935999661684036, "grad_norm_var": 4.6120291565051225e-06, "learning_rate": 0.003520191007463107, "loss": 2.5809, "step": 7772 }, { "crossentropy": 2.666801333427429, "epoch": 0.42268685934908506, "grad_norm": 0.030727561563253403, "grad_norm_var": 5.055370980255649e-06, "learning_rate": 0.0035190016563751317, "loss": 2.6668, "step": 7773 }, { "crossentropy": 2.6696484088897705, "epoch": 0.42274123820658527, "grad_norm": 0.034355007112026215, "grad_norm_var": 4.060852556006464e-06, "learning_rate": 0.0035178123971372935, "loss": 2.6696, "step": 7774 }, { "crossentropy": 2.7089359760284424, "epoch": 0.42279561706408547, "grad_norm": 0.032831862568855286, "grad_norm_var": 4.085957720955981e-06, "learning_rate": 0.003516623229823348, "loss": 2.7089, "step": 7775 }, { "crossentropy": 2.6505258083343506, "epoch": 0.4228499959215857, "grad_norm": 0.033205680549144745, "grad_norm_var": 4.080074240271856e-06, "learning_rate": 0.0035154341545070474, "loss": 2.6505, "step": 7776 }, { "crossentropy": 2.5623096227645874, "epoch": 0.4229043747790859, "grad_norm": 0.035522181540727615, "grad_norm_var": 4.238502780201138e-06, "learning_rate": 0.0035142451712621343, "loss": 2.5623, "step": 7777 }, { "crossentropy": 2.6443753242492676, "epoch": 0.4229587536365861, "grad_norm": 0.03928416594862938, "grad_norm_var": 5.989752426761343e-06, "learning_rate": 0.003513056280162352, "loss": 2.6444, "step": 7778 }, { "crossentropy": 2.560957670211792, "epoch": 0.4230131324940863, "grad_norm": 0.03322254866361618, "grad_norm_var": 6.036587883446558e-06, "learning_rate": 0.0035118674812814332, "loss": 2.561, "step": 7779 }, { "crossentropy": 2.574753761291504, "epoch": 0.4230675113515865, "grad_norm": 0.03262840211391449, "grad_norm_var": 6.174100161966909e-06, "learning_rate": 0.003510678774693106, "loss": 2.5748, "step": 7780 }, { "crossentropy": 2.601384401321411, "epoch": 0.4231218902090867, "grad_norm": 0.0333251953125, "grad_norm_var": 6.173435401997847e-06, "learning_rate": 0.0035094901604710905, "loss": 2.6014, "step": 7781 }, { "crossentropy": 2.5862892866134644, "epoch": 0.4231762690665869, "grad_norm": 0.03629227727651596, "grad_norm_var": 5.978108787540957e-06, "learning_rate": 0.00350830163868911, "loss": 2.5863, "step": 7782 }, { "crossentropy": 2.455802321434021, "epoch": 0.4232306479240871, "grad_norm": 0.03300786018371582, "grad_norm_var": 6.041412836838012e-06, "learning_rate": 0.003507113209420867, "loss": 2.4558, "step": 7783 }, { "crossentropy": 2.5507270097732544, "epoch": 0.4232850267815873, "grad_norm": 0.036214232444763184, "grad_norm_var": 6.228539052146633e-06, "learning_rate": 0.003505924872740072, "loss": 2.5507, "step": 7784 }, { "crossentropy": 2.6631189584732056, "epoch": 0.4233394056390875, "grad_norm": 0.03247951343655586, "grad_norm_var": 6.077909464376659e-06, "learning_rate": 0.003504736628720425, "loss": 2.6631, "step": 7785 }, { "crossentropy": 2.5884742736816406, "epoch": 0.4233937844965877, "grad_norm": 0.04079777002334595, "grad_norm_var": 8.501691635181876e-06, "learning_rate": 0.003503548477435617, "loss": 2.5885, "step": 7786 }, { "crossentropy": 2.7260656356811523, "epoch": 0.4234481633540879, "grad_norm": 0.032428909093141556, "grad_norm_var": 7.0483401287881055e-06, "learning_rate": 0.003502360418959336, "loss": 2.7261, "step": 7787 }, { "crossentropy": 2.6071112155914307, "epoch": 0.4235025422115881, "grad_norm": 0.03537948429584503, "grad_norm_var": 7.089151661258432e-06, "learning_rate": 0.003501172453365268, "loss": 2.6071, "step": 7788 }, { "crossentropy": 2.5511080026626587, "epoch": 0.4235569210690883, "grad_norm": 0.03204847872257233, "grad_norm_var": 6.537065655879554e-06, "learning_rate": 0.0034999845807270856, "loss": 2.5511, "step": 7789 }, { "crossentropy": 2.5771273374557495, "epoch": 0.4236112999265885, "grad_norm": 0.03327804058790207, "grad_norm_var": 6.639563292104346e-06, "learning_rate": 0.003498796801118463, "loss": 2.5771, "step": 7790 }, { "crossentropy": 2.6217029094696045, "epoch": 0.42366567878408873, "grad_norm": 0.033621013164520264, "grad_norm_var": 6.5033153093805215e-06, "learning_rate": 0.0034976091146130617, "loss": 2.6217, "step": 7791 }, { "crossentropy": 2.63490891456604, "epoch": 0.42372005764158893, "grad_norm": 0.03249417617917061, "grad_norm_var": 6.662106170793267e-06, "learning_rate": 0.003496421521284541, "loss": 2.6349, "step": 7792 }, { "crossentropy": 2.5214366912841797, "epoch": 0.42377443649908914, "grad_norm": 0.03364948183298111, "grad_norm_var": 6.626440482258161e-06, "learning_rate": 0.0034952340212065594, "loss": 2.5214, "step": 7793 }, { "crossentropy": 2.58722186088562, "epoch": 0.42382881535658934, "grad_norm": 0.03194909170269966, "grad_norm_var": 5.197198266988316e-06, "learning_rate": 0.0034940466144527576, "loss": 2.5872, "step": 7794 }, { "crossentropy": 2.6627572774887085, "epoch": 0.42388319421408954, "grad_norm": 0.03291988745331764, "grad_norm_var": 5.231312360835273e-06, "learning_rate": 0.0034928593010967817, "loss": 2.6628, "step": 7795 }, { "crossentropy": 2.569196343421936, "epoch": 0.42393757307158975, "grad_norm": 0.03334101289510727, "grad_norm_var": 5.141554287301358e-06, "learning_rate": 0.003491672081212268, "loss": 2.5692, "step": 7796 }, { "crossentropy": 2.614857792854309, "epoch": 0.42399195192908995, "grad_norm": 0.033720921725034714, "grad_norm_var": 5.118287712780704e-06, "learning_rate": 0.0034904849548728447, "loss": 2.6149, "step": 7797 }, { "crossentropy": 2.5663020610809326, "epoch": 0.42404633078659015, "grad_norm": 0.032084278762340546, "grad_norm_var": 4.9256211150973854e-06, "learning_rate": 0.003489297922152136, "loss": 2.5663, "step": 7798 }, { "crossentropy": 2.5697858333587646, "epoch": 0.42410070964409036, "grad_norm": 0.03401976451277733, "grad_norm_var": 4.8944282610665616e-06, "learning_rate": 0.0034881109831237634, "loss": 2.5698, "step": 7799 }, { "crossentropy": 2.561147689819336, "epoch": 0.42415508850159056, "grad_norm": 0.033924225717782974, "grad_norm_var": 4.4779025283834635e-06, "learning_rate": 0.003486924137861338, "loss": 2.5611, "step": 7800 }, { "crossentropy": 2.49426805973053, "epoch": 0.42420946735909076, "grad_norm": 0.03338024765253067, "grad_norm_var": 4.3900184276153435e-06, "learning_rate": 0.0034857373864384663, "loss": 2.4943, "step": 7801 }, { "crossentropy": 2.674677610397339, "epoch": 0.42426384621659097, "grad_norm": 0.033517759293317795, "grad_norm_var": 8.029475022680587e-07, "learning_rate": 0.0034845507289287514, "loss": 2.6747, "step": 7802 }, { "crossentropy": 2.581896185874939, "epoch": 0.42431822507409117, "grad_norm": 0.0333031490445137, "grad_norm_var": 7.567772259889708e-07, "learning_rate": 0.003483364165405785, "loss": 2.5819, "step": 7803 }, { "crossentropy": 2.585553288459778, "epoch": 0.4243726039315914, "grad_norm": 0.03281690552830696, "grad_norm_var": 4.530818641321673e-07, "learning_rate": 0.003482177695943163, "loss": 2.5856, "step": 7804 }, { "crossentropy": 2.632657289505005, "epoch": 0.4244269827890916, "grad_norm": 0.030915411189198494, "grad_norm_var": 6.966043426362509e-07, "learning_rate": 0.0034809913206144613, "loss": 2.6327, "step": 7805 }, { "crossentropy": 2.5522453784942627, "epoch": 0.4244813616465918, "grad_norm": 0.03427606821060181, "grad_norm_var": 7.880776440068963e-07, "learning_rate": 0.0034798050394932627, "loss": 2.5522, "step": 7806 }, { "crossentropy": 2.6072793006896973, "epoch": 0.424535740504092, "grad_norm": 0.03482823446393013, "grad_norm_var": 9.59673845203956e-07, "learning_rate": 0.0034786188526531397, "loss": 2.6073, "step": 7807 }, { "crossentropy": 2.6261494159698486, "epoch": 0.4245901193615922, "grad_norm": 0.03436116501688957, "grad_norm_var": 1.0027486847271102e-06, "learning_rate": 0.0034774327601676557, "loss": 2.6261, "step": 7808 }, { "crossentropy": 2.6809325218200684, "epoch": 0.4246444982190924, "grad_norm": 0.03162454068660736, "grad_norm_var": 1.1681687302291978e-06, "learning_rate": 0.0034762467621103723, "loss": 2.6809, "step": 7809 }, { "crossentropy": 2.5808002948760986, "epoch": 0.4246988770765926, "grad_norm": 0.03266220912337303, "grad_norm_var": 1.08230454383891e-06, "learning_rate": 0.0034750608585548456, "loss": 2.5808, "step": 7810 }, { "crossentropy": 2.541062116622925, "epoch": 0.4247532559340928, "grad_norm": 0.031497176736593246, "grad_norm_var": 1.2678249826970073e-06, "learning_rate": 0.0034738750495746217, "loss": 2.5411, "step": 7811 }, { "crossentropy": 2.6165764331817627, "epoch": 0.424807634791593, "grad_norm": 0.03145769238471985, "grad_norm_var": 1.439548784504469e-06, "learning_rate": 0.0034726893352432443, "loss": 2.6166, "step": 7812 }, { "crossentropy": 2.615699529647827, "epoch": 0.4248620136490932, "grad_norm": 0.031781598925590515, "grad_norm_var": 1.4944950265954275e-06, "learning_rate": 0.0034715037156342514, "loss": 2.6157, "step": 7813 }, { "crossentropy": 2.6003609895706177, "epoch": 0.4249163925065934, "grad_norm": 0.03201406076550484, "grad_norm_var": 1.5024698027126753e-06, "learning_rate": 0.0034703181908211714, "loss": 2.6004, "step": 7814 }, { "crossentropy": 2.57954204082489, "epoch": 0.4249707713640936, "grad_norm": 0.03226218372583389, "grad_norm_var": 1.4328378638358742e-06, "learning_rate": 0.0034691327608775335, "loss": 2.5795, "step": 7815 }, { "crossentropy": 2.6221410036087036, "epoch": 0.4250251502215938, "grad_norm": 0.03378506004810333, "grad_norm_var": 1.412982123007037e-06, "learning_rate": 0.0034679474258768523, "loss": 2.6221, "step": 7816 }, { "crossentropy": 2.565833568572998, "epoch": 0.425079529079094, "grad_norm": 0.03135629743337631, "grad_norm_var": 1.50708108393933e-06, "learning_rate": 0.003466762185892644, "loss": 2.5658, "step": 7817 }, { "crossentropy": 2.5100878477096558, "epoch": 0.4251339079365943, "grad_norm": 0.03168283775448799, "grad_norm_var": 1.5061219616525294e-06, "learning_rate": 0.0034655770409984165, "loss": 2.5101, "step": 7818 }, { "crossentropy": 2.767340898513794, "epoch": 0.4251882867940945, "grad_norm": 0.03294847905635834, "grad_norm_var": 1.4778495358690277e-06, "learning_rate": 0.0034643919912676693, "loss": 2.7673, "step": 7819 }, { "crossentropy": 2.6263214349746704, "epoch": 0.4252426656515947, "grad_norm": 0.032430894672870636, "grad_norm_var": 1.4717200502346958e-06, "learning_rate": 0.003463207036773899, "loss": 2.6263, "step": 7820 }, { "crossentropy": 2.6393171548843384, "epoch": 0.4252970445090949, "grad_norm": 0.03484261780977249, "grad_norm_var": 1.6097194232428391e-06, "learning_rate": 0.0034620221775905962, "loss": 2.6393, "step": 7821 }, { "crossentropy": 2.647670865058899, "epoch": 0.4253514233665951, "grad_norm": 0.03316323086619377, "grad_norm_var": 1.458932796434033e-06, "learning_rate": 0.0034608374137912435, "loss": 2.6477, "step": 7822 }, { "crossentropy": 2.6170848608016968, "epoch": 0.4254058022240953, "grad_norm": 0.03168550878763199, "grad_norm_var": 1.1712940594553004e-06, "learning_rate": 0.00345965274544932, "loss": 2.6171, "step": 7823 }, { "crossentropy": 2.6142265796661377, "epoch": 0.4254601810815955, "grad_norm": 0.033154457807540894, "grad_norm_var": 9.583828163590875e-07, "learning_rate": 0.003458468172638298, "loss": 2.6142, "step": 7824 }, { "crossentropy": 2.6197551488876343, "epoch": 0.4255145599390957, "grad_norm": 0.030807282775640488, "grad_norm_var": 1.0842788771956806e-06, "learning_rate": 0.0034572836954316413, "loss": 2.6198, "step": 7825 }, { "crossentropy": 2.4780246019363403, "epoch": 0.4255689387965959, "grad_norm": 0.03424505144357681, "grad_norm_var": 1.3076584849789144e-06, "learning_rate": 0.0034560993139028107, "loss": 2.478, "step": 7826 }, { "crossentropy": 2.6128435134887695, "epoch": 0.4256233176540961, "grad_norm": 0.03548290580511093, "grad_norm_var": 1.7970184881678509e-06, "learning_rate": 0.003454915028125263, "loss": 2.6128, "step": 7827 }, { "crossentropy": 2.533428907394409, "epoch": 0.4256776965115963, "grad_norm": 0.03351518139243126, "grad_norm_var": 1.722504563155848e-06, "learning_rate": 0.003453730838172444, "loss": 2.5334, "step": 7828 }, { "crossentropy": 2.5345041751861572, "epoch": 0.4257320753690965, "grad_norm": 0.031150173395872116, "grad_norm_var": 1.8350443652920268e-06, "learning_rate": 0.003452546744117798, "loss": 2.5345, "step": 7829 }, { "crossentropy": 2.6968733072280884, "epoch": 0.4257864542265967, "grad_norm": 0.0324910432100296, "grad_norm_var": 1.8003682067987392e-06, "learning_rate": 0.0034513627460347595, "loss": 2.6969, "step": 7830 }, { "crossentropy": 2.5604735612869263, "epoch": 0.4258408330840969, "grad_norm": 0.032279521226882935, "grad_norm_var": 1.7991143825455205e-06, "learning_rate": 0.0034501788439967587, "loss": 2.5605, "step": 7831 }, { "crossentropy": 2.5804333686828613, "epoch": 0.42589521194159713, "grad_norm": 0.03192422538995743, "grad_norm_var": 1.7745489250745385e-06, "learning_rate": 0.003448995038077224, "loss": 2.5804, "step": 7832 }, { "crossentropy": 2.6061651706695557, "epoch": 0.42594959079909733, "grad_norm": 0.03157036751508713, "grad_norm_var": 1.7391320570908063e-06, "learning_rate": 0.003447811328349567, "loss": 2.6062, "step": 7833 }, { "crossentropy": 2.5897029638290405, "epoch": 0.42600396965659754, "grad_norm": 0.031978052109479904, "grad_norm_var": 1.7041140570665687e-06, "learning_rate": 0.0034466277148872066, "loss": 2.5897, "step": 7834 }, { "crossentropy": 2.5247833728790283, "epoch": 0.42605834851409774, "grad_norm": 0.033376023173332214, "grad_norm_var": 1.7280324872197157e-06, "learning_rate": 0.0034454441977635475, "loss": 2.5248, "step": 7835 }, { "crossentropy": 2.5478198528289795, "epoch": 0.42611272737159794, "grad_norm": 0.03627532348036766, "grad_norm_var": 2.4850964199978992e-06, "learning_rate": 0.00344426077705199, "loss": 2.5478, "step": 7836 }, { "crossentropy": 2.6775147914886475, "epoch": 0.42616710622909815, "grad_norm": 0.030732810497283936, "grad_norm_var": 2.5290246460418774e-06, "learning_rate": 0.0034430774528259257, "loss": 2.6775, "step": 7837 }, { "crossentropy": 2.681431293487549, "epoch": 0.42622148508659835, "grad_norm": 0.03444048389792442, "grad_norm_var": 2.7031560932616604e-06, "learning_rate": 0.0034418942251587497, "loss": 2.6814, "step": 7838 }, { "crossentropy": 2.692383289337158, "epoch": 0.42627586394409855, "grad_norm": 0.033577945083379745, "grad_norm_var": 2.640910723817648e-06, "learning_rate": 0.0034407110941238397, "loss": 2.6924, "step": 7839 }, { "crossentropy": 2.5946465730667114, "epoch": 0.42633024280159876, "grad_norm": 0.03378792852163315, "grad_norm_var": 2.684311423451803e-06, "learning_rate": 0.0034395280597945745, "loss": 2.5946, "step": 7840 }, { "crossentropy": 2.4500821828842163, "epoch": 0.42638462165909896, "grad_norm": 0.03394566476345062, "grad_norm_var": 2.391920676126739e-06, "learning_rate": 0.0034383451222443247, "loss": 2.4501, "step": 7841 }, { "crossentropy": 2.614022135734558, "epoch": 0.42643900051659916, "grad_norm": 0.034037843346595764, "grad_norm_var": 2.3649938777919414e-06, "learning_rate": 0.0034371622815464525, "loss": 2.614, "step": 7842 }, { "crossentropy": 2.626440167427063, "epoch": 0.42649337937409937, "grad_norm": 0.03243399038910866, "grad_norm_var": 2.001813820358816e-06, "learning_rate": 0.003435979537774322, "loss": 2.6264, "step": 7843 }, { "crossentropy": 2.667911410331726, "epoch": 0.42654775823159957, "grad_norm": 0.03172588348388672, "grad_norm_var": 2.071796384304353e-06, "learning_rate": 0.0034347968910012788, "loss": 2.6679, "step": 7844 }, { "crossentropy": 2.640276789665222, "epoch": 0.4266021370890998, "grad_norm": 0.031751759350299835, "grad_norm_var": 1.9574318338386535e-06, "learning_rate": 0.003433614341300675, "loss": 2.6403, "step": 7845 }, { "crossentropy": 2.639467477798462, "epoch": 0.4266565159466, "grad_norm": 0.032766811549663544, "grad_norm_var": 1.9473113390820636e-06, "learning_rate": 0.0034324318887458496, "loss": 2.6395, "step": 7846 }, { "crossentropy": 2.522157311439514, "epoch": 0.4267108948041002, "grad_norm": 0.030983759090304375, "grad_norm_var": 2.161657501360001e-06, "learning_rate": 0.0034312495334101363, "loss": 2.5222, "step": 7847 }, { "crossentropy": 2.560562491416931, "epoch": 0.4267652736616004, "grad_norm": 0.041902076452970505, "grad_norm_var": 7.176576795822837e-06, "learning_rate": 0.003430067275366863, "loss": 2.5606, "step": 7848 }, { "crossentropy": 2.4684170484542847, "epoch": 0.4268196525191006, "grad_norm": 0.03246210142970085, "grad_norm_var": 7.002147414078086e-06, "learning_rate": 0.003428885114689356, "loss": 2.4684, "step": 7849 }, { "crossentropy": 2.4608594179153442, "epoch": 0.4268740313766008, "grad_norm": 0.03381343185901642, "grad_norm_var": 6.837509648348209e-06, "learning_rate": 0.003427703051450929, "loss": 2.4609, "step": 7850 }, { "crossentropy": 2.6236629486083984, "epoch": 0.426928410234101, "grad_norm": 0.03360813111066818, "grad_norm_var": 6.8331447501427155e-06, "learning_rate": 0.0034265210857248917, "loss": 2.6237, "step": 7851 }, { "crossentropy": 2.6867462396621704, "epoch": 0.4269827890916012, "grad_norm": 0.03384445607662201, "grad_norm_var": 6.3484353433238775e-06, "learning_rate": 0.0034253392175845523, "loss": 2.6867, "step": 7852 }, { "crossentropy": 2.573890447616577, "epoch": 0.4270371679491014, "grad_norm": 0.033675599843263626, "grad_norm_var": 5.808453390339002e-06, "learning_rate": 0.003424157447103203, "loss": 2.5739, "step": 7853 }, { "crossentropy": 2.653854250907898, "epoch": 0.4270915468066016, "grad_norm": 0.032609909772872925, "grad_norm_var": 5.830411581860317e-06, "learning_rate": 0.003422975774354141, "loss": 2.6539, "step": 7854 }, { "crossentropy": 2.471514105796814, "epoch": 0.4271459256641018, "grad_norm": 0.034100137650966644, "grad_norm_var": 5.848846169518778e-06, "learning_rate": 0.0034217941994106526, "loss": 2.4715, "step": 7855 }, { "crossentropy": 2.711750864982605, "epoch": 0.427200304521602, "grad_norm": 0.031304534524679184, "grad_norm_var": 6.168957380071763e-06, "learning_rate": 0.003420612722346016, "loss": 2.7118, "step": 7856 }, { "crossentropy": 2.511385917663574, "epoch": 0.4272546833791022, "grad_norm": 0.03258325532078743, "grad_norm_var": 6.1922718988874215e-06, "learning_rate": 0.003419431343233507, "loss": 2.5114, "step": 7857 }, { "crossentropy": 2.478097438812256, "epoch": 0.4273090622366024, "grad_norm": 0.03299800679087639, "grad_norm_var": 6.164516597778946e-06, "learning_rate": 0.0034182500621463907, "loss": 2.4781, "step": 7858 }, { "crossentropy": 2.678514242172241, "epoch": 0.4273634410941026, "grad_norm": 0.03227325156331062, "grad_norm_var": 6.184375263089967e-06, "learning_rate": 0.0034170688791579297, "loss": 2.6785, "step": 7859 }, { "crossentropy": 2.577869176864624, "epoch": 0.42741781995160283, "grad_norm": 0.03316160663962364, "grad_norm_var": 6.01662242084148e-06, "learning_rate": 0.0034158877943413837, "loss": 2.5779, "step": 7860 }, { "crossentropy": 2.6639480590820312, "epoch": 0.42747219880910303, "grad_norm": 0.03293778747320175, "grad_norm_var": 5.849437242908548e-06, "learning_rate": 0.0034147068077699985, "loss": 2.6639, "step": 7861 }, { "crossentropy": 2.6445627212524414, "epoch": 0.42752657766660324, "grad_norm": 0.03265256807208061, "grad_norm_var": 5.8604928683537625e-06, "learning_rate": 0.00341352591951702, "loss": 2.6446, "step": 7862 }, { "crossentropy": 2.6634398698806763, "epoch": 0.42758095652410344, "grad_norm": 0.03199446573853493, "grad_norm_var": 5.594422925149037e-06, "learning_rate": 0.0034123451296556844, "loss": 2.6634, "step": 7863 }, { "crossentropy": 2.668820858001709, "epoch": 0.42763533538160364, "grad_norm": 0.033622629940509796, "grad_norm_var": 5.980492048209806e-07, "learning_rate": 0.0034111644382592227, "loss": 2.6688, "step": 7864 }, { "crossentropy": 2.5172133445739746, "epoch": 0.42768971423910385, "grad_norm": 0.03260987997055054, "grad_norm_var": 5.892564901651317e-07, "learning_rate": 0.003409983845400862, "loss": 2.5172, "step": 7865 }, { "crossentropy": 2.588637590408325, "epoch": 0.42774409309660405, "grad_norm": 0.03380633518099785, "grad_norm_var": 5.884775095637539e-07, "learning_rate": 0.003408803351153822, "loss": 2.5886, "step": 7866 }, { "crossentropy": 2.570589065551758, "epoch": 0.42779847195410425, "grad_norm": 0.03511087968945503, "grad_norm_var": 8.541904632117699e-07, "learning_rate": 0.003407622955591313, "loss": 2.5706, "step": 7867 }, { "crossentropy": 2.4889928102493286, "epoch": 0.42785285081160446, "grad_norm": 0.03501160815358162, "grad_norm_var": 1.0582439871897308e-06, "learning_rate": 0.003406442658786544, "loss": 2.489, "step": 7868 }, { "crossentropy": 2.61014986038208, "epoch": 0.42790722966910466, "grad_norm": 0.03142431005835533, "grad_norm_var": 1.2182268719989597e-06, "learning_rate": 0.0034052624608127175, "loss": 2.6101, "step": 7869 }, { "crossentropy": 2.518144369125366, "epoch": 0.42796160852660486, "grad_norm": 0.03270372748374939, "grad_norm_var": 1.2137400576899804e-06, "learning_rate": 0.0034040823617430237, "loss": 2.5181, "step": 7870 }, { "crossentropy": 2.569535970687866, "epoch": 0.42801598738410507, "grad_norm": 0.03268460929393768, "grad_norm_var": 1.1348154400143856e-06, "learning_rate": 0.0034029023616506562, "loss": 2.5695, "step": 7871 }, { "crossentropy": 2.6554312705993652, "epoch": 0.42807036624160527, "grad_norm": 0.03324597701430321, "grad_norm_var": 9.496328114865758e-07, "learning_rate": 0.0034017224606087935, "loss": 2.6554, "step": 7872 }, { "crossentropy": 2.589988946914673, "epoch": 0.4281247450991055, "grad_norm": 0.03294668719172478, "grad_norm_var": 9.352073700290171e-07, "learning_rate": 0.0034005426586906154, "loss": 2.59, "step": 7873 }, { "crossentropy": 2.5687650442123413, "epoch": 0.4281791239566057, "grad_norm": 0.03185166418552399, "grad_norm_var": 1.0289570942361808e-06, "learning_rate": 0.0033993629559692894, "loss": 2.5688, "step": 7874 }, { "crossentropy": 2.5719878673553467, "epoch": 0.4282335028141059, "grad_norm": 0.03368830308318138, "grad_norm_var": 1.0165391181501022e-06, "learning_rate": 0.00339818335251798, "loss": 2.572, "step": 7875 }, { "crossentropy": 2.6150530576705933, "epoch": 0.4282878816716061, "grad_norm": 0.032750554382801056, "grad_norm_var": 1.0232194875720372e-06, "learning_rate": 0.0033970038484098463, "loss": 2.6151, "step": 7876 }, { "crossentropy": 2.5127623081207275, "epoch": 0.4283422605291063, "grad_norm": 0.03126402199268341, "grad_norm_var": 1.2267302394933579e-06, "learning_rate": 0.0033958244437180406, "loss": 2.5128, "step": 7877 }, { "crossentropy": 2.6715439558029175, "epoch": 0.4283966393866065, "grad_norm": 0.03410157933831215, "grad_norm_var": 1.2984617577472984e-06, "learning_rate": 0.0033946451385157062, "loss": 2.6715, "step": 7878 }, { "crossentropy": 2.4714020490646362, "epoch": 0.4284510182441067, "grad_norm": 0.03327284753322601, "grad_norm_var": 1.2205026543585703e-06, "learning_rate": 0.003393465932875982, "loss": 2.4714, "step": 7879 }, { "crossentropy": 2.5191537141799927, "epoch": 0.4285053971016069, "grad_norm": 0.032101526856422424, "grad_norm_var": 1.2653981205858664e-06, "learning_rate": 0.003392286826872007, "loss": 2.5192, "step": 7880 }, { "crossentropy": 2.630276918411255, "epoch": 0.4285597759591071, "grad_norm": 0.033706385642290115, "grad_norm_var": 1.2782579451169872e-06, "learning_rate": 0.0033911078205769007, "loss": 2.6303, "step": 7881 }, { "crossentropy": 2.6248375177383423, "epoch": 0.4286141548166073, "grad_norm": 0.03258896991610527, "grad_norm_var": 1.2569529743072215e-06, "learning_rate": 0.0033899289140637877, "loss": 2.6248, "step": 7882 }, { "crossentropy": 2.5756070613861084, "epoch": 0.4286685336741075, "grad_norm": 0.032939597964286804, "grad_norm_var": 9.487067782439536e-07, "learning_rate": 0.003388750107405784, "loss": 2.5756, "step": 7883 }, { "crossentropy": 2.6573944091796875, "epoch": 0.4287229125316077, "grad_norm": 0.032226234674453735, "grad_norm_var": 6.466548600577692e-07, "learning_rate": 0.003387571400675996, "loss": 2.6574, "step": 7884 }, { "crossentropy": 2.6866984367370605, "epoch": 0.4287772913891079, "grad_norm": 0.032931920140981674, "grad_norm_var": 5.285466753096948e-07, "learning_rate": 0.003386392793947527, "loss": 2.6867, "step": 7885 }, { "crossentropy": 2.567530035972595, "epoch": 0.4288316702466081, "grad_norm": 0.03139055520296097, "grad_norm_var": 6.554183672064298e-07, "learning_rate": 0.00338521428729347, "loss": 2.5675, "step": 7886 }, { "crossentropy": 2.5899293422698975, "epoch": 0.4288860491041083, "grad_norm": 0.031989578157663345, "grad_norm_var": 6.898827558755871e-07, "learning_rate": 0.0033840358807869187, "loss": 2.5899, "step": 7887 }, { "crossentropy": 2.597191333770752, "epoch": 0.42894042796160853, "grad_norm": 0.030422676354646683, "grad_norm_var": 9.777541609949185e-07, "learning_rate": 0.003382857574500957, "loss": 2.5972, "step": 7888 }, { "crossentropy": 2.5868860483169556, "epoch": 0.42899480681910873, "grad_norm": 0.033146850764751434, "grad_norm_var": 9.91890912249195e-07, "learning_rate": 0.00338167936850866, "loss": 2.5869, "step": 7889 }, { "crossentropy": 2.6008503437042236, "epoch": 0.42904918567660894, "grad_norm": 0.032253578305244446, "grad_norm_var": 9.65993296869229e-07, "learning_rate": 0.0033805012628830987, "loss": 2.6009, "step": 7890 }, { "crossentropy": 2.567317485809326, "epoch": 0.42910356453410914, "grad_norm": 0.031194010749459267, "grad_norm_var": 9.757527037168676e-07, "learning_rate": 0.003379323257697341, "loss": 2.5673, "step": 7891 }, { "crossentropy": 2.6425641775131226, "epoch": 0.42915794339160934, "grad_norm": 0.03256971016526222, "grad_norm_var": 9.691644727410533e-07, "learning_rate": 0.003378145353024442, "loss": 2.6426, "step": 7892 }, { "crossentropy": 2.617590069770813, "epoch": 0.42921232224910955, "grad_norm": 0.032318003475666046, "grad_norm_var": 8.815888812070075e-07, "learning_rate": 0.003376967548937457, "loss": 2.6176, "step": 7893 }, { "crossentropy": 2.562502145767212, "epoch": 0.42926670110660975, "grad_norm": 0.03676939010620117, "grad_norm_var": 1.914917026531977e-06, "learning_rate": 0.0033757898455094326, "loss": 2.5625, "step": 7894 }, { "crossentropy": 2.6038562059402466, "epoch": 0.42932107996410995, "grad_norm": 0.03367991745471954, "grad_norm_var": 1.9610405892053387e-06, "learning_rate": 0.0033746122428134065, "loss": 2.6039, "step": 7895 }, { "crossentropy": 2.454442262649536, "epoch": 0.42937545882161016, "grad_norm": 0.032934121787548065, "grad_norm_var": 1.944666123391174e-06, "learning_rate": 0.0033734347409224126, "loss": 2.4544, "step": 7896 }, { "crossentropy": 2.5642893314361572, "epoch": 0.42942983767911036, "grad_norm": 0.03212243691086769, "grad_norm_var": 1.8871020747404972e-06, "learning_rate": 0.003372257339909483, "loss": 2.5643, "step": 7897 }, { "crossentropy": 2.6528435945510864, "epoch": 0.42948421653661056, "grad_norm": 0.032011061906814575, "grad_norm_var": 1.908235898748093e-06, "learning_rate": 0.003371080039847633, "loss": 2.6528, "step": 7898 }, { "crossentropy": 2.654272675514221, "epoch": 0.42953859539411077, "grad_norm": 0.032640207558870316, "grad_norm_var": 1.8985344139624358e-06, "learning_rate": 0.003369902840809881, "loss": 2.6543, "step": 7899 }, { "crossentropy": 2.656502366065979, "epoch": 0.42959297425161097, "grad_norm": 0.03400246426463127, "grad_norm_var": 2.0220004732415897e-06, "learning_rate": 0.0033687257428692352, "loss": 2.6565, "step": 7900 }, { "crossentropy": 2.5383050441741943, "epoch": 0.4296473531091112, "grad_norm": 0.03285815566778183, "grad_norm_var": 2.0195533335235258e-06, "learning_rate": 0.0033675487460986977, "loss": 2.5383, "step": 7901 }, { "crossentropy": 2.5031027793884277, "epoch": 0.4297017319666114, "grad_norm": 0.03121308609843254, "grad_norm_var": 2.0511795907462155e-06, "learning_rate": 0.003366371850571266, "loss": 2.5031, "step": 7902 }, { "crossentropy": 2.597394585609436, "epoch": 0.4297561108241116, "grad_norm": 0.03180794045329094, "grad_norm_var": 2.0688200659674106e-06, "learning_rate": 0.0033651950563599264, "loss": 2.5974, "step": 7903 }, { "crossentropy": 2.5636682510375977, "epoch": 0.4298104896816118, "grad_norm": 0.03125214949250221, "grad_norm_var": 1.8686423333379021e-06, "learning_rate": 0.0033640183635376677, "loss": 2.5637, "step": 7904 }, { "crossentropy": 2.6016987562179565, "epoch": 0.429864868539112, "grad_norm": 0.03279456868767738, "grad_norm_var": 1.8541564605802171e-06, "learning_rate": 0.0033628417721774644, "loss": 2.6017, "step": 7905 }, { "crossentropy": 2.5832881927490234, "epoch": 0.4299192473966122, "grad_norm": 0.03216022998094559, "grad_norm_var": 1.8596513028321422e-06, "learning_rate": 0.003361665282352288, "loss": 2.5833, "step": 7906 }, { "crossentropy": 2.6205536127090454, "epoch": 0.4299736262541124, "grad_norm": 0.03395384922623634, "grad_norm_var": 1.8015913344187595e-06, "learning_rate": 0.003360488894135102, "loss": 2.6206, "step": 7907 }, { "crossentropy": 2.501394510269165, "epoch": 0.4300280051116126, "grad_norm": 0.03229675441980362, "grad_norm_var": 1.8152825641548631e-06, "learning_rate": 0.0033593126075988694, "loss": 2.5014, "step": 7908 }, { "crossentropy": 2.5840134620666504, "epoch": 0.4300823839691128, "grad_norm": 0.032137248665094376, "grad_norm_var": 1.8289626048171095e-06, "learning_rate": 0.003358136422816537, "loss": 2.584, "step": 7909 }, { "crossentropy": 2.538154721260071, "epoch": 0.430136762826613, "grad_norm": 0.03287196159362793, "grad_norm_var": 7.102075309692659e-07, "learning_rate": 0.0033569603398610525, "loss": 2.5382, "step": 7910 }, { "crossentropy": 2.5585397481918335, "epoch": 0.4301911416841132, "grad_norm": 0.03452279046177864, "grad_norm_var": 8.820417516985924e-07, "learning_rate": 0.0033557843588053575, "loss": 2.5585, "step": 7911 }, { "crossentropy": 2.4334723949432373, "epoch": 0.4302455205416134, "grad_norm": 0.0333106555044651, "grad_norm_var": 9.077430781346055e-07, "learning_rate": 0.003354608479722383, "loss": 2.4335, "step": 7912 }, { "crossentropy": 2.7364526987075806, "epoch": 0.4302998993991136, "grad_norm": 0.03629268705844879, "grad_norm_var": 1.7167822335330262e-06, "learning_rate": 0.0033534327026850577, "loss": 2.7365, "step": 7913 }, { "crossentropy": 2.5756667852401733, "epoch": 0.4303542782566138, "grad_norm": 0.03413211554288864, "grad_norm_var": 1.7514098789186046e-06, "learning_rate": 0.0033522570277662984, "loss": 2.5757, "step": 7914 }, { "crossentropy": 2.535485625267029, "epoch": 0.430408657114114, "grad_norm": 0.03317190706729889, "grad_norm_var": 1.742478224768471e-06, "learning_rate": 0.0033510814550390235, "loss": 2.5355, "step": 7915 }, { "crossentropy": 2.59210205078125, "epoch": 0.4304630359716142, "grad_norm": 0.035628460347652435, "grad_norm_var": 2.114503888372745e-06, "learning_rate": 0.0033499059845761405, "loss": 2.5921, "step": 7916 }, { "crossentropy": 2.574906349182129, "epoch": 0.43051741482911443, "grad_norm": 0.03241222724318504, "grad_norm_var": 2.1443013190791092e-06, "learning_rate": 0.003348730616450549, "loss": 2.5749, "step": 7917 }, { "crossentropy": 2.6147676706314087, "epoch": 0.43057179368661463, "grad_norm": 0.032496318221092224, "grad_norm_var": 1.9205375749472194e-06, "learning_rate": 0.0033475553507351444, "loss": 2.6148, "step": 7918 }, { "crossentropy": 2.686104416847229, "epoch": 0.43062617254411484, "grad_norm": 0.03342033922672272, "grad_norm_var": 1.7831904119163776e-06, "learning_rate": 0.0033463801875028188, "loss": 2.6861, "step": 7919 }, { "crossentropy": 2.6242282390594482, "epoch": 0.43068055140161504, "grad_norm": 0.03140721842646599, "grad_norm_var": 1.7422821237506394e-06, "learning_rate": 0.0033452051268264495, "loss": 2.6242, "step": 7920 }, { "crossentropy": 2.6138755083084106, "epoch": 0.43073493025911525, "grad_norm": 0.03253272920846939, "grad_norm_var": 1.7646694619871672e-06, "learning_rate": 0.0033440301687789166, "loss": 2.6139, "step": 7921 }, { "crossentropy": 2.612175464630127, "epoch": 0.43078930911661545, "grad_norm": 0.031568873673677444, "grad_norm_var": 1.8761351230503834e-06, "learning_rate": 0.00334285531343309, "loss": 2.6122, "step": 7922 }, { "crossentropy": 2.57041597366333, "epoch": 0.43084368797411565, "grad_norm": 0.030536053702235222, "grad_norm_var": 2.28991665308779e-06, "learning_rate": 0.00334168056086183, "loss": 2.5704, "step": 7923 }, { "crossentropy": 2.5589864253997803, "epoch": 0.43089806683161586, "grad_norm": 0.03130011260509491, "grad_norm_var": 2.451580952255112e-06, "learning_rate": 0.003340505911137997, "loss": 2.559, "step": 7924 }, { "crossentropy": 2.5435184240341187, "epoch": 0.43095244568911606, "grad_norm": 0.03338915854692459, "grad_norm_var": 2.408219010871732e-06, "learning_rate": 0.003339331364334438, "loss": 2.5435, "step": 7925 }, { "crossentropy": 2.6054201126098633, "epoch": 0.43100682454661626, "grad_norm": 0.032605960965156555, "grad_norm_var": 2.419384891867238e-06, "learning_rate": 0.0033381569205240013, "loss": 2.6054, "step": 7926 }, { "crossentropy": 2.6269928216934204, "epoch": 0.43106120340411647, "grad_norm": 0.032680075615644455, "grad_norm_var": 2.2686404024154036e-06, "learning_rate": 0.003336982579779524, "loss": 2.627, "step": 7927 }, { "crossentropy": 2.6292006969451904, "epoch": 0.43111558226161667, "grad_norm": 0.03408809006214142, "grad_norm_var": 2.3458419493590688e-06, "learning_rate": 0.003335808342173835, "loss": 2.6292, "step": 7928 }, { "crossentropy": 2.504300832748413, "epoch": 0.4311699611191169, "grad_norm": 0.032155539840459824, "grad_norm_var": 1.5876386562587668e-06, "learning_rate": 0.003334634207779761, "loss": 2.5043, "step": 7929 }, { "crossentropy": 2.556706428527832, "epoch": 0.4312243399766171, "grad_norm": 0.03394968807697296, "grad_norm_var": 1.5553786964204631e-06, "learning_rate": 0.003333460176670123, "loss": 2.5567, "step": 7930 }, { "crossentropy": 2.616901159286499, "epoch": 0.4312787188341173, "grad_norm": 0.03408539295196533, "grad_norm_var": 1.6639229287896401e-06, "learning_rate": 0.0033322862489177287, "loss": 2.6169, "step": 7931 }, { "crossentropy": 2.5321569442749023, "epoch": 0.4313330976916175, "grad_norm": 0.03522919863462448, "grad_norm_var": 1.5215040665015328e-06, "learning_rate": 0.003331112424595387, "loss": 2.5322, "step": 7932 }, { "crossentropy": 2.599471092224121, "epoch": 0.4313874765491177, "grad_norm": 0.03423460200428963, "grad_norm_var": 1.649168550052699e-06, "learning_rate": 0.0033299387037758975, "loss": 2.5995, "step": 7933 }, { "crossentropy": 2.4984817504882812, "epoch": 0.4314418554066179, "grad_norm": 0.03428768739104271, "grad_norm_var": 1.764070081866154e-06, "learning_rate": 0.0033287650865320524, "loss": 2.4985, "step": 7934 }, { "crossentropy": 2.610628366470337, "epoch": 0.4314962342641181, "grad_norm": 0.03311251848936081, "grad_norm_var": 1.7513826058929699e-06, "learning_rate": 0.003327591572936637, "loss": 2.6106, "step": 7935 }, { "crossentropy": 2.562228798866272, "epoch": 0.4315506131216183, "grad_norm": 0.03178275749087334, "grad_norm_var": 1.6830630922562977e-06, "learning_rate": 0.0033264181630624353, "loss": 2.5622, "step": 7936 }, { "crossentropy": 2.633962392807007, "epoch": 0.4316049919791185, "grad_norm": 0.031133968383073807, "grad_norm_var": 1.8871129007190368e-06, "learning_rate": 0.003325244856982218, "loss": 2.634, "step": 7937 }, { "crossentropy": 2.6374577283859253, "epoch": 0.4316593708366187, "grad_norm": 0.0329117625951767, "grad_norm_var": 1.7643948683525167e-06, "learning_rate": 0.003324071654768754, "loss": 2.6375, "step": 7938 }, { "crossentropy": 2.6466554403305054, "epoch": 0.4317137496941189, "grad_norm": 0.03194185718894005, "grad_norm_var": 1.4321310824978613e-06, "learning_rate": 0.0033228985564948024, "loss": 2.6467, "step": 7939 }, { "crossentropy": 2.656372308731079, "epoch": 0.4317681285516191, "grad_norm": 0.033169932663440704, "grad_norm_var": 1.2130050199440334e-06, "learning_rate": 0.003321725562233118, "loss": 2.6564, "step": 7940 }, { "crossentropy": 2.4793907403945923, "epoch": 0.4318225074091193, "grad_norm": 0.03417797014117241, "grad_norm_var": 1.2746929316444222e-06, "learning_rate": 0.0033205526720564526, "loss": 2.4794, "step": 7941 }, { "crossentropy": 2.5689717531204224, "epoch": 0.4318768862666195, "grad_norm": 0.0334932766854763, "grad_norm_var": 1.2510551429163625e-06, "learning_rate": 0.003319379886037542, "loss": 2.569, "step": 7942 }, { "crossentropy": 2.4562138319015503, "epoch": 0.4319312651241197, "grad_norm": 0.03222731873393059, "grad_norm_var": 1.2999105721518574e-06, "learning_rate": 0.0033182072042491242, "loss": 2.4562, "step": 7943 }, { "crossentropy": 2.6928558349609375, "epoch": 0.4319856439816199, "grad_norm": 0.036205265671014786, "grad_norm_var": 1.8169724345593514e-06, "learning_rate": 0.0033170346267639293, "loss": 2.6929, "step": 7944 }, { "crossentropy": 2.525252342224121, "epoch": 0.43204002283912013, "grad_norm": 0.03267683461308479, "grad_norm_var": 1.748768013247933e-06, "learning_rate": 0.003315862153654677, "loss": 2.5253, "step": 7945 }, { "crossentropy": 2.638484835624695, "epoch": 0.43209440169662033, "grad_norm": 0.03432196006178856, "grad_norm_var": 1.784031530999095e-06, "learning_rate": 0.0033146897849940823, "loss": 2.6385, "step": 7946 }, { "crossentropy": 2.6618025302886963, "epoch": 0.43214878055412054, "grad_norm": 0.03548634052276611, "grad_norm_var": 2.0278091372486278e-06, "learning_rate": 0.0033135175208548586, "loss": 2.6618, "step": 7947 }, { "crossentropy": 2.5494693517684937, "epoch": 0.43220315941162074, "grad_norm": 0.04026659205555916, "grad_norm_var": 4.758679915813311e-06, "learning_rate": 0.0033123453613097043, "loss": 2.5495, "step": 7948 }, { "crossentropy": 2.5618035793304443, "epoch": 0.43225753826912094, "grad_norm": 0.03478705883026123, "grad_norm_var": 4.806865262659404e-06, "learning_rate": 0.003311173306431317, "loss": 2.5618, "step": 7949 }, { "crossentropy": 2.5675792694091797, "epoch": 0.43231191712662115, "grad_norm": 0.031050078570842743, "grad_norm_var": 5.28339189158807e-06, "learning_rate": 0.003310001356292388, "loss": 2.5676, "step": 7950 }, { "crossentropy": 2.6308329105377197, "epoch": 0.43236629598412135, "grad_norm": 0.032047662883996964, "grad_norm_var": 5.43363959139605e-06, "learning_rate": 0.0033088295109655976, "loss": 2.6308, "step": 7951 }, { "crossentropy": 2.5268423557281494, "epoch": 0.43242067484162156, "grad_norm": 0.031477924436330795, "grad_norm_var": 5.513512875919962e-06, "learning_rate": 0.003307657770523628, "loss": 2.5268, "step": 7952 }, { "crossentropy": 2.495948553085327, "epoch": 0.43247505369912176, "grad_norm": 0.033397428691387177, "grad_norm_var": 5.093709692381247e-06, "learning_rate": 0.0033064861350391413, "loss": 2.4959, "step": 7953 }, { "crossentropy": 2.6524442434310913, "epoch": 0.43252943255662196, "grad_norm": 0.033768050372600555, "grad_norm_var": 5.0464076099319044e-06, "learning_rate": 0.003305314604584807, "loss": 2.6524, "step": 7954 }, { "crossentropy": 2.580203652381897, "epoch": 0.43258381141412217, "grad_norm": 0.03298117592930794, "grad_norm_var": 4.859062208948754e-06, "learning_rate": 0.003304143179233282, "loss": 2.5802, "step": 7955 }, { "crossentropy": 2.666436791419983, "epoch": 0.43263819027162237, "grad_norm": 0.032482896000146866, "grad_norm_var": 4.95048802828148e-06, "learning_rate": 0.0033029718590572156, "loss": 2.6664, "step": 7956 }, { "crossentropy": 2.679882287979126, "epoch": 0.4326925691291226, "grad_norm": 0.03559981659054756, "grad_norm_var": 5.147929612499674e-06, "learning_rate": 0.003301800644129251, "loss": 2.6799, "step": 7957 }, { "crossentropy": 2.4805243015289307, "epoch": 0.4327469479866228, "grad_norm": 0.03367634490132332, "grad_norm_var": 5.140295298615832e-06, "learning_rate": 0.0033006295345220293, "loss": 2.4805, "step": 7958 }, { "crossentropy": 2.6889337301254272, "epoch": 0.432801326844123, "grad_norm": 0.03331340104341507, "grad_norm_var": 4.97131870481222e-06, "learning_rate": 0.003299458530308178, "loss": 2.6889, "step": 7959 }, { "crossentropy": 2.510694742202759, "epoch": 0.4328557057016232, "grad_norm": 0.035403333604335785, "grad_norm_var": 4.772633826588951e-06, "learning_rate": 0.003298287631560324, "loss": 2.5107, "step": 7960 }, { "crossentropy": 2.5862938165664673, "epoch": 0.4329100845591234, "grad_norm": 0.034517448395490646, "grad_norm_var": 4.679024168004603e-06, "learning_rate": 0.0032971168383510853, "loss": 2.5863, "step": 7961 }, { "crossentropy": 2.6275854110717773, "epoch": 0.4329644634166236, "grad_norm": 0.03286653012037277, "grad_norm_var": 4.7559421420687954e-06, "learning_rate": 0.0032959461507530703, "loss": 2.6276, "step": 7962 }, { "crossentropy": 2.590813994407654, "epoch": 0.4330188422741238, "grad_norm": 0.03656357154250145, "grad_norm_var": 5.049834086874998e-06, "learning_rate": 0.0032947755688388877, "loss": 2.5908, "step": 7963 }, { "crossentropy": 2.6062487363815308, "epoch": 0.433073221131624, "grad_norm": 0.033963706344366074, "grad_norm_var": 2.2768524112795967e-06, "learning_rate": 0.0032936050926811356, "loss": 2.6062, "step": 7964 }, { "crossentropy": 2.626265048980713, "epoch": 0.4331275999891242, "grad_norm": 0.0328381210565567, "grad_norm_var": 2.2105969768504775e-06, "learning_rate": 0.003292434722352403, "loss": 2.6263, "step": 7965 }, { "crossentropy": 2.6690151691436768, "epoch": 0.4331819788466244, "grad_norm": 0.03190230578184128, "grad_norm_var": 1.9779778008066976e-06, "learning_rate": 0.003291264457925279, "loss": 2.669, "step": 7966 }, { "crossentropy": 2.469625473022461, "epoch": 0.43323635770412466, "grad_norm": 0.032999422401189804, "grad_norm_var": 1.8439469395998069e-06, "learning_rate": 0.003290094299472339, "loss": 2.4696, "step": 7967 }, { "crossentropy": 2.5861724615097046, "epoch": 0.43329073656162487, "grad_norm": 0.03201965615153313, "grad_norm_var": 1.7083257664029493e-06, "learning_rate": 0.0032889242470661552, "loss": 2.5862, "step": 7968 }, { "crossentropy": 2.633140802383423, "epoch": 0.43334511541912507, "grad_norm": 0.0332825742661953, "grad_norm_var": 1.7129158833580534e-06, "learning_rate": 0.0032877543007792977, "loss": 2.6331, "step": 7969 }, { "crossentropy": 2.5761585235595703, "epoch": 0.4333994942766253, "grad_norm": 0.03245819732546806, "grad_norm_var": 1.7971115605460696e-06, "learning_rate": 0.0032865844606843188, "loss": 2.5762, "step": 7970 }, { "crossentropy": 2.638046622276306, "epoch": 0.4334538731341255, "grad_norm": 0.032220542430877686, "grad_norm_var": 1.891394857872486e-06, "learning_rate": 0.003285414726853775, "loss": 2.638, "step": 7971 }, { "crossentropy": 2.6504112482070923, "epoch": 0.4335082519916257, "grad_norm": 0.032568495720624924, "grad_norm_var": 1.8801673602462007e-06, "learning_rate": 0.003284245099360213, "loss": 2.6504, "step": 7972 }, { "crossentropy": 2.5340861082077026, "epoch": 0.4335626308491259, "grad_norm": 0.032893892377614975, "grad_norm_var": 1.5845639422075916e-06, "learning_rate": 0.003283075578276169, "loss": 2.5341, "step": 7973 }, { "crossentropy": 2.5825018882751465, "epoch": 0.4336170097066261, "grad_norm": 0.033531591296195984, "grad_norm_var": 1.5794392750553254e-06, "learning_rate": 0.003281906163674176, "loss": 2.5825, "step": 7974 }, { "crossentropy": 2.6091054677963257, "epoch": 0.4336713885641263, "grad_norm": 0.041280630975961685, "grad_norm_var": 5.524934454094697e-06, "learning_rate": 0.0032807368556267624, "loss": 2.6091, "step": 7975 }, { "crossentropy": 2.4771169424057007, "epoch": 0.4337257674216265, "grad_norm": 0.03244708105921745, "grad_norm_var": 5.451732127916402e-06, "learning_rate": 0.0032795676542064457, "loss": 2.4771, "step": 7976 }, { "crossentropy": 2.5584338903427124, "epoch": 0.4337801462791267, "grad_norm": 0.03454101085662842, "grad_norm_var": 5.454501134458112e-06, "learning_rate": 0.003278398559485739, "loss": 2.5584, "step": 7977 }, { "crossentropy": 2.5252137184143066, "epoch": 0.4338345251366269, "grad_norm": 0.032546516507864, "grad_norm_var": 5.49427069248028e-06, "learning_rate": 0.0032772295715371505, "loss": 2.5252, "step": 7978 }, { "crossentropy": 2.656451106071472, "epoch": 0.4338889039941271, "grad_norm": 0.03290596604347229, "grad_norm_var": 4.899062839410171e-06, "learning_rate": 0.0032760606904331745, "loss": 2.6565, "step": 7979 }, { "crossentropy": 2.632015824317932, "epoch": 0.4339432828516273, "grad_norm": 0.032985933125019073, "grad_norm_var": 4.885322748311195e-06, "learning_rate": 0.003274891916246311, "loss": 2.632, "step": 7980 }, { "crossentropy": 2.5946507453918457, "epoch": 0.4339976617091275, "grad_norm": 0.03325384855270386, "grad_norm_var": 4.868367841959614e-06, "learning_rate": 0.003273723249049039, "loss": 2.5947, "step": 7981 }, { "crossentropy": 2.586604356765747, "epoch": 0.4340520405666277, "grad_norm": 0.03368867188692093, "grad_norm_var": 4.719458945113587e-06, "learning_rate": 0.003272554688913843, "loss": 2.5866, "step": 7982 }, { "crossentropy": 2.579321503639221, "epoch": 0.4341064194241279, "grad_norm": 0.03241077437996864, "grad_norm_var": 4.778559857477177e-06, "learning_rate": 0.003271386235913196, "loss": 2.5793, "step": 7983 }, { "crossentropy": 2.627661943435669, "epoch": 0.4341607982816281, "grad_norm": 0.032291170209646225, "grad_norm_var": 4.731758687185651e-06, "learning_rate": 0.003270217890119561, "loss": 2.6277, "step": 7984 }, { "crossentropy": 2.463018536567688, "epoch": 0.4342151771391283, "grad_norm": 0.03138011693954468, "grad_norm_var": 5.0021317799298085e-06, "learning_rate": 0.0032690496516054003, "loss": 2.463, "step": 7985 }, { "crossentropy": 2.5720441341400146, "epoch": 0.43426955599662853, "grad_norm": 0.031893856823444366, "grad_norm_var": 5.0882211534023294e-06, "learning_rate": 0.003267881520443168, "loss": 2.572, "step": 7986 }, { "crossentropy": 2.4932862520217896, "epoch": 0.43432393485412873, "grad_norm": 0.033733367919921875, "grad_norm_var": 5.01301823118133e-06, "learning_rate": 0.0032667134967053067, "loss": 2.4933, "step": 7987 }, { "crossentropy": 2.6134281158447266, "epoch": 0.43437831371162894, "grad_norm": 0.03641285374760628, "grad_norm_var": 5.512006037743505e-06, "learning_rate": 0.00326554558046426, "loss": 2.6134, "step": 7988 }, { "crossentropy": 2.6508275270462036, "epoch": 0.43443269256912914, "grad_norm": 0.038707658648490906, "grad_norm_var": 7.048208621153694e-06, "learning_rate": 0.0032643777717924595, "loss": 2.6508, "step": 7989 }, { "crossentropy": 2.5440051555633545, "epoch": 0.43448707142662935, "grad_norm": 0.03460991382598877, "grad_norm_var": 7.053436964449036e-06, "learning_rate": 0.003263210070762329, "loss": 2.544, "step": 7990 }, { "crossentropy": 2.553180694580078, "epoch": 0.43454145028412955, "grad_norm": 0.03464056923985481, "grad_norm_var": 3.423522101475331e-06, "learning_rate": 0.003262042477446293, "loss": 2.5532, "step": 7991 }, { "crossentropy": 2.472288727760315, "epoch": 0.43459582914162975, "grad_norm": 0.03353014960885048, "grad_norm_var": 3.322679405697489e-06, "learning_rate": 0.0032608749919167622, "loss": 2.4723, "step": 7992 }, { "crossentropy": 2.631904721260071, "epoch": 0.43465020799912996, "grad_norm": 0.034810204058885574, "grad_norm_var": 3.3566487711799684e-06, "learning_rate": 0.0032597076142461424, "loss": 2.6319, "step": 7993 }, { "crossentropy": 2.531038999557495, "epoch": 0.43470458685663016, "grad_norm": 0.03384063020348549, "grad_norm_var": 3.2558000581036614e-06, "learning_rate": 0.0032585403445068353, "loss": 2.531, "step": 7994 }, { "crossentropy": 2.6200459003448486, "epoch": 0.43475896571413036, "grad_norm": 0.035450950264930725, "grad_norm_var": 3.350964507665456e-06, "learning_rate": 0.003257373182771231, "loss": 2.62, "step": 7995 }, { "crossentropy": 2.58919095993042, "epoch": 0.43481334457163057, "grad_norm": 0.0344739593565464, "grad_norm_var": 3.292614749590926e-06, "learning_rate": 0.0032562061291117163, "loss": 2.5892, "step": 7996 }, { "crossentropy": 2.505921721458435, "epoch": 0.43486772342913077, "grad_norm": 0.0340067483484745, "grad_norm_var": 3.2460581218825166e-06, "learning_rate": 0.003255039183600674, "loss": 2.5059, "step": 7997 }, { "crossentropy": 2.637160897254944, "epoch": 0.434922102286631, "grad_norm": 0.03387267142534256, "grad_norm_var": 3.237651108247269e-06, "learning_rate": 0.0032538723463104735, "loss": 2.6372, "step": 7998 }, { "crossentropy": 2.6121264696121216, "epoch": 0.4349764811441312, "grad_norm": 0.03554607927799225, "grad_norm_var": 3.133704824697982e-06, "learning_rate": 0.003252705617313482, "loss": 2.6121, "step": 7999 }, { "crossentropy": 2.627420425415039, "epoch": 0.4350308600016314, "grad_norm": 0.032190922647714615, "grad_norm_var": 3.1615185385404827e-06, "learning_rate": 0.0032515389966820607, "loss": 2.6274, "step": 8000 }, { "crossentropy": 2.6774003505706787, "epoch": 0.4350852388591316, "grad_norm": 0.032689712941646576, "grad_norm_var": 2.7555785772531244e-06, "learning_rate": 0.003250372484488558, "loss": 2.6774, "step": 8001 }, { "crossentropy": 2.6634950637817383, "epoch": 0.4351396177166318, "grad_norm": 0.033572837710380554, "grad_norm_var": 2.370585706260031e-06, "learning_rate": 0.0032492060808053235, "loss": 2.6635, "step": 8002 }, { "crossentropy": 2.6535258293151855, "epoch": 0.435193996574132, "grad_norm": 0.03261806443333626, "grad_norm_var": 2.5631625444295808e-06, "learning_rate": 0.0032480397857046974, "loss": 2.6535, "step": 8003 }, { "crossentropy": 2.608312487602234, "epoch": 0.4352483754316322, "grad_norm": 0.032790761440992355, "grad_norm_var": 2.4283590770233023e-06, "learning_rate": 0.003246873599259009, "loss": 2.6083, "step": 8004 }, { "crossentropy": 2.590663433074951, "epoch": 0.4353027542891324, "grad_norm": 0.031084299087524414, "grad_norm_var": 1.4884300839572313e-06, "learning_rate": 0.003245707521540585, "loss": 2.5907, "step": 8005 }, { "crossentropy": 2.7033663988113403, "epoch": 0.4353571331466326, "grad_norm": 0.03314938396215439, "grad_norm_var": 1.4509897250731979e-06, "learning_rate": 0.0032445415526217475, "loss": 2.7034, "step": 8006 }, { "crossentropy": 2.594679832458496, "epoch": 0.4354115120041328, "grad_norm": 0.03264307230710983, "grad_norm_var": 1.434344968749788e-06, "learning_rate": 0.0032433756925748037, "loss": 2.5947, "step": 8007 }, { "crossentropy": 2.5790385007858276, "epoch": 0.435465890861633, "grad_norm": 0.032825127243995667, "grad_norm_var": 1.4641657760184126e-06, "learning_rate": 0.003242209941472063, "loss": 2.579, "step": 8008 }, { "crossentropy": 2.6137847900390625, "epoch": 0.4355202697191332, "grad_norm": 0.03390044718980789, "grad_norm_var": 1.3536707724523996e-06, "learning_rate": 0.0032410442993858223, "loss": 2.6138, "step": 8009 }, { "crossentropy": 2.595751643180847, "epoch": 0.4355746485766334, "grad_norm": 0.034873802214860916, "grad_norm_var": 1.4788843797683878e-06, "learning_rate": 0.0032398787663883745, "loss": 2.5958, "step": 8010 }, { "crossentropy": 2.5950080156326294, "epoch": 0.4356290274341336, "grad_norm": 0.03571280464529991, "grad_norm_var": 1.551964167147551e-06, "learning_rate": 0.0032387133425520053, "loss": 2.595, "step": 8011 }, { "crossentropy": 2.5402002334594727, "epoch": 0.4356834062916338, "grad_norm": 0.03452923148870468, "grad_norm_var": 1.5593555235113762e-06, "learning_rate": 0.003237548027948991, "loss": 2.5402, "step": 8012 }, { "crossentropy": 2.5723278522491455, "epoch": 0.435737785149134, "grad_norm": 0.03426044061779976, "grad_norm_var": 1.5805064806082477e-06, "learning_rate": 0.0032363828226516056, "loss": 2.5723, "step": 8013 }, { "crossentropy": 2.596060037612915, "epoch": 0.43579216400663423, "grad_norm": 0.03163134679198265, "grad_norm_var": 1.7879569622967008e-06, "learning_rate": 0.003235217726732115, "loss": 2.5961, "step": 8014 }, { "crossentropy": 2.611953616142273, "epoch": 0.43584654286413443, "grad_norm": 0.0343744270503521, "grad_norm_var": 1.5347673804915844e-06, "learning_rate": 0.003234052740262774, "loss": 2.612, "step": 8015 }, { "crossentropy": 2.5825215578079224, "epoch": 0.43590092172163464, "grad_norm": 0.03175535798072815, "grad_norm_var": 1.611204095461743e-06, "learning_rate": 0.0032328878633158344, "loss": 2.5825, "step": 8016 }, { "crossentropy": 2.57823383808136, "epoch": 0.43595530057913484, "grad_norm": 0.032374825328588486, "grad_norm_var": 1.64200369147637e-06, "learning_rate": 0.003231723095963545, "loss": 2.5782, "step": 8017 }, { "crossentropy": 2.6728051900863647, "epoch": 0.43600967943663504, "grad_norm": 0.03355887159705162, "grad_norm_var": 1.641425910106573e-06, "learning_rate": 0.0032305584382781383, "loss": 2.6728, "step": 8018 }, { "crossentropy": 2.591244101524353, "epoch": 0.43606405829413525, "grad_norm": 0.03151407092809677, "grad_norm_var": 1.8113782006554576e-06, "learning_rate": 0.0032293938903318483, "loss": 2.5912, "step": 8019 }, { "crossentropy": 2.5963001251220703, "epoch": 0.43611843715163545, "grad_norm": 0.03169316053390503, "grad_norm_var": 1.944536340366526e-06, "learning_rate": 0.0032282294521968993, "loss": 2.5963, "step": 8020 }, { "crossentropy": 2.5530298948287964, "epoch": 0.43617281600913566, "grad_norm": 0.03224771469831467, "grad_norm_var": 1.7137314898232313e-06, "learning_rate": 0.0032270651239455074, "loss": 2.553, "step": 8021 }, { "crossentropy": 2.6567158699035645, "epoch": 0.43622719486663586, "grad_norm": 0.03137478977441788, "grad_norm_var": 1.9202261915595687e-06, "learning_rate": 0.003225900905649884, "loss": 2.6567, "step": 8022 }, { "crossentropy": 2.546731114387512, "epoch": 0.43628157372413606, "grad_norm": 0.03519410267472267, "grad_norm_var": 2.1785689152851543e-06, "learning_rate": 0.0032247367973822307, "loss": 2.5467, "step": 8023 }, { "crossentropy": 2.5646674633026123, "epoch": 0.43633595258163627, "grad_norm": 0.033564966171979904, "grad_norm_var": 2.171973911580377e-06, "learning_rate": 0.003223572799214747, "loss": 2.5647, "step": 8024 }, { "crossentropy": 2.611230969429016, "epoch": 0.43639033143913647, "grad_norm": 0.03157751262187958, "grad_norm_var": 2.3186132969888263e-06, "learning_rate": 0.003222408911219623, "loss": 2.6112, "step": 8025 }, { "crossentropy": 2.664358377456665, "epoch": 0.4364447102966367, "grad_norm": 0.03167955204844475, "grad_norm_var": 2.2178204879360596e-06, "learning_rate": 0.0032212451334690407, "loss": 2.6644, "step": 8026 }, { "crossentropy": 2.4819332361221313, "epoch": 0.4364990891541369, "grad_norm": 0.03319667652249336, "grad_norm_var": 1.683337455528749e-06, "learning_rate": 0.0032200814660351754, "loss": 2.4819, "step": 8027 }, { "crossentropy": 2.453280806541443, "epoch": 0.4365534680116371, "grad_norm": 0.03154643252491951, "grad_norm_var": 1.544894203203695e-06, "learning_rate": 0.0032189179089902016, "loss": 2.4533, "step": 8028 }, { "crossentropy": 2.6118886470794678, "epoch": 0.4366078468691373, "grad_norm": 0.03258565440773964, "grad_norm_var": 1.3486384921684858e-06, "learning_rate": 0.003217754462406275, "loss": 2.6119, "step": 8029 }, { "crossentropy": 2.5310477018356323, "epoch": 0.4366622257266375, "grad_norm": 0.037556275725364685, "grad_norm_var": 2.8629048713488604e-06, "learning_rate": 0.003216591126355556, "loss": 2.531, "step": 8030 }, { "crossentropy": 2.722301721572876, "epoch": 0.4367166045841377, "grad_norm": 0.03166551515460014, "grad_norm_var": 2.7753256820703746e-06, "learning_rate": 0.0032154279009101944, "loss": 2.7223, "step": 8031 }, { "crossentropy": 2.708080768585205, "epoch": 0.4367709834416379, "grad_norm": 0.03295154869556427, "grad_norm_var": 2.7152338389369383e-06, "learning_rate": 0.0032142647861423293, "loss": 2.7081, "step": 8032 }, { "crossentropy": 2.5961902141571045, "epoch": 0.4368253622991381, "grad_norm": 0.03349142521619797, "grad_norm_var": 2.7346816127140255e-06, "learning_rate": 0.003213101782124096, "loss": 2.5962, "step": 8033 }, { "crossentropy": 2.6043660640716553, "epoch": 0.4368797411566383, "grad_norm": 0.0321044959127903, "grad_norm_var": 2.72697513502169e-06, "learning_rate": 0.003211938888927627, "loss": 2.6044, "step": 8034 }, { "crossentropy": 2.605987787246704, "epoch": 0.4369341200141385, "grad_norm": 0.03203335776925087, "grad_norm_var": 2.658498045242779e-06, "learning_rate": 0.0032107761066250398, "loss": 2.606, "step": 8035 }, { "crossentropy": 2.4820016622543335, "epoch": 0.4369884988716387, "grad_norm": 0.034136295318603516, "grad_norm_var": 2.677857831600144e-06, "learning_rate": 0.003209613435288451, "loss": 2.482, "step": 8036 }, { "crossentropy": 2.597915291786194, "epoch": 0.4370428777291389, "grad_norm": 0.0318496897816658, "grad_norm_var": 2.724055478954488e-06, "learning_rate": 0.0032084508749899666, "loss": 2.5979, "step": 8037 }, { "crossentropy": 2.636873245239258, "epoch": 0.4370972565866391, "grad_norm": 0.03062213957309723, "grad_norm_var": 2.913199801510222e-06, "learning_rate": 0.0032072884258016886, "loss": 2.6369, "step": 8038 }, { "crossentropy": 2.582236886024475, "epoch": 0.4371516354441393, "grad_norm": 0.03135674446821213, "grad_norm_var": 2.6391543494698928e-06, "learning_rate": 0.003206126087795713, "loss": 2.5822, "step": 8039 }, { "crossentropy": 2.5143537521362305, "epoch": 0.4372060143016395, "grad_norm": 0.031466223299503326, "grad_norm_var": 2.649987267984375e-06, "learning_rate": 0.003204963861044122, "loss": 2.5144, "step": 8040 }, { "crossentropy": 2.6923654079437256, "epoch": 0.4372603931591397, "grad_norm": 0.0323207750916481, "grad_norm_var": 2.594212427559026e-06, "learning_rate": 0.003203801745619001, "loss": 2.6924, "step": 8041 }, { "crossentropy": 2.635264754295349, "epoch": 0.43731477201663993, "grad_norm": 0.032654572278261185, "grad_norm_var": 2.5423956517421516e-06, "learning_rate": 0.0032026397415924213, "loss": 2.6353, "step": 8042 }, { "crossentropy": 2.6296985149383545, "epoch": 0.43736915087414013, "grad_norm": 0.030887845903635025, "grad_norm_var": 2.6906846661890632e-06, "learning_rate": 0.0032014778490364482, "loss": 2.6297, "step": 8043 }, { "crossentropy": 2.6060596704483032, "epoch": 0.43742352973164034, "grad_norm": 0.03257029876112938, "grad_norm_var": 2.6326052229759417e-06, "learning_rate": 0.0032003160680231413, "loss": 2.6061, "step": 8044 }, { "crossentropy": 2.571301221847534, "epoch": 0.43747790858914054, "grad_norm": 0.032150667160749435, "grad_norm_var": 2.6403798630868108e-06, "learning_rate": 0.0031991543986245575, "loss": 2.5713, "step": 8045 }, { "crossentropy": 2.461327314376831, "epoch": 0.43753228744664074, "grad_norm": 0.0315411314368248, "grad_norm_var": 8.373925273562228e-07, "learning_rate": 0.0031979928409127355, "loss": 2.4613, "step": 8046 }, { "crossentropy": 2.6044541597366333, "epoch": 0.43758666630414095, "grad_norm": 0.0307256318628788, "grad_norm_var": 9.48640309649841e-07, "learning_rate": 0.0031968313949597192, "loss": 2.6045, "step": 8047 }, { "crossentropy": 2.570168137550354, "epoch": 0.43764104516164115, "grad_norm": 0.03432776778936386, "grad_norm_var": 1.231723758212712e-06, "learning_rate": 0.0031956700608375375, "loss": 2.5702, "step": 8048 }, { "crossentropy": 2.603001356124878, "epoch": 0.43769542401914135, "grad_norm": 0.034075550734996796, "grad_norm_var": 1.3583070838859545e-06, "learning_rate": 0.0031945088386182174, "loss": 2.603, "step": 8049 }, { "crossentropy": 2.5890064239501953, "epoch": 0.43774980287664156, "grad_norm": 0.032323405146598816, "grad_norm_var": 1.3592019970846506e-06, "learning_rate": 0.0031933477283737767, "loss": 2.589, "step": 8050 }, { "crossentropy": 2.5899910926818848, "epoch": 0.43780418173414176, "grad_norm": 0.03218816965818405, "grad_norm_var": 1.3574638686605334e-06, "learning_rate": 0.003192186730176223, "loss": 2.59, "step": 8051 }, { "crossentropy": 2.653356671333313, "epoch": 0.43785856059164197, "grad_norm": 0.031738635152578354, "grad_norm_var": 1.0976900655592278e-06, "learning_rate": 0.0031910258440975637, "loss": 2.6534, "step": 8052 }, { "crossentropy": 2.7011512517929077, "epoch": 0.43791293944914217, "grad_norm": 0.03835923597216606, "grad_norm_var": 3.5722606795615094e-06, "learning_rate": 0.0031898650702097964, "loss": 2.7012, "step": 8053 }, { "crossentropy": 2.6265949010849, "epoch": 0.4379673183066424, "grad_norm": 0.03240837901830673, "grad_norm_var": 3.334724097872552e-06, "learning_rate": 0.0031887044085849095, "loss": 2.6266, "step": 8054 }, { "crossentropy": 2.5524615049362183, "epoch": 0.4380216971641426, "grad_norm": 0.03392385318875313, "grad_norm_var": 3.3318616253328232e-06, "learning_rate": 0.0031875438592948845, "loss": 2.5525, "step": 8055 }, { "crossentropy": 2.760704278945923, "epoch": 0.4380760760216428, "grad_norm": 0.034094370901584625, "grad_norm_var": 3.3210979888359065e-06, "learning_rate": 0.0031863834224117034, "loss": 2.7607, "step": 8056 }, { "crossentropy": 2.622041702270508, "epoch": 0.438130454879143, "grad_norm": 0.031802572309970856, "grad_norm_var": 3.377428399372005e-06, "learning_rate": 0.0031852230980073284, "loss": 2.622, "step": 8057 }, { "crossentropy": 2.657583713531494, "epoch": 0.4381848337366432, "grad_norm": 0.03333943709731102, "grad_norm_var": 3.387915707631084e-06, "learning_rate": 0.0031840628861537256, "loss": 2.6576, "step": 8058 }, { "crossentropy": 2.6818161010742188, "epoch": 0.4382392125941434, "grad_norm": 0.03270973265171051, "grad_norm_var": 3.1057165786463683e-06, "learning_rate": 0.0031829027869228507, "loss": 2.6818, "step": 8059 }, { "crossentropy": 2.5868369340896606, "epoch": 0.4382935914516436, "grad_norm": 0.03448619320988655, "grad_norm_var": 3.2209121224664624e-06, "learning_rate": 0.00318174280038665, "loss": 2.5868, "step": 8060 }, { "crossentropy": 2.629751205444336, "epoch": 0.4383479703091438, "grad_norm": 0.035614270716905594, "grad_norm_var": 3.5151154561656345e-06, "learning_rate": 0.0031805829266170664, "loss": 2.6298, "step": 8061 }, { "crossentropy": 2.61489737033844, "epoch": 0.438402349166644, "grad_norm": 0.03306565061211586, "grad_norm_var": 3.2919469375241687e-06, "learning_rate": 0.0031794231656860315, "loss": 2.6149, "step": 8062 }, { "crossentropy": 2.6526561975479126, "epoch": 0.4384567280241442, "grad_norm": 0.03405393660068512, "grad_norm_var": 2.7757695669714325e-06, "learning_rate": 0.003178263517665476, "loss": 2.6527, "step": 8063 }, { "crossentropy": 2.4520868062973022, "epoch": 0.4385111068816444, "grad_norm": 0.03304272145032883, "grad_norm_var": 2.7640405600307463e-06, "learning_rate": 0.0031771039826273195, "loss": 2.4521, "step": 8064 }, { "crossentropy": 2.5831053256988525, "epoch": 0.4385654857391446, "grad_norm": 0.03174443170428276, "grad_norm_var": 2.9486009919844902e-06, "learning_rate": 0.0031759445606434735, "loss": 2.5831, "step": 8065 }, { "crossentropy": 2.5487420558929443, "epoch": 0.4386198645966448, "grad_norm": 0.0313238762319088, "grad_norm_var": 3.1586434966275497e-06, "learning_rate": 0.0031747852517858446, "loss": 2.5487, "step": 8066 }, { "crossentropy": 2.6585607528686523, "epoch": 0.438674243454145, "grad_norm": 0.03192427009344101, "grad_norm_var": 3.2045268313841854e-06, "learning_rate": 0.003173626056126335, "loss": 2.6586, "step": 8067 }, { "crossentropy": 2.575210213661194, "epoch": 0.4387286223116452, "grad_norm": 0.03241748362779617, "grad_norm_var": 3.0873007824440877e-06, "learning_rate": 0.0031724669737368317, "loss": 2.5752, "step": 8068 }, { "crossentropy": 2.5269581079483032, "epoch": 0.4387830011691454, "grad_norm": 0.034570373594760895, "grad_norm_var": 1.4763746204677144e-06, "learning_rate": 0.003171308004689224, "loss": 2.527, "step": 8069 }, { "crossentropy": 2.706245183944702, "epoch": 0.43883738002664563, "grad_norm": 0.03265365585684776, "grad_norm_var": 1.4556325548170656e-06, "learning_rate": 0.00317014914905539, "loss": 2.7062, "step": 8070 }, { "crossentropy": 2.5155051946640015, "epoch": 0.43889175888414583, "grad_norm": 0.03369303420186043, "grad_norm_var": 1.4358519855234285e-06, "learning_rate": 0.0031689904069071993, "loss": 2.5155, "step": 8071 }, { "crossentropy": 2.593339681625366, "epoch": 0.43894613774164604, "grad_norm": 0.03410613164305687, "grad_norm_var": 1.437328167245601e-06, "learning_rate": 0.0031678317783165157, "loss": 2.5933, "step": 8072 }, { "crossentropy": 2.4435455799102783, "epoch": 0.43900051659914624, "grad_norm": 0.03225983306765556, "grad_norm_var": 1.3676829376231548e-06, "learning_rate": 0.003166673263355199, "loss": 2.4435, "step": 8073 }, { "crossentropy": 2.5978336334228516, "epoch": 0.43905489545664644, "grad_norm": 0.033437520265579224, "grad_norm_var": 1.3702670898687574e-06, "learning_rate": 0.0031655148620950953, "loss": 2.5978, "step": 8074 }, { "crossentropy": 2.717528462409973, "epoch": 0.43910927431414665, "grad_norm": 0.034304458647966385, "grad_norm_var": 1.4262559510471359e-06, "learning_rate": 0.003164356574608051, "loss": 2.7175, "step": 8075 }, { "crossentropy": 2.631586194038391, "epoch": 0.43916365317164685, "grad_norm": 0.031013118103146553, "grad_norm_var": 1.6278915046061126e-06, "learning_rate": 0.0031631984009658997, "loss": 2.6316, "step": 8076 }, { "crossentropy": 2.606832504272461, "epoch": 0.43921803202914705, "grad_norm": 0.03186115249991417, "grad_norm_var": 1.238343529258778e-06, "learning_rate": 0.0031620403412404704, "loss": 2.6068, "step": 8077 }, { "crossentropy": 2.5461217164993286, "epoch": 0.43927241088664726, "grad_norm": 0.03276422247290611, "grad_norm_var": 1.2350327165571202e-06, "learning_rate": 0.003160882395503588, "loss": 2.5461, "step": 8078 }, { "crossentropy": 2.6238280534744263, "epoch": 0.43932678974414746, "grad_norm": 0.03255251422524452, "grad_norm_var": 1.129531548154959e-06, "learning_rate": 0.0031597245638270623, "loss": 2.6238, "step": 8079 }, { "crossentropy": 2.560318112373352, "epoch": 0.43938116860164766, "grad_norm": 0.03323600813746452, "grad_norm_var": 1.139943895019886e-06, "learning_rate": 0.0031585668462827043, "loss": 2.5603, "step": 8080 }, { "crossentropy": 2.6111252307891846, "epoch": 0.43943554745914787, "grad_norm": 0.03314901888370514, "grad_norm_var": 1.07654114273382e-06, "learning_rate": 0.003157409242942315, "loss": 2.6111, "step": 8081 }, { "crossentropy": 2.5655548572540283, "epoch": 0.43948992631664807, "grad_norm": 0.034461501985788345, "grad_norm_var": 1.0620960709891662e-06, "learning_rate": 0.003156251753877685, "loss": 2.5656, "step": 8082 }, { "crossentropy": 2.6990623474121094, "epoch": 0.4395443051741483, "grad_norm": 0.03826995939016342, "grad_norm_var": 2.647285951964242e-06, "learning_rate": 0.003155094379160602, "loss": 2.6991, "step": 8083 }, { "crossentropy": 2.658927798271179, "epoch": 0.4395986840316485, "grad_norm": 0.032265663146972656, "grad_norm_var": 2.669058150307407e-06, "learning_rate": 0.0031539371188628474, "loss": 2.6589, "step": 8084 }, { "crossentropy": 2.6280311346054077, "epoch": 0.4396530628891487, "grad_norm": 0.03256816416978836, "grad_norm_var": 2.6104728233524437e-06, "learning_rate": 0.00315277997305619, "loss": 2.628, "step": 8085 }, { "crossentropy": 2.6655102968215942, "epoch": 0.4397074417466489, "grad_norm": 0.03293343260884285, "grad_norm_var": 2.59172979221732e-06, "learning_rate": 0.003151622941812398, "loss": 2.6655, "step": 8086 }, { "crossentropy": 2.6138997077941895, "epoch": 0.4397618206041491, "grad_norm": 0.03405625745654106, "grad_norm_var": 2.6187808083618826e-06, "learning_rate": 0.003150466025203227, "loss": 2.6139, "step": 8087 }, { "crossentropy": 2.487260103225708, "epoch": 0.4398161994616493, "grad_norm": 0.031680330634117126, "grad_norm_var": 2.7347009031991928e-06, "learning_rate": 0.003149309223300428, "loss": 2.4873, "step": 8088 }, { "crossentropy": 2.662761092185974, "epoch": 0.4398705783191495, "grad_norm": 0.03914323449134827, "grad_norm_var": 4.855344035884857e-06, "learning_rate": 0.003148152536175748, "loss": 2.6628, "step": 8089 }, { "crossentropy": 2.5861300230026245, "epoch": 0.4399249571766497, "grad_norm": 0.03572572022676468, "grad_norm_var": 5.131172561041369e-06, "learning_rate": 0.003146995963900919, "loss": 2.5861, "step": 8090 }, { "crossentropy": 2.586982011795044, "epoch": 0.4399793360341499, "grad_norm": 0.031465083360672, "grad_norm_var": 5.424781318303845e-06, "learning_rate": 0.003145839506547673, "loss": 2.587, "step": 8091 }, { "crossentropy": 2.5837467908859253, "epoch": 0.4400337148916501, "grad_norm": 0.03167535737156868, "grad_norm_var": 5.226282284933002e-06, "learning_rate": 0.003144683164187734, "loss": 2.5837, "step": 8092 }, { "crossentropy": 2.5781071186065674, "epoch": 0.4400880937491503, "grad_norm": 0.0330306701362133, "grad_norm_var": 5.038596157247212e-06, "learning_rate": 0.003143526936892813, "loss": 2.5781, "step": 8093 }, { "crossentropy": 2.5485496520996094, "epoch": 0.4401424726066505, "grad_norm": 0.032598190009593964, "grad_norm_var": 5.060726655560452e-06, "learning_rate": 0.0031423708247346207, "loss": 2.5485, "step": 8094 }, { "crossentropy": 2.634928584098816, "epoch": 0.4401968514641507, "grad_norm": 0.03350376337766647, "grad_norm_var": 4.974824814385261e-06, "learning_rate": 0.0031412148277848596, "loss": 2.6349, "step": 8095 }, { "crossentropy": 2.4629435539245605, "epoch": 0.4402512303216509, "grad_norm": 0.03549709543585777, "grad_norm_var": 5.143877447285015e-06, "learning_rate": 0.003140058946115221, "loss": 2.4629, "step": 8096 }, { "crossentropy": 2.4613125324249268, "epoch": 0.4403056091791511, "grad_norm": 0.03352271020412445, "grad_norm_var": 5.116359890808969e-06, "learning_rate": 0.0031389031797973933, "loss": 2.4613, "step": 8097 }, { "crossentropy": 2.6158220767974854, "epoch": 0.44035998803665133, "grad_norm": 0.03437319025397301, "grad_norm_var": 5.110233588387358e-06, "learning_rate": 0.0031377475289030557, "loss": 2.6158, "step": 8098 }, { "crossentropy": 2.5708162784576416, "epoch": 0.44041436689415153, "grad_norm": 0.032315321266651154, "grad_norm_var": 3.852279463195662e-06, "learning_rate": 0.003136591993503878, "loss": 2.5708, "step": 8099 }, { "crossentropy": 2.621479034423828, "epoch": 0.44046874575165174, "grad_norm": 0.03289886191487312, "grad_norm_var": 3.771258604341685e-06, "learning_rate": 0.0031354365736715284, "loss": 2.6215, "step": 8100 }, { "crossentropy": 2.5698907375335693, "epoch": 0.44052312460915194, "grad_norm": 0.03299408033490181, "grad_norm_var": 3.7261740119124792e-06, "learning_rate": 0.0031342812694776655, "loss": 2.5699, "step": 8101 }, { "crossentropy": 2.590519428253174, "epoch": 0.44057750346665214, "grad_norm": 0.033505529165267944, "grad_norm_var": 3.696674552992827e-06, "learning_rate": 0.0031331260809939377, "loss": 2.5905, "step": 8102 }, { "crossentropy": 2.5856692790985107, "epoch": 0.44063188232415235, "grad_norm": 0.03189460188150406, "grad_norm_var": 3.864161304900406e-06, "learning_rate": 0.003131971008291991, "loss": 2.5857, "step": 8103 }, { "crossentropy": 2.5532532930374146, "epoch": 0.44068626118165255, "grad_norm": 0.032482050359249115, "grad_norm_var": 3.7109957446296304e-06, "learning_rate": 0.0031308160514434604, "loss": 2.5533, "step": 8104 }, { "crossentropy": 2.5907713174819946, "epoch": 0.44074064003915275, "grad_norm": 0.03295091912150383, "grad_norm_var": 1.4805275258525722e-06, "learning_rate": 0.003129661210519974, "loss": 2.5908, "step": 8105 }, { "crossentropy": 2.6174103021621704, "epoch": 0.44079501889665296, "grad_norm": 0.032001983374357224, "grad_norm_var": 1.0693539668205077e-06, "learning_rate": 0.0031285064855931573, "loss": 2.6174, "step": 8106 }, { "crossentropy": 2.586121916770935, "epoch": 0.44084939775415316, "grad_norm": 0.030846701934933662, "grad_norm_var": 1.2131582297925883e-06, "learning_rate": 0.0031273518767346233, "loss": 2.5861, "step": 8107 }, { "crossentropy": 2.510667085647583, "epoch": 0.44090377661165336, "grad_norm": 0.03078492358326912, "grad_norm_var": 1.4058151674725538e-06, "learning_rate": 0.0031261973840159794, "loss": 2.5107, "step": 8108 }, { "crossentropy": 2.510436773300171, "epoch": 0.44095815546915357, "grad_norm": 0.032303210347890854, "grad_norm_var": 1.4189447168959223e-06, "learning_rate": 0.003125043007508828, "loss": 2.5104, "step": 8109 }, { "crossentropy": 2.5335766077041626, "epoch": 0.44101253432665377, "grad_norm": 0.03312305361032486, "grad_norm_var": 1.423468973747718e-06, "learning_rate": 0.0031238887472847606, "loss": 2.5336, "step": 8110 }, { "crossentropy": 2.523042917251587, "epoch": 0.441066913184154, "grad_norm": 0.08731123059988022, "grad_norm_var": 0.00018733643890963823, "learning_rate": 0.0031227346034153627, "loss": 2.523, "step": 8111 }, { "crossentropy": 2.625598669052124, "epoch": 0.4411212920416542, "grad_norm": 0.03250237926840782, "grad_norm_var": 0.00018816777979594281, "learning_rate": 0.003121580575972216, "loss": 2.6256, "step": 8112 }, { "crossentropy": 2.68182635307312, "epoch": 0.4411756708991544, "grad_norm": 0.037205152213573456, "grad_norm_var": 0.00018780478096100912, "learning_rate": 0.0031204266650268907, "loss": 2.6818, "step": 8113 }, { "crossentropy": 2.5747692584991455, "epoch": 0.4412300497566546, "grad_norm": 0.03284605219960213, "grad_norm_var": 0.0001883262436369215, "learning_rate": 0.003119272870650951, "loss": 2.5748, "step": 8114 }, { "crossentropy": 2.6847003698349, "epoch": 0.4412844286141548, "grad_norm": 0.03366604819893837, "grad_norm_var": 0.00018775454329722773, "learning_rate": 0.0031181191929159553, "loss": 2.6847, "step": 8115 }, { "crossentropy": 2.54558527469635, "epoch": 0.44133880747165505, "grad_norm": 0.0334136039018631, "grad_norm_var": 0.0001875440377490354, "learning_rate": 0.003116965631893451, "loss": 2.5456, "step": 8116 }, { "crossentropy": 2.5418821573257446, "epoch": 0.44139318632915525, "grad_norm": 0.03872247040271759, "grad_norm_var": 0.00018711615998604503, "learning_rate": 0.003115812187654985, "loss": 2.5419, "step": 8117 }, { "crossentropy": 2.561345934867859, "epoch": 0.44144756518665546, "grad_norm": 0.035951800644397736, "grad_norm_var": 0.00018648167040112942, "learning_rate": 0.003114658860272088, "loss": 2.5613, "step": 8118 }, { "crossentropy": 2.619479537010193, "epoch": 0.44150194404415566, "grad_norm": 0.03551286831498146, "grad_norm_var": 0.0001849573082220861, "learning_rate": 0.003113505649816292, "loss": 2.6195, "step": 8119 }, { "crossentropy": 2.673067092895508, "epoch": 0.44155632290165586, "grad_norm": 0.03248141333460808, "grad_norm_var": 0.0001849576899931831, "learning_rate": 0.0031123525563591177, "loss": 2.6731, "step": 8120 }, { "crossentropy": 2.620977997779846, "epoch": 0.44161070175915607, "grad_norm": 0.033257994800806046, "grad_norm_var": 0.00018479876287936733, "learning_rate": 0.003111199579972077, "loss": 2.621, "step": 8121 }, { "crossentropy": 2.6527833938598633, "epoch": 0.44166508061665627, "grad_norm": 0.03215713053941727, "grad_norm_var": 0.0001846969662358903, "learning_rate": 0.003110046720726676, "loss": 2.6528, "step": 8122 }, { "crossentropy": 2.4271174669265747, "epoch": 0.4417194594741565, "grad_norm": 0.033674877136945724, "grad_norm_var": 0.0001828745021808746, "learning_rate": 0.003108893978694418, "loss": 2.4271, "step": 8123 }, { "crossentropy": 2.6518261432647705, "epoch": 0.4417738383316567, "grad_norm": 0.03244401887059212, "grad_norm_var": 0.0001816313943835361, "learning_rate": 0.0031077413539467914, "loss": 2.6518, "step": 8124 }, { "crossentropy": 2.6557408571243286, "epoch": 0.4418282171891569, "grad_norm": 0.0326107032597065, "grad_norm_var": 0.00018143302111752092, "learning_rate": 0.003106588846555281, "loss": 2.6557, "step": 8125 }, { "crossentropy": 2.5666418075561523, "epoch": 0.4418825960466571, "grad_norm": 0.032083023339509964, "grad_norm_var": 0.00018208054540771548, "learning_rate": 0.0031054364565913668, "loss": 2.5666, "step": 8126 }, { "crossentropy": 2.6736223697662354, "epoch": 0.4419369749041573, "grad_norm": 0.03184734284877777, "grad_norm_var": 4.0600625653954e-06, "learning_rate": 0.003104284184126515, "loss": 2.6736, "step": 8127 }, { "crossentropy": 2.573097825050354, "epoch": 0.4419913537616575, "grad_norm": 0.031004633754491806, "grad_norm_var": 4.454118191125206e-06, "learning_rate": 0.003103132029232192, "loss": 2.5731, "step": 8128 }, { "crossentropy": 2.673425078392029, "epoch": 0.4420457326191577, "grad_norm": 0.032321181148290634, "grad_norm_var": 3.649340749236653e-06, "learning_rate": 0.0031019799919798532, "loss": 2.6734, "step": 8129 }, { "crossentropy": 2.4847525358200073, "epoch": 0.4421001114766579, "grad_norm": 0.032599929720163345, "grad_norm_var": 3.670474970889475e-06, "learning_rate": 0.0031008280724409442, "loss": 2.4848, "step": 8130 }, { "crossentropy": 2.4351468086242676, "epoch": 0.4421544903341581, "grad_norm": 0.03337983414530754, "grad_norm_var": 3.6638893630526353e-06, "learning_rate": 0.0030996762706869086, "loss": 2.4351, "step": 8131 }, { "crossentropy": 2.6931209564208984, "epoch": 0.4422088691916583, "grad_norm": 0.03195006400346756, "grad_norm_var": 3.7836766030797827e-06, "learning_rate": 0.0030985245867891785, "loss": 2.6931, "step": 8132 }, { "crossentropy": 2.665973663330078, "epoch": 0.4422632480491585, "grad_norm": 0.0335211306810379, "grad_norm_var": 1.6792929069798952e-06, "learning_rate": 0.0030973730208191798, "loss": 2.666, "step": 8133 }, { "crossentropy": 2.5874695777893066, "epoch": 0.4423176269066587, "grad_norm": 0.033712610602378845, "grad_norm_var": 1.088950260785757e-06, "learning_rate": 0.0030962215728483344, "loss": 2.5875, "step": 8134 }, { "crossentropy": 2.518603801727295, "epoch": 0.4423720057641589, "grad_norm": 0.03269055113196373, "grad_norm_var": 5.60241874681712e-07, "learning_rate": 0.003095070242948051, "loss": 2.5186, "step": 8135 }, { "crossentropy": 2.626005172729492, "epoch": 0.4424263846216591, "grad_norm": 0.03439147770404816, "grad_norm_var": 7.558906722530494e-07, "learning_rate": 0.003093919031189735, "loss": 2.626, "step": 8136 }, { "crossentropy": 2.5811671018600464, "epoch": 0.4424807634791593, "grad_norm": 0.03282170370221138, "grad_norm_var": 7.369511670028232e-07, "learning_rate": 0.003092767937644785, "loss": 2.5812, "step": 8137 }, { "crossentropy": 2.6483001708984375, "epoch": 0.4425351423366595, "grad_norm": 0.03291328251361847, "grad_norm_var": 7.178899358544858e-07, "learning_rate": 0.003091616962384587, "loss": 2.6483, "step": 8138 }, { "crossentropy": 2.507085680961609, "epoch": 0.44258952119415973, "grad_norm": 0.03210644796490669, "grad_norm_var": 6.777845374561497e-07, "learning_rate": 0.003090466105480527, "loss": 2.5071, "step": 8139 }, { "crossentropy": 2.5641201734542847, "epoch": 0.44264390005165993, "grad_norm": 0.03301684930920601, "grad_norm_var": 6.82570511084668e-07, "learning_rate": 0.00308931536700398, "loss": 2.5641, "step": 8140 }, { "crossentropy": 2.6084436178207397, "epoch": 0.44269827890916014, "grad_norm": 0.032401662319898605, "grad_norm_var": 6.873912054559024e-07, "learning_rate": 0.003088164747026312, "loss": 2.6084, "step": 8141 }, { "crossentropy": 2.583435297012329, "epoch": 0.44275265776666034, "grad_norm": 0.03149740397930145, "grad_norm_var": 7.548618596877587e-07, "learning_rate": 0.003087014245618882, "loss": 2.5834, "step": 8142 }, { "crossentropy": 2.4945744276046753, "epoch": 0.44280703662416054, "grad_norm": 0.03270252048969269, "grad_norm_var": 7.106435625389128e-07, "learning_rate": 0.003085863862853049, "loss": 2.4946, "step": 8143 }, { "crossentropy": 2.6850258111953735, "epoch": 0.44286141548166075, "grad_norm": 0.03310057148337364, "grad_norm_var": 5.143658037299117e-07, "learning_rate": 0.003084713598800151, "loss": 2.685, "step": 8144 }, { "crossentropy": 2.600988984107971, "epoch": 0.44291579433916095, "grad_norm": 0.031569600105285645, "grad_norm_var": 5.997026975436415e-07, "learning_rate": 0.0030835634535315327, "loss": 2.601, "step": 8145 }, { "crossentropy": 2.667667508125305, "epoch": 0.44297017319666115, "grad_norm": 0.03356018662452698, "grad_norm_var": 6.351134708356664e-07, "learning_rate": 0.003082413427118521, "loss": 2.6677, "step": 8146 }, { "crossentropy": 2.6495046615600586, "epoch": 0.44302455205416136, "grad_norm": 0.03296070918440819, "grad_norm_var": 6.155612482469999e-07, "learning_rate": 0.0030812635196324403, "loss": 2.6495, "step": 8147 }, { "crossentropy": 2.6582525968551636, "epoch": 0.44307893091166156, "grad_norm": 0.03268851712346077, "grad_norm_var": 5.652396721113802e-07, "learning_rate": 0.0030801137311446086, "loss": 2.6583, "step": 8148 }, { "crossentropy": 2.468478202819824, "epoch": 0.44313330976916177, "grad_norm": 0.03047395683825016, "grad_norm_var": 8.742977007720192e-07, "learning_rate": 0.0030789640617263315, "loss": 2.4685, "step": 8149 }, { "crossentropy": 2.6587257385253906, "epoch": 0.44318768862666197, "grad_norm": 0.03335503116250038, "grad_norm_var": 8.322467377884357e-07, "learning_rate": 0.003077814511448913, "loss": 2.6587, "step": 8150 }, { "crossentropy": 2.5597463846206665, "epoch": 0.44324206748416217, "grad_norm": 0.03300199657678604, "grad_norm_var": 8.403811407985216e-07, "learning_rate": 0.0030766650803836482, "loss": 2.5597, "step": 8151 }, { "crossentropy": 2.5661988258361816, "epoch": 0.4432964463416624, "grad_norm": 0.031094681471586227, "grad_norm_var": 7.586273405821384e-07, "learning_rate": 0.00307551576860182, "loss": 2.5662, "step": 8152 }, { "crossentropy": 2.5866916179656982, "epoch": 0.4433508251991626, "grad_norm": 0.03272939845919609, "grad_norm_var": 7.54635254923451e-07, "learning_rate": 0.0030743665761747093, "loss": 2.5867, "step": 8153 }, { "crossentropy": 2.512001156806946, "epoch": 0.4434052040566628, "grad_norm": 0.032779280096292496, "grad_norm_var": 7.474497253958141e-07, "learning_rate": 0.003073217503173591, "loss": 2.512, "step": 8154 }, { "crossentropy": 2.494475841522217, "epoch": 0.443459582914163, "grad_norm": 0.03322809934616089, "grad_norm_var": 7.762083023330243e-07, "learning_rate": 0.003072068549669724, "loss": 2.4945, "step": 8155 }, { "crossentropy": 2.673658609390259, "epoch": 0.4435139617716632, "grad_norm": 0.03310083970427513, "grad_norm_var": 7.823249394880154e-07, "learning_rate": 0.0030709197157343694, "loss": 2.6737, "step": 8156 }, { "crossentropy": 2.4686983823776245, "epoch": 0.4435683406291634, "grad_norm": 0.03478671982884407, "grad_norm_var": 1.1017253511551624e-06, "learning_rate": 0.003069771001438776, "loss": 2.4687, "step": 8157 }, { "crossentropy": 2.5252941846847534, "epoch": 0.4436227194866636, "grad_norm": 0.03268380090594292, "grad_norm_var": 1.005102497713646e-06, "learning_rate": 0.0030686224068541845, "loss": 2.5253, "step": 8158 }, { "crossentropy": 2.699261426925659, "epoch": 0.4436770983441638, "grad_norm": 0.03287637233734131, "grad_norm_var": 1.0061576441165355e-06, "learning_rate": 0.003067473932051832, "loss": 2.6993, "step": 8159 }, { "crossentropy": 2.566880941390991, "epoch": 0.443731477201664, "grad_norm": 0.03299909830093384, "grad_norm_var": 1.0020493893372288e-06, "learning_rate": 0.0030663255771029426, "loss": 2.5669, "step": 8160 }, { "crossentropy": 2.554604649543762, "epoch": 0.4437858560591642, "grad_norm": 0.030477240681648254, "grad_norm_var": 1.247533338690843e-06, "learning_rate": 0.0030651773420787387, "loss": 2.5546, "step": 8161 }, { "crossentropy": 2.544794201850891, "epoch": 0.4438402349166644, "grad_norm": 0.032554060220718384, "grad_norm_var": 1.1920193977651839e-06, "learning_rate": 0.0030640292270504334, "loss": 2.5448, "step": 8162 }, { "crossentropy": 2.6418488025665283, "epoch": 0.4438946137741646, "grad_norm": 0.03209516406059265, "grad_norm_var": 1.1985834302753133e-06, "learning_rate": 0.0030628812320892297, "loss": 2.6418, "step": 8163 }, { "crossentropy": 2.5087645053863525, "epoch": 0.4439489926316648, "grad_norm": 0.0338408462703228, "grad_norm_var": 1.301663934231848e-06, "learning_rate": 0.003061733357266325, "loss": 2.5088, "step": 8164 }, { "crossentropy": 2.5738011598587036, "epoch": 0.444003371489165, "grad_norm": 0.03156474232673645, "grad_norm_var": 1.0624875302863343e-06, "learning_rate": 0.0030605856026529135, "loss": 2.5738, "step": 8165 }, { "crossentropy": 2.5695377588272095, "epoch": 0.4440577503466652, "grad_norm": 0.031568314880132675, "grad_norm_var": 1.1054765980435255e-06, "learning_rate": 0.003059437968320172, "loss": 2.5695, "step": 8166 }, { "crossentropy": 2.5667097568511963, "epoch": 0.44411212920416543, "grad_norm": 0.03471197932958603, "grad_norm_var": 1.3830090846006042e-06, "learning_rate": 0.003058290454339279, "loss": 2.5667, "step": 8167 }, { "crossentropy": 2.6306378841400146, "epoch": 0.44416650806166563, "grad_norm": 0.03349697217345238, "grad_norm_var": 1.231693709759461e-06, "learning_rate": 0.0030571430607814034, "loss": 2.6306, "step": 8168 }, { "crossentropy": 2.6741803884506226, "epoch": 0.44422088691916584, "grad_norm": 0.032098859548568726, "grad_norm_var": 1.266119008337616e-06, "learning_rate": 0.0030559957877177024, "loss": 2.6742, "step": 8169 }, { "crossentropy": 2.612557530403137, "epoch": 0.44427526577666604, "grad_norm": 0.033682990819215775, "grad_norm_var": 1.3141958310165676e-06, "learning_rate": 0.003054848635219331, "loss": 2.6126, "step": 8170 }, { "crossentropy": 2.55901837348938, "epoch": 0.44432964463416624, "grad_norm": 0.032089170068502426, "grad_norm_var": 1.3394276776210653e-06, "learning_rate": 0.0030537016033574324, "loss": 2.559, "step": 8171 }, { "crossentropy": 2.5165170431137085, "epoch": 0.44438402349166645, "grad_norm": 0.031719811260700226, "grad_norm_var": 1.4012453823189845e-06, "learning_rate": 0.0030525546922031467, "loss": 2.5165, "step": 8172 }, { "crossentropy": 2.6035778522491455, "epoch": 0.44443840234916665, "grad_norm": 0.03381653502583504, "grad_norm_var": 1.1905132534258007e-06, "learning_rate": 0.0030514079018276043, "loss": 2.6036, "step": 8173 }, { "crossentropy": 2.506852149963379, "epoch": 0.44449278120666685, "grad_norm": 0.03069097362458706, "grad_norm_var": 1.4276820914026992e-06, "learning_rate": 0.0030502612323019254, "loss": 2.5069, "step": 8174 }, { "crossentropy": 2.5701757669448853, "epoch": 0.44454716006416706, "grad_norm": 0.034438688308000565, "grad_norm_var": 1.6549495429530552e-06, "learning_rate": 0.003049114683697227, "loss": 2.5702, "step": 8175 }, { "crossentropy": 2.5749138593673706, "epoch": 0.44460153892166726, "grad_norm": 0.035661276429891586, "grad_norm_var": 2.2341166416889178e-06, "learning_rate": 0.00304796825608462, "loss": 2.5749, "step": 8176 }, { "crossentropy": 2.5255465507507324, "epoch": 0.44465591777916746, "grad_norm": 0.03189967945218086, "grad_norm_var": 1.923509553103372e-06, "learning_rate": 0.003046821949535199, "loss": 2.5255, "step": 8177 }, { "crossentropy": 2.5903828144073486, "epoch": 0.44471029663666767, "grad_norm": 0.03534860163927078, "grad_norm_var": 2.2936456411309087e-06, "learning_rate": 0.0030456757641200605, "loss": 2.5904, "step": 8178 }, { "crossentropy": 2.5299121141433716, "epoch": 0.44476467549416787, "grad_norm": 0.0325622521340847, "grad_norm_var": 2.2481091481779306e-06, "learning_rate": 0.00304452969991029, "loss": 2.5299, "step": 8179 }, { "crossentropy": 2.5412309169769287, "epoch": 0.4448190543516681, "grad_norm": 0.031200924888253212, "grad_norm_var": 2.413930603239039e-06, "learning_rate": 0.0030433837569769635, "loss": 2.5412, "step": 8180 }, { "crossentropy": 2.5335038900375366, "epoch": 0.4448734332091683, "grad_norm": 0.03345396742224693, "grad_norm_var": 2.298267418048931e-06, "learning_rate": 0.003042237935391151, "loss": 2.5335, "step": 8181 }, { "crossentropy": 2.622833490371704, "epoch": 0.4449278120666685, "grad_norm": 0.03365856036543846, "grad_norm_var": 2.164646423779652e-06, "learning_rate": 0.0030410922352239202, "loss": 2.6228, "step": 8182 }, { "crossentropy": 2.583766460418701, "epoch": 0.4449821909241687, "grad_norm": 0.03221122547984123, "grad_norm_var": 2.037425304132236e-06, "learning_rate": 0.003039946656546319, "loss": 2.5838, "step": 8183 }, { "crossentropy": 2.5588449239730835, "epoch": 0.4450365697816689, "grad_norm": 0.03392072394490242, "grad_norm_var": 2.07661953359143e-06, "learning_rate": 0.003038801199429401, "loss": 2.5588, "step": 8184 }, { "crossentropy": 2.7101885080337524, "epoch": 0.4450909486391691, "grad_norm": 0.030898408964276314, "grad_norm_var": 2.315467854512943e-06, "learning_rate": 0.0030376558639442033, "loss": 2.7102, "step": 8185 }, { "crossentropy": 2.5767199993133545, "epoch": 0.4451453274966693, "grad_norm": 0.03490028157830238, "grad_norm_var": 2.5265028986228792e-06, "learning_rate": 0.0030365106501617596, "loss": 2.5767, "step": 8186 }, { "crossentropy": 2.5584148168563843, "epoch": 0.4451997063541695, "grad_norm": 0.032216593623161316, "grad_norm_var": 2.5115426482336893e-06, "learning_rate": 0.0030353655581530957, "loss": 2.5584, "step": 8187 }, { "crossentropy": 2.562140107154846, "epoch": 0.4452540852116697, "grad_norm": 0.03178078308701515, "grad_norm_var": 2.5010635041427943e-06, "learning_rate": 0.003034220587989226, "loss": 2.5621, "step": 8188 }, { "crossentropy": 2.576614737510681, "epoch": 0.4453084640691699, "grad_norm": 0.03262614831328392, "grad_norm_var": 2.466570222875774e-06, "learning_rate": 0.003033075739741164, "loss": 2.5766, "step": 8189 }, { "crossentropy": 2.5826526880264282, "epoch": 0.4453628429266701, "grad_norm": 0.0312553234398365, "grad_norm_var": 2.3152262395580044e-06, "learning_rate": 0.0030319310134799115, "loss": 2.5827, "step": 8190 }, { "crossentropy": 2.594952702522278, "epoch": 0.4454172217841703, "grad_norm": 0.032737258821725845, "grad_norm_var": 2.1702523570976105e-06, "learning_rate": 0.0030307864092764627, "loss": 2.595, "step": 8191 }, { "crossentropy": 2.5793217420578003, "epoch": 0.4454716006416705, "grad_norm": 0.03256624937057495, "grad_norm_var": 1.6277015842292483e-06, "learning_rate": 0.0030296419272018028, "loss": 2.5793, "step": 8192 }, { "crossentropy": 2.4680819511413574, "epoch": 0.4455259794991707, "grad_norm": 0.031042927876114845, "grad_norm_var": 1.7652655305666013e-06, "learning_rate": 0.003028497567326917, "loss": 2.4681, "step": 8193 }, { "crossentropy": 2.5415165424346924, "epoch": 0.4455803583566709, "grad_norm": 0.03272281214594841, "grad_norm_var": 1.2509614623098467e-06, "learning_rate": 0.003027353329722771, "loss": 2.5415, "step": 8194 }, { "crossentropy": 2.566909670829773, "epoch": 0.44563473721417113, "grad_norm": 0.031221523880958557, "grad_norm_var": 1.349436491198637e-06, "learning_rate": 0.003026209214460334, "loss": 2.5669, "step": 8195 }, { "crossentropy": 2.5380396842956543, "epoch": 0.44568911607167133, "grad_norm": 0.03180782496929169, "grad_norm_var": 1.2753584474449348e-06, "learning_rate": 0.003025065221610561, "loss": 2.538, "step": 8196 }, { "crossentropy": 2.556109070777893, "epoch": 0.44574349492917154, "grad_norm": 0.03174639120697975, "grad_norm_var": 1.2264642243829555e-06, "learning_rate": 0.0030239213512444018, "loss": 2.5561, "step": 8197 }, { "crossentropy": 2.550152897834778, "epoch": 0.44579787378667174, "grad_norm": 0.03155158460140228, "grad_norm_var": 1.1312708741414527e-06, "learning_rate": 0.0030227776034327993, "loss": 2.5502, "step": 8198 }, { "crossentropy": 2.459182858467102, "epoch": 0.44585225264417194, "grad_norm": 0.03059251606464386, "grad_norm_var": 1.2926936252418675e-06, "learning_rate": 0.003021633978246684, "loss": 2.4592, "step": 8199 }, { "crossentropy": 2.584670662879944, "epoch": 0.44590663150167215, "grad_norm": 0.03334891423583031, "grad_norm_var": 1.1742544018639514e-06, "learning_rate": 0.0030204904757569857, "loss": 2.5847, "step": 8200 }, { "crossentropy": 2.549191474914551, "epoch": 0.44596101035917235, "grad_norm": 0.03139809891581535, "grad_norm_var": 1.1122373651558607e-06, "learning_rate": 0.0030193470960346237, "loss": 2.5492, "step": 8201 }, { "crossentropy": 2.5604366064071655, "epoch": 0.44601538921667255, "grad_norm": 0.03149256110191345, "grad_norm_var": 5.632714997484313e-07, "learning_rate": 0.0030182038391505074, "loss": 2.5604, "step": 8202 }, { "crossentropy": 2.6307097673416138, "epoch": 0.44606976807417276, "grad_norm": 0.03329505771398544, "grad_norm_var": 6.841175981234556e-07, "learning_rate": 0.0030170607051755406, "loss": 2.6307, "step": 8203 }, { "crossentropy": 2.600603938102722, "epoch": 0.44612414693167296, "grad_norm": 0.03773025795817375, "grad_norm_var": 2.7628450391622606e-06, "learning_rate": 0.003015917694180623, "loss": 2.6006, "step": 8204 }, { "crossentropy": 2.643423318862915, "epoch": 0.44617852578917316, "grad_norm": 0.03253849595785141, "grad_norm_var": 2.7597585588804887e-06, "learning_rate": 0.003014774806236637, "loss": 2.6434, "step": 8205 }, { "crossentropy": 2.5340023040771484, "epoch": 0.44623290464667337, "grad_norm": 0.03157738596200943, "grad_norm_var": 2.7207161154915813e-06, "learning_rate": 0.003013632041414468, "loss": 2.534, "step": 8206 }, { "crossentropy": 2.636570930480957, "epoch": 0.44628728350417357, "grad_norm": 0.034998759627342224, "grad_norm_var": 3.161473885697587e-06, "learning_rate": 0.003012489399784989, "loss": 2.6366, "step": 8207 }, { "crossentropy": 2.6081539392471313, "epoch": 0.4463416623616738, "grad_norm": 0.03178996965289116, "grad_norm_var": 3.189895229830268e-06, "learning_rate": 0.003011346881419063, "loss": 2.6082, "step": 8208 }, { "crossentropy": 2.6710941791534424, "epoch": 0.446396041219174, "grad_norm": 0.03384988382458687, "grad_norm_var": 3.1637889900750757e-06, "learning_rate": 0.003010204486387549, "loss": 2.6711, "step": 8209 }, { "crossentropy": 2.4323230981826782, "epoch": 0.4464504200766742, "grad_norm": 0.031028427183628082, "grad_norm_var": 3.3163532575761674e-06, "learning_rate": 0.003009062214761299, "loss": 2.4323, "step": 8210 }, { "crossentropy": 2.498234748840332, "epoch": 0.4465047989341744, "grad_norm": 0.03572136536240578, "grad_norm_var": 3.816043417781354e-06, "learning_rate": 0.003007920066611154, "loss": 2.4982, "step": 8211 }, { "crossentropy": 2.468083620071411, "epoch": 0.4465591777916746, "grad_norm": 0.03195160999894142, "grad_norm_var": 3.7987126401785773e-06, "learning_rate": 0.0030067780420079495, "loss": 2.4681, "step": 8212 }, { "crossentropy": 2.604963541030884, "epoch": 0.4466135566491748, "grad_norm": 0.033290259540081024, "grad_norm_var": 3.7332267944722417e-06, "learning_rate": 0.003005636141022512, "loss": 2.605, "step": 8213 }, { "crossentropy": 2.5456353425979614, "epoch": 0.446667935506675, "grad_norm": 0.03352649509906769, "grad_norm_var": 3.6259568095858262e-06, "learning_rate": 0.0030044943637256606, "loss": 2.5456, "step": 8214 }, { "crossentropy": 2.622693181037903, "epoch": 0.4467223143641752, "grad_norm": 0.030697327107191086, "grad_norm_var": 3.5928856779647116e-06, "learning_rate": 0.003003352710188211, "loss": 2.6227, "step": 8215 }, { "crossentropy": 2.541813850402832, "epoch": 0.4467766932216754, "grad_norm": 0.03136912360787392, "grad_norm_var": 3.7496301759455674e-06, "learning_rate": 0.0030022111804809625, "loss": 2.5418, "step": 8216 }, { "crossentropy": 2.4761048555374146, "epoch": 0.4468310720791756, "grad_norm": 0.032084014266729355, "grad_norm_var": 3.642506598203893e-06, "learning_rate": 0.0030010697746747153, "loss": 2.4761, "step": 8217 }, { "crossentropy": 2.5374046564102173, "epoch": 0.4468854509366758, "grad_norm": 0.033785294741392136, "grad_norm_var": 3.5304585264202156e-06, "learning_rate": 0.002999928492840257, "loss": 2.5374, "step": 8218 }, { "crossentropy": 2.619412064552307, "epoch": 0.446939829794176, "grad_norm": 0.03704190254211426, "grad_norm_var": 4.516769591978292e-06, "learning_rate": 0.0029987873350483685, "loss": 2.6194, "step": 8219 }, { "crossentropy": 2.521608352661133, "epoch": 0.4469942086516762, "grad_norm": 0.03236750140786171, "grad_norm_var": 3.154500687578959e-06, "learning_rate": 0.002997646301369823, "loss": 2.5216, "step": 8220 }, { "crossentropy": 2.628045439720154, "epoch": 0.4470485875091764, "grad_norm": 0.032016005367040634, "grad_norm_var": 3.2020497695774143e-06, "learning_rate": 0.002996505391875388, "loss": 2.628, "step": 8221 }, { "crossentropy": 2.594700574874878, "epoch": 0.4471029663666766, "grad_norm": 0.03345298767089844, "grad_norm_var": 3.080289816043136e-06, "learning_rate": 0.002995364606635821, "loss": 2.5947, "step": 8222 }, { "crossentropy": 2.5946918725967407, "epoch": 0.4471573452241768, "grad_norm": 0.03271022066473961, "grad_norm_var": 2.8162461085412886e-06, "learning_rate": 0.0029942239457218718, "loss": 2.5947, "step": 8223 }, { "crossentropy": 2.5162367820739746, "epoch": 0.44721172408167703, "grad_norm": 0.03428471460938454, "grad_norm_var": 2.830127567644315e-06, "learning_rate": 0.0029930834092042847, "loss": 2.5162, "step": 8224 }, { "crossentropy": 2.5586408376693726, "epoch": 0.44726610293917723, "grad_norm": 0.033956095576286316, "grad_norm_var": 2.8418264348128996e-06, "learning_rate": 0.002991942997153791, "loss": 2.5586, "step": 8225 }, { "crossentropy": 2.553961157798767, "epoch": 0.44732048179667744, "grad_norm": 0.032450221478939056, "grad_norm_var": 2.579208557049184e-06, "learning_rate": 0.002990802709641124, "loss": 2.554, "step": 8226 }, { "crossentropy": 2.5441036224365234, "epoch": 0.44737486065417764, "grad_norm": 0.03354104235768318, "grad_norm_var": 2.134344822518619e-06, "learning_rate": 0.0029896625467369965, "loss": 2.5441, "step": 8227 }, { "crossentropy": 2.7211241722106934, "epoch": 0.44742923951167785, "grad_norm": 0.03626096993684769, "grad_norm_var": 2.6737741053448408e-06, "learning_rate": 0.0029885225085121243, "loss": 2.7211, "step": 8228 }, { "crossentropy": 2.601128101348877, "epoch": 0.44748361836917805, "grad_norm": 0.03335556015372276, "grad_norm_var": 2.673937210643379e-06, "learning_rate": 0.002987382595037212, "loss": 2.6011, "step": 8229 }, { "crossentropy": 2.4236247539520264, "epoch": 0.44753799722667825, "grad_norm": 0.03403870388865471, "grad_norm_var": 2.7053783401185043e-06, "learning_rate": 0.002986242806382954, "loss": 2.4236, "step": 8230 }, { "crossentropy": 2.613566517829895, "epoch": 0.44759237608417846, "grad_norm": 0.03220579773187637, "grad_norm_var": 2.3164327409249305e-06, "learning_rate": 0.002985103142620039, "loss": 2.6136, "step": 8231 }, { "crossentropy": 2.6082632541656494, "epoch": 0.44764675494167866, "grad_norm": 0.031191028654575348, "grad_norm_var": 2.3674122586848805e-06, "learning_rate": 0.0029839636038191497, "loss": 2.6083, "step": 8232 }, { "crossentropy": 2.5338741540908813, "epoch": 0.44770113379917886, "grad_norm": 0.033610567450523376, "grad_norm_var": 2.2408522996339905e-06, "learning_rate": 0.002982824190050958, "loss": 2.5339, "step": 8233 }, { "crossentropy": 2.594637155532837, "epoch": 0.44775551265667907, "grad_norm": 0.03273109719157219, "grad_norm_var": 2.2725694141631964e-06, "learning_rate": 0.002981684901386129, "loss": 2.5946, "step": 8234 }, { "crossentropy": 2.625279426574707, "epoch": 0.44780989151417927, "grad_norm": 0.03191838413476944, "grad_norm_var": 1.4600803308025453e-06, "learning_rate": 0.0029805457378953215, "loss": 2.6253, "step": 8235 }, { "crossentropy": 2.6110235452651978, "epoch": 0.4478642703716795, "grad_norm": 0.03145173564553261, "grad_norm_var": 1.6056803635144945e-06, "learning_rate": 0.002979406699649183, "loss": 2.611, "step": 8236 }, { "crossentropy": 2.6376830339431763, "epoch": 0.4479186492291797, "grad_norm": 0.033446647226810455, "grad_norm_var": 1.5318922354377756e-06, "learning_rate": 0.002978267786718358, "loss": 2.6377, "step": 8237 }, { "crossentropy": 2.557761073112488, "epoch": 0.4479730280866799, "grad_norm": 0.033236462622880936, "grad_norm_var": 1.5264464671815859e-06, "learning_rate": 0.0029771289991734817, "loss": 2.5578, "step": 8238 }, { "crossentropy": 2.48984158039093, "epoch": 0.4480274069441801, "grad_norm": 0.03164394572377205, "grad_norm_var": 1.659933255959469e-06, "learning_rate": 0.002975990337085177, "loss": 2.4898, "step": 8239 }, { "crossentropy": 2.555256485939026, "epoch": 0.4480817858016803, "grad_norm": 0.03196018561720848, "grad_norm_var": 1.6250945341106382e-06, "learning_rate": 0.0029748518005240665, "loss": 2.5553, "step": 8240 }, { "crossentropy": 2.645003080368042, "epoch": 0.4481361646591805, "grad_norm": 0.03332635015249252, "grad_norm_var": 1.5643451281741513e-06, "learning_rate": 0.002973713389560758, "loss": 2.645, "step": 8241 }, { "crossentropy": 2.588639974594116, "epoch": 0.4481905435166807, "grad_norm": 0.032470736652612686, "grad_norm_var": 1.563146479146143e-06, "learning_rate": 0.0029725751042658545, "loss": 2.5886, "step": 8242 }, { "crossentropy": 2.6010931730270386, "epoch": 0.4482449223741809, "grad_norm": 0.033048540353775024, "grad_norm_var": 1.53616681900041e-06, "learning_rate": 0.0029714369447099553, "loss": 2.6011, "step": 8243 }, { "crossentropy": 2.4097580909729004, "epoch": 0.4482993012316811, "grad_norm": 0.032535940408706665, "grad_norm_var": 7.184891760064297e-07, "learning_rate": 0.0029702989109636445, "loss": 2.4098, "step": 8244 }, { "crossentropy": 2.5565192699432373, "epoch": 0.4483536800891813, "grad_norm": 0.03178742527961731, "grad_norm_var": 7.216742239713261e-07, "learning_rate": 0.0029691610030975024, "loss": 2.5565, "step": 8245 }, { "crossentropy": 2.569956660270691, "epoch": 0.4484080589466815, "grad_norm": 0.03345032408833504, "grad_norm_var": 6.255581424460232e-07, "learning_rate": 0.0029680232211821036, "loss": 2.57, "step": 8246 }, { "crossentropy": 2.5059863328933716, "epoch": 0.4484624378041817, "grad_norm": 0.03164973482489586, "grad_norm_var": 6.667664678732903e-07, "learning_rate": 0.0029668855652880076, "loss": 2.506, "step": 8247 }, { "crossentropy": 2.5483193397521973, "epoch": 0.4485168166616819, "grad_norm": 0.03234374150633812, "grad_norm_var": 5.53826533232898e-07, "learning_rate": 0.0029657480354857747, "loss": 2.5483, "step": 8248 }, { "crossentropy": 2.6274118423461914, "epoch": 0.4485711955191821, "grad_norm": 0.031847115606069565, "grad_norm_var": 4.960533306135977e-07, "learning_rate": 0.002964610631845952, "loss": 2.6274, "step": 8249 }, { "crossentropy": 2.6199235916137695, "epoch": 0.4486255743766823, "grad_norm": 0.03215329721570015, "grad_norm_var": 4.935702919826405e-07, "learning_rate": 0.00296347335443908, "loss": 2.6199, "step": 8250 }, { "crossentropy": 2.6276966333389282, "epoch": 0.4486799532341825, "grad_norm": 0.03293972089886665, "grad_norm_var": 4.942818643493529e-07, "learning_rate": 0.0029623362033356928, "loss": 2.6277, "step": 8251 }, { "crossentropy": 2.586115598678589, "epoch": 0.44873433209168273, "grad_norm": 0.03095175325870514, "grad_norm_var": 5.768372952720535e-07, "learning_rate": 0.002961199178606315, "loss": 2.5861, "step": 8252 }, { "crossentropy": 2.6592193841934204, "epoch": 0.44878871094918293, "grad_norm": 0.03349633514881134, "grad_norm_var": 5.837634162925154e-07, "learning_rate": 0.0029600622803214603, "loss": 2.6592, "step": 8253 }, { "crossentropy": 2.565217137336731, "epoch": 0.44884308980668314, "grad_norm": 0.03574280068278313, "grad_norm_var": 1.2466757988251197e-06, "learning_rate": 0.002958925508551644, "loss": 2.5652, "step": 8254 }, { "crossentropy": 2.640221357345581, "epoch": 0.44889746866418334, "grad_norm": 0.03155028447508812, "grad_norm_var": 1.2589667109828376e-06, "learning_rate": 0.0029577888633673644, "loss": 2.6402, "step": 8255 }, { "crossentropy": 2.636228322982788, "epoch": 0.44895184752168354, "grad_norm": 0.03211523965001106, "grad_norm_var": 1.247688583207405e-06, "learning_rate": 0.002956652344839115, "loss": 2.6362, "step": 8256 }, { "crossentropy": 2.6035544872283936, "epoch": 0.44900622637918375, "grad_norm": 0.03140977770090103, "grad_norm_var": 1.28860790255698e-06, "learning_rate": 0.002955515953037383, "loss": 2.6036, "step": 8257 }, { "crossentropy": 2.697839856147766, "epoch": 0.44906060523668395, "grad_norm": 0.03236522525548935, "grad_norm_var": 1.289269385851682e-06, "learning_rate": 0.0029543796880326448, "loss": 2.6978, "step": 8258 }, { "crossentropy": 2.5093268156051636, "epoch": 0.44911498409418416, "grad_norm": 0.031104732304811478, "grad_norm_var": 1.3733256564822198e-06, "learning_rate": 0.0029532435498953696, "loss": 2.5093, "step": 8259 }, { "crossentropy": 2.7371933460235596, "epoch": 0.44916936295168436, "grad_norm": 0.03495684266090393, "grad_norm_var": 1.8028010821347086e-06, "learning_rate": 0.002952107538696024, "loss": 2.7372, "step": 8260 }, { "crossentropy": 2.5702754259109497, "epoch": 0.44922374180918456, "grad_norm": 0.035115160048007965, "grad_norm_var": 2.182508505291919e-06, "learning_rate": 0.002950971654505058, "loss": 2.5703, "step": 8261 }, { "crossentropy": 2.5905165672302246, "epoch": 0.44927812066668477, "grad_norm": 0.031766969710588455, "grad_norm_var": 2.191094422668988e-06, "learning_rate": 0.0029498358973929196, "loss": 2.5905, "step": 8262 }, { "crossentropy": 2.61608350276947, "epoch": 0.44933249952418497, "grad_norm": 0.03397384285926819, "grad_norm_var": 2.235985276427023e-06, "learning_rate": 0.002948700267430049, "loss": 2.6161, "step": 8263 }, { "crossentropy": 2.525704503059387, "epoch": 0.4493868783816852, "grad_norm": 0.03164307773113251, "grad_norm_var": 2.303645768364504e-06, "learning_rate": 0.002947564764686873, "loss": 2.5257, "step": 8264 }, { "crossentropy": 2.602947235107422, "epoch": 0.44944125723918543, "grad_norm": 0.03389127552509308, "grad_norm_var": 2.3335053894813287e-06, "learning_rate": 0.0029464293892338183, "loss": 2.6029, "step": 8265 }, { "crossentropy": 2.6105782985687256, "epoch": 0.44949563609668564, "grad_norm": 0.03491804748773575, "grad_norm_var": 2.5641788117885625e-06, "learning_rate": 0.0029452941411412994, "loss": 2.6106, "step": 8266 }, { "crossentropy": 2.5565608739852905, "epoch": 0.44955001495418584, "grad_norm": 0.033419881016016006, "grad_norm_var": 2.5749650080691505e-06, "learning_rate": 0.002944159020479721, "loss": 2.5566, "step": 8267 }, { "crossentropy": 2.689401149749756, "epoch": 0.44960439381168604, "grad_norm": 0.03254466876387596, "grad_norm_var": 2.2929349760252826e-06, "learning_rate": 0.002943024027319484, "loss": 2.6894, "step": 8268 }, { "crossentropy": 2.5285547971725464, "epoch": 0.44965877266918625, "grad_norm": 0.03368433564901352, "grad_norm_var": 2.3044299612847683e-06, "learning_rate": 0.0029418891617309785, "loss": 2.5286, "step": 8269 }, { "crossentropy": 2.580980062484741, "epoch": 0.44971315152668645, "grad_norm": 0.032822735607624054, "grad_norm_var": 1.8230533076144435e-06, "learning_rate": 0.0029407544237845876, "loss": 2.581, "step": 8270 }, { "crossentropy": 2.642251491546631, "epoch": 0.44976753038418665, "grad_norm": 0.033879783004522324, "grad_norm_var": 1.7258684561980338e-06, "learning_rate": 0.002939619813550689, "loss": 2.6423, "step": 8271 }, { "crossentropy": 2.469171404838562, "epoch": 0.44982190924168686, "grad_norm": 0.03154442086815834, "grad_norm_var": 1.821237535337463e-06, "learning_rate": 0.0029384853310996484, "loss": 2.4692, "step": 8272 }, { "crossentropy": 2.5381321907043457, "epoch": 0.44987628809918706, "grad_norm": 0.03270822390913963, "grad_norm_var": 1.6400395139517498e-06, "learning_rate": 0.002937350976501825, "loss": 2.5381, "step": 8273 }, { "crossentropy": 2.504091739654541, "epoch": 0.44993066695668726, "grad_norm": 0.03228595852851868, "grad_norm_var": 1.6486862712036232e-06, "learning_rate": 0.0029362167498275726, "loss": 2.5041, "step": 8274 }, { "crossentropy": 2.612615466117859, "epoch": 0.44998504581418747, "grad_norm": 0.03340568020939827, "grad_norm_var": 1.3547952167342405e-06, "learning_rate": 0.0029350826511472306, "loss": 2.6126, "step": 8275 }, { "crossentropy": 2.59557569026947, "epoch": 0.45003942467168767, "grad_norm": 0.03183187171816826, "grad_norm_var": 1.2685643277906185e-06, "learning_rate": 0.002933948680531139, "loss": 2.5956, "step": 8276 }, { "crossentropy": 2.6032615900039673, "epoch": 0.4500938035291879, "grad_norm": 0.03211076930165291, "grad_norm_var": 1.0213606152567033e-06, "learning_rate": 0.002932814838049625, "loss": 2.6033, "step": 8277 }, { "crossentropy": 2.6566847562789917, "epoch": 0.4501481823866881, "grad_norm": 0.03436537832021713, "grad_norm_var": 1.050117161152664e-06, "learning_rate": 0.002931681123773007, "loss": 2.6567, "step": 8278 }, { "crossentropy": 2.5738309621810913, "epoch": 0.4502025612441883, "grad_norm": 0.033591464161872864, "grad_norm_var": 1.0128871938038426e-06, "learning_rate": 0.002930547537771597, "loss": 2.5738, "step": 8279 }, { "crossentropy": 2.6893025636672974, "epoch": 0.4502569401016885, "grad_norm": 0.03276769071817398, "grad_norm_var": 8.8239712401512e-07, "learning_rate": 0.0029294140801157, "loss": 2.6893, "step": 8280 }, { "crossentropy": 2.5308923721313477, "epoch": 0.4503113189591887, "grad_norm": 0.03268619254231453, "grad_norm_var": 8.477499715571375e-07, "learning_rate": 0.002928280750875609, "loss": 2.5309, "step": 8281 }, { "crossentropy": 2.5758506059646606, "epoch": 0.4503656978166889, "grad_norm": 0.03276301920413971, "grad_norm_var": 5.970672785579958e-07, "learning_rate": 0.0029271475501216173, "loss": 2.5759, "step": 8282 }, { "crossentropy": 2.677563190460205, "epoch": 0.4504200766741891, "grad_norm": 0.033013828098773956, "grad_norm_var": 5.792665079364101e-07, "learning_rate": 0.0029260144779240005, "loss": 2.6776, "step": 8283 }, { "crossentropy": 2.4614484310150146, "epoch": 0.4504744555316893, "grad_norm": 0.032137271016836166, "grad_norm_var": 6.076037490640913e-07, "learning_rate": 0.002924881534353032, "loss": 2.4614, "step": 8284 }, { "crossentropy": 2.6677825450897217, "epoch": 0.4505288343891895, "grad_norm": 0.03332767263054848, "grad_norm_var": 5.758732982663952e-07, "learning_rate": 0.0029237487194789776, "loss": 2.6678, "step": 8285 }, { "crossentropy": 2.538966417312622, "epoch": 0.4505832132466897, "grad_norm": 0.03160238638520241, "grad_norm_var": 6.697467227229081e-07, "learning_rate": 0.0029226160333720897, "loss": 2.539, "step": 8286 }, { "crossentropy": 2.6133419275283813, "epoch": 0.4506375921041899, "grad_norm": 0.03330075740814209, "grad_norm_var": 6.03582309911752e-07, "learning_rate": 0.00292148347610262, "loss": 2.6133, "step": 8287 }, { "crossentropy": 2.6580599546432495, "epoch": 0.4506919709616901, "grad_norm": 0.032674554735422134, "grad_norm_var": 5.069949751213346e-07, "learning_rate": 0.002920351047740808, "loss": 2.6581, "step": 8288 }, { "crossentropy": 2.615023612976074, "epoch": 0.4507463498191903, "grad_norm": 0.032630834728479385, "grad_norm_var": 5.081697139900157e-07, "learning_rate": 0.0029192187483568833, "loss": 2.615, "step": 8289 }, { "crossentropy": 2.5737093687057495, "epoch": 0.4508007286766905, "grad_norm": 0.0318843275308609, "grad_norm_var": 5.447590533913873e-07, "learning_rate": 0.0029180865780210723, "loss": 2.5737, "step": 8290 }, { "crossentropy": 2.4695605039596558, "epoch": 0.4508551075341907, "grad_norm": 0.03104204498231411, "grad_norm_var": 6.891393389942772e-07, "learning_rate": 0.002916954536803593, "loss": 2.4696, "step": 8291 }, { "crossentropy": 2.4525188207626343, "epoch": 0.45090948639169093, "grad_norm": 0.03215736895799637, "grad_norm_var": 6.620718383662603e-07, "learning_rate": 0.00291582262477465, "loss": 2.4525, "step": 8292 }, { "crossentropy": 2.6068620681762695, "epoch": 0.45096386524919113, "grad_norm": 0.032760847359895706, "grad_norm_var": 6.436114284058634e-07, "learning_rate": 0.002914690842004444, "loss": 2.6069, "step": 8293 }, { "crossentropy": 2.5171467065811157, "epoch": 0.45101824410669134, "grad_norm": 0.03186878189444542, "grad_norm_var": 4.685180234717852e-07, "learning_rate": 0.0029135591885631686, "loss": 2.5171, "step": 8294 }, { "crossentropy": 2.574385166168213, "epoch": 0.45107262296419154, "grad_norm": 0.03329470008611679, "grad_norm_var": 4.3135165251211434e-07, "learning_rate": 0.0029124276645210068, "loss": 2.5744, "step": 8295 }, { "crossentropy": 2.617701768875122, "epoch": 0.45112700182169174, "grad_norm": 0.033429104834795, "grad_norm_var": 4.827842013027516e-07, "learning_rate": 0.002911296269948135, "loss": 2.6177, "step": 8296 }, { "crossentropy": 2.6056004762649536, "epoch": 0.45118138067919195, "grad_norm": 0.03512869030237198, "grad_norm_var": 9.04606037110594e-07, "learning_rate": 0.002910165004914721, "loss": 2.6056, "step": 8297 }, { "crossentropy": 2.6256308555603027, "epoch": 0.45123575953669215, "grad_norm": 0.0316920168697834, "grad_norm_var": 9.656567437268572e-07, "learning_rate": 0.002909033869490925, "loss": 2.6256, "step": 8298 }, { "crossentropy": 2.604103207588196, "epoch": 0.45129013839419235, "grad_norm": 0.03139915317296982, "grad_norm_var": 1.0441568588607687e-06, "learning_rate": 0.0029079028637469012, "loss": 2.6041, "step": 8299 }, { "crossentropy": 2.583150267601013, "epoch": 0.45134451725169256, "grad_norm": 0.03151155635714531, "grad_norm_var": 1.1006121547412942e-06, "learning_rate": 0.002906771987752788, "loss": 2.5832, "step": 8300 }, { "crossentropy": 2.6646299362182617, "epoch": 0.45139889610919276, "grad_norm": 0.03267065808176994, "grad_norm_var": 1.053469418641377e-06, "learning_rate": 0.0029056412415787254, "loss": 2.6646, "step": 8301 }, { "crossentropy": 2.5965906381607056, "epoch": 0.45145327496669296, "grad_norm": 0.032917775213718414, "grad_norm_var": 1.0146195706100715e-06, "learning_rate": 0.0029045106252948427, "loss": 2.5966, "step": 8302 }, { "crossentropy": 2.595668315887451, "epoch": 0.45150765382419317, "grad_norm": 0.032748911529779434, "grad_norm_var": 9.764037748032634e-07, "learning_rate": 0.0029033801389712556, "loss": 2.5957, "step": 8303 }, { "crossentropy": 2.6497602462768555, "epoch": 0.45156203268169337, "grad_norm": 0.032727763056755066, "grad_norm_var": 9.77902747012121e-07, "learning_rate": 0.002902249782678076, "loss": 2.6498, "step": 8304 }, { "crossentropy": 2.5881941318511963, "epoch": 0.4516164115391936, "grad_norm": 0.03526535630226135, "grad_norm_var": 1.460629043923536e-06, "learning_rate": 0.002901119556485413, "loss": 2.5882, "step": 8305 }, { "crossentropy": 2.6109588146209717, "epoch": 0.4516707903966938, "grad_norm": 0.033136773854494095, "grad_norm_var": 1.429772225223696e-06, "learning_rate": 0.0028999894604633558, "loss": 2.611, "step": 8306 }, { "crossentropy": 2.6490955352783203, "epoch": 0.451725169254194, "grad_norm": 0.0319036990404129, "grad_norm_var": 1.281737345958563e-06, "learning_rate": 0.0028988594946819947, "loss": 2.6491, "step": 8307 }, { "crossentropy": 2.5187326669692993, "epoch": 0.4517795481116942, "grad_norm": 0.03257329761981964, "grad_norm_var": 1.2575587590655153e-06, "learning_rate": 0.002897729659211409, "loss": 2.5187, "step": 8308 }, { "crossentropy": 2.5395030975341797, "epoch": 0.4518339269691944, "grad_norm": 0.0326862595975399, "grad_norm_var": 1.2584382333387841e-06, "learning_rate": 0.0028965999541216703, "loss": 2.5395, "step": 8309 }, { "crossentropy": 2.583140730857849, "epoch": 0.4518883058266946, "grad_norm": 0.031698331236839294, "grad_norm_var": 1.281637089751928e-06, "learning_rate": 0.002895470379482843, "loss": 2.5831, "step": 8310 }, { "crossentropy": 2.526582717895508, "epoch": 0.4519426846841948, "grad_norm": 0.032809123396873474, "grad_norm_var": 1.2642804380841657e-06, "learning_rate": 0.0028943409353649765, "loss": 2.5266, "step": 8311 }, { "crossentropy": 2.458467125892639, "epoch": 0.451997063541695, "grad_norm": 0.032946836203336716, "grad_norm_var": 1.23634827426568e-06, "learning_rate": 0.0028932116218381245, "loss": 2.4585, "step": 8312 }, { "crossentropy": 2.5740185976028442, "epoch": 0.4520514423991952, "grad_norm": 0.033429063856601715, "grad_norm_var": 8.752393416484756e-07, "learning_rate": 0.002892082438972325, "loss": 2.574, "step": 8313 }, { "crossentropy": 2.6721001863479614, "epoch": 0.4521058212566954, "grad_norm": 0.03289368748664856, "grad_norm_var": 8.14837609957227e-07, "learning_rate": 0.0028909533868376057, "loss": 2.6721, "step": 8314 }, { "crossentropy": 2.507233142852783, "epoch": 0.4521602001141956, "grad_norm": 0.0346369668841362, "grad_norm_var": 9.052753481740099e-07, "learning_rate": 0.002889824465503989, "loss": 2.5072, "step": 8315 }, { "crossentropy": 2.633960485458374, "epoch": 0.4522145789716958, "grad_norm": 0.03381650894880295, "grad_norm_var": 8.07621929458692e-07, "learning_rate": 0.002888695675041496, "loss": 2.634, "step": 8316 }, { "crossentropy": 2.594136953353882, "epoch": 0.452268957829196, "grad_norm": 0.03250603750348091, "grad_norm_var": 8.177257058575652e-07, "learning_rate": 0.002887567015520126, "loss": 2.5941, "step": 8317 }, { "crossentropy": 2.5151480436325073, "epoch": 0.4523233366866962, "grad_norm": 0.033367518335580826, "grad_norm_var": 8.228268613815389e-07, "learning_rate": 0.0028864384870098815, "loss": 2.5151, "step": 8318 }, { "crossentropy": 2.507817506790161, "epoch": 0.4523777155441964, "grad_norm": 0.03379892185330391, "grad_norm_var": 8.465529563359923e-07, "learning_rate": 0.0028853100895807507, "loss": 2.5078, "step": 8319 }, { "crossentropy": 2.5626198053359985, "epoch": 0.4524320944016966, "grad_norm": 0.03427630290389061, "grad_norm_var": 9.118769817369083e-07, "learning_rate": 0.0028841818233027172, "loss": 2.5626, "step": 8320 }, { "crossentropy": 2.534256935119629, "epoch": 0.45248647325919683, "grad_norm": 0.03377804905176163, "grad_norm_var": 6.473071897734768e-07, "learning_rate": 0.002883053688245756, "loss": 2.5343, "step": 8321 }, { "crossentropy": 2.6865556240081787, "epoch": 0.45254085211669703, "grad_norm": 0.03284534811973572, "grad_norm_var": 6.527828100335217e-07, "learning_rate": 0.0028819256844798274, "loss": 2.6866, "step": 8322 }, { "crossentropy": 2.5036942958831787, "epoch": 0.45259523097419724, "grad_norm": 0.033045634627342224, "grad_norm_var": 5.486549330681786e-07, "learning_rate": 0.0028807978120748957, "loss": 2.5037, "step": 8323 }, { "crossentropy": 2.6142226457595825, "epoch": 0.45264960983169744, "grad_norm": 0.0344928577542305, "grad_norm_var": 6.200237837481378e-07, "learning_rate": 0.0028796700711009094, "loss": 2.6142, "step": 8324 }, { "crossentropy": 2.5526739358901978, "epoch": 0.45270398868919765, "grad_norm": 0.0350470207631588, "grad_norm_var": 7.706878474361227e-07, "learning_rate": 0.0028785424616278067, "loss": 2.5527, "step": 8325 }, { "crossentropy": 2.623267889022827, "epoch": 0.45275836754669785, "grad_norm": 0.033048246055841446, "grad_norm_var": 5.671820239305213e-07, "learning_rate": 0.0028774149837255203, "loss": 2.6233, "step": 8326 }, { "crossentropy": 2.514968752861023, "epoch": 0.45281274640419805, "grad_norm": 0.03289789333939552, "grad_norm_var": 5.589512934824532e-07, "learning_rate": 0.002876287637463982, "loss": 2.515, "step": 8327 }, { "crossentropy": 2.4855494499206543, "epoch": 0.45286712526169826, "grad_norm": 0.031110616400837898, "grad_norm_var": 9.177664381956972e-07, "learning_rate": 0.002875160422913103, "loss": 2.4855, "step": 8328 }, { "crossentropy": 2.6468183994293213, "epoch": 0.45292150411919846, "grad_norm": 0.034125201404094696, "grad_norm_var": 9.473254793449509e-07, "learning_rate": 0.002874033340142793, "loss": 2.6468, "step": 8329 }, { "crossentropy": 2.587341070175171, "epoch": 0.45297588297669866, "grad_norm": 0.031777314841747284, "grad_norm_var": 1.1125542729621078e-06, "learning_rate": 0.0028729063892229535, "loss": 2.5873, "step": 8330 }, { "crossentropy": 2.6337099075317383, "epoch": 0.45303026183419887, "grad_norm": 0.034111734479665756, "grad_norm_var": 1.0439160780579497e-06, "learning_rate": 0.0028717795702234766, "loss": 2.6337, "step": 8331 }, { "crossentropy": 2.616352677345276, "epoch": 0.45308464069169907, "grad_norm": 0.03255723416805267, "grad_norm_var": 1.069370471419454e-06, "learning_rate": 0.0028706528832142465, "loss": 2.6164, "step": 8332 }, { "crossentropy": 2.6269123554229736, "epoch": 0.4531390195491993, "grad_norm": 0.03301002457737923, "grad_norm_var": 1.031951831453751e-06, "learning_rate": 0.0028695263282651397, "loss": 2.6269, "step": 8333 }, { "crossentropy": 2.5310977697372437, "epoch": 0.4531933984066995, "grad_norm": 0.03063547983765602, "grad_norm_var": 1.4850129253358153e-06, "learning_rate": 0.0028683999054460235, "loss": 2.5311, "step": 8334 }, { "crossentropy": 2.721462845802307, "epoch": 0.4532477772641997, "grad_norm": 0.034903138875961304, "grad_norm_var": 1.6553061684769585e-06, "learning_rate": 0.00286727361482676, "loss": 2.7215, "step": 8335 }, { "crossentropy": 2.5464388132095337, "epoch": 0.4533021561216999, "grad_norm": 0.03106723353266716, "grad_norm_var": 1.850772483184002e-06, "learning_rate": 0.002866147456477196, "loss": 2.5464, "step": 8336 }, { "crossentropy": 2.5976392030715942, "epoch": 0.4533565349792001, "grad_norm": 0.03215951845049858, "grad_norm_var": 1.8527042333218626e-06, "learning_rate": 0.0028650214304671764, "loss": 2.5976, "step": 8337 }, { "crossentropy": 2.67695152759552, "epoch": 0.4534109138367003, "grad_norm": 0.031797684729099274, "grad_norm_var": 1.932731771462715e-06, "learning_rate": 0.002863895536866541, "loss": 2.677, "step": 8338 }, { "crossentropy": 2.5191497802734375, "epoch": 0.4534652926942005, "grad_norm": 0.03247102349996567, "grad_norm_var": 1.9392740220420715e-06, "learning_rate": 0.00286276977574511, "loss": 2.5191, "step": 8339 }, { "crossentropy": 2.6226311922073364, "epoch": 0.4535196715517007, "grad_norm": 0.032855041325092316, "grad_norm_var": 1.7428741984655173e-06, "learning_rate": 0.0028616441471727063, "loss": 2.6226, "step": 8340 }, { "crossentropy": 2.5861207246780396, "epoch": 0.4535740504092009, "grad_norm": 0.03221860155463219, "grad_norm_var": 1.3665817343967337e-06, "learning_rate": 0.002860518651219138, "loss": 2.5861, "step": 8341 }, { "crossentropy": 2.6105822324752808, "epoch": 0.4536284292667011, "grad_norm": 0.03580259904265404, "grad_norm_var": 2.0249546650706e-06, "learning_rate": 0.0028593932879542082, "loss": 2.6106, "step": 8342 }, { "crossentropy": 2.5363118648529053, "epoch": 0.4536828081242013, "grad_norm": 0.031475648283958435, "grad_norm_var": 2.1174110773368304e-06, "learning_rate": 0.0028582680574477117, "loss": 2.5363, "step": 8343 }, { "crossentropy": 2.6239954233169556, "epoch": 0.4537371869817015, "grad_norm": 0.03183082863688469, "grad_norm_var": 2.003937793493051e-06, "learning_rate": 0.002857142959769433, "loss": 2.624, "step": 8344 }, { "crossentropy": 2.5573781728744507, "epoch": 0.4537915658392017, "grad_norm": 0.03191046044230461, "grad_norm_var": 1.8822311701091956e-06, "learning_rate": 0.0028560179949891507, "loss": 2.5574, "step": 8345 }, { "crossentropy": 2.6919567584991455, "epoch": 0.4538459446967019, "grad_norm": 0.03338006138801575, "grad_norm_var": 1.880549225001006e-06, "learning_rate": 0.0028548931631766326, "loss": 2.692, "step": 8346 }, { "crossentropy": 2.6417908668518066, "epoch": 0.4539003235542021, "grad_norm": 0.03240855783224106, "grad_norm_var": 1.7268713974782912e-06, "learning_rate": 0.002853768464401644, "loss": 2.6418, "step": 8347 }, { "crossentropy": 2.599171757698059, "epoch": 0.4539547024117023, "grad_norm": 0.031416188925504684, "grad_norm_var": 1.8041318447717738e-06, "learning_rate": 0.002852643898733929, "loss": 2.5992, "step": 8348 }, { "crossentropy": 2.632535696029663, "epoch": 0.45400908126920253, "grad_norm": 0.03183850273489952, "grad_norm_var": 1.803820527633039e-06, "learning_rate": 0.0028515194662432418, "loss": 2.6325, "step": 8349 }, { "crossentropy": 2.5784389972686768, "epoch": 0.45406346012670273, "grad_norm": 0.03501637279987335, "grad_norm_var": 1.9810205482464114e-06, "learning_rate": 0.0028503951669993127, "loss": 2.5784, "step": 8350 }, { "crossentropy": 2.5008801221847534, "epoch": 0.45411783898420294, "grad_norm": 0.031709033995866776, "grad_norm_var": 1.6631279691351267e-06, "learning_rate": 0.0028492710010718713, "loss": 2.5009, "step": 8351 }, { "crossentropy": 2.649869203567505, "epoch": 0.45417221784170314, "grad_norm": 0.03207313269376755, "grad_norm_var": 1.5395920099349943e-06, "learning_rate": 0.002848146968530637, "loss": 2.6499, "step": 8352 }, { "crossentropy": 2.5222039222717285, "epoch": 0.45422659669920334, "grad_norm": 0.03194426745176315, "grad_norm_var": 1.5529112818490319e-06, "learning_rate": 0.002847023069445321, "loss": 2.5222, "step": 8353 }, { "crossentropy": 2.551866292953491, "epoch": 0.45428097555670355, "grad_norm": 0.03199916332960129, "grad_norm_var": 1.536333017883249e-06, "learning_rate": 0.0028458993038856283, "loss": 2.5519, "step": 8354 }, { "crossentropy": 2.501519560813904, "epoch": 0.45433535441420375, "grad_norm": 0.032145749777555466, "grad_norm_var": 1.5451497270853783e-06, "learning_rate": 0.0028447756719212516, "loss": 2.5015, "step": 8355 }, { "crossentropy": 2.5657953023910522, "epoch": 0.45438973327170395, "grad_norm": 0.03347694128751755, "grad_norm_var": 1.5986367552576577e-06, "learning_rate": 0.0028436521736218785, "loss": 2.5658, "step": 8356 }, { "crossentropy": 2.630833864212036, "epoch": 0.45444411212920416, "grad_norm": 0.03272576257586479, "grad_norm_var": 1.592953265231826e-06, "learning_rate": 0.0028425288090571875, "loss": 2.6308, "step": 8357 }, { "crossentropy": 2.4415860176086426, "epoch": 0.45449849098670436, "grad_norm": 0.0342281274497509, "grad_norm_var": 1.069706804486074e-06, "learning_rate": 0.00284140557829685, "loss": 2.4416, "step": 8358 }, { "crossentropy": 2.5805559158325195, "epoch": 0.45455286984420457, "grad_norm": 0.03558928146958351, "grad_norm_var": 1.5799283328981594e-06, "learning_rate": 0.0028402824814105243, "loss": 2.5806, "step": 8359 }, { "crossentropy": 2.58212149143219, "epoch": 0.45460724870170477, "grad_norm": 0.03346147760748863, "grad_norm_var": 1.5504501766446775e-06, "learning_rate": 0.0028391595184678636, "loss": 2.5821, "step": 8360 }, { "crossentropy": 2.5462377071380615, "epoch": 0.454661627559205, "grad_norm": 0.031384214758872986, "grad_norm_var": 1.6324680104260377e-06, "learning_rate": 0.002838036689538519, "loss": 2.5462, "step": 8361 }, { "crossentropy": 2.5712753534317017, "epoch": 0.4547160064167052, "grad_norm": 0.03199264779686928, "grad_norm_var": 1.645434035623244e-06, "learning_rate": 0.002836913994692121, "loss": 2.5713, "step": 8362 }, { "crossentropy": 2.6072490215301514, "epoch": 0.4547703852742054, "grad_norm": 0.032199520617723465, "grad_norm_var": 1.6566528474091143e-06, "learning_rate": 0.002835791433998301, "loss": 2.6072, "step": 8363 }, { "crossentropy": 2.5827651023864746, "epoch": 0.4548247641317056, "grad_norm": 0.03195449337363243, "grad_norm_var": 1.5826176874620308e-06, "learning_rate": 0.002834669007526678, "loss": 2.5828, "step": 8364 }, { "crossentropy": 2.567683696746826, "epoch": 0.4548791429892058, "grad_norm": 0.031277406960725784, "grad_norm_var": 1.6692642677359113e-06, "learning_rate": 0.0028335467153468652, "loss": 2.5677, "step": 8365 }, { "crossentropy": 2.5417128801345825, "epoch": 0.454933521846706, "grad_norm": 0.03371039405465126, "grad_norm_var": 1.3722680469881987e-06, "learning_rate": 0.002832424557528467, "loss": 2.5417, "step": 8366 }, { "crossentropy": 2.5643398761749268, "epoch": 0.4549879007042062, "grad_norm": 0.03461204469203949, "grad_norm_var": 1.5475496302587896e-06, "learning_rate": 0.0028313025341410725, "loss": 2.5643, "step": 8367 }, { "crossentropy": 2.606081485748291, "epoch": 0.4550422795617064, "grad_norm": 0.03153528273105621, "grad_norm_var": 1.6176421366307278e-06, "learning_rate": 0.0028301806452542748, "loss": 2.6061, "step": 8368 }, { "crossentropy": 2.662323832511902, "epoch": 0.4550966584192066, "grad_norm": 0.03304114192724228, "grad_norm_var": 1.5728353797426117e-06, "learning_rate": 0.0028290588909376537, "loss": 2.6623, "step": 8369 }, { "crossentropy": 2.556233763694763, "epoch": 0.4551510372767068, "grad_norm": 0.03141957148909569, "grad_norm_var": 1.658296080418673e-06, "learning_rate": 0.002827937271260773, "loss": 2.5562, "step": 8370 }, { "crossentropy": 2.5208016633987427, "epoch": 0.455205416134207, "grad_norm": 0.03136678412556648, "grad_norm_var": 1.7638738685316415e-06, "learning_rate": 0.0028268157862931967, "loss": 2.5208, "step": 8371 }, { "crossentropy": 2.6717220544815063, "epoch": 0.4552597949917072, "grad_norm": 0.035791635513305664, "grad_norm_var": 2.323570295886754e-06, "learning_rate": 0.002825694436104482, "loss": 2.6717, "step": 8372 }, { "crossentropy": 2.5672807693481445, "epoch": 0.4553141738492074, "grad_norm": 0.031291671097278595, "grad_norm_var": 2.484108136986916e-06, "learning_rate": 0.0028245732207641708, "loss": 2.5673, "step": 8373 }, { "crossentropy": 2.6008880138397217, "epoch": 0.4553685527067076, "grad_norm": 0.031609755009412766, "grad_norm_var": 2.415232935944432e-06, "learning_rate": 0.002823452140341799, "loss": 2.6009, "step": 8374 }, { "crossentropy": 2.578020691871643, "epoch": 0.4554229315642078, "grad_norm": 0.031733348965644836, "grad_norm_var": 1.8281130200044233e-06, "learning_rate": 0.0028223311949068966, "loss": 2.578, "step": 8375 }, { "crossentropy": 2.613271713256836, "epoch": 0.455477310421708, "grad_norm": 0.038016803562641144, "grad_norm_var": 3.7704734737992183e-06, "learning_rate": 0.002821210384528984, "loss": 2.6133, "step": 8376 }, { "crossentropy": 2.6076313257217407, "epoch": 0.45553168927920823, "grad_norm": 0.031875208020210266, "grad_norm_var": 3.7004789839654533e-06, "learning_rate": 0.0028200897092775733, "loss": 2.6076, "step": 8377 }, { "crossentropy": 2.675060272216797, "epoch": 0.45558606813670843, "grad_norm": 0.03301604464650154, "grad_norm_var": 3.6674755543512443e-06, "learning_rate": 0.0028189691692221625, "loss": 2.6751, "step": 8378 }, { "crossentropy": 2.543497323989868, "epoch": 0.45564044699420864, "grad_norm": 0.031194845214486122, "grad_norm_var": 3.8080785577902873e-06, "learning_rate": 0.0028178487644322527, "loss": 2.5435, "step": 8379 }, { "crossentropy": 2.723421573638916, "epoch": 0.45569482585170884, "grad_norm": 0.04292354732751846, "grad_norm_var": 1.0215227950801499e-05, "learning_rate": 0.0028167284949773305, "loss": 2.7234, "step": 8380 }, { "crossentropy": 2.6212295293807983, "epoch": 0.45574920470920904, "grad_norm": 0.03525666147470474, "grad_norm_var": 1.0078190214368601e-05, "learning_rate": 0.002815608360926869, "loss": 2.6212, "step": 8381 }, { "crossentropy": 2.5977855920791626, "epoch": 0.45580358356670925, "grad_norm": 0.035189494490623474, "grad_norm_var": 1.0226899197676012e-05, "learning_rate": 0.002814488362350338, "loss": 2.5978, "step": 8382 }, { "crossentropy": 2.6029257774353027, "epoch": 0.45585796242420945, "grad_norm": 0.03459510952234268, "grad_norm_var": 1.0224952801786988e-05, "learning_rate": 0.0028133684993172058, "loss": 2.6029, "step": 8383 }, { "crossentropy": 2.5107990503311157, "epoch": 0.45591234128170965, "grad_norm": 0.03220875561237335, "grad_norm_var": 1.005523014437081e-05, "learning_rate": 0.002812248771896919, "loss": 2.5108, "step": 8384 }, { "crossentropy": 2.5791783332824707, "epoch": 0.45596672013920986, "grad_norm": 0.03214017301797867, "grad_norm_var": 1.0195100867348482e-05, "learning_rate": 0.0028111291801589233, "loss": 2.5792, "step": 8385 }, { "crossentropy": 2.502281427383423, "epoch": 0.45602109899671006, "grad_norm": 0.03191657364368439, "grad_norm_var": 1.005764353557216e-05, "learning_rate": 0.002810009724172656, "loss": 2.5023, "step": 8386 }, { "crossentropy": 2.6182775497436523, "epoch": 0.45607547785421026, "grad_norm": 0.030980601906776428, "grad_norm_var": 1.0190085473726545e-05, "learning_rate": 0.0028088904040075423, "loss": 2.6183, "step": 8387 }, { "crossentropy": 2.566606640815735, "epoch": 0.45612985671171047, "grad_norm": 0.031847964972257614, "grad_norm_var": 1.0080043587166196e-05, "learning_rate": 0.002807771219733004, "loss": 2.5666, "step": 8388 }, { "crossentropy": 2.5949676036834717, "epoch": 0.45618423556921067, "grad_norm": 0.03540435805916786, "grad_norm_var": 9.93319774420542e-06, "learning_rate": 0.0028066521714184512, "loss": 2.595, "step": 8389 }, { "crossentropy": 2.4194525480270386, "epoch": 0.4562386144267109, "grad_norm": 0.043412722647190094, "grad_norm_var": 1.5280836354414567e-05, "learning_rate": 0.002805533259133285, "loss": 2.4195, "step": 8390 }, { "crossentropy": 2.4271703958511353, "epoch": 0.4562929932842111, "grad_norm": 0.030430292710661888, "grad_norm_var": 1.586451381655838e-05, "learning_rate": 0.0028044144829469033, "loss": 2.4272, "step": 8391 }, { "crossentropy": 2.5866862535476685, "epoch": 0.4563473721417113, "grad_norm": 0.033440690487623215, "grad_norm_var": 1.4966876742276749e-05, "learning_rate": 0.0028032958429286855, "loss": 2.5867, "step": 8392 }, { "crossentropy": 2.5817476511001587, "epoch": 0.4564017509992115, "grad_norm": 0.03268140181899071, "grad_norm_var": 1.4766784399915187e-05, "learning_rate": 0.00280217733914801, "loss": 2.5817, "step": 8393 }, { "crossentropy": 2.5434627532958984, "epoch": 0.4564561298567117, "grad_norm": 0.03347228839993477, "grad_norm_var": 1.4709903369354513e-05, "learning_rate": 0.00280105897167425, "loss": 2.5435, "step": 8394 }, { "crossentropy": 2.5741037130355835, "epoch": 0.4565105087142119, "grad_norm": 0.03375612199306488, "grad_norm_var": 1.4095871795487602e-05, "learning_rate": 0.0027999407405767613, "loss": 2.5741, "step": 8395 }, { "crossentropy": 2.633743405342102, "epoch": 0.4565648875717121, "grad_norm": 0.033754751086235046, "grad_norm_var": 8.873170553783102e-06, "learning_rate": 0.0027988226459248968, "loss": 2.6337, "step": 8396 }, { "crossentropy": 2.6399794816970825, "epoch": 0.4566192664292123, "grad_norm": 0.033416748046875, "grad_norm_var": 8.72261548175468e-06, "learning_rate": 0.002797704687787999, "loss": 2.64, "step": 8397 }, { "crossentropy": 2.6130177974700928, "epoch": 0.4566736452867125, "grad_norm": 0.0319984145462513, "grad_norm_var": 8.710628619372481e-06, "learning_rate": 0.002796586866235404, "loss": 2.613, "step": 8398 }, { "crossentropy": 2.7168664932250977, "epoch": 0.4567280241442127, "grad_norm": 0.03536064922809601, "grad_norm_var": 8.862501053945648e-06, "learning_rate": 0.0027954691813364364, "loss": 2.7169, "step": 8399 }, { "crossentropy": 2.611027717590332, "epoch": 0.4567824030017129, "grad_norm": 0.03606768324971199, "grad_norm_var": 9.121677532351518e-06, "learning_rate": 0.0027943516331604156, "loss": 2.611, "step": 8400 }, { "crossentropy": 2.636495351791382, "epoch": 0.4568367818592131, "grad_norm": 0.033714644610881805, "grad_norm_var": 8.937593848659533e-06, "learning_rate": 0.0027932342217766503, "loss": 2.6365, "step": 8401 }, { "crossentropy": 2.6164214611053467, "epoch": 0.4568911607167133, "grad_norm": 0.032118089497089386, "grad_norm_var": 8.888089195075105e-06, "learning_rate": 0.0027921169472544427, "loss": 2.6164, "step": 8402 }, { "crossentropy": 2.4827383756637573, "epoch": 0.4569455395742135, "grad_norm": 0.034363411366939545, "grad_norm_var": 8.301827967997185e-06, "learning_rate": 0.0027909998096630815, "loss": 2.4827, "step": 8403 }, { "crossentropy": 2.548738718032837, "epoch": 0.4569999184317137, "grad_norm": 0.033929917961359024, "grad_norm_var": 7.953826997222668e-06, "learning_rate": 0.002789882809071852, "loss": 2.5487, "step": 8404 }, { "crossentropy": 2.6571770906448364, "epoch": 0.45705429728921393, "grad_norm": 0.033244192600250244, "grad_norm_var": 7.900789493484267e-06, "learning_rate": 0.002788765945550033, "loss": 2.6572, "step": 8405 }, { "crossentropy": 2.5340166091918945, "epoch": 0.45710867614671413, "grad_norm": 0.033256322145462036, "grad_norm_var": 1.699584330428734e-06, "learning_rate": 0.0027876492191668866, "loss": 2.534, "step": 8406 }, { "crossentropy": 2.520366907119751, "epoch": 0.45716305500421434, "grad_norm": 0.031246954575181007, "grad_norm_var": 1.4137800877758356e-06, "learning_rate": 0.0027865326299916734, "loss": 2.5204, "step": 8407 }, { "crossentropy": 2.5750759840011597, "epoch": 0.45721743386171454, "grad_norm": 0.032448943704366684, "grad_norm_var": 1.4816265983417687e-06, "learning_rate": 0.0027854161780936425, "loss": 2.5751, "step": 8408 }, { "crossentropy": 2.6240639686584473, "epoch": 0.45727181271921474, "grad_norm": 0.03160625696182251, "grad_norm_var": 1.6607429734016247e-06, "learning_rate": 0.0027842998635420357, "loss": 2.6241, "step": 8409 }, { "crossentropy": 2.62615966796875, "epoch": 0.45732619157671495, "grad_norm": 0.03291761130094528, "grad_norm_var": 1.6716463250028425e-06, "learning_rate": 0.002783183686406086, "loss": 2.6262, "step": 8410 }, { "crossentropy": 2.6137478351593018, "epoch": 0.45738057043421515, "grad_norm": 0.03400421515107155, "grad_norm_var": 1.6897528618487472e-06, "learning_rate": 0.002782067646755017, "loss": 2.6137, "step": 8411 }, { "crossentropy": 2.567873239517212, "epoch": 0.45743494929171535, "grad_norm": 0.03125286102294922, "grad_norm_var": 1.942797444755848e-06, "learning_rate": 0.0027809517446580447, "loss": 2.5679, "step": 8412 }, { "crossentropy": 2.6930668354034424, "epoch": 0.45748932814921556, "grad_norm": 0.0337507463991642, "grad_norm_var": 1.9601265051103485e-06, "learning_rate": 0.0027798359801843764, "loss": 2.6931, "step": 8413 }, { "crossentropy": 2.570613145828247, "epoch": 0.4575437070067158, "grad_norm": 0.03178372606635094, "grad_norm_var": 1.9975475036168445e-06, "learning_rate": 0.0027787203534032135, "loss": 2.5706, "step": 8414 }, { "crossentropy": 2.5755860805511475, "epoch": 0.457598085864216, "grad_norm": 0.03146158531308174, "grad_norm_var": 1.8201016752357555e-06, "learning_rate": 0.002777604864383738, "loss": 2.5756, "step": 8415 }, { "crossentropy": 2.465865135192871, "epoch": 0.4576524647217162, "grad_norm": 0.03266717493534088, "grad_norm_var": 1.1283261321835217e-06, "learning_rate": 0.002776489513195142, "loss": 2.4659, "step": 8416 }, { "crossentropy": 2.6058379411697388, "epoch": 0.4577068435792164, "grad_norm": 0.03228454664349556, "grad_norm_var": 1.0694308198606286e-06, "learning_rate": 0.002775374299906591, "loss": 2.6058, "step": 8417 }, { "crossentropy": 2.5757142305374146, "epoch": 0.45776122243671663, "grad_norm": 0.0328538678586483, "grad_norm_var": 1.0514730086144993e-06, "learning_rate": 0.0027742592245872523, "loss": 2.5757, "step": 8418 }, { "crossentropy": 2.528563380241394, "epoch": 0.45781560129421683, "grad_norm": 0.03351039066910744, "grad_norm_var": 9.068533535887587e-07, "learning_rate": 0.0027731442873062807, "loss": 2.5286, "step": 8419 }, { "crossentropy": 2.6123745441436768, "epoch": 0.45786998015171704, "grad_norm": 0.03291785344481468, "grad_norm_var": 7.966320199130252e-07, "learning_rate": 0.002772029488132825, "loss": 2.6124, "step": 8420 }, { "crossentropy": 2.575935482978821, "epoch": 0.45792435900921724, "grad_norm": 0.032399434596300125, "grad_norm_var": 7.659099468335182e-07, "learning_rate": 0.002770914827136023, "loss": 2.5759, "step": 8421 }, { "crossentropy": 2.569661855697632, "epoch": 0.45797873786671744, "grad_norm": 0.03296465054154396, "grad_norm_var": 7.426950102637815e-07, "learning_rate": 0.0027698003043850053, "loss": 2.5697, "step": 8422 }, { "crossentropy": 2.6478753089904785, "epoch": 0.45803311672421765, "grad_norm": 0.03184906765818596, "grad_norm_var": 6.644017550636964e-07, "learning_rate": 0.0027686859199488934, "loss": 2.6479, "step": 8423 }, { "crossentropy": 2.5646610260009766, "epoch": 0.45808749558171785, "grad_norm": 0.03269340470433235, "grad_norm_var": 6.651017783093164e-07, "learning_rate": 0.0027675716738968003, "loss": 2.5647, "step": 8424 }, { "crossentropy": 2.5779380798339844, "epoch": 0.45814187443921806, "grad_norm": 0.030847081914544106, "grad_norm_var": 7.973949560780794e-07, "learning_rate": 0.0027664575662978332, "loss": 2.5779, "step": 8425 }, { "crossentropy": 2.5762914419174194, "epoch": 0.45819625329671826, "grad_norm": 0.03226816654205322, "grad_norm_var": 7.884503325331169e-07, "learning_rate": 0.0027653435972210816, "loss": 2.5763, "step": 8426 }, { "crossentropy": 2.5025604963302612, "epoch": 0.45825063215421846, "grad_norm": 0.0329316183924675, "grad_norm_var": 6.40841414460214e-07, "learning_rate": 0.002764229766735638, "loss": 2.5026, "step": 8427 }, { "crossentropy": 2.567525267601013, "epoch": 0.45830501101171867, "grad_norm": 0.03387409821152687, "grad_norm_var": 6.685583454554688e-07, "learning_rate": 0.002763116074910582, "loss": 2.5675, "step": 8428 }, { "crossentropy": 2.5141793489456177, "epoch": 0.45835938986921887, "grad_norm": 0.0328085795044899, "grad_norm_var": 5.75218838604396e-07, "learning_rate": 0.00276200252181498, "loss": 2.5142, "step": 8429 }, { "crossentropy": 2.5939788818359375, "epoch": 0.4584137687267191, "grad_norm": 0.03196106478571892, "grad_norm_var": 5.600776742036567e-07, "learning_rate": 0.0027608891075178955, "loss": 2.594, "step": 8430 }, { "crossentropy": 2.3855923414230347, "epoch": 0.4584681475842193, "grad_norm": 0.03145648539066315, "grad_norm_var": 5.607978457285023e-07, "learning_rate": 0.0027597758320883803, "loss": 2.3856, "step": 8431 }, { "crossentropy": 2.4933987855911255, "epoch": 0.4585225264417195, "grad_norm": 0.030773675069212914, "grad_norm_var": 7.472118949009473e-07, "learning_rate": 0.0027586626955954795, "loss": 2.4934, "step": 8432 }, { "crossentropy": 2.537519335746765, "epoch": 0.4585769052992197, "grad_norm": 0.03247464820742607, "grad_norm_var": 7.465537032435207e-07, "learning_rate": 0.0027575496981082285, "loss": 2.5375, "step": 8433 }, { "crossentropy": 2.4698292016983032, "epoch": 0.4586312841567199, "grad_norm": 0.031190140172839165, "grad_norm_var": 8.214236590376219e-07, "learning_rate": 0.002756436839695655, "loss": 2.4698, "step": 8434 }, { "crossentropy": 2.587138533592224, "epoch": 0.4586856630142201, "grad_norm": 0.031975723803043365, "grad_norm_var": 7.224902161258867e-07, "learning_rate": 0.002755324120426777, "loss": 2.5871, "step": 8435 }, { "crossentropy": 2.4995453357696533, "epoch": 0.4587400418717203, "grad_norm": 0.032035816460847855, "grad_norm_var": 6.880563390411343e-07, "learning_rate": 0.0027542115403706062, "loss": 2.4995, "step": 8436 }, { "crossentropy": 2.5526280403137207, "epoch": 0.4587944207292205, "grad_norm": 0.03441859781742096, "grad_norm_var": 1.008279165423314e-06, "learning_rate": 0.002753099099596138, "loss": 2.5526, "step": 8437 }, { "crossentropy": 2.555483341217041, "epoch": 0.4588487995867207, "grad_norm": 0.03424043953418732, "grad_norm_var": 1.2260138938787462e-06, "learning_rate": 0.002751986798172371, "loss": 2.5555, "step": 8438 }, { "crossentropy": 2.572594165802002, "epoch": 0.4589031784442209, "grad_norm": 0.03135387599468231, "grad_norm_var": 1.2752337297198263e-06, "learning_rate": 0.002750874636168289, "loss": 2.5726, "step": 8439 }, { "crossentropy": 2.6121712923049927, "epoch": 0.4589575573017211, "grad_norm": 0.035582441836595535, "grad_norm_var": 1.936313733393831e-06, "learning_rate": 0.002749762613652863, "loss": 2.6122, "step": 8440 }, { "crossentropy": 2.4671133756637573, "epoch": 0.4590119361592213, "grad_norm": 0.03476862981915474, "grad_norm_var": 2.0269167278041377e-06, "learning_rate": 0.0027486507306950605, "loss": 2.4671, "step": 8441 }, { "crossentropy": 2.5181920528411865, "epoch": 0.4590663150167215, "grad_norm": 0.03157981485128403, "grad_norm_var": 2.101407707287041e-06, "learning_rate": 0.0027475389873638434, "loss": 2.5182, "step": 8442 }, { "crossentropy": 2.6020402908325195, "epoch": 0.4591206938742217, "grad_norm": 0.03151192516088486, "grad_norm_var": 2.1862042662865525e-06, "learning_rate": 0.0027464273837281574, "loss": 2.602, "step": 8443 }, { "crossentropy": 2.537494421005249, "epoch": 0.4591750727317219, "grad_norm": 0.0312489066272974, "grad_norm_var": 2.179845223524328e-06, "learning_rate": 0.002745315919856943, "loss": 2.5375, "step": 8444 }, { "crossentropy": 2.602605104446411, "epoch": 0.4592294515892221, "grad_norm": 0.031499527394771576, "grad_norm_var": 2.226331668827884e-06, "learning_rate": 0.002744204595819134, "loss": 2.6026, "step": 8445 }, { "crossentropy": 2.569359540939331, "epoch": 0.45928383044672233, "grad_norm": 0.03154103830456734, "grad_norm_var": 2.2607909037291774e-06, "learning_rate": 0.002743093411683653, "loss": 2.5694, "step": 8446 }, { "crossentropy": 2.5879346132278442, "epoch": 0.45933820930422253, "grad_norm": 0.03469769284129143, "grad_norm_var": 2.529841797065677e-06, "learning_rate": 0.002741982367519417, "loss": 2.5879, "step": 8447 }, { "crossentropy": 2.6321840286254883, "epoch": 0.45939258816172274, "grad_norm": 0.032819993793964386, "grad_norm_var": 2.3053145506733707e-06, "learning_rate": 0.0027408714633953245, "loss": 2.6322, "step": 8448 }, { "crossentropy": 2.5498135089874268, "epoch": 0.45944696701922294, "grad_norm": 0.032063256949186325, "grad_norm_var": 2.3273592108965607e-06, "learning_rate": 0.0027397606993802793, "loss": 2.5498, "step": 8449 }, { "crossentropy": 2.5727975368499756, "epoch": 0.45950134587672314, "grad_norm": 0.03191297501325607, "grad_norm_var": 2.2185465616627082e-06, "learning_rate": 0.0027386500755431707, "loss": 2.5728, "step": 8450 }, { "crossentropy": 2.629109025001526, "epoch": 0.45955572473422335, "grad_norm": 0.03346976265311241, "grad_norm_var": 2.2131458106960095e-06, "learning_rate": 0.0027375395919528745, "loss": 2.6291, "step": 8451 }, { "crossentropy": 2.6629440784454346, "epoch": 0.45961010359172355, "grad_norm": 0.03232775256037712, "grad_norm_var": 2.18886132294023e-06, "learning_rate": 0.0027364292486782614, "loss": 2.6629, "step": 8452 }, { "crossentropy": 2.588089346885681, "epoch": 0.45966448244922375, "grad_norm": 0.032154012471437454, "grad_norm_var": 2.0251215394791045e-06, "learning_rate": 0.0027353190457882004, "loss": 2.5881, "step": 8453 }, { "crossentropy": 2.5549949407577515, "epoch": 0.45971886130672396, "grad_norm": 0.032098397612571716, "grad_norm_var": 1.864295740909326e-06, "learning_rate": 0.0027342089833515383, "loss": 2.555, "step": 8454 }, { "crossentropy": 2.6162017583847046, "epoch": 0.45977324016422416, "grad_norm": 0.031025933101773262, "grad_norm_var": 1.9228542067839874e-06, "learning_rate": 0.0027330990614371224, "loss": 2.6162, "step": 8455 }, { "crossentropy": 2.5598005056381226, "epoch": 0.45982761902172437, "grad_norm": 0.0392465814948082, "grad_norm_var": 4.258683874659487e-06, "learning_rate": 0.00273198928011379, "loss": 2.5598, "step": 8456 }, { "crossentropy": 2.670503616333008, "epoch": 0.45988199787922457, "grad_norm": 0.03617498278617859, "grad_norm_var": 4.7612150862703815e-06, "learning_rate": 0.0027308796394503675, "loss": 2.6705, "step": 8457 }, { "crossentropy": 2.5878058671951294, "epoch": 0.4599363767367248, "grad_norm": 0.03195227310061455, "grad_norm_var": 4.707512568760906e-06, "learning_rate": 0.0027297701395156758, "loss": 2.5878, "step": 8458 }, { "crossentropy": 2.609142541885376, "epoch": 0.459990755594225, "grad_norm": 0.03336216136813164, "grad_norm_var": 4.589136958086232e-06, "learning_rate": 0.00272866078037852, "loss": 2.6091, "step": 8459 }, { "crossentropy": 2.563493251800537, "epoch": 0.4600451344517252, "grad_norm": 0.03150544315576553, "grad_norm_var": 4.5342195034590485e-06, "learning_rate": 0.0027275515621077075, "loss": 2.5635, "step": 8460 }, { "crossentropy": 2.5518386363983154, "epoch": 0.4600995133092254, "grad_norm": 0.032526273280382156, "grad_norm_var": 4.3959617344605515e-06, "learning_rate": 0.00272644248477203, "loss": 2.5518, "step": 8461 }, { "crossentropy": 2.619627594947815, "epoch": 0.4601538921667256, "grad_norm": 0.03317996859550476, "grad_norm_var": 4.233025561597283e-06, "learning_rate": 0.0027253335484402684, "loss": 2.6196, "step": 8462 }, { "crossentropy": 2.5841197967529297, "epoch": 0.4602082710242258, "grad_norm": 0.0334458202123642, "grad_norm_var": 4.073864771016742e-06, "learning_rate": 0.002724224753181197, "loss": 2.5841, "step": 8463 }, { "crossentropy": 2.614776849746704, "epoch": 0.460262649881726, "grad_norm": 0.03299026936292648, "grad_norm_var": 4.069794304198274e-06, "learning_rate": 0.0027231160990635883, "loss": 2.6148, "step": 8464 }, { "crossentropy": 2.5755438804626465, "epoch": 0.4603170287392262, "grad_norm": 0.03324216976761818, "grad_norm_var": 3.99530758223945e-06, "learning_rate": 0.0027220075861561954, "loss": 2.5755, "step": 8465 }, { "crossentropy": 2.6090691089630127, "epoch": 0.4603714075967264, "grad_norm": 0.03489525988698006, "grad_norm_var": 4.053958196332493e-06, "learning_rate": 0.0027208992145277677, "loss": 2.6091, "step": 8466 }, { "crossentropy": 2.5395835638046265, "epoch": 0.4604257864542266, "grad_norm": 0.033309243619441986, "grad_norm_var": 4.053001438300664e-06, "learning_rate": 0.002719790984247046, "loss": 2.5396, "step": 8467 }, { "crossentropy": 2.4876949787139893, "epoch": 0.4604801653117268, "grad_norm": 0.031589481979608536, "grad_norm_var": 4.186687049124759e-06, "learning_rate": 0.0027186828953827615, "loss": 2.4877, "step": 8468 }, { "crossentropy": 2.4919567108154297, "epoch": 0.460534544169227, "grad_norm": 0.0329444520175457, "grad_norm_var": 4.105628951415899e-06, "learning_rate": 0.0027175749480036373, "loss": 2.492, "step": 8469 }, { "crossentropy": 2.643662452697754, "epoch": 0.4605889230267272, "grad_norm": 0.03626563772559166, "grad_norm_var": 4.499431451119894e-06, "learning_rate": 0.0027164671421783864, "loss": 2.6437, "step": 8470 }, { "crossentropy": 2.565975785255432, "epoch": 0.4606433018842274, "grad_norm": 0.036325737833976746, "grad_norm_var": 4.433515733263486e-06, "learning_rate": 0.002715359477975715, "loss": 2.566, "step": 8471 }, { "crossentropy": 2.7517181634902954, "epoch": 0.4606976807417276, "grad_norm": 0.03156284615397453, "grad_norm_var": 2.6815259367572616e-06, "learning_rate": 0.00271425195546432, "loss": 2.7517, "step": 8472 }, { "crossentropy": 2.5535141229629517, "epoch": 0.4607520595992278, "grad_norm": 0.032813455909490585, "grad_norm_var": 2.168437820871764e-06, "learning_rate": 0.0027131445747128865, "loss": 2.5535, "step": 8473 }, { "crossentropy": 2.545216202735901, "epoch": 0.46080643845672803, "grad_norm": 0.033750537782907486, "grad_norm_var": 2.0607346653117122e-06, "learning_rate": 0.002712037335790092, "loss": 2.5452, "step": 8474 }, { "crossentropy": 2.5483731031417847, "epoch": 0.46086081731422823, "grad_norm": 0.03467780724167824, "grad_norm_var": 2.1698583609212435e-06, "learning_rate": 0.002710930238764613, "loss": 2.5484, "step": 8475 }, { "crossentropy": 2.546465754508972, "epoch": 0.46091519617172844, "grad_norm": 0.03362305834889412, "grad_norm_var": 1.9041823572905786e-06, "learning_rate": 0.0027098232837051046, "loss": 2.5465, "step": 8476 }, { "crossentropy": 2.6880598068237305, "epoch": 0.46096957502922864, "grad_norm": 0.03094676323235035, "grad_norm_var": 2.28021068864521e-06, "learning_rate": 0.002708716470680221, "loss": 2.6881, "step": 8477 }, { "crossentropy": 2.6492063999176025, "epoch": 0.46102395388672884, "grad_norm": 0.032216716557741165, "grad_norm_var": 2.3757926024062093e-06, "learning_rate": 0.002707609799758606, "loss": 2.6492, "step": 8478 }, { "crossentropy": 2.5385433435440063, "epoch": 0.46107833274422905, "grad_norm": 0.0333520881831646, "grad_norm_var": 2.3759247055585845e-06, "learning_rate": 0.002706503271008893, "loss": 2.5385, "step": 8479 }, { "crossentropy": 2.6113078594207764, "epoch": 0.46113271160172925, "grad_norm": 0.03227231279015541, "grad_norm_var": 2.4479949219505796e-06, "learning_rate": 0.00270539688449971, "loss": 2.6113, "step": 8480 }, { "crossentropy": 2.6308748722076416, "epoch": 0.46118709045922945, "grad_norm": 0.033843111246824265, "grad_norm_var": 2.4609863169514397e-06, "learning_rate": 0.0027042906402996725, "loss": 2.6309, "step": 8481 }, { "crossentropy": 2.643514394760132, "epoch": 0.46124146931672966, "grad_norm": 0.03597499057650566, "grad_norm_var": 2.74921707603067e-06, "learning_rate": 0.002703184538477389, "loss": 2.6435, "step": 8482 }, { "crossentropy": 2.5991131067276, "epoch": 0.46129584817422986, "grad_norm": 0.033689599484205246, "grad_norm_var": 2.7502704262627088e-06, "learning_rate": 0.0027020785791014596, "loss": 2.5991, "step": 8483 }, { "crossentropy": 2.5316468477249146, "epoch": 0.46135022703173006, "grad_norm": 0.033211931586265564, "grad_norm_var": 2.5035429821618766e-06, "learning_rate": 0.0027009727622404763, "loss": 2.5316, "step": 8484 }, { "crossentropy": 2.586809515953064, "epoch": 0.46140460588923027, "grad_norm": 0.03234962001442909, "grad_norm_var": 2.5770099612062856e-06, "learning_rate": 0.002699867087963015, "loss": 2.5868, "step": 8485 }, { "crossentropy": 2.525742530822754, "epoch": 0.46145898474673047, "grad_norm": 0.031889379024505615, "grad_norm_var": 2.1921891706730277e-06, "learning_rate": 0.0026987615563376557, "loss": 2.5257, "step": 8486 }, { "crossentropy": 2.553780436515808, "epoch": 0.4615133636042307, "grad_norm": 0.03305163234472275, "grad_norm_var": 1.533110147549837e-06, "learning_rate": 0.0026976561674329582, "loss": 2.5538, "step": 8487 }, { "crossentropy": 2.4943279027938843, "epoch": 0.4615677424617309, "grad_norm": 0.0449041984975338, "grad_norm_var": 9.964825990964653e-06, "learning_rate": 0.002696550921317478, "loss": 2.4943, "step": 8488 }, { "crossentropy": 2.661044716835022, "epoch": 0.4616221213192311, "grad_norm": 0.03270096331834793, "grad_norm_var": 9.982070732745502e-06, "learning_rate": 0.0026954458180597626, "loss": 2.661, "step": 8489 }, { "crossentropy": 2.5750911235809326, "epoch": 0.4616765001767313, "grad_norm": 0.03213750943541527, "grad_norm_var": 1.0177567326200921e-05, "learning_rate": 0.0026943408577283483, "loss": 2.5751, "step": 8490 }, { "crossentropy": 2.547587037086487, "epoch": 0.4617308790342315, "grad_norm": 0.03267280384898186, "grad_norm_var": 1.0194848641938943e-05, "learning_rate": 0.0026932360403917653, "loss": 2.5476, "step": 8491 }, { "crossentropy": 2.629668951034546, "epoch": 0.4617852578917317, "grad_norm": 0.03255234286189079, "grad_norm_var": 1.0274243186249758e-05, "learning_rate": 0.002692131366118532, "loss": 2.6297, "step": 8492 }, { "crossentropy": 2.584335446357727, "epoch": 0.4618396367492319, "grad_norm": 0.033506136387586594, "grad_norm_var": 9.774686549013136e-06, "learning_rate": 0.0026910268349771603, "loss": 2.5843, "step": 8493 }, { "crossentropy": 2.592529773712158, "epoch": 0.4618940156067321, "grad_norm": 0.03276880085468292, "grad_norm_var": 9.679372696246892e-06, "learning_rate": 0.002689922447036151, "loss": 2.5925, "step": 8494 }, { "crossentropy": 2.5421887636184692, "epoch": 0.4619483944642323, "grad_norm": 0.03256675601005554, "grad_norm_var": 9.765327307979104e-06, "learning_rate": 0.0026888182023640005, "loss": 2.5422, "step": 8495 }, { "crossentropy": 2.626768469810486, "epoch": 0.4620027733217325, "grad_norm": 0.03355690464377403, "grad_norm_var": 9.614380863307941e-06, "learning_rate": 0.0026877141010291862, "loss": 2.6268, "step": 8496 }, { "crossentropy": 2.6487916707992554, "epoch": 0.4620571521792327, "grad_norm": 0.032646242529153824, "grad_norm_var": 9.702783736860686e-06, "learning_rate": 0.0026866101431001887, "loss": 2.6488, "step": 8497 }, { "crossentropy": 2.563838839530945, "epoch": 0.4621115310367329, "grad_norm": 0.032486941665410995, "grad_norm_var": 9.433632193078568e-06, "learning_rate": 0.002685506328645476, "loss": 2.5638, "step": 8498 }, { "crossentropy": 2.6610432863235474, "epoch": 0.4621659098942331, "grad_norm": 0.032246582210063934, "grad_norm_var": 9.535615037696507e-06, "learning_rate": 0.002684402657733501, "loss": 2.661, "step": 8499 }, { "crossentropy": 2.601485252380371, "epoch": 0.4622202887517333, "grad_norm": 0.0320359505712986, "grad_norm_var": 9.659854467130903e-06, "learning_rate": 0.0026832991304327146, "loss": 2.6015, "step": 8500 }, { "crossentropy": 2.6003708839416504, "epoch": 0.4622746676092335, "grad_norm": 0.033107053488492966, "grad_norm_var": 9.591697491777758e-06, "learning_rate": 0.0026821957468115567, "loss": 2.6004, "step": 8501 }, { "crossentropy": 2.530584454536438, "epoch": 0.46232904646673373, "grad_norm": 0.03178611025214195, "grad_norm_var": 9.613534232285648e-06, "learning_rate": 0.002681092506938457, "loss": 2.5306, "step": 8502 }, { "crossentropy": 2.521554470062256, "epoch": 0.46238342532423393, "grad_norm": 0.030764739960432053, "grad_norm_var": 1.0052855869709573e-05, "learning_rate": 0.002679989410881838, "loss": 2.5216, "step": 8503 }, { "crossentropy": 2.556352972984314, "epoch": 0.46243780418173414, "grad_norm": 0.03380657732486725, "grad_norm_var": 5.46357923989343e-07, "learning_rate": 0.002678886458710113, "loss": 2.5564, "step": 8504 }, { "crossentropy": 2.5381916761398315, "epoch": 0.46249218303923434, "grad_norm": 0.034360628575086594, "grad_norm_var": 7.44418048932947e-07, "learning_rate": 0.0026777836504916857, "loss": 2.5382, "step": 8505 }, { "crossentropy": 2.571093201637268, "epoch": 0.46254656189673454, "grad_norm": 0.03199898451566696, "grad_norm_var": 7.55778092676312e-07, "learning_rate": 0.0026766809862949536, "loss": 2.5711, "step": 8506 }, { "crossentropy": 2.5224260091781616, "epoch": 0.46260094075423475, "grad_norm": 0.03209282457828522, "grad_norm_var": 7.772785943399122e-07, "learning_rate": 0.0026755784661882985, "loss": 2.5224, "step": 8507 }, { "crossentropy": 2.61913001537323, "epoch": 0.46265531961173495, "grad_norm": 0.03288819640874863, "grad_norm_var": 7.802811557312712e-07, "learning_rate": 0.002674476090240098, "loss": 2.6191, "step": 8508 }, { "crossentropy": 2.5276893377304077, "epoch": 0.46270969846923515, "grad_norm": 0.03246347978711128, "grad_norm_var": 7.311127051031739e-07, "learning_rate": 0.0026733738585187263, "loss": 2.5277, "step": 8509 }, { "crossentropy": 2.5939568281173706, "epoch": 0.46276407732673536, "grad_norm": 0.03190404549241066, "grad_norm_var": 7.582200827545756e-07, "learning_rate": 0.002672271771092537, "loss": 2.594, "step": 8510 }, { "crossentropy": 2.6232492923736572, "epoch": 0.46281845618423556, "grad_norm": 0.033038441091775894, "grad_norm_var": 7.735251483601338e-07, "learning_rate": 0.0026711698280298822, "loss": 2.6232, "step": 8511 }, { "crossentropy": 2.5960041284561157, "epoch": 0.46287283504173576, "grad_norm": 0.03189739212393761, "grad_norm_var": 7.281592301418167e-07, "learning_rate": 0.002670068029399103, "loss": 2.596, "step": 8512 }, { "crossentropy": 2.538774847984314, "epoch": 0.46292721389923597, "grad_norm": 0.03291274607181549, "grad_norm_var": 7.388514998753667e-07, "learning_rate": 0.0026689663752685333, "loss": 2.5388, "step": 8513 }, { "crossentropy": 2.6165491342544556, "epoch": 0.46298159275673617, "grad_norm": 0.03178223595023155, "grad_norm_var": 7.698874462438073e-07, "learning_rate": 0.0026678648657064976, "loss": 2.6165, "step": 8514 }, { "crossentropy": 2.6353580951690674, "epoch": 0.4630359716142364, "grad_norm": 0.03503458946943283, "grad_norm_var": 1.182730482482408e-06, "learning_rate": 0.002666763500781304, "loss": 2.6354, "step": 8515 }, { "crossentropy": 2.5740606784820557, "epoch": 0.4630903504717366, "grad_norm": 0.03838949650526047, "grad_norm_var": 3.213366542659188e-06, "learning_rate": 0.0026656622805612653, "loss": 2.5741, "step": 8516 }, { "crossentropy": 2.5880061388015747, "epoch": 0.4631447293292368, "grad_norm": 0.03471381217241287, "grad_norm_var": 3.3946089804448575e-06, "learning_rate": 0.002664561205114677, "loss": 2.588, "step": 8517 }, { "crossentropy": 2.623470664024353, "epoch": 0.463199108186737, "grad_norm": 0.033619437366724014, "grad_norm_var": 3.279925464255475e-06, "learning_rate": 0.0026634602745098245, "loss": 2.6235, "step": 8518 }, { "crossentropy": 2.551325559616089, "epoch": 0.4632534870442372, "grad_norm": 0.032178379595279694, "grad_norm_var": 2.9403045409288783e-06, "learning_rate": 0.002662359488814985, "loss": 2.5513, "step": 8519 }, { "crossentropy": 2.5317349433898926, "epoch": 0.4633078659017374, "grad_norm": 0.03100406378507614, "grad_norm_var": 3.2484614886145342e-06, "learning_rate": 0.0026612588480984345, "loss": 2.5317, "step": 8520 }, { "crossentropy": 2.549183964729309, "epoch": 0.4633622447592376, "grad_norm": 0.03211596980690956, "grad_norm_var": 3.1987729187063844e-06, "learning_rate": 0.0026601583524284277, "loss": 2.5492, "step": 8521 }, { "crossentropy": 2.519104838371277, "epoch": 0.4634166236167378, "grad_norm": 0.03276338055729866, "grad_norm_var": 3.1330515983918337e-06, "learning_rate": 0.0026590580018732188, "loss": 2.5191, "step": 8522 }, { "crossentropy": 2.6080280542373657, "epoch": 0.463471002474238, "grad_norm": 0.045201752334833145, "grad_norm_var": 1.2200459858166014e-05, "learning_rate": 0.00265795779650105, "loss": 2.608, "step": 8523 }, { "crossentropy": 2.517425298690796, "epoch": 0.4635253813317382, "grad_norm": 0.03335113823413849, "grad_norm_var": 1.215330069857471e-05, "learning_rate": 0.002656857736380155, "loss": 2.5174, "step": 8524 }, { "crossentropy": 2.6035454273223877, "epoch": 0.4635797601892384, "grad_norm": 0.03303191810846329, "grad_norm_var": 1.2064759815577641e-05, "learning_rate": 0.002655757821578761, "loss": 2.6035, "step": 8525 }, { "crossentropy": 2.469746470451355, "epoch": 0.4636341390467386, "grad_norm": 0.03247472643852234, "grad_norm_var": 1.1930678499959673e-05, "learning_rate": 0.0026546580521650772, "loss": 2.4697, "step": 8526 }, { "crossentropy": 2.5754189491271973, "epoch": 0.4636885179042388, "grad_norm": 0.03207647427916527, "grad_norm_var": 1.2107914257538688e-05, "learning_rate": 0.0026535584282073167, "loss": 2.5754, "step": 8527 }, { "crossentropy": 2.6096807718276978, "epoch": 0.463742896761739, "grad_norm": 0.03566432371735573, "grad_norm_var": 1.1984319603832299e-05, "learning_rate": 0.0026524589497736763, "loss": 2.6097, "step": 8528 }, { "crossentropy": 2.5847795009613037, "epoch": 0.4637972756192392, "grad_norm": 0.03385317698121071, "grad_norm_var": 1.1885125495191459e-05, "learning_rate": 0.002651359616932342, "loss": 2.5848, "step": 8529 }, { "crossentropy": 2.584435224533081, "epoch": 0.46385165447673943, "grad_norm": 0.0316329225897789, "grad_norm_var": 1.1934721110305687e-05, "learning_rate": 0.002650260429751492, "loss": 2.5844, "step": 8530 }, { "crossentropy": 2.570441722869873, "epoch": 0.46390603333423963, "grad_norm": 0.032736245542764664, "grad_norm_var": 1.2007304916044948e-05, "learning_rate": 0.002649161388299304, "loss": 2.5704, "step": 8531 }, { "crossentropy": 2.660125970840454, "epoch": 0.46396041219173983, "grad_norm": 0.040652524679899216, "grad_norm_var": 1.3636636874417701e-05, "learning_rate": 0.0026480624926439322, "loss": 2.6601, "step": 8532 }, { "crossentropy": 2.5188422203063965, "epoch": 0.46401479104924004, "grad_norm": 0.03272394835948944, "grad_norm_var": 1.3745635479143053e-05, "learning_rate": 0.002646963742853532, "loss": 2.5188, "step": 8533 }, { "crossentropy": 2.5610166788101196, "epoch": 0.46406916990674024, "grad_norm": 0.032951224595308304, "grad_norm_var": 1.3813464534155086e-05, "learning_rate": 0.0026458651389962475, "loss": 2.561, "step": 8534 }, { "crossentropy": 2.5798975229263306, "epoch": 0.46412354876424045, "grad_norm": 0.03361044079065323, "grad_norm_var": 1.3588897801638095e-05, "learning_rate": 0.0026447666811402117, "loss": 2.5799, "step": 8535 }, { "crossentropy": 2.6040470600128174, "epoch": 0.46417792762174065, "grad_norm": 0.03371468931436539, "grad_norm_var": 1.292367598994485e-05, "learning_rate": 0.0026436683693535513, "loss": 2.604, "step": 8536 }, { "crossentropy": 2.555311441421509, "epoch": 0.46423230647924085, "grad_norm": 0.03341690078377724, "grad_norm_var": 1.2653273615123989e-05, "learning_rate": 0.002642570203704381, "loss": 2.5553, "step": 8537 }, { "crossentropy": 2.6740429401397705, "epoch": 0.46428668533674106, "grad_norm": 0.034375716000795364, "grad_norm_var": 1.2471225054470844e-05, "learning_rate": 0.002641472184260809, "loss": 2.674, "step": 8538 }, { "crossentropy": 2.561858057975769, "epoch": 0.46434106419424126, "grad_norm": 0.03145447373390198, "grad_norm_var": 4.606026041264107e-06, "learning_rate": 0.0026403743110909356, "loss": 2.5619, "step": 8539 }, { "crossentropy": 2.690108895301819, "epoch": 0.46439544305174146, "grad_norm": 0.031256843358278275, "grad_norm_var": 4.951756497287701e-06, "learning_rate": 0.0026392765842628454, "loss": 2.6901, "step": 8540 }, { "crossentropy": 2.5403610467910767, "epoch": 0.46444982190924167, "grad_norm": 0.03322118893265724, "grad_norm_var": 4.942771924262845e-06, "learning_rate": 0.0026381790038446187, "loss": 2.5404, "step": 8541 }, { "crossentropy": 2.686956524848938, "epoch": 0.46450420076674187, "grad_norm": 0.032909542322158813, "grad_norm_var": 4.895815149831099e-06, "learning_rate": 0.002637081569904332, "loss": 2.687, "step": 8542 }, { "crossentropy": 2.6107629537582397, "epoch": 0.4645585796242421, "grad_norm": 0.03474957123398781, "grad_norm_var": 4.829459519693075e-06, "learning_rate": 0.0026359842825100415, "loss": 2.6108, "step": 8543 }, { "crossentropy": 2.780712842941284, "epoch": 0.4646129584817423, "grad_norm": 0.03326307609677315, "grad_norm_var": 4.555395335929108e-06, "learning_rate": 0.002634887141729802, "loss": 2.7807, "step": 8544 }, { "crossentropy": 2.4482710361480713, "epoch": 0.4646673373392425, "grad_norm": 0.03249529376626015, "grad_norm_var": 4.612604966896851e-06, "learning_rate": 0.002633790147631657, "loss": 2.4483, "step": 8545 }, { "crossentropy": 2.5326664447784424, "epoch": 0.4647217161967427, "grad_norm": 0.03098258562386036, "grad_norm_var": 4.796408445761887e-06, "learning_rate": 0.0026326933002836406, "loss": 2.5327, "step": 8546 }, { "crossentropy": 2.6798219680786133, "epoch": 0.4647760950542429, "grad_norm": 0.03433233126997948, "grad_norm_var": 4.812852205466585e-06, "learning_rate": 0.0026315965997537787, "loss": 2.6798, "step": 8547 }, { "crossentropy": 2.6091716289520264, "epoch": 0.4648304739117431, "grad_norm": 0.031918879598379135, "grad_norm_var": 1.2591534261521906e-06, "learning_rate": 0.002630500046110088, "loss": 2.6092, "step": 8548 }, { "crossentropy": 2.4809333086013794, "epoch": 0.4648848527692433, "grad_norm": 0.03202686458826065, "grad_norm_var": 1.311560534433343e-06, "learning_rate": 0.0026294036394205757, "loss": 2.4809, "step": 8549 }, { "crossentropy": 2.6297433376312256, "epoch": 0.4649392316267435, "grad_norm": 0.03245540335774422, "grad_norm_var": 1.3246943752838787e-06, "learning_rate": 0.0026283073797532387, "loss": 2.6297, "step": 8550 }, { "crossentropy": 2.558923363685608, "epoch": 0.4649936104842437, "grad_norm": 0.03111107461154461, "grad_norm_var": 1.47386476478407e-06, "learning_rate": 0.002627211267176069, "loss": 2.5589, "step": 8551 }, { "crossentropy": 2.502704620361328, "epoch": 0.4650479893417439, "grad_norm": 0.03126000240445137, "grad_norm_var": 1.5282679373224486e-06, "learning_rate": 0.0026261153017570404, "loss": 2.5027, "step": 8552 }, { "crossentropy": 2.4726479053497314, "epoch": 0.4651023681992441, "grad_norm": 0.032110098749399185, "grad_norm_var": 1.4886320914772021e-06, "learning_rate": 0.002625019483564132, "loss": 2.4726, "step": 8553 }, { "crossentropy": 2.658514380455017, "epoch": 0.4651567470567443, "grad_norm": 0.03500107675790787, "grad_norm_var": 1.669875793109075e-06, "learning_rate": 0.0026239238126652986, "loss": 2.6585, "step": 8554 }, { "crossentropy": 2.5503716468811035, "epoch": 0.4652111259142445, "grad_norm": 0.0316910520195961, "grad_norm_var": 1.6393130536349053e-06, "learning_rate": 0.002622828289128496, "loss": 2.5504, "step": 8555 }, { "crossentropy": 2.539566993713379, "epoch": 0.4652655047717447, "grad_norm": 0.031608697026968, "grad_norm_var": 1.5864280150585294e-06, "learning_rate": 0.002621732913021666, "loss": 2.5396, "step": 8556 }, { "crossentropy": 2.4758540391921997, "epoch": 0.4653198836292449, "grad_norm": 0.031734272837638855, "grad_norm_var": 1.595716112651133e-06, "learning_rate": 0.0026206376844127434, "loss": 2.4759, "step": 8557 }, { "crossentropy": 2.5308743715286255, "epoch": 0.4653742624867451, "grad_norm": 0.03185127303004265, "grad_norm_var": 1.6048363197522972e-06, "learning_rate": 0.0026195426033696536, "loss": 2.5309, "step": 8558 }, { "crossentropy": 2.58630108833313, "epoch": 0.46542864134424533, "grad_norm": 0.030329566448926926, "grad_norm_var": 1.448237359360366e-06, "learning_rate": 0.002618447669960312, "loss": 2.5863, "step": 8559 }, { "crossentropy": 2.528002977371216, "epoch": 0.46548302020174553, "grad_norm": 0.03180653229355812, "grad_norm_var": 1.3618935539308022e-06, "learning_rate": 0.0026173528842526254, "loss": 2.528, "step": 8560 }, { "crossentropy": 2.62679123878479, "epoch": 0.46553739905924574, "grad_norm": 0.03261754289269447, "grad_norm_var": 1.370172432176714e-06, "learning_rate": 0.0026162582463144925, "loss": 2.6268, "step": 8561 }, { "crossentropy": 2.4902799129486084, "epoch": 0.465591777916746, "grad_norm": 0.03322935849428177, "grad_norm_var": 1.3652091798249985e-06, "learning_rate": 0.002615163756213802, "loss": 2.4903, "step": 8562 }, { "crossentropy": 2.5948331356048584, "epoch": 0.4656461567742462, "grad_norm": 0.032568659633398056, "grad_norm_var": 1.0564823301609556e-06, "learning_rate": 0.0026140694140184277, "loss": 2.5948, "step": 8563 }, { "crossentropy": 2.5942351818084717, "epoch": 0.4657005356317464, "grad_norm": 0.03494234383106232, "grad_norm_var": 1.5618468324656338e-06, "learning_rate": 0.0026129752197962463, "loss": 2.5942, "step": 8564 }, { "crossentropy": 2.4570518732070923, "epoch": 0.4657549144892466, "grad_norm": 0.032756660133600235, "grad_norm_var": 1.5713310183397872e-06, "learning_rate": 0.0026118811736151187, "loss": 2.4571, "step": 8565 }, { "crossentropy": 2.5784542560577393, "epoch": 0.4658092933467468, "grad_norm": 0.03170742839574814, "grad_norm_var": 1.5925047746810958e-06, "learning_rate": 0.0026107872755428917, "loss": 2.5785, "step": 8566 }, { "crossentropy": 2.553678274154663, "epoch": 0.465863672204247, "grad_norm": 0.032281145453453064, "grad_norm_var": 1.4972130892460315e-06, "learning_rate": 0.0026096935256474108, "loss": 2.5537, "step": 8567 }, { "crossentropy": 2.636300206184387, "epoch": 0.4659180510617472, "grad_norm": 0.03236271068453789, "grad_norm_var": 1.4139093613506233e-06, "learning_rate": 0.002608599923996509, "loss": 2.6363, "step": 8568 }, { "crossentropy": 2.5987731218338013, "epoch": 0.4659724299192474, "grad_norm": 0.03152712062001228, "grad_norm_var": 1.458648923168398e-06, "learning_rate": 0.002607506470658011, "loss": 2.5988, "step": 8569 }, { "crossentropy": 2.471076726913452, "epoch": 0.4660268087767476, "grad_norm": 0.03212426230311394, "grad_norm_var": 9.689748476463904e-07, "learning_rate": 0.00260641316569973, "loss": 2.4711, "step": 8570 }, { "crossentropy": 2.643521308898926, "epoch": 0.46608118763424783, "grad_norm": 0.03317078948020935, "grad_norm_var": 1.0061685070614685e-06, "learning_rate": 0.0026053200091894733, "loss": 2.6435, "step": 8571 }, { "crossentropy": 2.589324474334717, "epoch": 0.46613556649174803, "grad_norm": 0.03261187672615051, "grad_norm_var": 9.781182368106521e-07, "learning_rate": 0.0026042270011950373, "loss": 2.5893, "step": 8572 }, { "crossentropy": 2.6644420623779297, "epoch": 0.46618994534924824, "grad_norm": 0.03200293332338333, "grad_norm_var": 9.605249506924296e-07, "learning_rate": 0.0026031341417842107, "loss": 2.6644, "step": 8573 }, { "crossentropy": 2.6414791345596313, "epoch": 0.46624432420674844, "grad_norm": 0.031040169298648834, "grad_norm_var": 1.0575404816369019e-06, "learning_rate": 0.0026020414310247665, "loss": 2.6415, "step": 8574 }, { "crossentropy": 2.5312023162841797, "epoch": 0.46629870306424864, "grad_norm": 0.03218533843755722, "grad_norm_var": 7.809106510954395e-07, "learning_rate": 0.002600948868984479, "loss": 2.5312, "step": 8575 }, { "crossentropy": 2.4964710474014282, "epoch": 0.46635308192174885, "grad_norm": 0.032742977142333984, "grad_norm_var": 7.574447561347712e-07, "learning_rate": 0.0025998564557311076, "loss": 2.4965, "step": 8576 }, { "crossentropy": 2.623053789138794, "epoch": 0.46640746077924905, "grad_norm": 0.03528323397040367, "grad_norm_var": 1.2462003818223636e-06, "learning_rate": 0.0025987641913324, "loss": 2.6231, "step": 8577 }, { "crossentropy": 2.5623010396957397, "epoch": 0.46646183963674925, "grad_norm": 0.03448103368282318, "grad_norm_var": 1.439378630564934e-06, "learning_rate": 0.002597672075856096, "loss": 2.5623, "step": 8578 }, { "crossentropy": 2.5422500371932983, "epoch": 0.46651621849424946, "grad_norm": 0.03190230205655098, "grad_norm_var": 1.482068888744581e-06, "learning_rate": 0.002596580109369934, "loss": 2.5423, "step": 8579 }, { "crossentropy": 2.5051807165145874, "epoch": 0.46657059735174966, "grad_norm": 0.03163627162575722, "grad_norm_var": 1.1746142511758738e-06, "learning_rate": 0.002595488291941631, "loss": 2.5052, "step": 8580 }, { "crossentropy": 2.496731400489807, "epoch": 0.46662497620924986, "grad_norm": 0.03087715059518814, "grad_norm_var": 1.328201705082475e-06, "learning_rate": 0.002594396623638903, "loss": 2.4967, "step": 8581 }, { "crossentropy": 2.5503536462783813, "epoch": 0.46667935506675007, "grad_norm": 0.031384099274873734, "grad_norm_var": 1.3633445031570208e-06, "learning_rate": 0.002593305104529453, "loss": 2.5504, "step": 8582 }, { "crossentropy": 2.5836498737335205, "epoch": 0.46673373392425027, "grad_norm": 0.03207414969801903, "grad_norm_var": 1.3679459421868841e-06, "learning_rate": 0.002592213734680977, "loss": 2.5836, "step": 8583 }, { "crossentropy": 2.593231439590454, "epoch": 0.4667881127817505, "grad_norm": 0.03213658556342125, "grad_norm_var": 1.370393720948974e-06, "learning_rate": 0.0025911225141611624, "loss": 2.5932, "step": 8584 }, { "crossentropy": 2.5905826091766357, "epoch": 0.4668424916392507, "grad_norm": 0.03182845935225487, "grad_norm_var": 1.3440609259893673e-06, "learning_rate": 0.002590031443037679, "loss": 2.5906, "step": 8585 }, { "crossentropy": 2.437637448310852, "epoch": 0.4668968704967509, "grad_norm": 0.030533498153090477, "grad_norm_var": 1.5485293559813156e-06, "learning_rate": 0.0025889405213782015, "loss": 2.4376, "step": 8586 }, { "crossentropy": 2.636489987373352, "epoch": 0.4669512493542511, "grad_norm": 0.033957261592149734, "grad_norm_var": 1.6844599459899054e-06, "learning_rate": 0.002587849749250387, "loss": 2.6365, "step": 8587 }, { "crossentropy": 2.4957001209259033, "epoch": 0.4670056282117513, "grad_norm": 0.03238480165600777, "grad_norm_var": 1.6780079407265144e-06, "learning_rate": 0.0025867591267218805, "loss": 2.4957, "step": 8588 }, { "crossentropy": 2.7652653455734253, "epoch": 0.4670600070692515, "grad_norm": 0.0347842238843441, "grad_norm_var": 2.0594236279316507e-06, "learning_rate": 0.0025856686538603202, "loss": 2.7653, "step": 8589 }, { "crossentropy": 2.592527389526367, "epoch": 0.4671143859267517, "grad_norm": 0.03157179057598114, "grad_norm_var": 1.9770148534669344e-06, "learning_rate": 0.0025845783307333436, "loss": 2.5925, "step": 8590 }, { "crossentropy": 2.508966326713562, "epoch": 0.4671687647842519, "grad_norm": 0.03293221816420555, "grad_norm_var": 1.9820179249705465e-06, "learning_rate": 0.0025834881574085646, "loss": 2.509, "step": 8591 }, { "crossentropy": 2.6205445528030396, "epoch": 0.4672231436417521, "grad_norm": 0.03333074226975441, "grad_norm_var": 2.0201531831687123e-06, "learning_rate": 0.0025823981339535974, "loss": 2.6205, "step": 8592 }, { "crossentropy": 2.61955189704895, "epoch": 0.4672775224992523, "grad_norm": 0.03231857717037201, "grad_norm_var": 1.496421967333419e-06, "learning_rate": 0.0025813082604360443, "loss": 2.6196, "step": 8593 }, { "crossentropy": 2.6022154092788696, "epoch": 0.4673319013567525, "grad_norm": 0.031725332140922546, "grad_norm_var": 1.2002848173396497e-06, "learning_rate": 0.0025802185369234974, "loss": 2.6022, "step": 8594 }, { "crossentropy": 2.6519335508346558, "epoch": 0.4673862802142527, "grad_norm": 0.030840963125228882, "grad_norm_var": 1.3143847034990796e-06, "learning_rate": 0.0025791289634835434, "loss": 2.6519, "step": 8595 }, { "crossentropy": 2.621951937675476, "epoch": 0.4674406590717529, "grad_norm": 0.03389343246817589, "grad_norm_var": 1.4797768003131275e-06, "learning_rate": 0.0025780395401837504, "loss": 2.622, "step": 8596 }, { "crossentropy": 2.619803547859192, "epoch": 0.4674950379292531, "grad_norm": 0.03280922397971153, "grad_norm_var": 1.350193833447967e-06, "learning_rate": 0.0025769502670916888, "loss": 2.6198, "step": 8597 }, { "crossentropy": 2.603432536125183, "epoch": 0.4675494167867533, "grad_norm": 0.03113405965268612, "grad_norm_var": 1.3881895790522508e-06, "learning_rate": 0.002575861144274914, "loss": 2.6034, "step": 8598 }, { "crossentropy": 2.499164342880249, "epoch": 0.46760379564425353, "grad_norm": 0.03164978325366974, "grad_norm_var": 1.4173706870998504e-06, "learning_rate": 0.0025747721718009696, "loss": 2.4992, "step": 8599 }, { "crossentropy": 2.619599938392639, "epoch": 0.46765817450175373, "grad_norm": 0.031935740262269974, "grad_norm_var": 1.4259935177352486e-06, "learning_rate": 0.0025736833497373925, "loss": 2.6196, "step": 8600 }, { "crossentropy": 2.6180518865585327, "epoch": 0.46771255335925394, "grad_norm": 0.03330358862876892, "grad_norm_var": 1.4590451599284677e-06, "learning_rate": 0.002572594678151715, "loss": 2.6181, "step": 8601 }, { "crossentropy": 2.544143557548523, "epoch": 0.46776693221675414, "grad_norm": 0.030809128656983376, "grad_norm_var": 1.3935782347064089e-06, "learning_rate": 0.0025715061571114505, "loss": 2.5441, "step": 8602 }, { "crossentropy": 2.5598950386047363, "epoch": 0.46782131107425434, "grad_norm": 0.031168336048722267, "grad_norm_var": 1.3234280089536766e-06, "learning_rate": 0.0025704177866841113, "loss": 2.5599, "step": 8603 }, { "crossentropy": 2.5780107975006104, "epoch": 0.46787568993175455, "grad_norm": 0.03149591386318207, "grad_norm_var": 1.3612188800427346e-06, "learning_rate": 0.002569329566937195, "loss": 2.578, "step": 8604 }, { "crossentropy": 2.5784233808517456, "epoch": 0.46793006878925475, "grad_norm": 0.03307044878602028, "grad_norm_var": 9.614634897120068e-07, "learning_rate": 0.002568241497938193, "loss": 2.5784, "step": 8605 }, { "crossentropy": 2.5780062675476074, "epoch": 0.46798444764675495, "grad_norm": 0.03316836804151535, "grad_norm_var": 1.0031567951841128e-06, "learning_rate": 0.002567153579754586, "loss": 2.578, "step": 8606 }, { "crossentropy": 2.5283336639404297, "epoch": 0.46803882650425516, "grad_norm": 0.03329480066895485, "grad_norm_var": 1.0456061506922219e-06, "learning_rate": 0.002566065812453846, "loss": 2.5283, "step": 8607 }, { "crossentropy": 2.6309157609939575, "epoch": 0.46809320536175536, "grad_norm": 0.033701494336128235, "grad_norm_var": 1.1077815125845688e-06, "learning_rate": 0.0025649781961034346, "loss": 2.6309, "step": 8608 }, { "crossentropy": 2.4468849897384644, "epoch": 0.46814758421925556, "grad_norm": 0.03217921778559685, "grad_norm_var": 1.1080917634393214e-06, "learning_rate": 0.002563890730770808, "loss": 2.4469, "step": 8609 }, { "crossentropy": 2.597504734992981, "epoch": 0.46820196307675577, "grad_norm": 0.03206193819642067, "grad_norm_var": 1.0911212866054521e-06, "learning_rate": 0.002562803416523405, "loss": 2.5975, "step": 8610 }, { "crossentropy": 2.5389819145202637, "epoch": 0.46825634193425597, "grad_norm": 0.03251396119594574, "grad_norm_var": 9.44545154518333e-07, "learning_rate": 0.00256171625342866, "loss": 2.539, "step": 8611 }, { "crossentropy": 2.721321225166321, "epoch": 0.4683107207917562, "grad_norm": 0.03125715255737305, "grad_norm_var": 8.493450372828628e-07, "learning_rate": 0.0025606292415540034, "loss": 2.7213, "step": 8612 }, { "crossentropy": 2.5915167331695557, "epoch": 0.4683650996492564, "grad_norm": 0.03282245993614197, "grad_norm_var": 8.503921886629373e-07, "learning_rate": 0.002559542380966845, "loss": 2.5915, "step": 8613 }, { "crossentropy": 2.6637399196624756, "epoch": 0.4684194785067566, "grad_norm": 0.03411030024290085, "grad_norm_var": 9.71931115079557e-07, "learning_rate": 0.002558455671734593, "loss": 2.6637, "step": 8614 }, { "crossentropy": 2.6172664165496826, "epoch": 0.4684738573642568, "grad_norm": 0.03375701978802681, "grad_norm_var": 1.0361697573392388e-06, "learning_rate": 0.0025573691139246448, "loss": 2.6173, "step": 8615 }, { "crossentropy": 2.610128164291382, "epoch": 0.468528236221757, "grad_norm": 0.034178100526332855, "grad_norm_var": 1.169584160762525e-06, "learning_rate": 0.002556282707604386, "loss": 2.6101, "step": 8616 }, { "crossentropy": 2.505358338356018, "epoch": 0.4685826150792572, "grad_norm": 0.034116972237825394, "grad_norm_var": 1.2784797269978106e-06, "learning_rate": 0.002555196452841195, "loss": 2.5054, "step": 8617 }, { "crossentropy": 2.513274669647217, "epoch": 0.4686369939367574, "grad_norm": 0.032330598682165146, "grad_norm_var": 1.033161334000595e-06, "learning_rate": 0.002554110349702442, "loss": 2.5133, "step": 8618 }, { "crossentropy": 2.565992593765259, "epoch": 0.4686913727942576, "grad_norm": 0.03403434902429581, "grad_norm_var": 9.128220157444388e-07, "learning_rate": 0.0025530243982554834, "loss": 2.566, "step": 8619 }, { "crossentropy": 2.6090999841690063, "epoch": 0.4687457516517578, "grad_norm": 0.03227333724498749, "grad_norm_var": 7.940848553942073e-07, "learning_rate": 0.0025519385985676712, "loss": 2.6091, "step": 8620 }, { "crossentropy": 2.638800024986267, "epoch": 0.468800130509258, "grad_norm": 0.03133317455649376, "grad_norm_var": 9.790017054565777e-07, "learning_rate": 0.002550852950706347, "loss": 2.6388, "step": 8621 }, { "crossentropy": 2.654738187789917, "epoch": 0.4688545093667582, "grad_norm": 0.03561863675713539, "grad_norm_var": 1.4269447018470475e-06, "learning_rate": 0.002549767454738835, "loss": 2.6547, "step": 8622 }, { "crossentropy": 2.5780482292175293, "epoch": 0.4689088882242584, "grad_norm": 0.033956896513700485, "grad_norm_var": 1.471630741089793e-06, "learning_rate": 0.0025486821107324653, "loss": 2.578, "step": 8623 }, { "crossentropy": 2.6087000370025635, "epoch": 0.4689632670817586, "grad_norm": 0.03356846049427986, "grad_norm_var": 1.4627833853220542e-06, "learning_rate": 0.0025475969187545434, "loss": 2.6087, "step": 8624 }, { "crossentropy": 2.4254558086395264, "epoch": 0.4690176459392588, "grad_norm": 0.0328180268406868, "grad_norm_var": 1.4071323460001597e-06, "learning_rate": 0.002546511878872374, "loss": 2.4255, "step": 8625 }, { "crossentropy": 2.5305891036987305, "epoch": 0.469072024796759, "grad_norm": 0.03494228050112724, "grad_norm_var": 1.499355953207826e-06, "learning_rate": 0.0025454269911532514, "loss": 2.5306, "step": 8626 }, { "crossentropy": 2.7293132543563843, "epoch": 0.4691264036542592, "grad_norm": 0.033200379461050034, "grad_norm_var": 1.452106289144955e-06, "learning_rate": 0.002544342255664458, "loss": 2.7293, "step": 8627 }, { "crossentropy": 2.5421239137649536, "epoch": 0.46918078251175943, "grad_norm": 0.03184330463409424, "grad_norm_var": 1.3065082436026938e-06, "learning_rate": 0.002543257672473268, "loss": 2.5421, "step": 8628 }, { "crossentropy": 2.4956876039505005, "epoch": 0.46923516136925963, "grad_norm": 0.03471982851624489, "grad_norm_var": 1.377427547027965e-06, "learning_rate": 0.0025421732416469472, "loss": 2.4957, "step": 8629 }, { "crossentropy": 2.5849506855010986, "epoch": 0.46928954022675984, "grad_norm": 0.03298480063676834, "grad_norm_var": 1.3725326477670442e-06, "learning_rate": 0.0025410889632527505, "loss": 2.585, "step": 8630 }, { "crossentropy": 2.505145311355591, "epoch": 0.46934391908426004, "grad_norm": 0.03233383595943451, "grad_norm_var": 1.4465112674217143e-06, "learning_rate": 0.0025400048373579237, "loss": 2.5051, "step": 8631 }, { "crossentropy": 2.5802336931228638, "epoch": 0.46939829794176025, "grad_norm": 0.03152585029602051, "grad_norm_var": 1.6077515245899523e-06, "learning_rate": 0.0025389208640297055, "loss": 2.5802, "step": 8632 }, { "crossentropy": 2.4983372688293457, "epoch": 0.46945267679926045, "grad_norm": 0.03167184069752693, "grad_norm_var": 1.690634619167369e-06, "learning_rate": 0.002537837043335317, "loss": 2.4983, "step": 8633 }, { "crossentropy": 2.608092784881592, "epoch": 0.46950705565676065, "grad_norm": 0.03182986006140709, "grad_norm_var": 1.7558206124020901e-06, "learning_rate": 0.0025367533753419837, "loss": 2.6081, "step": 8634 }, { "crossentropy": 2.4269970655441284, "epoch": 0.46956143451426086, "grad_norm": 0.03153414651751518, "grad_norm_var": 1.8153420293947712e-06, "learning_rate": 0.002535669860116907, "loss": 2.427, "step": 8635 }, { "crossentropy": 2.526772379875183, "epoch": 0.46961581337176106, "grad_norm": 0.031120868399739265, "grad_norm_var": 1.992291904853597e-06, "learning_rate": 0.0025345864977272888, "loss": 2.5268, "step": 8636 }, { "crossentropy": 2.6327922344207764, "epoch": 0.46967019222926126, "grad_norm": 0.03169409558176994, "grad_norm_var": 1.929237527225818e-06, "learning_rate": 0.0025335032882403173, "loss": 2.6328, "step": 8637 }, { "crossentropy": 2.608566999435425, "epoch": 0.46972457108676147, "grad_norm": 0.031623829156160355, "grad_norm_var": 1.4440675036414176e-06, "learning_rate": 0.002532420231723172, "loss": 2.6086, "step": 8638 }, { "crossentropy": 2.5656442642211914, "epoch": 0.46977894994426167, "grad_norm": 0.032065022736787796, "grad_norm_var": 1.3218369029057194e-06, "learning_rate": 0.002531337328243024, "loss": 2.5656, "step": 8639 }, { "crossentropy": 2.5645623207092285, "epoch": 0.4698333288017619, "grad_norm": 0.0336340107023716, "grad_norm_var": 1.331729829884493e-06, "learning_rate": 0.002530254577867033, "loss": 2.5646, "step": 8640 }, { "crossentropy": 2.5221073627471924, "epoch": 0.4698877076592621, "grad_norm": 0.032475266605615616, "grad_norm_var": 1.3232300987382616e-06, "learning_rate": 0.00252917198066235, "loss": 2.5221, "step": 8641 }, { "crossentropy": 2.5956586599349976, "epoch": 0.4699420865167623, "grad_norm": 0.03271201252937317, "grad_norm_var": 8.929694451223346e-07, "learning_rate": 0.0025280895366961184, "loss": 2.5957, "step": 8642 }, { "crossentropy": 2.5953208208084106, "epoch": 0.4699964653742625, "grad_norm": 0.03179869428277016, "grad_norm_var": 8.494648744570745e-07, "learning_rate": 0.0025270072460354697, "loss": 2.5953, "step": 8643 }, { "crossentropy": 2.592957377433777, "epoch": 0.4700508442317627, "grad_norm": 0.03286679461598396, "grad_norm_var": 8.631266037830195e-07, "learning_rate": 0.0025259251087475254, "loss": 2.593, "step": 8644 }, { "crossentropy": 2.510520815849304, "epoch": 0.4701052230892629, "grad_norm": 0.03300972282886505, "grad_norm_var": 4.911685512514468e-07, "learning_rate": 0.0025248431248993965, "loss": 2.5105, "step": 8645 }, { "crossentropy": 2.5902732610702515, "epoch": 0.4701596019467631, "grad_norm": 0.0340607725083828, "grad_norm_var": 6.789789717557508e-07, "learning_rate": 0.002523761294558194, "loss": 2.5903, "step": 8646 }, { "crossentropy": 2.586684465408325, "epoch": 0.4702139808042633, "grad_norm": 0.03187910467386246, "grad_norm_var": 6.866553389806105e-07, "learning_rate": 0.002522679617791005, "loss": 2.5867, "step": 8647 }, { "crossentropy": 2.567142605781555, "epoch": 0.4702683596617635, "grad_norm": 0.032716114073991776, "grad_norm_var": 6.652176059626733e-07, "learning_rate": 0.0025215980946649164, "loss": 2.5671, "step": 8648 }, { "crossentropy": 2.5740758180618286, "epoch": 0.4703227385192637, "grad_norm": 0.03140813112258911, "grad_norm_var": 6.914139143708211e-07, "learning_rate": 0.0025205167252470035, "loss": 2.5741, "step": 8649 }, { "crossentropy": 2.5473592281341553, "epoch": 0.4703771173767639, "grad_norm": 0.03300539031624794, "grad_norm_var": 7.077321502254082e-07, "learning_rate": 0.002519435509604332, "loss": 2.5474, "step": 8650 }, { "crossentropy": 2.5705196857452393, "epoch": 0.4704314962342641, "grad_norm": 0.032521411776542664, "grad_norm_var": 6.612225534071189e-07, "learning_rate": 0.002518354447803959, "loss": 2.5705, "step": 8651 }, { "crossentropy": 2.5955890417099, "epoch": 0.4704858750917643, "grad_norm": 0.03241805359721184, "grad_norm_var": 5.430872647260142e-07, "learning_rate": 0.002517273539912926, "loss": 2.5956, "step": 8652 }, { "crossentropy": 2.545315146446228, "epoch": 0.4705402539492645, "grad_norm": 0.03366309031844139, "grad_norm_var": 5.756505443908651e-07, "learning_rate": 0.0025161927859982747, "loss": 2.5453, "step": 8653 }, { "crossentropy": 2.5207154750823975, "epoch": 0.4705946328067647, "grad_norm": 0.03495733439922333, "grad_norm_var": 8.291395606723854e-07, "learning_rate": 0.0025151121861270343, "loss": 2.5207, "step": 8654 }, { "crossentropy": 2.414412498474121, "epoch": 0.4706490116642649, "grad_norm": 0.03202382102608681, "grad_norm_var": 8.334175260987324e-07, "learning_rate": 0.0025140317403662166, "loss": 2.4144, "step": 8655 }, { "crossentropy": 2.489852547645569, "epoch": 0.47070339052176513, "grad_norm": 0.03278093412518501, "grad_norm_var": 7.865241090326362e-07, "learning_rate": 0.002512951448782831, "loss": 2.4899, "step": 8656 }, { "crossentropy": 2.5005760192871094, "epoch": 0.47075776937926533, "grad_norm": 0.03302391991019249, "grad_norm_var": 7.838837912388374e-07, "learning_rate": 0.0025118713114438817, "loss": 2.5006, "step": 8657 }, { "crossentropy": 2.6050705909729004, "epoch": 0.47081214823676554, "grad_norm": 0.032873038202524185, "grad_norm_var": 7.835544815444512e-07, "learning_rate": 0.002510791328416352, "loss": 2.6051, "step": 8658 }, { "crossentropy": 2.5071799755096436, "epoch": 0.47086652709426574, "grad_norm": 0.033634934574365616, "grad_norm_var": 7.459816836215156e-07, "learning_rate": 0.0025097114997672232, "loss": 2.5072, "step": 8659 }, { "crossentropy": 2.542224168777466, "epoch": 0.47092090595176594, "grad_norm": 0.030293233692646027, "grad_norm_var": 1.1808182823385786e-06, "learning_rate": 0.002508631825563466, "loss": 2.5422, "step": 8660 }, { "crossentropy": 2.5464717149734497, "epoch": 0.47097528480926615, "grad_norm": 0.031004132702946663, "grad_norm_var": 1.3672607454105842e-06, "learning_rate": 0.00250755230587204, "loss": 2.5465, "step": 8661 }, { "crossentropy": 2.675298810005188, "epoch": 0.47102966366676635, "grad_norm": 0.03459722176194191, "grad_norm_var": 1.4867651545337797e-06, "learning_rate": 0.0025064729407598984, "loss": 2.6753, "step": 8662 }, { "crossentropy": 2.535642385482788, "epoch": 0.47108404252426656, "grad_norm": 0.035264752805233, "grad_norm_var": 1.8438993869572395e-06, "learning_rate": 0.0025053937302939766, "loss": 2.5356, "step": 8663 }, { "crossentropy": 2.680638551712036, "epoch": 0.47113842138176676, "grad_norm": 0.03359917178750038, "grad_norm_var": 1.8725637614014212e-06, "learning_rate": 0.0025043146745412116, "loss": 2.6806, "step": 8664 }, { "crossentropy": 2.5165003538131714, "epoch": 0.47119280023926696, "grad_norm": 0.031417056918144226, "grad_norm_var": 1.8707435290691988e-06, "learning_rate": 0.002503235773568525, "loss": 2.5165, "step": 8665 }, { "crossentropy": 2.6907507181167603, "epoch": 0.47124717909676717, "grad_norm": 0.033251602202653885, "grad_norm_var": 1.8766020101955757e-06, "learning_rate": 0.002502157027442827, "loss": 2.6908, "step": 8666 }, { "crossentropy": 2.6689549684524536, "epoch": 0.47130155795426737, "grad_norm": 0.03276631236076355, "grad_norm_var": 1.8661031895089112e-06, "learning_rate": 0.002501078436231019, "loss": 2.669, "step": 8667 }, { "crossentropy": 2.648499011993408, "epoch": 0.4713559368117676, "grad_norm": 0.03276762738823891, "grad_norm_var": 1.8478730637192976e-06, "learning_rate": 0.0025000000000000014, "loss": 2.6485, "step": 8668 }, { "crossentropy": 2.593273878097534, "epoch": 0.4714103156692678, "grad_norm": 0.03202288970351219, "grad_norm_var": 1.8698824195145877e-06, "learning_rate": 0.00249892171881665, "loss": 2.5933, "step": 8669 }, { "crossentropy": 2.640802264213562, "epoch": 0.471464694526768, "grad_norm": 0.035525865852832794, "grad_norm_var": 2.0466168307748614e-06, "learning_rate": 0.002497843592747842, "loss": 2.6408, "step": 8670 }, { "crossentropy": 2.590069890022278, "epoch": 0.4715190733842682, "grad_norm": 0.034274764358997345, "grad_norm_var": 2.091948886174958e-06, "learning_rate": 0.0024967656218604422, "loss": 2.5901, "step": 8671 }, { "crossentropy": 2.7039719820022583, "epoch": 0.4715734522417684, "grad_norm": 0.033880360424518585, "grad_norm_var": 2.1253273302808495e-06, "learning_rate": 0.0024956878062213045, "loss": 2.704, "step": 8672 }, { "crossentropy": 2.538041353225708, "epoch": 0.4716278310992686, "grad_norm": 0.03269590809941292, "grad_norm_var": 2.1370107137169086e-06, "learning_rate": 0.0024946101458972743, "loss": 2.538, "step": 8673 }, { "crossentropy": 2.6155295372009277, "epoch": 0.4716822099567688, "grad_norm": 0.03533782809972763, "grad_norm_var": 2.43659899623592e-06, "learning_rate": 0.0024935326409551875, "loss": 2.6155, "step": 8674 }, { "crossentropy": 2.6157373189926147, "epoch": 0.471736588814269, "grad_norm": 0.032017387449741364, "grad_norm_var": 2.521605316444066e-06, "learning_rate": 0.0024924552914618687, "loss": 2.6157, "step": 8675 }, { "crossentropy": 2.509481430053711, "epoch": 0.4717909676717692, "grad_norm": 0.03739635646343231, "grad_norm_var": 2.950695744027597e-06, "learning_rate": 0.0024913780974841373, "loss": 2.5095, "step": 8676 }, { "crossentropy": 2.5017718076705933, "epoch": 0.4718453465292694, "grad_norm": 0.034107938408851624, "grad_norm_var": 2.4728500520497057e-06, "learning_rate": 0.0024903010590887954, "loss": 2.5018, "step": 8677 }, { "crossentropy": 2.5716270208358765, "epoch": 0.4718997253867696, "grad_norm": 0.033330921083688736, "grad_norm_var": 2.4397653190594283e-06, "learning_rate": 0.0024892241763426405, "loss": 2.5716, "step": 8678 }, { "crossentropy": 2.6196635961532593, "epoch": 0.4719541042442698, "grad_norm": 0.03271622583270073, "grad_norm_var": 2.32369373734381e-06, "learning_rate": 0.0024881474493124644, "loss": 2.6197, "step": 8679 }, { "crossentropy": 2.555114269256592, "epoch": 0.47200848310177, "grad_norm": 0.031624238938093185, "grad_norm_var": 2.559590648216978e-06, "learning_rate": 0.002487070878065041, "loss": 2.5551, "step": 8680 }, { "crossentropy": 2.56659996509552, "epoch": 0.4720628619592702, "grad_norm": 0.03580833598971367, "grad_norm_var": 2.576944256415849e-06, "learning_rate": 0.002485994462667138, "loss": 2.5666, "step": 8681 }, { "crossentropy": 2.5509713888168335, "epoch": 0.4721172408167704, "grad_norm": 0.03204607591032982, "grad_norm_var": 2.743109727027885e-06, "learning_rate": 0.002484918203185515, "loss": 2.551, "step": 8682 }, { "crossentropy": 2.6938297748565674, "epoch": 0.4721716196742706, "grad_norm": 0.03149022161960602, "grad_norm_var": 2.9943796414781215e-06, "learning_rate": 0.0024838420996869197, "loss": 2.6938, "step": 8683 }, { "crossentropy": 2.511459231376648, "epoch": 0.47222599853177083, "grad_norm": 0.03102090395987034, "grad_norm_var": 3.3708179452875217e-06, "learning_rate": 0.0024827661522380923, "loss": 2.5115, "step": 8684 }, { "crossentropy": 2.569196105003357, "epoch": 0.47228037738927103, "grad_norm": 0.031410276889801025, "grad_norm_var": 3.511333880815712e-06, "learning_rate": 0.0024816903609057614, "loss": 2.5692, "step": 8685 }, { "crossentropy": 2.5189236402511597, "epoch": 0.47233475624677124, "grad_norm": 0.034420888870954514, "grad_norm_var": 3.277051901430271e-06, "learning_rate": 0.0024806147257566457, "loss": 2.5189, "step": 8686 }, { "crossentropy": 2.5509226322174072, "epoch": 0.47238913510427144, "grad_norm": 0.0333506241440773, "grad_norm_var": 3.2163162841942893e-06, "learning_rate": 0.002479539246857456, "loss": 2.5509, "step": 8687 }, { "crossentropy": 2.534317970275879, "epoch": 0.47244351396177164, "grad_norm": 0.03222058340907097, "grad_norm_var": 3.258046577837602e-06, "learning_rate": 0.002478463924274895, "loss": 2.5343, "step": 8688 }, { "crossentropy": 2.586983561515808, "epoch": 0.47249789281927185, "grad_norm": 0.03315398842096329, "grad_norm_var": 3.241156457172674e-06, "learning_rate": 0.0024773887580756466, "loss": 2.587, "step": 8689 }, { "crossentropy": 2.4604806900024414, "epoch": 0.47255227167677205, "grad_norm": 0.03094935044646263, "grad_norm_var": 3.2031642466365336e-06, "learning_rate": 0.002476313748326399, "loss": 2.4605, "step": 8690 }, { "crossentropy": 2.601337194442749, "epoch": 0.47260665053427225, "grad_norm": 0.03149065375328064, "grad_norm_var": 3.285407661892748e-06, "learning_rate": 0.0024752388950938183, "loss": 2.6013, "step": 8691 }, { "crossentropy": 2.5754106044769287, "epoch": 0.47266102939177246, "grad_norm": 0.03408857807517052, "grad_norm_var": 1.98997741248535e-06, "learning_rate": 0.0024741641984445673, "loss": 2.5754, "step": 8692 }, { "crossentropy": 2.5833228826522827, "epoch": 0.47271540824927266, "grad_norm": 0.03358297795057297, "grad_norm_var": 1.908783504858781e-06, "learning_rate": 0.0024730896584452978, "loss": 2.5833, "step": 8693 }, { "crossentropy": 2.4248188734054565, "epoch": 0.47276978710677287, "grad_norm": 0.03217582404613495, "grad_norm_var": 1.8902378033374174e-06, "learning_rate": 0.0024720152751626524, "loss": 2.4248, "step": 8694 }, { "crossentropy": 2.5491198301315308, "epoch": 0.47282416596427307, "grad_norm": 0.03309444338083267, "grad_norm_var": 1.9051978728002686e-06, "learning_rate": 0.002470941048663262, "loss": 2.5491, "step": 8695 }, { "crossentropy": 2.4856395721435547, "epoch": 0.4728785448217733, "grad_norm": 0.03349236398935318, "grad_norm_var": 1.8751645726065945e-06, "learning_rate": 0.0024698669790137505, "loss": 2.4856, "step": 8696 }, { "crossentropy": 2.5958101749420166, "epoch": 0.4729329236792735, "grad_norm": 0.03437909483909607, "grad_norm_var": 1.417593299920837e-06, "learning_rate": 0.00246879306628073, "loss": 2.5958, "step": 8697 }, { "crossentropy": 2.6253212690353394, "epoch": 0.4729873025367737, "grad_norm": 0.033272884786129, "grad_norm_var": 1.4132118688560025e-06, "learning_rate": 0.002467719310530803, "loss": 2.6253, "step": 8698 }, { "crossentropy": 2.561974883079529, "epoch": 0.4730416813942739, "grad_norm": 0.032467518001794815, "grad_norm_var": 1.3120585187047547e-06, "learning_rate": 0.0024666457118305662, "loss": 2.562, "step": 8699 }, { "crossentropy": 2.5185450315475464, "epoch": 0.4730960602517741, "grad_norm": 0.03262649103999138, "grad_norm_var": 1.0953766681124724e-06, "learning_rate": 0.0024655722702465973, "loss": 2.5185, "step": 8700 }, { "crossentropy": 2.552359938621521, "epoch": 0.4731504391092743, "grad_norm": 0.034259114414453506, "grad_norm_var": 1.042059952798396e-06, "learning_rate": 0.002464498985845474, "loss": 2.5524, "step": 8701 }, { "crossentropy": 2.726384997367859, "epoch": 0.4732048179667745, "grad_norm": 0.03449460491538048, "grad_norm_var": 1.0557353311517354e-06, "learning_rate": 0.0024634258586937624, "loss": 2.7264, "step": 8702 }, { "crossentropy": 2.5681698322296143, "epoch": 0.4732591968242747, "grad_norm": 0.031002281233668327, "grad_norm_var": 1.3121289978951782e-06, "learning_rate": 0.002462352888858012, "loss": 2.5682, "step": 8703 }, { "crossentropy": 2.6144847869873047, "epoch": 0.4733135756817749, "grad_norm": 0.030483491718769073, "grad_norm_var": 1.6631605711300007e-06, "learning_rate": 0.0024612800764047694, "loss": 2.6145, "step": 8704 }, { "crossentropy": 2.544817328453064, "epoch": 0.4733679545392751, "grad_norm": 0.03252766653895378, "grad_norm_var": 1.6592317603956181e-06, "learning_rate": 0.0024602074214005697, "loss": 2.5448, "step": 8705 }, { "crossentropy": 2.6244364976882935, "epoch": 0.4734223333967753, "grad_norm": 0.031822800636291504, "grad_norm_var": 1.4943909246417498e-06, "learning_rate": 0.0024591349239119382, "loss": 2.6244, "step": 8706 }, { "crossentropy": 2.472290873527527, "epoch": 0.4734767122542755, "grad_norm": 0.03344469889998436, "grad_norm_var": 1.3843946185660423e-06, "learning_rate": 0.00245806258400539, "loss": 2.4723, "step": 8707 }, { "crossentropy": 2.5962904691696167, "epoch": 0.4735310911117757, "grad_norm": 0.032285988330841064, "grad_norm_var": 1.3140486791684968e-06, "learning_rate": 0.0024569904017474305, "loss": 2.5963, "step": 8708 }, { "crossentropy": 2.6482725143432617, "epoch": 0.4735854699692759, "grad_norm": 0.031698260456323624, "grad_norm_var": 1.3489156185880844e-06, "learning_rate": 0.0024559183772045546, "loss": 2.6483, "step": 8709 }, { "crossentropy": 2.585112452507019, "epoch": 0.4736398488267761, "grad_norm": 0.03248358145356178, "grad_norm_var": 1.3324860870180295e-06, "learning_rate": 0.0024548465104432516, "loss": 2.5851, "step": 8710 }, { "crossentropy": 2.604732632637024, "epoch": 0.4736942276842764, "grad_norm": 0.0321175679564476, "grad_norm_var": 1.3459242802500571e-06, "learning_rate": 0.0024537748015299914, "loss": 2.6047, "step": 8711 }, { "crossentropy": 2.6534225940704346, "epoch": 0.4737486065417766, "grad_norm": 0.03291282430291176, "grad_norm_var": 1.3040386918790887e-06, "learning_rate": 0.0024527032505312456, "loss": 2.6534, "step": 8712 }, { "crossentropy": 2.6335219144821167, "epoch": 0.4738029853992768, "grad_norm": 0.033528488129377365, "grad_norm_var": 1.1522968515833282e-06, "learning_rate": 0.0024516318575134722, "loss": 2.6335, "step": 8713 }, { "crossentropy": 2.5991272926330566, "epoch": 0.473857364256777, "grad_norm": 0.03529646620154381, "grad_norm_var": 1.5926746151587729e-06, "learning_rate": 0.0024505606225431126, "loss": 2.5991, "step": 8714 }, { "crossentropy": 2.5808063745498657, "epoch": 0.4739117431142772, "grad_norm": 0.03197535127401352, "grad_norm_var": 1.6241027691541217e-06, "learning_rate": 0.0024494895456866044, "loss": 2.5808, "step": 8715 }, { "crossentropy": 2.6600507497787476, "epoch": 0.4739661219717774, "grad_norm": 0.031862206757068634, "grad_norm_var": 1.6665711996989198e-06, "learning_rate": 0.002448418627010381, "loss": 2.6601, "step": 8716 }, { "crossentropy": 2.4761979579925537, "epoch": 0.4740205008292776, "grad_norm": 0.03127497807145119, "grad_norm_var": 1.5778076951592958e-06, "learning_rate": 0.002447347866580853, "loss": 2.4762, "step": 8717 }, { "crossentropy": 2.509027600288391, "epoch": 0.4740748796867778, "grad_norm": 0.03445414453744888, "grad_norm_var": 1.5668837407617573e-06, "learning_rate": 0.002446277264464431, "loss": 2.509, "step": 8718 }, { "crossentropy": 2.5602328777313232, "epoch": 0.474129258544278, "grad_norm": 0.03298496454954147, "grad_norm_var": 1.430340109698225e-06, "learning_rate": 0.0024452068207275117, "loss": 2.5602, "step": 8719 }, { "crossentropy": 2.6289758682250977, "epoch": 0.4741836374017782, "grad_norm": 0.03176875412464142, "grad_norm_var": 1.1756638187659465e-06, "learning_rate": 0.002444136535436484, "loss": 2.629, "step": 8720 }, { "crossentropy": 2.6694120168685913, "epoch": 0.4742380162592784, "grad_norm": 0.03272249549627304, "grad_norm_var": 1.1747954329542505e-06, "learning_rate": 0.0024430664086577277, "loss": 2.6694, "step": 8721 }, { "crossentropy": 2.688665509223938, "epoch": 0.4742923951167786, "grad_norm": 0.03124450147151947, "grad_norm_var": 1.260605413495354e-06, "learning_rate": 0.0024419964404576045, "loss": 2.6887, "step": 8722 }, { "crossentropy": 2.5608052015304565, "epoch": 0.4743467739742788, "grad_norm": 0.033674951642751694, "grad_norm_var": 1.2889779353994838e-06, "learning_rate": 0.002440926630902479, "loss": 2.5608, "step": 8723 }, { "crossentropy": 2.3586843013763428, "epoch": 0.474401152831779, "grad_norm": 0.030641408637166023, "grad_norm_var": 1.5362687128489625e-06, "learning_rate": 0.0024398569800587007, "loss": 2.3587, "step": 8724 }, { "crossentropy": 2.667136549949646, "epoch": 0.47445553168927923, "grad_norm": 0.03198692202568054, "grad_norm_var": 1.509077235652351e-06, "learning_rate": 0.002438787487992603, "loss": 2.6671, "step": 8725 }, { "crossentropy": 2.58849835395813, "epoch": 0.47450991054677943, "grad_norm": 0.03193848207592964, "grad_norm_var": 1.5330641007652805e-06, "learning_rate": 0.0024377181547705165, "loss": 2.5885, "step": 8726 }, { "crossentropy": 2.4814332723617554, "epoch": 0.47456428940427964, "grad_norm": 0.030732305720448494, "grad_norm_var": 1.7280730843574043e-06, "learning_rate": 0.0024366489804587645, "loss": 2.4814, "step": 8727 }, { "crossentropy": 2.5898325443267822, "epoch": 0.47461866826177984, "grad_norm": 0.032098203897476196, "grad_norm_var": 1.717915509287284e-06, "learning_rate": 0.0024355799651236505, "loss": 2.5898, "step": 8728 }, { "crossentropy": 2.551036834716797, "epoch": 0.47467304711928004, "grad_norm": 0.033205509185791016, "grad_norm_var": 1.6752584868522912e-06, "learning_rate": 0.002434511108831477, "loss": 2.551, "step": 8729 }, { "crossentropy": 2.5421944856643677, "epoch": 0.47472742597678025, "grad_norm": 0.03185831382870674, "grad_norm_var": 1.070840732860824e-06, "learning_rate": 0.002433442411648532, "loss": 2.5422, "step": 8730 }, { "crossentropy": 2.535807967185974, "epoch": 0.47478180483428045, "grad_norm": 0.031199932098388672, "grad_norm_var": 1.1266290197329068e-06, "learning_rate": 0.002432373873641097, "loss": 2.5358, "step": 8731 }, { "crossentropy": 2.5289872884750366, "epoch": 0.47483618369178066, "grad_norm": 0.032427869737148285, "grad_norm_var": 1.1284660401626618e-06, "learning_rate": 0.0024313054948754413, "loss": 2.529, "step": 8732 }, { "crossentropy": 2.506751537322998, "epoch": 0.47489056254928086, "grad_norm": 0.031420834362506866, "grad_norm_var": 1.1130050718173117e-06, "learning_rate": 0.002430237275417821, "loss": 2.5068, "step": 8733 }, { "crossentropy": 2.5711185932159424, "epoch": 0.47494494140678106, "grad_norm": 0.09280365705490112, "grad_norm_var": 0.00023185034804743289, "learning_rate": 0.0024291692153344904, "loss": 2.5711, "step": 8734 }, { "crossentropy": 2.4901092052459717, "epoch": 0.47499932026428127, "grad_norm": 0.037938568741083145, "grad_norm_var": 0.00023152846112652932, "learning_rate": 0.0024281013146916905, "loss": 2.4901, "step": 8735 }, { "crossentropy": 2.6086106300354004, "epoch": 0.47505369912178147, "grad_norm": 0.033896662294864655, "grad_norm_var": 0.00023058148289115438, "learning_rate": 0.0024270335735556477, "loss": 2.6086, "step": 8736 }, { "crossentropy": 2.610690116882324, "epoch": 0.4751080779792817, "grad_norm": 0.03217834234237671, "grad_norm_var": 0.0002308549735361602, "learning_rate": 0.0024259659919925814, "loss": 2.6107, "step": 8737 }, { "crossentropy": 2.6936819553375244, "epoch": 0.4751624568367819, "grad_norm": 0.03278103843331337, "grad_norm_var": 0.00022998669680235085, "learning_rate": 0.0024248985700687084, "loss": 2.6937, "step": 8738 }, { "crossentropy": 2.5729490518569946, "epoch": 0.4752168356942821, "grad_norm": 0.033569999039173126, "grad_norm_var": 0.00023002410446439398, "learning_rate": 0.002423831307850223, "loss": 2.5729, "step": 8739 }, { "crossentropy": 2.6539312601089478, "epoch": 0.4752712145517823, "grad_norm": 0.03665580227971077, "grad_norm_var": 0.00022775329200959784, "learning_rate": 0.0024227642054033193, "loss": 2.6539, "step": 8740 }, { "crossentropy": 2.562968373298645, "epoch": 0.4753255934092825, "grad_norm": 0.032375551760196686, "grad_norm_var": 0.0002275201563750562, "learning_rate": 0.002421697262794176, "loss": 2.563, "step": 8741 }, { "crossentropy": 2.6129170656204224, "epoch": 0.4753799722667827, "grad_norm": 0.03321664407849312, "grad_norm_var": 0.00022681206373661005, "learning_rate": 0.002420630480088965, "loss": 2.6129, "step": 8742 }, { "crossentropy": 2.576493263244629, "epoch": 0.4754343511242829, "grad_norm": 0.031369298696517944, "grad_norm_var": 0.00022632441962407892, "learning_rate": 0.0024195638573538468, "loss": 2.5765, "step": 8743 }, { "crossentropy": 2.547930359840393, "epoch": 0.4754887299817831, "grad_norm": 0.031092196702957153, "grad_norm_var": 0.00022701998992804464, "learning_rate": 0.0024184973946549727, "loss": 2.5479, "step": 8744 }, { "crossentropy": 2.5004066228866577, "epoch": 0.4755431088392833, "grad_norm": 0.03613356873393059, "grad_norm_var": 0.00022617227698797164, "learning_rate": 0.002417431092058483, "loss": 2.5004, "step": 8745 }, { "crossentropy": 2.6028802394866943, "epoch": 0.4755974876967835, "grad_norm": 0.03262601047754288, "grad_norm_var": 0.00022568973144524804, "learning_rate": 0.0024163649496305112, "loss": 2.6029, "step": 8746 }, { "crossentropy": 2.6005760431289673, "epoch": 0.4756518665542837, "grad_norm": 0.03261653706431389, "grad_norm_var": 0.0002247233409379543, "learning_rate": 0.002415298967437175, "loss": 2.6006, "step": 8747 }, { "crossentropy": 2.666853666305542, "epoch": 0.4757062454117839, "grad_norm": 0.03191268816590309, "grad_norm_var": 0.00022505872640672797, "learning_rate": 0.0024142331455445846, "loss": 2.6669, "step": 8748 }, { "crossentropy": 2.622381091117859, "epoch": 0.4757606242692841, "grad_norm": 0.03394944220781326, "grad_norm_var": 0.00022356496203523724, "learning_rate": 0.0024131674840188484, "loss": 2.6224, "step": 8749 }, { "crossentropy": 2.566866874694824, "epoch": 0.4758150031267843, "grad_norm": 0.03239788860082626, "grad_norm_var": 3.738684868034338e-06, "learning_rate": 0.0024121019829260503, "loss": 2.5669, "step": 8750 }, { "crossentropy": 2.6918028593063354, "epoch": 0.4758693819842845, "grad_norm": 0.03296235576272011, "grad_norm_var": 2.2879005405563105e-06, "learning_rate": 0.0024110366423322756, "loss": 2.6918, "step": 8751 }, { "crossentropy": 2.4340357780456543, "epoch": 0.4759237608417847, "grad_norm": 0.03666018322110176, "grad_norm_var": 3.055675212765054e-06, "learning_rate": 0.002409971462303594, "loss": 2.434, "step": 8752 }, { "crossentropy": 2.5561596155166626, "epoch": 0.47597813969928493, "grad_norm": 0.03463529050350189, "grad_norm_var": 3.071707641421941e-06, "learning_rate": 0.0024089064429060674, "loss": 2.5562, "step": 8753 }, { "crossentropy": 2.590038299560547, "epoch": 0.47603251855678513, "grad_norm": 0.035199373960494995, "grad_norm_var": 3.2264737726688783e-06, "learning_rate": 0.0024078415842057476, "loss": 2.59, "step": 8754 }, { "crossentropy": 2.6624338626861572, "epoch": 0.47608689741428534, "grad_norm": 0.03384951502084732, "grad_norm_var": 3.2307678910566014e-06, "learning_rate": 0.0024067768862686767, "loss": 2.6624, "step": 8755 }, { "crossentropy": 2.5533427000045776, "epoch": 0.47614127627178554, "grad_norm": 0.03264123201370239, "grad_norm_var": 2.6041198267332903e-06, "learning_rate": 0.002405712349160885, "loss": 2.5533, "step": 8756 }, { "crossentropy": 2.457753539085388, "epoch": 0.47619565512928574, "grad_norm": 0.03276805952191353, "grad_norm_var": 2.5626280255504225e-06, "learning_rate": 0.0024046479729483974, "loss": 2.4578, "step": 8757 }, { "crossentropy": 2.655007243156433, "epoch": 0.47625003398678595, "grad_norm": 0.03291301801800728, "grad_norm_var": 2.5748772494855905e-06, "learning_rate": 0.0024035837576972204, "loss": 2.655, "step": 8758 }, { "crossentropy": 2.6723746061325073, "epoch": 0.47630441284428615, "grad_norm": 0.032308515161275864, "grad_norm_var": 2.380977919746776e-06, "learning_rate": 0.0024025197034733565, "loss": 2.6724, "step": 8759 }, { "crossentropy": 2.527235746383667, "epoch": 0.47635879170178635, "grad_norm": 0.031711477786302567, "grad_norm_var": 2.213017950526639e-06, "learning_rate": 0.0024014558103428027, "loss": 2.5272, "step": 8760 }, { "crossentropy": 2.5904852151870728, "epoch": 0.47641317055928656, "grad_norm": 0.03292820602655411, "grad_norm_var": 1.7105313166729716e-06, "learning_rate": 0.0024003920783715344, "loss": 2.5905, "step": 8761 }, { "crossentropy": 2.5709354877471924, "epoch": 0.47646754941678676, "grad_norm": 0.03128299117088318, "grad_norm_var": 1.9358929856457447e-06, "learning_rate": 0.0023993285076255267, "loss": 2.5709, "step": 8762 }, { "crossentropy": 2.5435320138931274, "epoch": 0.47652192827428697, "grad_norm": 0.032300934195518494, "grad_norm_var": 1.965452357420051e-06, "learning_rate": 0.00239826509817074, "loss": 2.5435, "step": 8763 }, { "crossentropy": 2.561176896095276, "epoch": 0.47657630713178717, "grad_norm": 0.03192941099405289, "grad_norm_var": 1.962708038270172e-06, "learning_rate": 0.002397201850073126, "loss": 2.5612, "step": 8764 }, { "crossentropy": 2.54872727394104, "epoch": 0.4766306859892874, "grad_norm": 0.03578121215105057, "grad_norm_var": 2.3670934746512773e-06, "learning_rate": 0.002396138763398627, "loss": 2.5487, "step": 8765 }, { "crossentropy": 2.7738994359970093, "epoch": 0.4766850648467876, "grad_norm": 0.05829690024256706, "grad_norm_var": 4.128880603710596e-05, "learning_rate": 0.002395075838213174, "loss": 2.7739, "step": 8766 }, { "crossentropy": 2.5355066061019897, "epoch": 0.4767394437042878, "grad_norm": 0.033077117055654526, "grad_norm_var": 4.126020152294726e-05, "learning_rate": 0.0023940130745826894, "loss": 2.5355, "step": 8767 }, { "crossentropy": 2.579948902130127, "epoch": 0.476793822561788, "grad_norm": 0.03162621334195137, "grad_norm_var": 4.1657687123781e-05, "learning_rate": 0.002392950472573084, "loss": 2.5799, "step": 8768 }, { "crossentropy": 2.657760977745056, "epoch": 0.4768482014192882, "grad_norm": 0.032529670745134354, "grad_norm_var": 4.191873075947312e-05, "learning_rate": 0.002391888032250262, "loss": 2.6578, "step": 8769 }, { "crossentropy": 2.5385334491729736, "epoch": 0.4769025802767884, "grad_norm": 0.03474677726626396, "grad_norm_var": 4.1886099817327735e-05, "learning_rate": 0.002390825753680109, "loss": 2.5385, "step": 8770 }, { "crossentropy": 2.5150654315948486, "epoch": 0.4769569591342886, "grad_norm": 0.0329609140753746, "grad_norm_var": 4.200282878838672e-05, "learning_rate": 0.002389763636928513, "loss": 2.5151, "step": 8771 }, { "crossentropy": 2.6793015003204346, "epoch": 0.4770113379917888, "grad_norm": 0.03488743677735329, "grad_norm_var": 4.180260954904438e-05, "learning_rate": 0.0023887016820613415, "loss": 2.6793, "step": 8772 }, { "crossentropy": 2.6583685874938965, "epoch": 0.477065716849289, "grad_norm": 0.03460514172911644, "grad_norm_var": 4.158856213573592e-05, "learning_rate": 0.0023876398891444573, "loss": 2.6584, "step": 8773 }, { "crossentropy": 2.63930344581604, "epoch": 0.4771200957067892, "grad_norm": 0.03266270086169243, "grad_norm_var": 4.1649378831413315e-05, "learning_rate": 0.002386578258243712, "loss": 2.6393, "step": 8774 }, { "crossentropy": 2.5753999948501587, "epoch": 0.4771744745642894, "grad_norm": 0.03302428871393204, "grad_norm_var": 4.1462495872115976e-05, "learning_rate": 0.002385516789424946, "loss": 2.5754, "step": 8775 }, { "crossentropy": 2.5379743576049805, "epoch": 0.4772288534217896, "grad_norm": 0.032991740852594376, "grad_norm_var": 4.106384568555854e-05, "learning_rate": 0.0023844554827539915, "loss": 2.538, "step": 8776 }, { "crossentropy": 2.506746530532837, "epoch": 0.4772832322792898, "grad_norm": 0.033323753625154495, "grad_norm_var": 4.0978757619516046e-05, "learning_rate": 0.0023833943382966694, "loss": 2.5067, "step": 8777 }, { "crossentropy": 2.4245612621307373, "epoch": 0.47733761113679, "grad_norm": 0.03297511115670204, "grad_norm_var": 4.0375115739225453e-05, "learning_rate": 0.002382333356118791, "loss": 2.4246, "step": 8778 }, { "crossentropy": 2.508251428604126, "epoch": 0.4773919899942902, "grad_norm": 0.03311401978135109, "grad_norm_var": 4.0139278674374236e-05, "learning_rate": 0.0023812725362861577, "loss": 2.5083, "step": 8779 }, { "crossentropy": 2.4453768730163574, "epoch": 0.4774463688517904, "grad_norm": 0.03233864903450012, "grad_norm_var": 3.998720394913872e-05, "learning_rate": 0.0023802118788645616, "loss": 2.4454, "step": 8780 }, { "crossentropy": 2.5190504789352417, "epoch": 0.47750074770929063, "grad_norm": 0.03198935464024544, "grad_norm_var": 4.0457431703730336e-05, "learning_rate": 0.0023791513839197787, "loss": 2.5191, "step": 8781 }, { "crossentropy": 2.489630341529846, "epoch": 0.47755512656679083, "grad_norm": 0.03178456053137779, "grad_norm_var": 9.633102385677694e-07, "learning_rate": 0.002378091051517586, "loss": 2.4896, "step": 8782 }, { "crossentropy": 2.644059896469116, "epoch": 0.47760950542429104, "grad_norm": 0.031527817249298096, "grad_norm_var": 1.1056305435510246e-06, "learning_rate": 0.002377030881723743, "loss": 2.6441, "step": 8783 }, { "crossentropy": 2.535805344581604, "epoch": 0.47766388428179124, "grad_norm": 0.03224761039018631, "grad_norm_var": 1.0206635123863675e-06, "learning_rate": 0.0023759708746039975, "loss": 2.5358, "step": 8784 }, { "crossentropy": 2.6212886571884155, "epoch": 0.47771826313929144, "grad_norm": 0.03198545053601265, "grad_norm_var": 1.071985596377313e-06, "learning_rate": 0.0023749110302240927, "loss": 2.6213, "step": 8785 }, { "crossentropy": 2.615757465362549, "epoch": 0.47777264199679165, "grad_norm": 0.030564630404114723, "grad_norm_var": 1.162006043658537e-06, "learning_rate": 0.002373851348649758, "loss": 2.6158, "step": 8786 }, { "crossentropy": 2.6108309030532837, "epoch": 0.47782702085429185, "grad_norm": 0.031299035996198654, "grad_norm_var": 1.2738039161437243e-06, "learning_rate": 0.002372791829946714, "loss": 2.6108, "step": 8787 }, { "crossentropy": 2.5846818685531616, "epoch": 0.47788139971179205, "grad_norm": 0.03361330181360245, "grad_norm_var": 9.837080826268953e-07, "learning_rate": 0.002371732474180672, "loss": 2.5847, "step": 8788 }, { "crossentropy": 2.588340401649475, "epoch": 0.47793577856929226, "grad_norm": 0.03194616734981537, "grad_norm_var": 6.803015455273161e-07, "learning_rate": 0.002370673281417331, "loss": 2.5883, "step": 8789 }, { "crossentropy": 2.6691235303878784, "epoch": 0.47799015742679246, "grad_norm": 0.03287084028124809, "grad_norm_var": 6.920546000902291e-07, "learning_rate": 0.0023696142517223824, "loss": 2.6691, "step": 8790 }, { "crossentropy": 2.582574248313904, "epoch": 0.47804453628429266, "grad_norm": 0.03132924810051918, "grad_norm_var": 7.191825602479897e-07, "learning_rate": 0.0023685553851615073, "loss": 2.5826, "step": 8791 }, { "crossentropy": 2.6108559370040894, "epoch": 0.47809891514179287, "grad_norm": 0.03292878717184067, "grad_norm_var": 7.1315243213899e-07, "learning_rate": 0.002367496681800372, "loss": 2.6109, "step": 8792 }, { "crossentropy": 2.665704846382141, "epoch": 0.47815329399929307, "grad_norm": 0.030238520354032516, "grad_norm_var": 8.622085253971358e-07, "learning_rate": 0.002366438141704636, "loss": 2.6657, "step": 8793 }, { "crossentropy": 2.6568058729171753, "epoch": 0.4782076728567933, "grad_norm": 0.032530754804611206, "grad_norm_var": 8.195651234922518e-07, "learning_rate": 0.002365379764939955, "loss": 2.6568, "step": 8794 }, { "crossentropy": 2.50271213054657, "epoch": 0.4782620517142935, "grad_norm": 0.03231314942240715, "grad_norm_var": 7.427547127198391e-07, "learning_rate": 0.002364321551571963, "loss": 2.5027, "step": 8795 }, { "crossentropy": 2.590876340866089, "epoch": 0.4783164305717937, "grad_norm": 0.033879708498716354, "grad_norm_var": 9.67087412543606e-07, "learning_rate": 0.0023632635016662914, "loss": 2.5909, "step": 8796 }, { "crossentropy": 2.604430913925171, "epoch": 0.4783708094292939, "grad_norm": 0.03325580433011055, "grad_norm_var": 1.0544630416991676e-06, "learning_rate": 0.0023622056152885584, "loss": 2.6044, "step": 8797 }, { "crossentropy": 2.5788443088531494, "epoch": 0.4784251882867941, "grad_norm": 0.030643422156572342, "grad_norm_var": 1.1906479839928249e-06, "learning_rate": 0.0023611478925043743, "loss": 2.5788, "step": 8798 }, { "crossentropy": 2.6275309324264526, "epoch": 0.4784795671442943, "grad_norm": 0.03274720534682274, "grad_norm_var": 1.194877613104013e-06, "learning_rate": 0.0023600903333793398, "loss": 2.6275, "step": 8799 }, { "crossentropy": 2.6117345094680786, "epoch": 0.4785339460017945, "grad_norm": 0.03149079531431198, "grad_norm_var": 1.2207858135901504e-06, "learning_rate": 0.0023590329379790382, "loss": 2.6117, "step": 8800 }, { "crossentropy": 2.745244860649109, "epoch": 0.4785883248592947, "grad_norm": 0.03143015131354332, "grad_norm_var": 1.248709763698484e-06, "learning_rate": 0.002357975706369053, "loss": 2.7452, "step": 8801 }, { "crossentropy": 2.6091222763061523, "epoch": 0.4786427037167949, "grad_norm": 0.031766898930072784, "grad_norm_var": 1.0981214003604375e-06, "learning_rate": 0.0023569186386149534, "loss": 2.6091, "step": 8802 }, { "crossentropy": 2.5088231563568115, "epoch": 0.4786970825742951, "grad_norm": 0.031130054965615273, "grad_norm_var": 1.1189153230224072e-06, "learning_rate": 0.0023558617347822935, "loss": 2.5088, "step": 8803 }, { "crossentropy": 2.479746103286743, "epoch": 0.4787514614317953, "grad_norm": 0.031392667442560196, "grad_norm_var": 9.885777593503318e-07, "learning_rate": 0.0023548049949366225, "loss": 2.4797, "step": 8804 }, { "crossentropy": 2.601642370223999, "epoch": 0.4788058402892955, "grad_norm": 0.03293510153889656, "grad_norm_var": 1.0434760363655133e-06, "learning_rate": 0.0023537484191434826, "loss": 2.6016, "step": 8805 }, { "crossentropy": 2.6372640132904053, "epoch": 0.4788602191467957, "grad_norm": 0.03185168653726578, "grad_norm_var": 9.975573726948937e-07, "learning_rate": 0.002352692007468397, "loss": 2.6373, "step": 8806 }, { "crossentropy": 2.6204166412353516, "epoch": 0.4789145980042959, "grad_norm": 0.03149246796965599, "grad_norm_var": 9.84810121804749e-07, "learning_rate": 0.002351635759976885, "loss": 2.6204, "step": 8807 }, { "crossentropy": 2.514963150024414, "epoch": 0.4789689768617961, "grad_norm": 0.03176988288760185, "grad_norm_var": 9.254970478381746e-07, "learning_rate": 0.0023505796767344547, "loss": 2.515, "step": 8808 }, { "crossentropy": 2.6102561950683594, "epoch": 0.47902335571929633, "grad_norm": 0.03445587679743767, "grad_norm_var": 1.086397180216506e-06, "learning_rate": 0.002349523757806602, "loss": 2.6103, "step": 8809 }, { "crossentropy": 2.6202231645584106, "epoch": 0.47907773457679653, "grad_norm": 0.03522586077451706, "grad_norm_var": 1.6617965415739126e-06, "learning_rate": 0.0023484680032588153, "loss": 2.6202, "step": 8810 }, { "crossentropy": 2.597733497619629, "epoch": 0.47913211343429674, "grad_norm": 0.03308304026722908, "grad_norm_var": 1.6938999496071317e-06, "learning_rate": 0.0023474124131565707, "loss": 2.5977, "step": 8811 }, { "crossentropy": 2.575605034828186, "epoch": 0.47918649229179694, "grad_norm": 0.03327267989516258, "grad_norm_var": 1.5979287456878058e-06, "learning_rate": 0.002346356987565335, "loss": 2.5756, "step": 8812 }, { "crossentropy": 2.4577391147613525, "epoch": 0.47924087114929714, "grad_norm": 0.031782474368810654, "grad_norm_var": 1.5598763393830413e-06, "learning_rate": 0.0023453017265505673, "loss": 2.4577, "step": 8813 }, { "crossentropy": 2.531318187713623, "epoch": 0.47929525000679735, "grad_norm": 0.031524982303380966, "grad_norm_var": 1.416154039307948e-06, "learning_rate": 0.00234424663017771, "loss": 2.5313, "step": 8814 }, { "crossentropy": 2.5658087730407715, "epoch": 0.47934962886429755, "grad_norm": 0.03255434334278107, "grad_norm_var": 1.4078658045427734e-06, "learning_rate": 0.0023431916985121983, "loss": 2.5658, "step": 8815 }, { "crossentropy": 2.557629108428955, "epoch": 0.47940400772179775, "grad_norm": 0.03323712944984436, "grad_norm_var": 1.404828149402963e-06, "learning_rate": 0.002342136931619464, "loss": 2.5576, "step": 8816 }, { "crossentropy": 2.636334180831909, "epoch": 0.47945838657929796, "grad_norm": 0.0326155349612236, "grad_norm_var": 1.3343719047931157e-06, "learning_rate": 0.002341082329564918, "loss": 2.6363, "step": 8817 }, { "crossentropy": 2.5989441871643066, "epoch": 0.47951276543679816, "grad_norm": 0.03192265331745148, "grad_norm_var": 1.3205459231575006e-06, "learning_rate": 0.0023400278924139667, "loss": 2.5989, "step": 8818 }, { "crossentropy": 2.6358141899108887, "epoch": 0.47956714429429836, "grad_norm": 0.03088299185037613, "grad_norm_var": 1.3699966983790035e-06, "learning_rate": 0.0023389736202320053, "loss": 2.6358, "step": 8819 }, { "crossentropy": 2.6175044775009155, "epoch": 0.47962152315179857, "grad_norm": 0.03105471283197403, "grad_norm_var": 1.427030352740601e-06, "learning_rate": 0.00233791951308442, "loss": 2.6175, "step": 8820 }, { "crossentropy": 2.5524879693984985, "epoch": 0.47967590200929877, "grad_norm": 0.0337705984711647, "grad_norm_var": 1.5214862923481598e-06, "learning_rate": 0.002336865571036584, "loss": 2.5525, "step": 8821 }, { "crossentropy": 2.521877646446228, "epoch": 0.479730280866799, "grad_norm": 0.031809449195861816, "grad_norm_var": 1.5254237669514703e-06, "learning_rate": 0.0023358117941538626, "loss": 2.5219, "step": 8822 }, { "crossentropy": 2.6768561601638794, "epoch": 0.4797846597242992, "grad_norm": 0.033178940415382385, "grad_norm_var": 1.470238905581835e-06, "learning_rate": 0.00233475818250161, "loss": 2.6769, "step": 8823 }, { "crossentropy": 2.624660611152649, "epoch": 0.4798390385817994, "grad_norm": 0.03303498029708862, "grad_norm_var": 1.4245394296465962e-06, "learning_rate": 0.00233370473614517, "loss": 2.6247, "step": 8824 }, { "crossentropy": 2.6824748516082764, "epoch": 0.4798934174392996, "grad_norm": 0.03285666182637215, "grad_norm_var": 1.2127278320723036e-06, "learning_rate": 0.0023326514551498785, "loss": 2.6825, "step": 8825 }, { "crossentropy": 2.5931732654571533, "epoch": 0.4799477962967998, "grad_norm": 0.03360993042588234, "grad_norm_var": 8.129566124448484e-07, "learning_rate": 0.002331598339581053, "loss": 2.5932, "step": 8826 }, { "crossentropy": 2.324093222618103, "epoch": 0.4800021751543, "grad_norm": 0.03334498405456543, "grad_norm_var": 8.371910392698222e-07, "learning_rate": 0.002330545389504015, "loss": 2.3241, "step": 8827 }, { "crossentropy": 2.567172884941101, "epoch": 0.4800565540118002, "grad_norm": 0.03281467407941818, "grad_norm_var": 8.048451888321984e-07, "learning_rate": 0.0023294926049840605, "loss": 2.5672, "step": 8828 }, { "crossentropy": 2.53537118434906, "epoch": 0.4801109328693004, "grad_norm": 0.03126974403858185, "grad_norm_var": 8.703077293845465e-07, "learning_rate": 0.0023284399860864853, "loss": 2.5354, "step": 8829 }, { "crossentropy": 2.5550631284713745, "epoch": 0.4801653117268006, "grad_norm": 0.034952472895383835, "grad_norm_var": 1.1737430554191349e-06, "learning_rate": 0.0023273875328765716, "loss": 2.5551, "step": 8830 }, { "crossentropy": 2.554668664932251, "epoch": 0.4802196905843008, "grad_norm": 0.03509622439742088, "grad_norm_var": 1.534347009984662e-06, "learning_rate": 0.0023263352454195915, "loss": 2.5547, "step": 8831 }, { "crossentropy": 2.6866400241851807, "epoch": 0.480274069441801, "grad_norm": 0.030742377042770386, "grad_norm_var": 1.791478112857468e-06, "learning_rate": 0.0023252831237808064, "loss": 2.6866, "step": 8832 }, { "crossentropy": 2.4580122232437134, "epoch": 0.4803284482993012, "grad_norm": 0.03125292807817459, "grad_norm_var": 1.9201073208313196e-06, "learning_rate": 0.0023242311680254685, "loss": 2.458, "step": 8833 }, { "crossentropy": 2.7198050022125244, "epoch": 0.4803828271568014, "grad_norm": 0.033167582005262375, "grad_norm_var": 1.9045985700377165e-06, "learning_rate": 0.0023231793782188194, "loss": 2.7198, "step": 8834 }, { "crossentropy": 2.4535462856292725, "epoch": 0.4804372060143016, "grad_norm": 0.033865462988615036, "grad_norm_var": 1.746953875292103e-06, "learning_rate": 0.0023221277544260887, "loss": 2.4535, "step": 8835 }, { "crossentropy": 2.5302035808563232, "epoch": 0.4804915848718018, "grad_norm": 0.03271166980266571, "grad_norm_var": 1.5188580009419435e-06, "learning_rate": 0.002321076296712501, "loss": 2.5302, "step": 8836 }, { "crossentropy": 2.518784523010254, "epoch": 0.48054596372930203, "grad_norm": 0.03394931182265282, "grad_norm_var": 1.5399927098896349e-06, "learning_rate": 0.00232002500514326, "loss": 2.5188, "step": 8837 }, { "crossentropy": 2.639063596725464, "epoch": 0.48060034258680223, "grad_norm": 0.05636269599199295, "grad_norm_var": 3.5391379072362225e-05, "learning_rate": 0.002318973879783571, "loss": 2.6391, "step": 8838 }, { "crossentropy": 2.4828968048095703, "epoch": 0.48065472144430244, "grad_norm": 0.03215022385120392, "grad_norm_var": 3.564052537507878e-05, "learning_rate": 0.002317922920698624, "loss": 2.4829, "step": 8839 }, { "crossentropy": 2.5919758081436157, "epoch": 0.48070910030180264, "grad_norm": 0.03360619395971298, "grad_norm_var": 3.5553233758041776e-05, "learning_rate": 0.002316872127953596, "loss": 2.592, "step": 8840 }, { "crossentropy": 2.609531879425049, "epoch": 0.48076347915930284, "grad_norm": 0.03535742685198784, "grad_norm_var": 3.5401295395348585e-05, "learning_rate": 0.0023158215016136565, "loss": 2.6095, "step": 8841 }, { "crossentropy": 2.4993748664855957, "epoch": 0.48081785801680305, "grad_norm": 0.03282352536916733, "grad_norm_var": 3.554804548224118e-05, "learning_rate": 0.0023147710417439657, "loss": 2.4994, "step": 8842 }, { "crossentropy": 2.6537840366363525, "epoch": 0.48087223687430325, "grad_norm": 0.0323636569082737, "grad_norm_var": 3.577136042520231e-05, "learning_rate": 0.0023137207484096706, "loss": 2.6538, "step": 8843 }, { "crossentropy": 2.5906906127929688, "epoch": 0.48092661573180345, "grad_norm": 0.03149373084306717, "grad_norm_var": 3.618259712394502e-05, "learning_rate": 0.002312670621675911, "loss": 2.5907, "step": 8844 }, { "crossentropy": 2.5487178564071655, "epoch": 0.48098099458930366, "grad_norm": 0.03255247697234154, "grad_norm_var": 3.574188402446036e-05, "learning_rate": 0.0023116206616078136, "loss": 2.5487, "step": 8845 }, { "crossentropy": 2.5797066688537598, "epoch": 0.48103537344680386, "grad_norm": 0.03260507807135582, "grad_norm_var": 3.595342057777229e-05, "learning_rate": 0.0023105708682704964, "loss": 2.5797, "step": 8846 }, { "crossentropy": 2.476075053215027, "epoch": 0.48108975230430406, "grad_norm": 0.03231995925307274, "grad_norm_var": 3.6170500524679294e-05, "learning_rate": 0.0023095212417290683, "loss": 2.4761, "step": 8847 }, { "crossentropy": 2.5181000232696533, "epoch": 0.48114413116180427, "grad_norm": 0.03254538029432297, "grad_norm_var": 3.554059515401345e-05, "learning_rate": 0.002308471782048621, "loss": 2.5181, "step": 8848 }, { "crossentropy": 2.621568202972412, "epoch": 0.48119851001930447, "grad_norm": 0.032646503299474716, "grad_norm_var": 3.509199586190174e-05, "learning_rate": 0.0023074224892942466, "loss": 2.6216, "step": 8849 }, { "crossentropy": 2.686271905899048, "epoch": 0.4812528888768047, "grad_norm": 0.03244265541434288, "grad_norm_var": 3.524469268113556e-05, "learning_rate": 0.002306373363531021, "loss": 2.6863, "step": 8850 }, { "crossentropy": 2.57254421710968, "epoch": 0.4813072677343049, "grad_norm": 0.0331767238676548, "grad_norm_var": 3.531996088903481e-05, "learning_rate": 0.0023053244048240056, "loss": 2.5725, "step": 8851 }, { "crossentropy": 2.51213002204895, "epoch": 0.4813616465918051, "grad_norm": 0.03175603225827217, "grad_norm_var": 3.558186751197248e-05, "learning_rate": 0.0023042756132382568, "loss": 2.5121, "step": 8852 }, { "crossentropy": 2.56387996673584, "epoch": 0.4814160254493053, "grad_norm": 0.032211970537900925, "grad_norm_var": 3.584236222741154e-05, "learning_rate": 0.0023032269888388253, "loss": 2.5639, "step": 8853 }, { "crossentropy": 2.582181215286255, "epoch": 0.4814704043068055, "grad_norm": 0.03237605467438698, "grad_norm_var": 7.640796369963339e-07, "learning_rate": 0.0023021785316907394, "loss": 2.5822, "step": 8854 }, { "crossentropy": 2.6956013441085815, "epoch": 0.4815247831643057, "grad_norm": 0.03183316811919212, "grad_norm_var": 7.915628965728579e-07, "learning_rate": 0.0023011302418590265, "loss": 2.6956, "step": 8855 }, { "crossentropy": 2.594281554222107, "epoch": 0.4815791620218059, "grad_norm": 0.03192732855677605, "grad_norm_var": 7.496321679104426e-07, "learning_rate": 0.0023000821194086992, "loss": 2.5943, "step": 8856 }, { "crossentropy": 2.4336217641830444, "epoch": 0.4816335408793061, "grad_norm": 0.03301510587334633, "grad_norm_var": 2.0856088215239164e-07, "learning_rate": 0.002299034164404762, "loss": 2.4336, "step": 8857 }, { "crossentropy": 2.5816218852996826, "epoch": 0.4816879197368063, "grad_norm": 0.031142042949795723, "grad_norm_var": 2.859659993520071e-07, "learning_rate": 0.0022979863769122095, "loss": 2.5816, "step": 8858 }, { "crossentropy": 2.6026906967163086, "epoch": 0.4817422985943065, "grad_norm": 0.03129400312900543, "grad_norm_var": 3.4490178957266987e-07, "learning_rate": 0.0022969387569960194, "loss": 2.6027, "step": 8859 }, { "crossentropy": 2.6664355993270874, "epoch": 0.48179667745180677, "grad_norm": 0.033616241067647934, "grad_norm_var": 4.2414758473715135e-07, "learning_rate": 0.002295891304721169, "loss": 2.6664, "step": 8860 }, { "crossentropy": 2.544631838798523, "epoch": 0.48185105630930697, "grad_norm": 0.03150329366326332, "grad_norm_var": 4.634042436013937e-07, "learning_rate": 0.002294844020152621, "loss": 2.5446, "step": 8861 }, { "crossentropy": 2.609142303466797, "epoch": 0.4819054351668072, "grad_norm": 0.031242884695529938, "grad_norm_var": 5.19557739151706e-07, "learning_rate": 0.002293796903355323, "loss": 2.6091, "step": 8862 }, { "crossentropy": 2.669127941131592, "epoch": 0.4819598140243074, "grad_norm": 0.03420257940888405, "grad_norm_var": 7.735491087309017e-07, "learning_rate": 0.002292749954394216, "loss": 2.6691, "step": 8863 }, { "crossentropy": 2.553987503051758, "epoch": 0.4820141928818076, "grad_norm": 0.03225523978471756, "grad_norm_var": 7.696368950156484e-07, "learning_rate": 0.0022917031733342363, "loss": 2.554, "step": 8864 }, { "crossentropy": 2.4927386045455933, "epoch": 0.4820685717393078, "grad_norm": 0.03148134797811508, "grad_norm_var": 7.991195912389192e-07, "learning_rate": 0.0022906565602402998, "loss": 2.4927, "step": 8865 }, { "crossentropy": 2.5405198335647583, "epoch": 0.482122950596808, "grad_norm": 0.0313241183757782, "grad_norm_var": 8.437046000626801e-07, "learning_rate": 0.002289610115177317, "loss": 2.5405, "step": 8866 }, { "crossentropy": 2.5767712593078613, "epoch": 0.4821773294543082, "grad_norm": 0.031667351722717285, "grad_norm_var": 7.789379965814399e-07, "learning_rate": 0.0022885638382101883, "loss": 2.5768, "step": 8867 }, { "crossentropy": 2.5445717573165894, "epoch": 0.4822317083118084, "grad_norm": 0.033510949462652206, "grad_norm_var": 9.019230676514938e-07, "learning_rate": 0.002287517729403802, "loss": 2.5446, "step": 8868 }, { "crossentropy": 2.532157778739929, "epoch": 0.4822860871693086, "grad_norm": 0.03338807821273804, "grad_norm_var": 9.960965198518018e-07, "learning_rate": 0.002286471788823039, "loss": 2.5322, "step": 8869 }, { "crossentropy": 2.4842864274978638, "epoch": 0.4823404660268088, "grad_norm": 0.031735748052597046, "grad_norm_var": 1.009784208023815e-06, "learning_rate": 0.0022854260165327627, "loss": 2.4843, "step": 8870 }, { "crossentropy": 2.5591315031051636, "epoch": 0.482394844884309, "grad_norm": 0.03170851245522499, "grad_norm_var": 1.0167895536596865e-06, "learning_rate": 0.0022843804125978356, "loss": 2.5591, "step": 8871 }, { "crossentropy": 2.6594462394714355, "epoch": 0.4824492237418092, "grad_norm": 0.03223089873790741, "grad_norm_var": 1.0119810219202334e-06, "learning_rate": 0.002283334977083105, "loss": 2.6594, "step": 8872 }, { "crossentropy": 2.503114938735962, "epoch": 0.4825036025993094, "grad_norm": 0.032304808497428894, "grad_norm_var": 9.670187950241848e-07, "learning_rate": 0.0022822897100534057, "loss": 2.5031, "step": 8873 }, { "crossentropy": 2.609576106071472, "epoch": 0.4825579814568096, "grad_norm": 0.03197012096643448, "grad_norm_var": 8.971509139633788e-07, "learning_rate": 0.0022812446115735617, "loss": 2.6096, "step": 8874 }, { "crossentropy": 2.53876268863678, "epoch": 0.4826123603143098, "grad_norm": 0.030865570530295372, "grad_norm_var": 9.612207621031611e-07, "learning_rate": 0.002280199681708397, "loss": 2.5388, "step": 8875 }, { "crossentropy": 2.614386796951294, "epoch": 0.48266673917181, "grad_norm": 0.03256732225418091, "grad_norm_var": 8.30235081542456e-07, "learning_rate": 0.0022791549205227104, "loss": 2.6144, "step": 8876 }, { "crossentropy": 2.3205596208572388, "epoch": 0.4827211180293102, "grad_norm": 0.031995777040719986, "grad_norm_var": 8.047388031784252e-07, "learning_rate": 0.002278110328081298, "loss": 2.3206, "step": 8877 }, { "crossentropy": 2.5213228464126587, "epoch": 0.48277549688681043, "grad_norm": 0.031921494752168655, "grad_norm_var": 7.51153617618488e-07, "learning_rate": 0.0022770659044489463, "loss": 2.5213, "step": 8878 }, { "crossentropy": 2.505875587463379, "epoch": 0.48282987574431063, "grad_norm": 0.0358792319893837, "grad_norm_var": 1.3755145407928163e-06, "learning_rate": 0.0022760216496904287, "loss": 2.5059, "step": 8879 }, { "crossentropy": 2.620452046394348, "epoch": 0.48288425460181084, "grad_norm": 0.03181793540716171, "grad_norm_var": 1.3901005261625673e-06, "learning_rate": 0.00227497756387051, "loss": 2.6205, "step": 8880 }, { "crossentropy": 2.52947735786438, "epoch": 0.48293863345931104, "grad_norm": 0.03769376873970032, "grad_norm_var": 3.146427300785533e-06, "learning_rate": 0.002273933647053939, "loss": 2.5295, "step": 8881 }, { "crossentropy": 2.50112247467041, "epoch": 0.48299301231681124, "grad_norm": 0.03248222544789314, "grad_norm_var": 3.0237645551853812e-06, "learning_rate": 0.0022728898993054636, "loss": 2.5011, "step": 8882 }, { "crossentropy": 2.5651196241378784, "epoch": 0.48304739117431145, "grad_norm": 0.033873192965984344, "grad_norm_var": 3.0142361044332383e-06, "learning_rate": 0.0022718463206898155, "loss": 2.5651, "step": 8883 }, { "crossentropy": 2.651377558708191, "epoch": 0.48310177003181165, "grad_norm": 0.03123888187110424, "grad_norm_var": 3.143193947272432e-06, "learning_rate": 0.0022708029112717136, "loss": 2.6514, "step": 8884 }, { "crossentropy": 2.6394609212875366, "epoch": 0.48315614888931185, "grad_norm": 0.030630867928266525, "grad_norm_var": 3.3762561913809147e-06, "learning_rate": 0.002269759671115869, "loss": 2.6395, "step": 8885 }, { "crossentropy": 2.6365586519241333, "epoch": 0.48321052774681206, "grad_norm": 0.031057795509696007, "grad_norm_var": 3.4792430179357196e-06, "learning_rate": 0.002268716600286988, "loss": 2.6366, "step": 8886 }, { "crossentropy": 2.689265012741089, "epoch": 0.48326490660431226, "grad_norm": 0.03137921914458275, "grad_norm_var": 3.5214252320940464e-06, "learning_rate": 0.002267673698849755, "loss": 2.6893, "step": 8887 }, { "crossentropy": 2.507866859436035, "epoch": 0.48331928546181246, "grad_norm": 0.03180505335330963, "grad_norm_var": 3.5477161183135176e-06, "learning_rate": 0.002266630966868852, "loss": 2.5079, "step": 8888 }, { "crossentropy": 2.634261727333069, "epoch": 0.48337366431931267, "grad_norm": 0.03221152722835541, "grad_norm_var": 3.55028597060282e-06, "learning_rate": 0.002265588404408948, "loss": 2.6343, "step": 8889 }, { "crossentropy": 2.5400220155715942, "epoch": 0.48342804317681287, "grad_norm": 0.03268350288271904, "grad_norm_var": 3.535318710938294e-06, "learning_rate": 0.0022645460115347017, "loss": 2.54, "step": 8890 }, { "crossentropy": 2.607455253601074, "epoch": 0.4834824220343131, "grad_norm": 0.03261579945683479, "grad_norm_var": 3.3438506437828233e-06, "learning_rate": 0.0022635037883107618, "loss": 2.6075, "step": 8891 }, { "crossentropy": 2.570339798927307, "epoch": 0.4835368008918133, "grad_norm": 0.034337908029556274, "grad_norm_var": 3.5283302392677406e-06, "learning_rate": 0.0022624617348017658, "loss": 2.5703, "step": 8892 }, { "crossentropy": 2.6080485582351685, "epoch": 0.4835911797493135, "grad_norm": 0.032545026391744614, "grad_norm_var": 3.4936708706738428e-06, "learning_rate": 0.0022614198510723404, "loss": 2.608, "step": 8893 }, { "crossentropy": 2.5286840200424194, "epoch": 0.4836455586068137, "grad_norm": 0.03404480591416359, "grad_norm_var": 3.5378236868807237e-06, "learning_rate": 0.0022603781371871045, "loss": 2.5287, "step": 8894 }, { "crossentropy": 2.5129843950271606, "epoch": 0.4836999374643139, "grad_norm": 0.039425477385520935, "grad_norm_var": 5.73554455336143e-06, "learning_rate": 0.0022593365932106614, "loss": 2.513, "step": 8895 }, { "crossentropy": 2.5511651039123535, "epoch": 0.4837543163218141, "grad_norm": 0.03257499635219574, "grad_norm_var": 5.640419441509809e-06, "learning_rate": 0.002258295219207605, "loss": 2.5512, "step": 8896 }, { "crossentropy": 2.4549375772476196, "epoch": 0.4838086951793143, "grad_norm": 0.03484785556793213, "grad_norm_var": 4.427208969986357e-06, "learning_rate": 0.002257254015242527, "loss": 2.4549, "step": 8897 }, { "crossentropy": 2.580819606781006, "epoch": 0.4838630740368145, "grad_norm": 0.030938006937503815, "grad_norm_var": 4.679690846649762e-06, "learning_rate": 0.002256212981379996, "loss": 2.5808, "step": 8898 }, { "crossentropy": 2.51665997505188, "epoch": 0.4839174528943147, "grad_norm": 0.0329158753156662, "grad_norm_var": 4.611232358174971e-06, "learning_rate": 0.0022551721176845775, "loss": 2.5167, "step": 8899 }, { "crossentropy": 2.660783052444458, "epoch": 0.4839718317518149, "grad_norm": 0.03172672912478447, "grad_norm_var": 4.522722088940107e-06, "learning_rate": 0.0022541314242208255, "loss": 2.6608, "step": 8900 }, { "crossentropy": 2.639854073524475, "epoch": 0.4840262106093151, "grad_norm": 0.04457899183034897, "grad_norm_var": 1.253875185511324e-05, "learning_rate": 0.002253090901053283, "loss": 2.6399, "step": 8901 }, { "crossentropy": 2.53008234500885, "epoch": 0.4840805894668153, "grad_norm": 0.032895468175411224, "grad_norm_var": 1.209493403992364e-05, "learning_rate": 0.0022520505482464814, "loss": 2.5301, "step": 8902 }, { "crossentropy": 2.5692983865737915, "epoch": 0.4841349683243155, "grad_norm": 0.030394133180379868, "grad_norm_var": 1.2479502418598816e-05, "learning_rate": 0.002251010365864943, "loss": 2.5693, "step": 8903 }, { "crossentropy": 2.5654077529907227, "epoch": 0.4841893471818157, "grad_norm": 0.03208860754966736, "grad_norm_var": 1.240971583909197e-05, "learning_rate": 0.0022499703539731785, "loss": 2.5654, "step": 8904 }, { "crossentropy": 2.58783495426178, "epoch": 0.4842437260393159, "grad_norm": 0.03336074948310852, "grad_norm_var": 1.2248622567968363e-05, "learning_rate": 0.0022489305126356897, "loss": 2.5878, "step": 8905 }, { "crossentropy": 2.6213423013687134, "epoch": 0.48429810489681613, "grad_norm": 0.03029659576714039, "grad_norm_var": 1.2983385991865169e-05, "learning_rate": 0.002247890841916967, "loss": 2.6213, "step": 8906 }, { "crossentropy": 2.557698965072632, "epoch": 0.48435248375431633, "grad_norm": 0.03462453559041023, "grad_norm_var": 1.2938713137324596e-05, "learning_rate": 0.0022468513418814847, "loss": 2.5577, "step": 8907 }, { "crossentropy": 2.5976775884628296, "epoch": 0.48440686261181654, "grad_norm": 0.037679173052310944, "grad_norm_var": 1.385394837456707e-05, "learning_rate": 0.0022458120125937193, "loss": 2.5977, "step": 8908 }, { "crossentropy": 2.5769166946411133, "epoch": 0.48446124146931674, "grad_norm": 0.03232121840119362, "grad_norm_var": 1.3902244582118676e-05, "learning_rate": 0.0022447728541181234, "loss": 2.5769, "step": 8909 }, { "crossentropy": 2.6036185026168823, "epoch": 0.48451562032681694, "grad_norm": 0.032655179500579834, "grad_norm_var": 1.4022893372905838e-05, "learning_rate": 0.002243733866519146, "loss": 2.6036, "step": 8910 }, { "crossentropy": 2.5834646224975586, "epoch": 0.48456999918431715, "grad_norm": 0.0332377664744854, "grad_norm_var": 1.1904828938610875e-05, "learning_rate": 0.002242695049861225, "loss": 2.5835, "step": 8911 }, { "crossentropy": 2.5401415824890137, "epoch": 0.48462437804181735, "grad_norm": 0.033531975001096725, "grad_norm_var": 1.1834980655602344e-05, "learning_rate": 0.0022416564042087857, "loss": 2.5401, "step": 8912 }, { "crossentropy": 2.592150568962097, "epoch": 0.48467875689931755, "grad_norm": 0.034389711916446686, "grad_norm_var": 1.1773754526052378e-05, "learning_rate": 0.002240617929626245, "loss": 2.5922, "step": 8913 }, { "crossentropy": 2.592483162879944, "epoch": 0.48473313575681776, "grad_norm": 0.03428433835506439, "grad_norm_var": 1.1284935799251448e-05, "learning_rate": 0.0022395796261780077, "loss": 2.5925, "step": 8914 }, { "crossentropy": 2.5418479442596436, "epoch": 0.48478751461431796, "grad_norm": 0.031694021075963974, "grad_norm_var": 1.1524123444422634e-05, "learning_rate": 0.0022385414939284686, "loss": 2.5418, "step": 8915 }, { "crossentropy": 2.6538738012313843, "epoch": 0.48484189347181816, "grad_norm": 0.038275349885225296, "grad_norm_var": 1.2450923940253418e-05, "learning_rate": 0.0022375035329420118, "loss": 2.6539, "step": 8916 }, { "crossentropy": 2.5763165950775146, "epoch": 0.48489627232931837, "grad_norm": 0.03410110995173454, "grad_norm_var": 4.734667704795595e-06, "learning_rate": 0.0022364657432830115, "loss": 2.5763, "step": 8917 }, { "crossentropy": 2.5403181314468384, "epoch": 0.48495065118681857, "grad_norm": 0.03265517204999924, "grad_norm_var": 4.7573049284351745e-06, "learning_rate": 0.0022354281250158265, "loss": 2.5403, "step": 8918 }, { "crossentropy": 2.6029239892959595, "epoch": 0.4850050300443188, "grad_norm": 0.03290550038218498, "grad_norm_var": 4.12008207621114e-06, "learning_rate": 0.0022343906782048128, "loss": 2.6029, "step": 8919 }, { "crossentropy": 2.613836884498596, "epoch": 0.485059408901819, "grad_norm": 0.03255858272314072, "grad_norm_var": 4.037215780095304e-06, "learning_rate": 0.002233353402914313, "loss": 2.6138, "step": 8920 }, { "crossentropy": 2.5383834838867188, "epoch": 0.4851137877593192, "grad_norm": 0.03244078904390335, "grad_norm_var": 4.1269018890196e-06, "learning_rate": 0.002232316299208654, "loss": 2.5384, "step": 8921 }, { "crossentropy": 2.648759365081787, "epoch": 0.4851681666168194, "grad_norm": 0.03203349560499191, "grad_norm_var": 3.5496904410049768e-06, "learning_rate": 0.0022312793671521586, "loss": 2.6488, "step": 8922 }, { "crossentropy": 2.5921722650527954, "epoch": 0.4852225454743196, "grad_norm": 0.036788854748010635, "grad_norm_var": 4.105867152274373e-06, "learning_rate": 0.0022302426068091345, "loss": 2.5922, "step": 8923 }, { "crossentropy": 2.5315228700637817, "epoch": 0.4852769243318198, "grad_norm": 0.033015090972185135, "grad_norm_var": 3.082337640806544e-06, "learning_rate": 0.002229206018243882, "loss": 2.5315, "step": 8924 }, { "crossentropy": 2.622273564338684, "epoch": 0.48533130318932, "grad_norm": 0.032922208309173584, "grad_norm_var": 3.006005725499546e-06, "learning_rate": 0.002228169601520689, "loss": 2.6223, "step": 8925 }, { "crossentropy": 2.5353972911834717, "epoch": 0.4853856820468202, "grad_norm": 0.03791920095682144, "grad_norm_var": 4.079597941778618e-06, "learning_rate": 0.0022271333567038336, "loss": 2.5354, "step": 8926 }, { "crossentropy": 2.496115565299988, "epoch": 0.4854400609043204, "grad_norm": 0.03122882731258869, "grad_norm_var": 4.515135074387152e-06, "learning_rate": 0.0022260972838575818, "loss": 2.4961, "step": 8927 }, { "crossentropy": 2.6655391454696655, "epoch": 0.4854944397618206, "grad_norm": 0.03152520954608917, "grad_norm_var": 4.837611733834535e-06, "learning_rate": 0.0022250613830461924, "loss": 2.6655, "step": 8928 }, { "crossentropy": 2.575199604034424, "epoch": 0.4855488186193208, "grad_norm": 0.032830070704221725, "grad_norm_var": 4.840203094111705e-06, "learning_rate": 0.002224025654333905, "loss": 2.5752, "step": 8929 }, { "crossentropy": 2.5356284379959106, "epoch": 0.485603197476821, "grad_norm": 0.034474801272153854, "grad_norm_var": 4.86051924008291e-06, "learning_rate": 0.0022229900977849605, "loss": 2.5356, "step": 8930 }, { "crossentropy": 2.5228904485702515, "epoch": 0.4856575763343212, "grad_norm": 0.03209477290511131, "grad_norm_var": 4.769487431477412e-06, "learning_rate": 0.0022219547134635833, "loss": 2.5229, "step": 8931 }, { "crossentropy": 2.466104507446289, "epoch": 0.4857119551918214, "grad_norm": 0.03275492787361145, "grad_norm_var": 3.2406343088701848e-06, "learning_rate": 0.0022209195014339825, "loss": 2.4661, "step": 8932 }, { "crossentropy": 2.5565344095230103, "epoch": 0.4857663340493216, "grad_norm": 0.03214510530233383, "grad_norm_var": 3.2618388662515684e-06, "learning_rate": 0.002219884461760363, "loss": 2.5565, "step": 8933 }, { "crossentropy": 2.4676129817962646, "epoch": 0.4858207129068218, "grad_norm": 0.03089093416929245, "grad_norm_var": 3.571192700795829e-06, "learning_rate": 0.0022188495945069183, "loss": 2.4676, "step": 8934 }, { "crossentropy": 2.656467080116272, "epoch": 0.48587509176432203, "grad_norm": 0.031951386481523514, "grad_norm_var": 3.644311378320205e-06, "learning_rate": 0.002217814899737828, "loss": 2.6565, "step": 8935 }, { "crossentropy": 2.52437686920166, "epoch": 0.48592947062182223, "grad_norm": 0.03252122178673744, "grad_norm_var": 3.6464649689794815e-06, "learning_rate": 0.0022167803775172655, "loss": 2.5244, "step": 8936 }, { "crossentropy": 2.3827531337738037, "epoch": 0.48598384947932244, "grad_norm": 0.030984530225396156, "grad_norm_var": 3.881968880930601e-06, "learning_rate": 0.002215746027909386, "loss": 2.3828, "step": 8937 }, { "crossentropy": 2.5096163749694824, "epoch": 0.48603822833682264, "grad_norm": 0.03247855231165886, "grad_norm_var": 3.8441139033275225e-06, "learning_rate": 0.002214711850978344, "loss": 2.5096, "step": 8938 }, { "crossentropy": 2.536756157875061, "epoch": 0.48609260719432285, "grad_norm": 0.030932767316699028, "grad_norm_var": 2.957144783496248e-06, "learning_rate": 0.002213677846788278, "loss": 2.5368, "step": 8939 }, { "crossentropy": 2.6653246879577637, "epoch": 0.48614698605182305, "grad_norm": 0.03306221216917038, "grad_norm_var": 2.960256846832445e-06, "learning_rate": 0.0022126440154033125, "loss": 2.6653, "step": 8940 }, { "crossentropy": 2.5342209339141846, "epoch": 0.48620136490932325, "grad_norm": 0.0348099060356617, "grad_norm_var": 3.2779616794624633e-06, "learning_rate": 0.0022116103568875643, "loss": 2.5342, "step": 8941 }, { "crossentropy": 2.5609610080718994, "epoch": 0.48625574376682346, "grad_norm": 0.03287110850214958, "grad_norm_var": 1.3326752333510007e-06, "learning_rate": 0.0022105768713051466, "loss": 2.561, "step": 8942 }, { "crossentropy": 2.5882550477981567, "epoch": 0.48631012262432366, "grad_norm": 0.03317217901349068, "grad_norm_var": 1.2789098310747015e-06, "learning_rate": 0.0022095435587201483, "loss": 2.5883, "step": 8943 }, { "crossentropy": 2.578435182571411, "epoch": 0.48636450148182386, "grad_norm": 0.03325718268752098, "grad_norm_var": 1.248505990584838e-06, "learning_rate": 0.002208510419196657, "loss": 2.5784, "step": 8944 }, { "crossentropy": 2.61476993560791, "epoch": 0.48641888033932407, "grad_norm": 0.03523828461766243, "grad_norm_var": 1.6922410150790878e-06, "learning_rate": 0.0022074774527987475, "loss": 2.6148, "step": 8945 }, { "crossentropy": 2.6975148916244507, "epoch": 0.48647325919682427, "grad_norm": 0.033326685428619385, "grad_norm_var": 1.5071448513550556e-06, "learning_rate": 0.0022064446595904817, "loss": 2.6975, "step": 8946 }, { "crossentropy": 2.603940963745117, "epoch": 0.4865276380543245, "grad_norm": 0.033014535903930664, "grad_norm_var": 1.49122400023687e-06, "learning_rate": 0.002205412039635915, "loss": 2.6039, "step": 8947 }, { "crossentropy": 2.5095605850219727, "epoch": 0.4865820169118247, "grad_norm": 0.031152451410889626, "grad_norm_var": 1.6428082223183022e-06, "learning_rate": 0.002204379592999087, "loss": 2.5096, "step": 8948 }, { "crossentropy": 2.625312924385071, "epoch": 0.4866363957693249, "grad_norm": 0.03157033026218414, "grad_norm_var": 1.699319008967405e-06, "learning_rate": 0.002203347319744031, "loss": 2.6253, "step": 8949 }, { "crossentropy": 2.4919543266296387, "epoch": 0.4866907746268251, "grad_norm": 0.030819999054074287, "grad_norm_var": 1.7155816732713468e-06, "learning_rate": 0.002202315219934769, "loss": 2.492, "step": 8950 }, { "crossentropy": 2.537740111351013, "epoch": 0.4867451534843253, "grad_norm": 0.03202527016401291, "grad_norm_var": 1.7098021087344946e-06, "learning_rate": 0.0022012832936353067, "loss": 2.5377, "step": 8951 }, { "crossentropy": 2.578463912010193, "epoch": 0.4867995323418255, "grad_norm": 0.03307788446545601, "grad_norm_var": 1.7250050386716695e-06, "learning_rate": 0.0022002515409096433, "loss": 2.5785, "step": 8952 }, { "crossentropy": 2.5752689838409424, "epoch": 0.4868539111993257, "grad_norm": 0.03289166837930679, "grad_norm_var": 1.5384573696306982e-06, "learning_rate": 0.0021992199618217734, "loss": 2.5753, "step": 8953 }, { "crossentropy": 2.7137622833251953, "epoch": 0.4869082900568259, "grad_norm": 0.03190484270453453, "grad_norm_var": 1.5783636670553634e-06, "learning_rate": 0.002198188556435668, "loss": 2.7138, "step": 8954 }, { "crossentropy": 2.5663082599639893, "epoch": 0.4869626689143261, "grad_norm": 0.03222225233912468, "grad_norm_var": 1.3792253424663517e-06, "learning_rate": 0.002197157324815296, "loss": 2.5663, "step": 8955 }, { "crossentropy": 2.6606485843658447, "epoch": 0.4870170477718263, "grad_norm": 0.03185006231069565, "grad_norm_var": 1.4248074537305417e-06, "learning_rate": 0.0021961262670246134, "loss": 2.6606, "step": 8956 }, { "crossentropy": 2.5571982860565186, "epoch": 0.4870714266293265, "grad_norm": 0.03052942082285881, "grad_norm_var": 1.3659431415577808e-06, "learning_rate": 0.0021950953831275668, "loss": 2.5572, "step": 8957 }, { "crossentropy": 2.6357736587524414, "epoch": 0.4871258054868267, "grad_norm": 0.03207562491297722, "grad_norm_var": 1.3589995572243473e-06, "learning_rate": 0.0021940646731880885, "loss": 2.6358, "step": 8958 }, { "crossentropy": 2.686068296432495, "epoch": 0.4871801843443269, "grad_norm": 0.03166363015770912, "grad_norm_var": 1.3425051680092278e-06, "learning_rate": 0.0021930341372701037, "loss": 2.6861, "step": 8959 }, { "crossentropy": 2.624301791191101, "epoch": 0.4872345632018271, "grad_norm": 0.03234321251511574, "grad_norm_var": 1.276699154998245e-06, "learning_rate": 0.002192003775437524, "loss": 2.6243, "step": 8960 }, { "crossentropy": 2.559479832649231, "epoch": 0.4872889420593273, "grad_norm": 0.03164161741733551, "grad_norm_var": 6.433442112721566e-07, "learning_rate": 0.002190973587754253, "loss": 2.5595, "step": 8961 }, { "crossentropy": 2.5650153160095215, "epoch": 0.4873433209168275, "grad_norm": 0.03229403495788574, "grad_norm_var": 5.282673632214796e-07, "learning_rate": 0.002189943574284183, "loss": 2.565, "step": 8962 }, { "crossentropy": 2.6327613592147827, "epoch": 0.48739769977432773, "grad_norm": 0.03269844502210617, "grad_norm_var": 4.893221874277079e-07, "learning_rate": 0.002188913735091189, "loss": 2.6328, "step": 8963 }, { "crossentropy": 2.5122904777526855, "epoch": 0.48745207863182793, "grad_norm": 0.031995560973882675, "grad_norm_var": 4.471793376570419e-07, "learning_rate": 0.002187884070239148, "loss": 2.5123, "step": 8964 }, { "crossentropy": 2.5768582820892334, "epoch": 0.48750645748932814, "grad_norm": 0.030949773266911507, "grad_norm_var": 5.047502186642308e-07, "learning_rate": 0.002186854579791913, "loss": 2.5769, "step": 8965 }, { "crossentropy": 2.5766249895095825, "epoch": 0.48756083634682834, "grad_norm": 0.036211710423231125, "grad_norm_var": 1.5190443832396033e-06, "learning_rate": 0.002185825263813335, "loss": 2.5766, "step": 8966 }, { "crossentropy": 2.5275824069976807, "epoch": 0.48761521520432854, "grad_norm": 0.031323689967393875, "grad_norm_var": 1.573022441627098e-06, "learning_rate": 0.00218479612236725, "loss": 2.5276, "step": 8967 }, { "crossentropy": 2.504051089286804, "epoch": 0.48766959406182875, "grad_norm": 0.0351799912750721, "grad_norm_var": 2.0869616635779078e-06, "learning_rate": 0.0021837671555174853, "loss": 2.5041, "step": 8968 }, { "crossentropy": 2.5190253257751465, "epoch": 0.48772397291932895, "grad_norm": 0.03337297588586807, "grad_norm_var": 2.1354973660026352e-06, "learning_rate": 0.0021827383633278557, "loss": 2.519, "step": 8969 }, { "crossentropy": 2.465010643005371, "epoch": 0.48777835177682916, "grad_norm": 0.03533773869276047, "grad_norm_var": 2.6494979659366964e-06, "learning_rate": 0.0021817097458621667, "loss": 2.465, "step": 8970 }, { "crossentropy": 2.5539770126342773, "epoch": 0.48783273063432936, "grad_norm": 0.03128691390156746, "grad_norm_var": 2.7519856626690886e-06, "learning_rate": 0.002180681303184211, "loss": 2.554, "step": 8971 }, { "crossentropy": 2.597717523574829, "epoch": 0.48788710949182956, "grad_norm": 0.033262137323617935, "grad_norm_var": 2.7453625256260017e-06, "learning_rate": 0.0021796530353577724, "loss": 2.5977, "step": 8972 }, { "crossentropy": 2.5660074949264526, "epoch": 0.48794148834932977, "grad_norm": 0.032845236361026764, "grad_norm_var": 2.430274056576054e-06, "learning_rate": 0.002178624942446626, "loss": 2.566, "step": 8973 }, { "crossentropy": 2.4505032300949097, "epoch": 0.48799586720682997, "grad_norm": 0.030801065266132355, "grad_norm_var": 2.651532213488701e-06, "learning_rate": 0.002177597024514526, "loss": 2.4505, "step": 8974 }, { "crossentropy": 2.631343960762024, "epoch": 0.4880502460643302, "grad_norm": 0.03477006033062935, "grad_norm_var": 2.8251965558145996e-06, "learning_rate": 0.0021765692816252294, "loss": 2.6313, "step": 8975 }, { "crossentropy": 2.5751073360443115, "epoch": 0.4881046249218304, "grad_norm": 0.03437952697277069, "grad_norm_var": 2.934641586942741e-06, "learning_rate": 0.0021755417138424747, "loss": 2.5751, "step": 8976 }, { "crossentropy": 2.560078978538513, "epoch": 0.4881590037793306, "grad_norm": 0.03293613716959953, "grad_norm_var": 2.8011366729768177e-06, "learning_rate": 0.0021745143212299886, "loss": 2.5601, "step": 8977 }, { "crossentropy": 2.538415789604187, "epoch": 0.4882133826368308, "grad_norm": 0.03632654622197151, "grad_norm_var": 3.3826045008343662e-06, "learning_rate": 0.00217348710385149, "loss": 2.5384, "step": 8978 }, { "crossentropy": 2.5829015970230103, "epoch": 0.488267761494331, "grad_norm": 0.03229169175028801, "grad_norm_var": 3.4285440228925725e-06, "learning_rate": 0.002172460061770686, "loss": 2.5829, "step": 8979 }, { "crossentropy": 2.5827747583389282, "epoch": 0.4883221403518312, "grad_norm": 0.03297669440507889, "grad_norm_var": 3.314215164871995e-06, "learning_rate": 0.0021714331950512724, "loss": 2.5828, "step": 8980 }, { "crossentropy": 2.519593119621277, "epoch": 0.4883765192093314, "grad_norm": 0.0306907519698143, "grad_norm_var": 3.4027101723727374e-06, "learning_rate": 0.002170406503756935, "loss": 2.5196, "step": 8981 }, { "crossentropy": 2.5048388242721558, "epoch": 0.4884308980668316, "grad_norm": 0.03209969401359558, "grad_norm_var": 2.903978238621985e-06, "learning_rate": 0.0021693799879513487, "loss": 2.5048, "step": 8982 }, { "crossentropy": 2.7200011014938354, "epoch": 0.4884852769243318, "grad_norm": 0.03261888027191162, "grad_norm_var": 2.6990372034134286e-06, "learning_rate": 0.0021683536476981764, "loss": 2.72, "step": 8983 }, { "crossentropy": 2.580474853515625, "epoch": 0.488539655781832, "grad_norm": 0.034912701696157455, "grad_norm_var": 2.6328849331777013e-06, "learning_rate": 0.002167327483061073, "loss": 2.5805, "step": 8984 }, { "crossentropy": 2.6913145780563354, "epoch": 0.4885940346393322, "grad_norm": 0.032192014157772064, "grad_norm_var": 2.689948521846358e-06, "learning_rate": 0.002166301494103674, "loss": 2.6913, "step": 8985 }, { "crossentropy": 2.6936373710632324, "epoch": 0.4886484134968324, "grad_norm": 0.034950681030750275, "grad_norm_var": 2.5842395388557424e-06, "learning_rate": 0.0021652756808896167, "loss": 2.6936, "step": 8986 }, { "crossentropy": 2.6907877922058105, "epoch": 0.4887027923543326, "grad_norm": 0.032445840537548065, "grad_norm_var": 2.3905234001602806e-06, "learning_rate": 0.002164250043482521, "loss": 2.6908, "step": 8987 }, { "crossentropy": 2.6241064071655273, "epoch": 0.4887571712118328, "grad_norm": 0.031011279672384262, "grad_norm_var": 2.675386217662612e-06, "learning_rate": 0.002163224581945991, "loss": 2.6241, "step": 8988 }, { "crossentropy": 2.5998045206069946, "epoch": 0.488811550069333, "grad_norm": 0.03204364702105522, "grad_norm_var": 2.7337482014022987e-06, "learning_rate": 0.002162199296343629, "loss": 2.5998, "step": 8989 }, { "crossentropy": 2.503326416015625, "epoch": 0.4888659289268332, "grad_norm": 0.031505730003118515, "grad_norm_var": 2.5614272393149895e-06, "learning_rate": 0.00216117418673902, "loss": 2.5033, "step": 8990 }, { "crossentropy": 2.6589267253875732, "epoch": 0.48892030778433343, "grad_norm": 0.03219462186098099, "grad_norm_var": 2.3714178374230227e-06, "learning_rate": 0.002160149253195742, "loss": 2.6589, "step": 8991 }, { "crossentropy": 2.4975075721740723, "epoch": 0.48897468664183363, "grad_norm": 0.03142111375927925, "grad_norm_var": 2.3145203104054287e-06, "learning_rate": 0.0021591244957773585, "loss": 2.4975, "step": 8992 }, { "crossentropy": 2.501024603843689, "epoch": 0.48902906549933384, "grad_norm": 0.030840547755360603, "grad_norm_var": 2.5128460444578574e-06, "learning_rate": 0.0021580999145474266, "loss": 2.501, "step": 8993 }, { "crossentropy": 2.601157546043396, "epoch": 0.48908344435683404, "grad_norm": 0.03262009471654892, "grad_norm_var": 1.4965396011669957e-06, "learning_rate": 0.002157075509569488, "loss": 2.6012, "step": 8994 }, { "crossentropy": 2.6510186195373535, "epoch": 0.48913782321433424, "grad_norm": 0.03445814549922943, "grad_norm_var": 1.78719620853375e-06, "learning_rate": 0.0021560512809070775, "loss": 2.651, "step": 8995 }, { "crossentropy": 2.5569413900375366, "epoch": 0.48919220207183445, "grad_norm": 0.031554654240608215, "grad_norm_var": 1.8111413399520543e-06, "learning_rate": 0.0021550272286237114, "loss": 2.5569, "step": 8996 }, { "crossentropy": 2.6279337406158447, "epoch": 0.48924658092933465, "grad_norm": 0.030979352071881294, "grad_norm_var": 1.7525943216999592e-06, "learning_rate": 0.002154003352782906, "loss": 2.6279, "step": 8997 }, { "crossentropy": 2.5533483028411865, "epoch": 0.48930095978683485, "grad_norm": 0.036118034273386, "grad_norm_var": 2.6193388587044652e-06, "learning_rate": 0.0021529796534481616, "loss": 2.5533, "step": 8998 }, { "crossentropy": 2.4906152486801147, "epoch": 0.48935533864433506, "grad_norm": 0.030564308166503906, "grad_norm_var": 2.882573119593277e-06, "learning_rate": 0.0021519561306829627, "loss": 2.4906, "step": 8999 }, { "crossentropy": 2.6034048795700073, "epoch": 0.48940971750183526, "grad_norm": 0.03353828564286232, "grad_norm_var": 2.556351572000938e-06, "learning_rate": 0.002150932784550788, "loss": 2.6034, "step": 9000 }, { "crossentropy": 2.5647743940353394, "epoch": 0.48946409635933547, "grad_norm": 0.03222568705677986, "grad_norm_var": 2.555477878928708e-06, "learning_rate": 0.0021499096151151088, "loss": 2.5648, "step": 9001 }, { "crossentropy": 2.6578203439712524, "epoch": 0.48951847521683567, "grad_norm": 0.03175516426563263, "grad_norm_var": 2.10883790576627e-06, "learning_rate": 0.0021488866224393765, "loss": 2.6578, "step": 9002 }, { "crossentropy": 2.5287898778915405, "epoch": 0.4895728540743359, "grad_norm": 0.033090513199567795, "grad_norm_var": 2.1555336258679537e-06, "learning_rate": 0.0021478638065870383, "loss": 2.5288, "step": 9003 }, { "crossentropy": 2.6696702241897583, "epoch": 0.4896272329318361, "grad_norm": 0.03270319476723671, "grad_norm_var": 2.056114703571863e-06, "learning_rate": 0.0021468411676215274, "loss": 2.6697, "step": 9004 }, { "crossentropy": 2.5765230655670166, "epoch": 0.4896816117893363, "grad_norm": 0.03138428181409836, "grad_norm_var": 2.110292440428198e-06, "learning_rate": 0.0021458187056062673, "loss": 2.5765, "step": 9005 }, { "crossentropy": 2.6635448932647705, "epoch": 0.4897359906468365, "grad_norm": 0.03339777886867523, "grad_norm_var": 2.1312361197742617e-06, "learning_rate": 0.0021447964206046718, "loss": 2.6635, "step": 9006 }, { "crossentropy": 2.5068187713623047, "epoch": 0.4897903695043367, "grad_norm": 0.030604654923081398, "grad_norm_var": 2.3386814939900378e-06, "learning_rate": 0.0021437743126801368, "loss": 2.5068, "step": 9007 }, { "crossentropy": 2.656609058380127, "epoch": 0.4898447483618369, "grad_norm": 0.03241698071360588, "grad_norm_var": 2.280182702969165e-06, "learning_rate": 0.0021427523818960577, "loss": 2.6566, "step": 9008 }, { "crossentropy": 2.5937944650650024, "epoch": 0.48989912721933715, "grad_norm": 0.031990423798561096, "grad_norm_var": 2.125152164449082e-06, "learning_rate": 0.0021417306283158144, "loss": 2.5938, "step": 9009 }, { "crossentropy": 2.576037049293518, "epoch": 0.48995350607683735, "grad_norm": 0.030892057344317436, "grad_norm_var": 2.2754960073035805e-06, "learning_rate": 0.0021407090520027708, "loss": 2.576, "step": 9010 }, { "crossentropy": 2.4734573364257812, "epoch": 0.49000788493433756, "grad_norm": 0.031926561146974564, "grad_norm_var": 1.966011324429116e-06, "learning_rate": 0.0021396876530202836, "loss": 2.4735, "step": 9011 }, { "crossentropy": 2.541312336921692, "epoch": 0.49006226379183776, "grad_norm": 0.03332849219441414, "grad_norm_var": 2.0108941579918218e-06, "learning_rate": 0.0021386664314317063, "loss": 2.5413, "step": 9012 }, { "crossentropy": 2.563523769378662, "epoch": 0.49011664264933796, "grad_norm": 0.031789086759090424, "grad_norm_var": 1.9085090979959383e-06, "learning_rate": 0.0021376453873003663, "loss": 2.5635, "step": 9013 }, { "crossentropy": 2.5820512771606445, "epoch": 0.49017102150683817, "grad_norm": 0.032555270940065384, "grad_norm_var": 9.156168820174023e-07, "learning_rate": 0.0021366245206895914, "loss": 2.5821, "step": 9014 }, { "crossentropy": 2.6198935508728027, "epoch": 0.49022540036433837, "grad_norm": 0.03213432803750038, "grad_norm_var": 7.408388433451905e-07, "learning_rate": 0.0021356038316626942, "loss": 2.6199, "step": 9015 }, { "crossentropy": 2.6943897008895874, "epoch": 0.4902797792218386, "grad_norm": 0.059647828340530396, "grad_norm_var": 4.789062141702036e-05, "learning_rate": 0.0021345833202829766, "loss": 2.6944, "step": 9016 }, { "crossentropy": 2.445944309234619, "epoch": 0.4903341580793388, "grad_norm": 0.03432921692728996, "grad_norm_var": 4.7707354230265565e-05, "learning_rate": 0.0021335629866137324, "loss": 2.4459, "step": 9017 }, { "crossentropy": 2.5525834560394287, "epoch": 0.490388536936839, "grad_norm": 0.03113294392824173, "grad_norm_var": 4.791750841458946e-05, "learning_rate": 0.0021325428307182354, "loss": 2.5526, "step": 9018 }, { "crossentropy": 2.5789287090301514, "epoch": 0.4904429157943392, "grad_norm": 0.03196018934249878, "grad_norm_var": 4.812805790331964e-05, "learning_rate": 0.002131522852659761, "loss": 2.5789, "step": 9019 }, { "crossentropy": 2.4541982412338257, "epoch": 0.4904972946518394, "grad_norm": 0.030674247071146965, "grad_norm_var": 4.870561952894958e-05, "learning_rate": 0.002130503052501566, "loss": 2.4542, "step": 9020 }, { "crossentropy": 2.5887794494628906, "epoch": 0.4905516735093396, "grad_norm": 0.030512981116771698, "grad_norm_var": 4.902909418477926e-05, "learning_rate": 0.002129483430306895, "loss": 2.5888, "step": 9021 }, { "crossentropy": 2.5333446264266968, "epoch": 0.4906060523668398, "grad_norm": 0.032213907688856125, "grad_norm_var": 4.9165314481191164e-05, "learning_rate": 0.0021284639861389835, "loss": 2.5333, "step": 9022 }, { "crossentropy": 2.58456289768219, "epoch": 0.49066043122434, "grad_norm": 0.03281116485595703, "grad_norm_var": 4.857901046227894e-05, "learning_rate": 0.0021274447200610625, "loss": 2.5846, "step": 9023 }, { "crossentropy": 2.5576001405715942, "epoch": 0.4907148100818402, "grad_norm": 0.033251699060201645, "grad_norm_var": 4.8472002344371556e-05, "learning_rate": 0.00212642563213634, "loss": 2.5576, "step": 9024 }, { "crossentropy": 2.5013173818588257, "epoch": 0.4907691889393404, "grad_norm": 0.033489711582660675, "grad_norm_var": 4.824637253425477e-05, "learning_rate": 0.0021254067224280204, "loss": 2.5013, "step": 9025 }, { "crossentropy": 2.5253634452819824, "epoch": 0.4908235677968406, "grad_norm": 0.03318945690989494, "grad_norm_var": 4.765007726793117e-05, "learning_rate": 0.0021243879909992962, "loss": 2.5254, "step": 9026 }, { "crossentropy": 2.507636785507202, "epoch": 0.4908779466543408, "grad_norm": 0.03194664418697357, "grad_norm_var": 4.764439184553891e-05, "learning_rate": 0.002123369437913348, "loss": 2.5076, "step": 9027 }, { "crossentropy": 2.63283371925354, "epoch": 0.490932325511841, "grad_norm": 0.032727304846048355, "grad_norm_var": 4.772565333214921e-05, "learning_rate": 0.0021223510632333454, "loss": 2.6328, "step": 9028 }, { "crossentropy": 2.5540192127227783, "epoch": 0.4909867043693412, "grad_norm": 0.03143201023340225, "grad_norm_var": 4.783997336776194e-05, "learning_rate": 0.0021213328670224465, "loss": 2.554, "step": 9029 }, { "crossentropy": 2.541032910346985, "epoch": 0.4910410832268414, "grad_norm": 0.03280354291200638, "grad_norm_var": 4.7795982615174414e-05, "learning_rate": 0.0021203148493437996, "loss": 2.541, "step": 9030 }, { "crossentropy": 2.5299142599105835, "epoch": 0.4910954620843416, "grad_norm": 0.032920610159635544, "grad_norm_var": 4.763734487309144e-05, "learning_rate": 0.002119297010260543, "loss": 2.5299, "step": 9031 }, { "crossentropy": 2.513936758041382, "epoch": 0.49114984094184183, "grad_norm": 0.032281022518873215, "grad_norm_var": 1.0976398213811822e-06, "learning_rate": 0.002118279349835798, "loss": 2.5139, "step": 9032 }, { "crossentropy": 2.5769882202148438, "epoch": 0.49120421979934203, "grad_norm": 0.03412909805774689, "grad_norm_var": 1.0474601382922382e-06, "learning_rate": 0.00211726186813268, "loss": 2.577, "step": 9033 }, { "crossentropy": 2.410733699798584, "epoch": 0.49125859865684224, "grad_norm": 0.03384929150342941, "grad_norm_var": 1.0706209671218115e-06, "learning_rate": 0.0021162445652142966, "loss": 2.4107, "step": 9034 }, { "crossentropy": 2.5770766735076904, "epoch": 0.49131297751434244, "grad_norm": 0.032809916883707047, "grad_norm_var": 1.0532235320961085e-06, "learning_rate": 0.0021152274411437343, "loss": 2.5771, "step": 9035 }, { "crossentropy": 2.5013257265090942, "epoch": 0.49136735637184265, "grad_norm": 0.03189466521143913, "grad_norm_var": 8.386178797630909e-07, "learning_rate": 0.0021142104959840776, "loss": 2.5013, "step": 9036 }, { "crossentropy": 2.5915592908859253, "epoch": 0.49142173522934285, "grad_norm": 0.031678151339292526, "grad_norm_var": 5.928004304775817e-07, "learning_rate": 0.002113193729798395, "loss": 2.5916, "step": 9037 }, { "crossentropy": 2.55587899684906, "epoch": 0.49147611408684305, "grad_norm": 0.03198205307126045, "grad_norm_var": 6.116281548682392e-07, "learning_rate": 0.002112177142649746, "loss": 2.5559, "step": 9038 }, { "crossentropy": 2.512642502784729, "epoch": 0.49153049294434326, "grad_norm": 0.031528789550065994, "grad_norm_var": 6.953621374518326e-07, "learning_rate": 0.0021111607346011784, "loss": 2.5126, "step": 9039 }, { "crossentropy": 2.6311269998550415, "epoch": 0.49158487180184346, "grad_norm": 0.03357424587011337, "grad_norm_var": 7.290476307453622e-07, "learning_rate": 0.002110144505715729, "loss": 2.6311, "step": 9040 }, { "crossentropy": 2.567082405090332, "epoch": 0.49163925065934366, "grad_norm": 0.03204222768545151, "grad_norm_var": 6.95963682381615e-07, "learning_rate": 0.0021091284560564227, "loss": 2.5671, "step": 9041 }, { "crossentropy": 2.5840184688568115, "epoch": 0.49169362951684387, "grad_norm": 0.031320396810770035, "grad_norm_var": 7.547716510994742e-07, "learning_rate": 0.0021081125856862743, "loss": 2.584, "step": 9042 }, { "crossentropy": 2.583624243736267, "epoch": 0.49174800837434407, "grad_norm": 0.030710585415363312, "grad_norm_var": 9.30334276426301e-07, "learning_rate": 0.0021070968946682897, "loss": 2.5836, "step": 9043 }, { "crossentropy": 2.5929824113845825, "epoch": 0.4918023872318443, "grad_norm": 0.03219752758741379, "grad_norm_var": 9.215945585244282e-07, "learning_rate": 0.0021060813830654545, "loss": 2.593, "step": 9044 }, { "crossentropy": 2.5999163389205933, "epoch": 0.4918567660893445, "grad_norm": 0.03333093598484993, "grad_norm_var": 9.215941163963233e-07, "learning_rate": 0.0021050660509407577, "loss": 2.5999, "step": 9045 }, { "crossentropy": 2.572848320007324, "epoch": 0.4919111449468447, "grad_norm": 0.032269202172756195, "grad_norm_var": 9.135965028013333e-07, "learning_rate": 0.002104050898357164, "loss": 2.5728, "step": 9046 }, { "crossentropy": 2.4980229139328003, "epoch": 0.4919655238043449, "grad_norm": 0.03347662463784218, "grad_norm_var": 9.709639977178418e-07, "learning_rate": 0.0021030359253776327, "loss": 2.498, "step": 9047 }, { "crossentropy": 2.6694300174713135, "epoch": 0.4920199026618451, "grad_norm": 0.03315321356058121, "grad_norm_var": 9.997685343485682e-07, "learning_rate": 0.0021020211320651135, "loss": 2.6694, "step": 9048 }, { "crossentropy": 2.678620934486389, "epoch": 0.4920742815193453, "grad_norm": 0.03258901089429855, "grad_norm_var": 8.128020828945815e-07, "learning_rate": 0.002101006518482541, "loss": 2.6786, "step": 9049 }, { "crossentropy": 2.6336885690689087, "epoch": 0.4921286603768455, "grad_norm": 0.03336561098694801, "grad_norm_var": 7.339854479143995e-07, "learning_rate": 0.0020999920846928423, "loss": 2.6337, "step": 9050 }, { "crossentropy": 2.5258634090423584, "epoch": 0.4921830392343457, "grad_norm": 0.0348113588988781, "grad_norm_var": 1.1016891672538733e-06, "learning_rate": 0.00209897783075893, "loss": 2.5259, "step": 9051 }, { "crossentropy": 2.5795180797576904, "epoch": 0.4922374180918459, "grad_norm": 0.03260999545454979, "grad_norm_var": 1.0763844822791446e-06, "learning_rate": 0.002097963756743709, "loss": 2.5795, "step": 9052 }, { "crossentropy": 2.5709691047668457, "epoch": 0.4922917969493461, "grad_norm": 0.03581879660487175, "grad_norm_var": 1.6721312411473062e-06, "learning_rate": 0.00209694986271007, "loss": 2.571, "step": 9053 }, { "crossentropy": 2.629469871520996, "epoch": 0.4923461758068463, "grad_norm": 0.03285770118236542, "grad_norm_var": 1.6246976368150534e-06, "learning_rate": 0.0020959361487208955, "loss": 2.6295, "step": 9054 }, { "crossentropy": 2.6032745838165283, "epoch": 0.4924005546643465, "grad_norm": 0.032839663326740265, "grad_norm_var": 1.500557484109856e-06, "learning_rate": 0.00209492261483905, "loss": 2.6033, "step": 9055 }, { "crossentropy": 2.487039089202881, "epoch": 0.4924549335218467, "grad_norm": 0.032484330236911774, "grad_norm_var": 1.4819701573695956e-06, "learning_rate": 0.0020939092611273976, "loss": 2.487, "step": 9056 }, { "crossentropy": 2.6231085062026978, "epoch": 0.4925093123793469, "grad_norm": 0.03493880853056908, "grad_norm_var": 1.6876954154881423e-06, "learning_rate": 0.0020928960876487853, "loss": 2.6231, "step": 9057 }, { "crossentropy": 2.5061224699020386, "epoch": 0.4925636912368471, "grad_norm": 0.038653213530778885, "grad_norm_var": 3.3588878593206196e-06, "learning_rate": 0.002091883094466045, "loss": 2.5061, "step": 9058 }, { "crossentropy": 2.6109213829040527, "epoch": 0.4926180700943473, "grad_norm": 0.03193672001361847, "grad_norm_var": 2.995735371769115e-06, "learning_rate": 0.0020908702816420043, "loss": 2.6109, "step": 9059 }, { "crossentropy": 2.551607847213745, "epoch": 0.49267244895184753, "grad_norm": 0.032753635197877884, "grad_norm_var": 2.9123124404644683e-06, "learning_rate": 0.0020898576492394765, "loss": 2.5516, "step": 9060 }, { "crossentropy": 2.5486987829208374, "epoch": 0.49272682780934773, "grad_norm": 0.032703250646591187, "grad_norm_var": 2.960965824368398e-06, "learning_rate": 0.002088845197321264, "loss": 2.5487, "step": 9061 }, { "crossentropy": 2.46751070022583, "epoch": 0.49278120666684794, "grad_norm": 0.03250402584671974, "grad_norm_var": 2.9234082755425287e-06, "learning_rate": 0.002087832925950159, "loss": 2.4675, "step": 9062 }, { "crossentropy": 2.4629249572753906, "epoch": 0.49283558552434814, "grad_norm": 0.03168836981058121, "grad_norm_var": 3.151140688114016e-06, "learning_rate": 0.0020868208351889402, "loss": 2.4629, "step": 9063 }, { "crossentropy": 2.535094380378723, "epoch": 0.49288996438184834, "grad_norm": 0.0311665590852499, "grad_norm_var": 3.4848356682968256e-06, "learning_rate": 0.0020858089251003776, "loss": 2.5351, "step": 9064 }, { "crossentropy": 2.597061276435852, "epoch": 0.49294434323934855, "grad_norm": 0.03239862248301506, "grad_norm_var": 3.50661100810821e-06, "learning_rate": 0.00208479719574723, "loss": 2.5971, "step": 9065 }, { "crossentropy": 2.491230010986328, "epoch": 0.49299872209684875, "grad_norm": 0.03388087451457977, "grad_norm_var": 3.52457477105759e-06, "learning_rate": 0.002083785647192239, "loss": 2.4912, "step": 9066 }, { "crossentropy": 2.529437780380249, "epoch": 0.49305310095434896, "grad_norm": 0.030827954411506653, "grad_norm_var": 3.7549389243363237e-06, "learning_rate": 0.002082774279498145, "loss": 2.5294, "step": 9067 }, { "crossentropy": 2.592999577522278, "epoch": 0.49310747981184916, "grad_norm": 0.03417379781603813, "grad_norm_var": 3.7995844927486785e-06, "learning_rate": 0.002081763092727673, "loss": 2.593, "step": 9068 }, { "crossentropy": 2.545720338821411, "epoch": 0.49316185866934936, "grad_norm": 0.031765103340148926, "grad_norm_var": 3.425573086335504e-06, "learning_rate": 0.002080752086943531, "loss": 2.5457, "step": 9069 }, { "crossentropy": 2.5524107217788696, "epoch": 0.49321623752684957, "grad_norm": 0.031337447464466095, "grad_norm_var": 3.5934510803677186e-06, "learning_rate": 0.0020797412622084233, "loss": 2.5524, "step": 9070 }, { "crossentropy": 2.636363983154297, "epoch": 0.49327061638434977, "grad_norm": 0.03428234905004501, "grad_norm_var": 3.71610797888001e-06, "learning_rate": 0.0020787306185850395, "loss": 2.6364, "step": 9071 }, { "crossentropy": 2.4813815355300903, "epoch": 0.49332499524185, "grad_norm": 0.030029280111193657, "grad_norm_var": 4.2512813714876484e-06, "learning_rate": 0.0020777201561360597, "loss": 2.4814, "step": 9072 }, { "crossentropy": 2.5874541997909546, "epoch": 0.4933793740993502, "grad_norm": 0.031742118299007416, "grad_norm_var": 3.984737400074961e-06, "learning_rate": 0.002076709874924151, "loss": 2.5875, "step": 9073 }, { "crossentropy": 2.4875046014785767, "epoch": 0.4934337529568504, "grad_norm": 0.03179130330681801, "grad_norm_var": 1.4033000292394683e-06, "learning_rate": 0.0020756997750119707, "loss": 2.4875, "step": 9074 }, { "crossentropy": 2.542291283607483, "epoch": 0.4934881318143506, "grad_norm": 0.0339440256357193, "grad_norm_var": 1.5883217686985437e-06, "learning_rate": 0.0020746898564621636, "loss": 2.5423, "step": 9075 }, { "crossentropy": 2.5968059301376343, "epoch": 0.4935425106718508, "grad_norm": 0.03181213513016701, "grad_norm_var": 1.5882574670789592e-06, "learning_rate": 0.002073680119337367, "loss": 2.5968, "step": 9076 }, { "crossentropy": 2.5583478212356567, "epoch": 0.493596889529351, "grad_norm": 0.03150591254234314, "grad_norm_var": 1.6059705117100106e-06, "learning_rate": 0.0020726705637001985, "loss": 2.5583, "step": 9077 }, { "crossentropy": 2.4774014949798584, "epoch": 0.4936512683868512, "grad_norm": 0.03423645719885826, "grad_norm_var": 1.8688347727516263e-06, "learning_rate": 0.002071661189613271, "loss": 2.4774, "step": 9078 }, { "crossentropy": 2.6195929050445557, "epoch": 0.4937056472443514, "grad_norm": 0.03138980269432068, "grad_norm_var": 1.898212897717457e-06, "learning_rate": 0.00207065199713919, "loss": 2.6196, "step": 9079 }, { "crossentropy": 2.5185635089874268, "epoch": 0.4937600261018516, "grad_norm": 0.03302542865276337, "grad_norm_var": 1.8412497612240678e-06, "learning_rate": 0.0020696429863405396, "loss": 2.5186, "step": 9080 }, { "crossentropy": 2.66785991191864, "epoch": 0.4938144049593518, "grad_norm": 0.03172948956489563, "grad_norm_var": 1.8679211194679112e-06, "learning_rate": 0.002068634157279899, "loss": 2.6679, "step": 9081 }, { "crossentropy": 2.809871554374695, "epoch": 0.493868783816852, "grad_norm": 0.033577390015125275, "grad_norm_var": 1.81141134778516e-06, "learning_rate": 0.002067625510019835, "loss": 2.8099, "step": 9082 }, { "crossentropy": 2.595139265060425, "epoch": 0.4939231626743522, "grad_norm": 0.03389432653784752, "grad_norm_var": 1.787776423437072e-06, "learning_rate": 0.0020666170446229032, "loss": 2.5951, "step": 9083 }, { "crossentropy": 2.5058082342147827, "epoch": 0.4939775415318524, "grad_norm": 0.032099928706884384, "grad_norm_var": 1.597838021017668e-06, "learning_rate": 0.0020656087611516474, "loss": 2.5058, "step": 9084 }, { "crossentropy": 2.544735312461853, "epoch": 0.4940319203893526, "grad_norm": 0.04394300654530525, "grad_norm_var": 9.859878527420035e-06, "learning_rate": 0.0020646006596686008, "loss": 2.5447, "step": 9085 }, { "crossentropy": 2.5663074254989624, "epoch": 0.4940862992468528, "grad_norm": 0.032010678201913834, "grad_norm_var": 9.725838223113023e-06, "learning_rate": 0.002063592740236284, "loss": 2.5663, "step": 9086 }, { "crossentropy": 2.5965503454208374, "epoch": 0.494140678104353, "grad_norm": 0.03155072405934334, "grad_norm_var": 9.793747213070986e-06, "learning_rate": 0.00206258500291721, "loss": 2.5966, "step": 9087 }, { "crossentropy": 2.524359345436096, "epoch": 0.49419505696185323, "grad_norm": 0.03435578942298889, "grad_norm_var": 9.239784364852243e-06, "learning_rate": 0.0020615774477738737, "loss": 2.5244, "step": 9088 }, { "crossentropy": 2.5074849128723145, "epoch": 0.49424943581935343, "grad_norm": 0.03379238769412041, "grad_norm_var": 9.079904301605563e-06, "learning_rate": 0.002060570074868763, "loss": 2.5075, "step": 9089 }, { "crossentropy": 2.5589325428009033, "epoch": 0.49430381467685364, "grad_norm": 0.0315278135240078, "grad_norm_var": 9.141328395039817e-06, "learning_rate": 0.002059562884264359, "loss": 2.5589, "step": 9090 }, { "crossentropy": 2.641839623451233, "epoch": 0.49435819353435384, "grad_norm": 0.032268792390823364, "grad_norm_var": 9.195147151743316e-06, "learning_rate": 0.002058555876023122, "loss": 2.6418, "step": 9091 }, { "crossentropy": 2.519999384880066, "epoch": 0.49441257239185404, "grad_norm": 0.038744378834962845, "grad_norm_var": 1.0828032938538177e-05, "learning_rate": 0.002057549050207507, "loss": 2.52, "step": 9092 }, { "crossentropy": 2.6031296253204346, "epoch": 0.49446695124935425, "grad_norm": 0.03245354816317558, "grad_norm_var": 1.0603360861951201e-05, "learning_rate": 0.002056542406879957, "loss": 2.6031, "step": 9093 }, { "crossentropy": 2.5669039487838745, "epoch": 0.49452133010685445, "grad_norm": 0.031864654272794724, "grad_norm_var": 1.081297188995911e-05, "learning_rate": 0.002055535946102903, "loss": 2.5669, "step": 9094 }, { "crossentropy": 2.5664992332458496, "epoch": 0.49457570896435465, "grad_norm": 0.031744442880153656, "grad_norm_var": 1.0714466160773271e-05, "learning_rate": 0.0020545296679387644, "loss": 2.5665, "step": 9095 }, { "crossentropy": 2.540157198905945, "epoch": 0.49463008782185486, "grad_norm": 0.03531123697757721, "grad_norm_var": 1.0847188600328581e-05, "learning_rate": 0.0020535235724499492, "loss": 2.5402, "step": 9096 }, { "crossentropy": 2.5650410652160645, "epoch": 0.49468446667935506, "grad_norm": 0.0325661264359951, "grad_norm_var": 1.0659489270144611e-05, "learning_rate": 0.0020525176596988552, "loss": 2.565, "step": 9097 }, { "crossentropy": 2.68534255027771, "epoch": 0.49473884553685527, "grad_norm": 0.03276912495493889, "grad_norm_var": 1.073040760754702e-05, "learning_rate": 0.0020515119297478683, "loss": 2.6853, "step": 9098 }, { "crossentropy": 2.5067564249038696, "epoch": 0.49479322439435547, "grad_norm": 0.03218859061598778, "grad_norm_var": 1.089217895435604e-05, "learning_rate": 0.002050506382659364, "loss": 2.5068, "step": 9099 }, { "crossentropy": 2.5652670860290527, "epoch": 0.49484760325185567, "grad_norm": 0.033962056040763855, "grad_norm_var": 1.0711763498717255e-05, "learning_rate": 0.0020495010184957006, "loss": 2.5653, "step": 9100 }, { "crossentropy": 2.6141140460968018, "epoch": 0.4949019821093559, "grad_norm": 0.03266372159123421, "grad_norm_var": 3.4328539862617132e-06, "learning_rate": 0.0020484958373192374, "loss": 2.6141, "step": 9101 }, { "crossentropy": 2.5783830881118774, "epoch": 0.4949563609668561, "grad_norm": 0.033557794988155365, "grad_norm_var": 3.35550022352397e-06, "learning_rate": 0.0020474908391923084, "loss": 2.5784, "step": 9102 }, { "crossentropy": 2.497478723526001, "epoch": 0.4950107398243563, "grad_norm": 0.0312521755695343, "grad_norm_var": 3.4270242603865256e-06, "learning_rate": 0.0020464860241772454, "loss": 2.4975, "step": 9103 }, { "crossentropy": 2.691732406616211, "epoch": 0.4950651186818565, "grad_norm": 0.03388362377882004, "grad_norm_var": 3.36749694442532e-06, "learning_rate": 0.0020454813923363653, "loss": 2.6917, "step": 9104 }, { "crossentropy": 2.424458622932434, "epoch": 0.4951194975393567, "grad_norm": 0.034918393939733505, "grad_norm_var": 3.5417725083335937e-06, "learning_rate": 0.0020444769437319744, "loss": 2.4245, "step": 9105 }, { "crossentropy": 2.649134874343872, "epoch": 0.4951738763968569, "grad_norm": 0.03159381076693535, "grad_norm_var": 3.527068059031429e-06, "learning_rate": 0.0020434726784263686, "loss": 2.6491, "step": 9106 }, { "crossentropy": 2.6167486906051636, "epoch": 0.4952282552543571, "grad_norm": 0.0329643189907074, "grad_norm_var": 3.4678014002193295e-06, "learning_rate": 0.00204246859648183, "loss": 2.6167, "step": 9107 }, { "crossentropy": 2.637595534324646, "epoch": 0.4952826341118573, "grad_norm": 0.03140143305063248, "grad_norm_var": 1.4852079246413055e-06, "learning_rate": 0.0020414646979606326, "loss": 2.6376, "step": 9108 }, { "crossentropy": 2.577862024307251, "epoch": 0.4953370129693575, "grad_norm": 0.0342794805765152, "grad_norm_var": 1.6047488392386654e-06, "learning_rate": 0.0020404609829250365, "loss": 2.5779, "step": 9109 }, { "crossentropy": 2.4293285608291626, "epoch": 0.4953913918268577, "grad_norm": 0.030580485239624977, "grad_norm_var": 1.8906667716863385e-06, "learning_rate": 0.0020394574514372927, "loss": 2.4293, "step": 9110 }, { "crossentropy": 2.390061140060425, "epoch": 0.4954457706843579, "grad_norm": 0.03394243121147156, "grad_norm_var": 1.8679392457566913e-06, "learning_rate": 0.002038454103559634, "loss": 2.3901, "step": 9111 }, { "crossentropy": 2.559261679649353, "epoch": 0.4955001495418581, "grad_norm": 0.037496745586395264, "grad_norm_var": 2.842972977140631e-06, "learning_rate": 0.002037450939354294, "loss": 2.5593, "step": 9112 }, { "crossentropy": 2.5878063440322876, "epoch": 0.4955545283993583, "grad_norm": 0.03293098509311676, "grad_norm_var": 2.824043348570087e-06, "learning_rate": 0.0020364479588834833, "loss": 2.5878, "step": 9113 }, { "crossentropy": 2.4959787130355835, "epoch": 0.4956089072568585, "grad_norm": 0.03281761333346367, "grad_norm_var": 2.8217338835170944e-06, "learning_rate": 0.0020354451622094072, "loss": 2.496, "step": 9114 }, { "crossentropy": 2.573214054107666, "epoch": 0.4956632861143587, "grad_norm": 0.03225858137011528, "grad_norm_var": 2.813048450955276e-06, "learning_rate": 0.002034442549394258, "loss": 2.5732, "step": 9115 }, { "crossentropy": 2.558703660964966, "epoch": 0.49571766497185893, "grad_norm": 0.09874438494443893, "grad_norm_var": 0.00027206822661179236, "learning_rate": 0.0020334401205002175, "loss": 2.5587, "step": 9116 }, { "crossentropy": 2.6605974435806274, "epoch": 0.49577204382935913, "grad_norm": 0.031890906393527985, "grad_norm_var": 0.00027257353536156944, "learning_rate": 0.0020324378755894545, "loss": 2.6606, "step": 9117 }, { "crossentropy": 2.448285460472107, "epoch": 0.49582642268685934, "grad_norm": 0.03429795056581497, "grad_norm_var": 0.00027225257134741324, "learning_rate": 0.0020314358147241287, "loss": 2.4483, "step": 9118 }, { "crossentropy": 2.5762611627578735, "epoch": 0.49588080154435954, "grad_norm": 0.032602496445178986, "grad_norm_var": 0.0002712950688886642, "learning_rate": 0.0020304339379663854, "loss": 2.5763, "step": 9119 }, { "crossentropy": 2.558022618293762, "epoch": 0.49593518040185974, "grad_norm": 0.03431012108922005, "grad_norm_var": 0.00027111285882047383, "learning_rate": 0.0020294322453783607, "loss": 2.558, "step": 9120 }, { "crossentropy": 2.5386457443237305, "epoch": 0.49598955925935995, "grad_norm": 0.03272701054811478, "grad_norm_var": 0.00027211306483814477, "learning_rate": 0.002028430737022181, "loss": 2.5386, "step": 9121 }, { "crossentropy": 2.645013213157654, "epoch": 0.49604393811686015, "grad_norm": 0.03393540903925896, "grad_norm_var": 0.0002707124808410191, "learning_rate": 0.0020274294129599526, "loss": 2.645, "step": 9122 }, { "crossentropy": 2.5140219926834106, "epoch": 0.49609831697436035, "grad_norm": 0.03152814880013466, "grad_norm_var": 0.00027167618127094107, "learning_rate": 0.0020264282732537827, "loss": 2.514, "step": 9123 }, { "crossentropy": 2.508588433265686, "epoch": 0.49615269583186056, "grad_norm": 0.03204495832324028, "grad_norm_var": 0.0002712016091884101, "learning_rate": 0.0020254273179657607, "loss": 2.5086, "step": 9124 }, { "crossentropy": 2.4966601133346558, "epoch": 0.49620707468936076, "grad_norm": 0.03229660540819168, "grad_norm_var": 0.00027223910868344096, "learning_rate": 0.0020244265471579615, "loss": 2.4967, "step": 9125 }, { "crossentropy": 2.5102691650390625, "epoch": 0.49626145354686096, "grad_norm": 0.031401026993989944, "grad_norm_var": 0.00027156241469270486, "learning_rate": 0.0020234259608924534, "loss": 2.5103, "step": 9126 }, { "crossentropy": 2.616497278213501, "epoch": 0.49631583240436117, "grad_norm": 0.0344836600124836, "grad_norm_var": 0.000271345529609864, "learning_rate": 0.0020224255592312924, "loss": 2.6165, "step": 9127 }, { "crossentropy": 2.5405750274658203, "epoch": 0.49637021126186137, "grad_norm": 0.03249487653374672, "grad_norm_var": 0.00027273491096697444, "learning_rate": 0.0020214253422365224, "loss": 2.5406, "step": 9128 }, { "crossentropy": 2.571035861968994, "epoch": 0.4964245901193616, "grad_norm": 0.0447385199368, "grad_norm_var": 0.000275164069002274, "learning_rate": 0.002020425309970175, "loss": 2.571, "step": 9129 }, { "crossentropy": 2.5451327562332153, "epoch": 0.4964789689768618, "grad_norm": 0.032130345702171326, "grad_norm_var": 0.000275637395738916, "learning_rate": 0.002019425462494273, "loss": 2.5451, "step": 9130 }, { "crossentropy": 2.568213701248169, "epoch": 0.496533347834362, "grad_norm": 0.03170314431190491, "grad_norm_var": 0.0002760535730537583, "learning_rate": 0.002018425799870825, "loss": 2.5682, "step": 9131 }, { "crossentropy": 2.5708765983581543, "epoch": 0.4965877266918622, "grad_norm": 0.032699499279260635, "grad_norm_var": 1.0088643636327738e-05, "learning_rate": 0.0020174263221618303, "loss": 2.5709, "step": 9132 }, { "crossentropy": 2.57615065574646, "epoch": 0.4966421055493624, "grad_norm": 0.033455152064561844, "grad_norm_var": 9.915294784179398e-06, "learning_rate": 0.0020164270294292714, "loss": 2.5762, "step": 9133 }, { "crossentropy": 2.4941869974136353, "epoch": 0.4966964844068626, "grad_norm": 0.03439006581902504, "grad_norm_var": 9.924973908324805e-06, "learning_rate": 0.002015427921735128, "loss": 2.4942, "step": 9134 }, { "crossentropy": 2.511602282524109, "epoch": 0.4967508632643628, "grad_norm": 0.03567207604646683, "grad_norm_var": 1.0122469376409812e-05, "learning_rate": 0.0020144289991413647, "loss": 2.5116, "step": 9135 }, { "crossentropy": 2.5243782997131348, "epoch": 0.496805242121863, "grad_norm": 0.032263658940792084, "grad_norm_var": 1.0231565396876669e-05, "learning_rate": 0.0020134302617099285, "loss": 2.5244, "step": 9136 }, { "crossentropy": 2.5713027715682983, "epoch": 0.4968596209793632, "grad_norm": 0.03205370530486107, "grad_norm_var": 1.0340314171567294e-05, "learning_rate": 0.0020124317095027624, "loss": 2.5713, "step": 9137 }, { "crossentropy": 2.5841652154922485, "epoch": 0.4969139998368634, "grad_norm": 0.03232604265213013, "grad_norm_var": 1.042607403117235e-05, "learning_rate": 0.0020114333425817993, "loss": 2.5842, "step": 9138 }, { "crossentropy": 2.5822960138320923, "epoch": 0.4969683786943636, "grad_norm": 0.032031167298555374, "grad_norm_var": 1.0310973059232675e-05, "learning_rate": 0.002010435161008953, "loss": 2.5823, "step": 9139 }, { "crossentropy": 2.755781054496765, "epoch": 0.4970227575518638, "grad_norm": 0.03331308811903, "grad_norm_var": 1.0163508607192485e-05, "learning_rate": 0.00200943716484613, "loss": 2.7558, "step": 9140 }, { "crossentropy": 2.494473457336426, "epoch": 0.497077136409364, "grad_norm": 0.03140333294868469, "grad_norm_var": 1.0367520806567737e-05, "learning_rate": 0.0020084393541552263, "loss": 2.4945, "step": 9141 }, { "crossentropy": 2.6580426692962646, "epoch": 0.4971315152668642, "grad_norm": 0.03411489352583885, "grad_norm_var": 1.0055676485354268e-05, "learning_rate": 0.002007441728998125, "loss": 2.658, "step": 9142 }, { "crossentropy": 2.606170654296875, "epoch": 0.4971858941243644, "grad_norm": 0.03658647835254669, "grad_norm_var": 1.0550477863735299e-05, "learning_rate": 0.002006444289436699, "loss": 2.6062, "step": 9143 }, { "crossentropy": 2.6295851469039917, "epoch": 0.49724027298186463, "grad_norm": 0.032717835158109665, "grad_norm_var": 1.0513716013556736e-05, "learning_rate": 0.002005447035532804, "loss": 2.6296, "step": 9144 }, { "crossentropy": 2.6638879776000977, "epoch": 0.49729465183936483, "grad_norm": 0.03242120519280434, "grad_norm_var": 2.1135687984850476e-06, "learning_rate": 0.002004449967348294, "loss": 2.6639, "step": 9145 }, { "crossentropy": 2.5234698057174683, "epoch": 0.49734903069686504, "grad_norm": 0.033148717135190964, "grad_norm_var": 2.0494251993921054e-06, "learning_rate": 0.002003453084945006, "loss": 2.5235, "step": 9146 }, { "crossentropy": 2.6156126260757446, "epoch": 0.49740340955436524, "grad_norm": 0.03197235241532326, "grad_norm_var": 2.002244925973969e-06, "learning_rate": 0.0020024563883847624, "loss": 2.6156, "step": 9147 }, { "crossentropy": 2.5658706426620483, "epoch": 0.49745778841186544, "grad_norm": 0.03503264859318733, "grad_norm_var": 2.1990332007467326e-06, "learning_rate": 0.0020014598777293767, "loss": 2.5659, "step": 9148 }, { "crossentropy": 2.586089611053467, "epoch": 0.49751216726936565, "grad_norm": 0.03143982216715813, "grad_norm_var": 2.4129094271738614e-06, "learning_rate": 0.002000463553040658, "loss": 2.5861, "step": 9149 }, { "crossentropy": 2.522156238555908, "epoch": 0.49756654612686585, "grad_norm": 0.03400254249572754, "grad_norm_var": 2.3597943873118177e-06, "learning_rate": 0.001999467414380391, "loss": 2.5222, "step": 9150 }, { "crossentropy": 2.5468196868896484, "epoch": 0.49762092498436605, "grad_norm": 0.033224742859601974, "grad_norm_var": 1.913183582705896e-06, "learning_rate": 0.0019984714618103577, "loss": 2.5468, "step": 9151 }, { "crossentropy": 2.51406466960907, "epoch": 0.49767530384186626, "grad_norm": 0.0321383997797966, "grad_norm_var": 1.9265165162464513e-06, "learning_rate": 0.001997475695392326, "loss": 2.5141, "step": 9152 }, { "crossentropy": 2.555240750312805, "epoch": 0.49772968269936646, "grad_norm": 0.03237288072705269, "grad_norm_var": 1.892806608658204e-06, "learning_rate": 0.0019964801151880524, "loss": 2.5552, "step": 9153 }, { "crossentropy": 2.634568929672241, "epoch": 0.49778406155686666, "grad_norm": 0.03229512646794319, "grad_norm_var": 1.8957079220846898e-06, "learning_rate": 0.001995484721259284, "loss": 2.6346, "step": 9154 }, { "crossentropy": 2.4240245819091797, "epoch": 0.49783844041436687, "grad_norm": 0.03162713721394539, "grad_norm_var": 1.9588267883821965e-06, "learning_rate": 0.0019944895136677484, "loss": 2.424, "step": 9155 }, { "crossentropy": 2.630405068397522, "epoch": 0.49789281927186707, "grad_norm": 0.0328260138630867, "grad_norm_var": 1.952555099172098e-06, "learning_rate": 0.0019934944924751724, "loss": 2.6304, "step": 9156 }, { "crossentropy": 2.6287319660186768, "epoch": 0.4979471981293673, "grad_norm": 0.0322851724922657, "grad_norm_var": 1.8183905235056216e-06, "learning_rate": 0.0019924996577432674, "loss": 2.6287, "step": 9157 }, { "crossentropy": 2.5964527130126953, "epoch": 0.49800157698686753, "grad_norm": 0.03156832978129387, "grad_norm_var": 1.8495201454857742e-06, "learning_rate": 0.0019915050095337277, "loss": 2.5965, "step": 9158 }, { "crossentropy": 2.612382173538208, "epoch": 0.49805595584436774, "grad_norm": 0.03095409646630287, "grad_norm_var": 1.0290048530883199e-06, "learning_rate": 0.001990510547908241, "loss": 2.6124, "step": 9159 }, { "crossentropy": 2.5612716674804688, "epoch": 0.49811033470186794, "grad_norm": 0.031370293349027634, "grad_norm_var": 1.103661183006419e-06, "learning_rate": 0.0019895162729284873, "loss": 2.5613, "step": 9160 }, { "crossentropy": 2.583733558654785, "epoch": 0.49816471355936814, "grad_norm": 0.034922413527965546, "grad_norm_var": 1.4959103579477902e-06, "learning_rate": 0.0019885221846561263, "loss": 2.5837, "step": 9161 }, { "crossentropy": 2.610752820968628, "epoch": 0.49821909241686835, "grad_norm": 0.03189534693956375, "grad_norm_var": 1.498014888072901e-06, "learning_rate": 0.0019875282831528117, "loss": 2.6108, "step": 9162 }, { "crossentropy": 2.618778705596924, "epoch": 0.49827347127436855, "grad_norm": 0.03298015892505646, "grad_norm_var": 1.491202689919933e-06, "learning_rate": 0.0019865345684801842, "loss": 2.6188, "step": 9163 }, { "crossentropy": 2.5752631425857544, "epoch": 0.49832785013186875, "grad_norm": 0.032937344163656235, "grad_norm_var": 1.0743686128827922e-06, "learning_rate": 0.001985541040699874, "loss": 2.5753, "step": 9164 }, { "crossentropy": 2.60605251789093, "epoch": 0.49838222898936896, "grad_norm": 0.03188291937112808, "grad_norm_var": 1.0282885806321579e-06, "learning_rate": 0.001984547699873498, "loss": 2.6061, "step": 9165 }, { "crossentropy": 2.5383670330047607, "epoch": 0.49843660784686916, "grad_norm": 0.03446720913052559, "grad_norm_var": 1.1376508209352358e-06, "learning_rate": 0.001983554546062662, "loss": 2.5384, "step": 9166 }, { "crossentropy": 2.5182886123657227, "epoch": 0.49849098670436937, "grad_norm": 0.03315960243344307, "grad_norm_var": 1.1314843309922589e-06, "learning_rate": 0.001982561579328961, "loss": 2.5183, "step": 9167 }, { "crossentropy": 2.526467204093933, "epoch": 0.49854536556186957, "grad_norm": 0.031474221497774124, "grad_norm_var": 1.189319787301725e-06, "learning_rate": 0.0019815687997339787, "loss": 2.5265, "step": 9168 }, { "crossentropy": 2.5136196613311768, "epoch": 0.4985997444193698, "grad_norm": 0.03127625584602356, "grad_norm_var": 1.274096761857968e-06, "learning_rate": 0.0019805762073392846, "loss": 2.5136, "step": 9169 }, { "crossentropy": 2.6302292346954346, "epoch": 0.49865412327687, "grad_norm": 0.03290288895368576, "grad_norm_var": 1.2911070242449876e-06, "learning_rate": 0.0019795838022064367, "loss": 2.6302, "step": 9170 }, { "crossentropy": 2.457655906677246, "epoch": 0.4987085021343702, "grad_norm": 0.03152789920568466, "grad_norm_var": 1.302055865700045e-06, "learning_rate": 0.001978591584396989, "loss": 2.4577, "step": 9171 }, { "crossentropy": 2.594797968864441, "epoch": 0.4987628809918704, "grad_norm": 0.03223051503300667, "grad_norm_var": 1.2905438081790728e-06, "learning_rate": 0.0019775995539724734, "loss": 2.5948, "step": 9172 }, { "crossentropy": 2.5901821851730347, "epoch": 0.4988172598493706, "grad_norm": 0.03408370167016983, "grad_norm_var": 1.4736499893898413e-06, "learning_rate": 0.0019766077109944153, "loss": 2.5902, "step": 9173 }, { "crossentropy": 2.5220608711242676, "epoch": 0.4988716387068708, "grad_norm": 0.030040457844734192, "grad_norm_var": 1.804675658735079e-06, "learning_rate": 0.001975616055524328, "loss": 2.5221, "step": 9174 }, { "crossentropy": 2.615153431892395, "epoch": 0.498926017564371, "grad_norm": 0.03312402218580246, "grad_norm_var": 1.6859565271948318e-06, "learning_rate": 0.0019746245876237135, "loss": 2.6152, "step": 9175 }, { "crossentropy": 2.4774752855300903, "epoch": 0.4989803964218712, "grad_norm": 0.03280849754810333, "grad_norm_var": 1.595301422890362e-06, "learning_rate": 0.0019736333073540613, "loss": 2.4775, "step": 9176 }, { "crossentropy": 2.524554967880249, "epoch": 0.4990347752793714, "grad_norm": 0.03237432613968849, "grad_norm_var": 1.2144790021595408e-06, "learning_rate": 0.00197264221477685, "loss": 2.5246, "step": 9177 }, { "crossentropy": 2.5211559534072876, "epoch": 0.4990891541368716, "grad_norm": 0.033455390483140945, "grad_norm_var": 1.251666679516888e-06, "learning_rate": 0.001971651309953546, "loss": 2.5212, "step": 9178 }, { "crossentropy": 2.6134259700775146, "epoch": 0.4991435329943718, "grad_norm": 0.031087912619113922, "grad_norm_var": 1.3657488677829357e-06, "learning_rate": 0.001970660592945605, "loss": 2.6134, "step": 9179 }, { "crossentropy": 2.556410074234009, "epoch": 0.499197911851872, "grad_norm": 0.0308504868298769, "grad_norm_var": 1.495952896782846e-06, "learning_rate": 0.001969670063814471, "loss": 2.5564, "step": 9180 }, { "crossentropy": 2.6321334838867188, "epoch": 0.4992522907093722, "grad_norm": 0.03406715765595436, "grad_norm_var": 1.6736441447699843e-06, "learning_rate": 0.001968679722621571, "loss": 2.6321, "step": 9181 }, { "crossentropy": 2.4838839769363403, "epoch": 0.4993066695668724, "grad_norm": 0.03083411417901516, "grad_norm_var": 1.5132857255760467e-06, "learning_rate": 0.001967689569428332, "loss": 2.4839, "step": 9182 }, { "crossentropy": 2.613719940185547, "epoch": 0.4993610484243726, "grad_norm": 0.031102720648050308, "grad_norm_var": 1.5162069190390963e-06, "learning_rate": 0.0019666996042961575, "loss": 2.6137, "step": 9183 }, { "crossentropy": 2.633225202560425, "epoch": 0.4994154272818728, "grad_norm": 0.03581053391098976, "grad_norm_var": 2.342611174566365e-06, "learning_rate": 0.0019657098272864455, "loss": 2.6332, "step": 9184 }, { "crossentropy": 2.5862903594970703, "epoch": 0.49946980613937303, "grad_norm": 0.03139045462012291, "grad_norm_var": 2.3270988923365075e-06, "learning_rate": 0.0019647202384605812, "loss": 2.5863, "step": 9185 }, { "crossentropy": 2.611141562461853, "epoch": 0.49952418499687323, "grad_norm": 0.03195076063275337, "grad_norm_var": 2.3142913277156747e-06, "learning_rate": 0.001963730837879937, "loss": 2.6111, "step": 9186 }, { "crossentropy": 2.570374011993408, "epoch": 0.49957856385437344, "grad_norm": 0.03139936923980713, "grad_norm_var": 2.328490181561828e-06, "learning_rate": 0.0019627416256058766, "loss": 2.5704, "step": 9187 }, { "crossentropy": 2.582580804824829, "epoch": 0.49963294271187364, "grad_norm": 0.032381799072027206, "grad_norm_var": 2.32875801681696e-06, "learning_rate": 0.0019617526016997484, "loss": 2.5826, "step": 9188 }, { "crossentropy": 2.6379441022872925, "epoch": 0.49968732156937384, "grad_norm": 0.057332273572683334, "grad_norm_var": 4.1646318669894394e-05, "learning_rate": 0.0019607637662228915, "loss": 2.6379, "step": 9189 }, { "crossentropy": 2.5831700563430786, "epoch": 0.49974170042687405, "grad_norm": 0.03257524594664574, "grad_norm_var": 4.079395306018039e-05, "learning_rate": 0.0019597751192366327, "loss": 2.5832, "step": 9190 }, { "crossentropy": 2.56622314453125, "epoch": 0.49979607928437425, "grad_norm": 0.03571751341223717, "grad_norm_var": 4.094287292328667e-05, "learning_rate": 0.0019587866608022876, "loss": 2.5662, "step": 9191 }, { "crossentropy": 2.4988248348236084, "epoch": 0.49985045814187445, "grad_norm": 0.03219327703118324, "grad_norm_var": 4.107010436935274e-05, "learning_rate": 0.001957798390981155, "loss": 2.4988, "step": 9192 }, { "crossentropy": 2.5436757802963257, "epoch": 0.49990483699937466, "grad_norm": 0.0333581417798996, "grad_norm_var": 4.0913058694808047e-05, "learning_rate": 0.0019568103098345324, "loss": 2.5437, "step": 9193 }, { "crossentropy": 2.6292282342910767, "epoch": 0.49995921585687486, "grad_norm": 0.03191686421632767, "grad_norm_var": 4.1192042879627015e-05, "learning_rate": 0.0019558224174236988, "loss": 2.6292, "step": 9194 }, { "crossentropy": 2.558867931365967, "epoch": 0.500013594714375, "grad_norm": 0.033767957240343094, "grad_norm_var": 4.060105532344791e-05, "learning_rate": 0.0019548347138099186, "loss": 2.5589, "step": 9195 }, { "crossentropy": 2.4900201559066772, "epoch": 0.5000679735718753, "grad_norm": 0.030808236449956894, "grad_norm_var": 4.0619841869871326e-05, "learning_rate": 0.0019538471990544507, "loss": 2.49, "step": 9196 }, { "crossentropy": 2.604884147644043, "epoch": 0.5001223524293754, "grad_norm": 0.0317804291844368, "grad_norm_var": 4.09758542533003e-05, "learning_rate": 0.00195285987321854, "loss": 2.6049, "step": 9197 }, { "crossentropy": 2.599042534828186, "epoch": 0.5001767312868757, "grad_norm": 0.03217405453324318, "grad_norm_var": 4.051888644827824e-05, "learning_rate": 0.0019518727363634187, "loss": 2.599, "step": 9198 }, { "crossentropy": 2.6090694665908813, "epoch": 0.5002311101443758, "grad_norm": 0.03190912306308746, "grad_norm_var": 4.0236860085836264e-05, "learning_rate": 0.0019508857885503085, "loss": 2.6091, "step": 9199 }, { "crossentropy": 2.6608203649520874, "epoch": 0.5002854890018761, "grad_norm": 0.03267965093255043, "grad_norm_var": 4.015804307253608e-05, "learning_rate": 0.001949899029840419, "loss": 2.6608, "step": 9200 }, { "crossentropy": 2.596789836883545, "epoch": 0.5003398678593762, "grad_norm": 0.03176335617899895, "grad_norm_var": 4.0039052934272695e-05, "learning_rate": 0.0019489124602949481, "loss": 2.5968, "step": 9201 }, { "crossentropy": 2.5625803470611572, "epoch": 0.5003942467168765, "grad_norm": 0.03227163106203079, "grad_norm_var": 3.9958596401102704e-05, "learning_rate": 0.0019479260799750836, "loss": 2.5626, "step": 9202 }, { "crossentropy": 2.5146299600601196, "epoch": 0.5004486255743766, "grad_norm": 0.033328454941511154, "grad_norm_var": 3.952180522370965e-05, "learning_rate": 0.0019469398889419948, "loss": 2.5146, "step": 9203 }, { "crossentropy": 2.5802804231643677, "epoch": 0.5005030044318769, "grad_norm": 0.03116035833954811, "grad_norm_var": 3.989851819770901e-05, "learning_rate": 0.0019459538872568494, "loss": 2.5803, "step": 9204 }, { "crossentropy": 2.5004199743270874, "epoch": 0.500557383289377, "grad_norm": 0.03297123312950134, "grad_norm_var": 1.3528547303518398e-06, "learning_rate": 0.0019449680749807985, "loss": 2.5004, "step": 9205 }, { "crossentropy": 2.537633180618286, "epoch": 0.5006117621468773, "grad_norm": 0.034234508872032166, "grad_norm_var": 1.5363813729222033e-06, "learning_rate": 0.001943982452174977, "loss": 2.5376, "step": 9206 }, { "crossentropy": 2.5578352212905884, "epoch": 0.5006661410043775, "grad_norm": 0.03481126204133034, "grad_norm_var": 1.2142955655223066e-06, "learning_rate": 0.0019429970189005137, "loss": 2.5578, "step": 9207 }, { "crossentropy": 2.438826084136963, "epoch": 0.5007205198618777, "grad_norm": 0.03541180491447449, "grad_norm_var": 1.6998333650986019e-06, "learning_rate": 0.0019420117752185285, "loss": 2.4388, "step": 9208 }, { "crossentropy": 2.510999917984009, "epoch": 0.5007748987193779, "grad_norm": 0.0406254380941391, "grad_norm_var": 5.568937317509034e-06, "learning_rate": 0.0019410267211901206, "loss": 2.511, "step": 9209 }, { "crossentropy": 2.545786738395691, "epoch": 0.5008292775768781, "grad_norm": 0.03125286474823952, "grad_norm_var": 5.712386278240202e-06, "learning_rate": 0.0019400418568763828, "loss": 2.5458, "step": 9210 }, { "crossentropy": 2.6086037158966064, "epoch": 0.5008836564343783, "grad_norm": 0.034166380763053894, "grad_norm_var": 5.753308120997937e-06, "learning_rate": 0.0019390571823383969, "loss": 2.6086, "step": 9211 }, { "crossentropy": 2.499766707420349, "epoch": 0.5009380352918785, "grad_norm": 0.03213421627879143, "grad_norm_var": 5.438695590544394e-06, "learning_rate": 0.0019380726976372304, "loss": 2.4998, "step": 9212 }, { "crossentropy": 2.584237813949585, "epoch": 0.5009924141493787, "grad_norm": 0.031141862273216248, "grad_norm_var": 5.592894344070657e-06, "learning_rate": 0.001937088402833943, "loss": 2.5842, "step": 9213 }, { "crossentropy": 2.5261365175247192, "epoch": 0.5010467930068789, "grad_norm": 0.034202419221401215, "grad_norm_var": 5.558435935861377e-06, "learning_rate": 0.0019361042979895732, "loss": 2.5261, "step": 9214 }, { "crossentropy": 2.6233359575271606, "epoch": 0.5011011718643791, "grad_norm": 0.034870196133852005, "grad_norm_var": 5.526097370533769e-06, "learning_rate": 0.0019351203831651604, "loss": 2.6233, "step": 9215 }, { "crossentropy": 2.5986099243164062, "epoch": 0.5011555507218793, "grad_norm": 0.03132561221718788, "grad_norm_var": 5.800363705010571e-06, "learning_rate": 0.0019341366584217257, "loss": 2.5986, "step": 9216 }, { "crossentropy": 2.6443523168563843, "epoch": 0.5012099295793795, "grad_norm": 0.03316454589366913, "grad_norm_var": 5.602457597289091e-06, "learning_rate": 0.0019331531238202754, "loss": 2.6444, "step": 9217 }, { "crossentropy": 2.5493987798690796, "epoch": 0.5012643084368797, "grad_norm": 0.03419665992259979, "grad_norm_var": 5.501570436480287e-06, "learning_rate": 0.001932169779421809, "loss": 2.5494, "step": 9218 }, { "crossentropy": 2.474705457687378, "epoch": 0.5013186872943799, "grad_norm": 0.03252486139535904, "grad_norm_var": 5.580386156979015e-06, "learning_rate": 0.001931186625287313, "loss": 2.4747, "step": 9219 }, { "crossentropy": 2.551435708999634, "epoch": 0.5013730661518802, "grad_norm": 0.03209155797958374, "grad_norm_var": 5.32706498530122e-06, "learning_rate": 0.0019302036614777614, "loss": 2.5514, "step": 9220 }, { "crossentropy": 2.6831839084625244, "epoch": 0.5014274450093803, "grad_norm": 0.032995712012052536, "grad_norm_var": 5.3247390626322276e-06, "learning_rate": 0.0019292208880541163, "loss": 2.6832, "step": 9221 }, { "crossentropy": 2.5386688709259033, "epoch": 0.5014818238668806, "grad_norm": 0.030794067308306694, "grad_norm_var": 5.817899770089877e-06, "learning_rate": 0.0019282383050773294, "loss": 2.5387, "step": 9222 }, { "crossentropy": 2.49507212638855, "epoch": 0.5015362027243807, "grad_norm": 0.031071415171027184, "grad_norm_var": 6.029142529298768e-06, "learning_rate": 0.0019272559126083388, "loss": 2.4951, "step": 9223 }, { "crossentropy": 2.4545971155166626, "epoch": 0.501590581581881, "grad_norm": 0.03261541575193405, "grad_norm_var": 5.711138421767014e-06, "learning_rate": 0.0019262737107080736, "loss": 2.4546, "step": 9224 }, { "crossentropy": 2.6132892370224, "epoch": 0.5016449604393811, "grad_norm": 0.03286000341176987, "grad_norm_var": 1.6606214119023466e-06, "learning_rate": 0.0019252916994374452, "loss": 2.6133, "step": 9225 }, { "crossentropy": 2.6252483129501343, "epoch": 0.5016993392968814, "grad_norm": 0.03160868212580681, "grad_norm_var": 1.6051929986757308e-06, "learning_rate": 0.0019243098788573577, "loss": 2.6252, "step": 9226 }, { "crossentropy": 2.6737254858016968, "epoch": 0.5017537181543815, "grad_norm": 0.03276342526078224, "grad_norm_var": 1.4371152107103247e-06, "learning_rate": 0.0019233282490287074, "loss": 2.6737, "step": 9227 }, { "crossentropy": 2.5195152759552, "epoch": 0.5018080970118818, "grad_norm": 0.032195914536714554, "grad_norm_var": 1.4341586019316876e-06, "learning_rate": 0.0019223468100123687, "loss": 2.5195, "step": 9228 }, { "crossentropy": 2.6255643367767334, "epoch": 0.5018624758693819, "grad_norm": 0.03252238780260086, "grad_norm_var": 1.298422884056458e-06, "learning_rate": 0.0019213655618692117, "loss": 2.6256, "step": 9229 }, { "crossentropy": 2.613856554031372, "epoch": 0.5019168547268822, "grad_norm": 0.035688064992427826, "grad_norm_var": 1.7512746384390834e-06, "learning_rate": 0.0019203845046600926, "loss": 2.6139, "step": 9230 }, { "crossentropy": 2.547357439994812, "epoch": 0.5019712335843823, "grad_norm": 0.03202953189611435, "grad_norm_var": 1.4357328395712965e-06, "learning_rate": 0.0019194036384458553, "loss": 2.5474, "step": 9231 }, { "crossentropy": 2.5547447204589844, "epoch": 0.5020256124418826, "grad_norm": 0.032267820090055466, "grad_norm_var": 1.3401654554161721e-06, "learning_rate": 0.0019184229632873324, "loss": 2.5547, "step": 9232 }, { "crossentropy": 2.509127974510193, "epoch": 0.5020799912993829, "grad_norm": 0.031819529831409454, "grad_norm_var": 1.3496360690324247e-06, "learning_rate": 0.0019174424792453438, "loss": 2.5091, "step": 9233 }, { "crossentropy": 2.535598397254944, "epoch": 0.502134370156883, "grad_norm": 0.032688822597265244, "grad_norm_var": 1.1511955127218564e-06, "learning_rate": 0.0019164621863806991, "loss": 2.5356, "step": 9234 }, { "crossentropy": 2.4979724884033203, "epoch": 0.5021887490143833, "grad_norm": 0.049511536955833435, "grad_norm_var": 1.9448766355926957e-05, "learning_rate": 0.001915482084754197, "loss": 2.498, "step": 9235 }, { "crossentropy": 2.589115619659424, "epoch": 0.5022431278718834, "grad_norm": 0.032435085624456406, "grad_norm_var": 1.9392993203647818e-05, "learning_rate": 0.0019145021744266183, "loss": 2.5891, "step": 9236 }, { "crossentropy": 2.5114283561706543, "epoch": 0.5022975067293837, "grad_norm": 0.03179292008280754, "grad_norm_var": 1.956295735209765e-05, "learning_rate": 0.001913522455458736, "loss": 2.5114, "step": 9237 }, { "crossentropy": 2.55535626411438, "epoch": 0.5023518855868838, "grad_norm": 0.0332273505628109, "grad_norm_var": 1.9082182715383098e-05, "learning_rate": 0.0019125429279113172, "loss": 2.5554, "step": 9238 }, { "crossentropy": 2.5580259561538696, "epoch": 0.5024062644443841, "grad_norm": 0.03238517418503761, "grad_norm_var": 1.8752625485554155e-05, "learning_rate": 0.0019115635918451064, "loss": 2.558, "step": 9239 }, { "crossentropy": 2.592129111289978, "epoch": 0.5024606433018842, "grad_norm": 0.03251413255929947, "grad_norm_var": 1.8767247940633795e-05, "learning_rate": 0.0019105844473208417, "loss": 2.5921, "step": 9240 }, { "crossentropy": 2.6110754013061523, "epoch": 0.5025150221593845, "grad_norm": 0.03286254033446312, "grad_norm_var": 1.8766983016117965e-05, "learning_rate": 0.0019096054943992492, "loss": 2.6111, "step": 9241 }, { "crossentropy": 2.5791701078414917, "epoch": 0.5025694010168846, "grad_norm": 0.032108139246702194, "grad_norm_var": 1.8646996446547207e-05, "learning_rate": 0.001908626733141043, "loss": 2.5792, "step": 9242 }, { "crossentropy": 2.665654420852661, "epoch": 0.5026237798743849, "grad_norm": 0.03133771941065788, "grad_norm_var": 1.8947468305978896e-05, "learning_rate": 0.0019076481636069247, "loss": 2.6657, "step": 9243 }, { "crossentropy": 2.5853785276412964, "epoch": 0.502678158731885, "grad_norm": 0.030961062759160995, "grad_norm_var": 1.9271755061580028e-05, "learning_rate": 0.0019066697858575842, "loss": 2.5854, "step": 9244 }, { "crossentropy": 2.6385433673858643, "epoch": 0.5027325375893853, "grad_norm": 0.037702467292547226, "grad_norm_var": 2.0267063394906137e-05, "learning_rate": 0.0019056915999537, "loss": 2.6385, "step": 9245 }, { "crossentropy": 2.5541584491729736, "epoch": 0.5027869164468854, "grad_norm": 0.04075311869382858, "grad_norm_var": 2.3123122417184095e-05, "learning_rate": 0.0019047136059559377, "loss": 2.5542, "step": 9246 }, { "crossentropy": 2.6171168088912964, "epoch": 0.5028412953043857, "grad_norm": 0.032168198376894, "grad_norm_var": 2.308512266711744e-05, "learning_rate": 0.0019037358039249537, "loss": 2.6171, "step": 9247 }, { "crossentropy": 2.5988134145736694, "epoch": 0.5028956741618859, "grad_norm": 0.03268223628401756, "grad_norm_var": 2.299138733348541e-05, "learning_rate": 0.001902758193921385, "loss": 2.5988, "step": 9248 }, { "crossentropy": 2.6037453413009644, "epoch": 0.5029500530193861, "grad_norm": 0.03229234740138054, "grad_norm_var": 2.2856274095321184e-05, "learning_rate": 0.0019017807760058686, "loss": 2.6037, "step": 9249 }, { "crossentropy": 2.5604969263076782, "epoch": 0.5030044318768863, "grad_norm": 0.03258635848760605, "grad_norm_var": 2.2877766088874366e-05, "learning_rate": 0.0019008035502390186, "loss": 2.5605, "step": 9250 }, { "crossentropy": 2.556623339653015, "epoch": 0.5030588107343865, "grad_norm": 0.0325084924697876, "grad_norm_var": 6.251428390541038e-06, "learning_rate": 0.0018998265166814431, "loss": 2.5566, "step": 9251 }, { "crossentropy": 2.6649011373519897, "epoch": 0.5031131895918867, "grad_norm": 0.03309517353773117, "grad_norm_var": 6.2161944685713726e-06, "learning_rate": 0.0018988496753937368, "loss": 2.6649, "step": 9252 }, { "crossentropy": 2.565609335899353, "epoch": 0.5031675684493869, "grad_norm": 0.03466959297657013, "grad_norm_var": 6.1990383832216205e-06, "learning_rate": 0.0018978730264364824, "loss": 2.5656, "step": 9253 }, { "crossentropy": 2.6212438344955444, "epoch": 0.5032219473068871, "grad_norm": 0.031652312725782394, "grad_norm_var": 6.383177098258595e-06, "learning_rate": 0.0018968965698702505, "loss": 2.6212, "step": 9254 }, { "crossentropy": 2.5982418060302734, "epoch": 0.5032763261643873, "grad_norm": 0.03325990214943886, "grad_norm_var": 6.32809970584324e-06, "learning_rate": 0.0018959203057556007, "loss": 2.5982, "step": 9255 }, { "crossentropy": 2.569687008857727, "epoch": 0.5033307050218875, "grad_norm": 0.03185121342539787, "grad_norm_var": 6.426982752727404e-06, "learning_rate": 0.0018949442341530793, "loss": 2.5697, "step": 9256 }, { "crossentropy": 2.618261694908142, "epoch": 0.5033850838793877, "grad_norm": 0.031488846987485886, "grad_norm_var": 6.621508380479033e-06, "learning_rate": 0.0018939683551232218, "loss": 2.6183, "step": 9257 }, { "crossentropy": 2.6016347408294678, "epoch": 0.5034394627368879, "grad_norm": 0.0319226048886776, "grad_norm_var": 6.6505421295205584e-06, "learning_rate": 0.001892992668726553, "loss": 2.6016, "step": 9258 }, { "crossentropy": 2.550470471382141, "epoch": 0.5034938415943881, "grad_norm": 0.03294425457715988, "grad_norm_var": 6.4165352406958065e-06, "learning_rate": 0.0018920171750235787, "loss": 2.5505, "step": 9259 }, { "crossentropy": 2.6305785179138184, "epoch": 0.5035482204518883, "grad_norm": 0.033321600407361984, "grad_norm_var": 6.0337908498647265e-06, "learning_rate": 0.0018910418740748032, "loss": 2.6306, "step": 9260 }, { "crossentropy": 2.528104066848755, "epoch": 0.5036025993093886, "grad_norm": 0.0310811810195446, "grad_norm_var": 5.003016097652329e-06, "learning_rate": 0.0018900667659407144, "loss": 2.5281, "step": 9261 }, { "crossentropy": 2.6153323650360107, "epoch": 0.5036569781668887, "grad_norm": 0.032150302082300186, "grad_norm_var": 7.552793233162394e-07, "learning_rate": 0.0018890918506817833, "loss": 2.6153, "step": 9262 }, { "crossentropy": 2.4890247583389282, "epoch": 0.503711357024389, "grad_norm": 0.032242968678474426, "grad_norm_var": 7.525236224131815e-07, "learning_rate": 0.0018881171283584752, "loss": 2.489, "step": 9263 }, { "crossentropy": 2.4245526790618896, "epoch": 0.5037657358818891, "grad_norm": 0.03081943839788437, "grad_norm_var": 9.20246710580698e-07, "learning_rate": 0.0018871425990312418, "loss": 2.4246, "step": 9264 }, { "crossentropy": 2.4782310724258423, "epoch": 0.5038201147393894, "grad_norm": 0.030929602682590485, "grad_norm_var": 1.0500437976788032e-06, "learning_rate": 0.001886168262760522, "loss": 2.4782, "step": 9265 }, { "crossentropy": 2.646709680557251, "epoch": 0.5038744935968895, "grad_norm": 0.032360371202230453, "grad_norm_var": 1.044087176472972e-06, "learning_rate": 0.001885194119606744, "loss": 2.6467, "step": 9266 }, { "crossentropy": 2.5655605792999268, "epoch": 0.5039288724543898, "grad_norm": 0.03280993923544884, "grad_norm_var": 1.0594078869217057e-06, "learning_rate": 0.0018842201696303224, "loss": 2.5656, "step": 9267 }, { "crossentropy": 2.559072732925415, "epoch": 0.5039832513118899, "grad_norm": 0.031882110983133316, "grad_norm_var": 1.0207364385963458e-06, "learning_rate": 0.001883246412891661, "loss": 2.5591, "step": 9268 }, { "crossentropy": 2.6538432836532593, "epoch": 0.5040376301693902, "grad_norm": 0.03314284235239029, "grad_norm_var": 6.660644373318939e-07, "learning_rate": 0.0018822728494511532, "loss": 2.6538, "step": 9269 }, { "crossentropy": 2.6155163049697876, "epoch": 0.5040920090268903, "grad_norm": 0.031560976058244705, "grad_norm_var": 6.722353809713103e-07, "learning_rate": 0.0018812994793691734, "loss": 2.6155, "step": 9270 }, { "crossentropy": 2.63319730758667, "epoch": 0.5041463878843906, "grad_norm": 0.033527616411447525, "grad_norm_var": 7.177426475272914e-07, "learning_rate": 0.0018803263027060936, "loss": 2.6332, "step": 9271 }, { "crossentropy": 2.6362876892089844, "epoch": 0.5042007667418907, "grad_norm": 0.03266745060682297, "grad_norm_var": 7.293422232064935e-07, "learning_rate": 0.00187935331952227, "loss": 2.6363, "step": 9272 }, { "crossentropy": 2.536580801010132, "epoch": 0.504255145599391, "grad_norm": 0.03226233646273613, "grad_norm_var": 6.95634946044447e-07, "learning_rate": 0.0018783805298780427, "loss": 2.5366, "step": 9273 }, { "crossentropy": 2.5546001195907593, "epoch": 0.5043095244568911, "grad_norm": 0.032907404005527496, "grad_norm_var": 7.163327608199094e-07, "learning_rate": 0.0018774079338337425, "loss": 2.5546, "step": 9274 }, { "crossentropy": 2.631974697113037, "epoch": 0.5043639033143914, "grad_norm": 0.03273484483361244, "grad_norm_var": 7.007542377569787e-07, "learning_rate": 0.0018764355314496944, "loss": 2.632, "step": 9275 }, { "crossentropy": 2.610399603843689, "epoch": 0.5044182821718916, "grad_norm": 0.03238263353705406, "grad_norm_var": 6.248358745036357e-07, "learning_rate": 0.0018754633227862012, "loss": 2.6104, "step": 9276 }, { "crossentropy": 2.562039613723755, "epoch": 0.5044726610293918, "grad_norm": 0.0319734551012516, "grad_norm_var": 5.395414125197728e-07, "learning_rate": 0.0018744913079035597, "loss": 2.562, "step": 9277 }, { "crossentropy": 2.5163707733154297, "epoch": 0.504527039886892, "grad_norm": 0.03214507922530174, "grad_norm_var": 5.396279653030004e-07, "learning_rate": 0.001873519486862053, "loss": 2.5164, "step": 9278 }, { "crossentropy": 2.5344886779785156, "epoch": 0.5045814187443922, "grad_norm": 0.03128334879875183, "grad_norm_var": 6.008734651450262e-07, "learning_rate": 0.0018725478597219536, "loss": 2.5345, "step": 9279 }, { "crossentropy": 2.5607569217681885, "epoch": 0.5046357976018924, "grad_norm": 0.03179171681404114, "grad_norm_var": 4.794492808037528e-07, "learning_rate": 0.0018715764265435214, "loss": 2.5608, "step": 9280 }, { "crossentropy": 2.535524606704712, "epoch": 0.5046901764593926, "grad_norm": 0.03399735316634178, "grad_norm_var": 5.18308558985064e-07, "learning_rate": 0.0018706051873869994, "loss": 2.5355, "step": 9281 }, { "crossentropy": 2.5516971349716187, "epoch": 0.5047445553168928, "grad_norm": 0.03588185831904411, "grad_norm_var": 1.2445452525687457e-06, "learning_rate": 0.0018696341423126272, "loss": 2.5517, "step": 9282 }, { "crossentropy": 2.4733211994171143, "epoch": 0.504798934174393, "grad_norm": 0.02988363616168499, "grad_norm_var": 1.7307800443586238e-06, "learning_rate": 0.0018686632913806295, "loss": 2.4733, "step": 9283 }, { "crossentropy": 2.5470082759857178, "epoch": 0.5048533130318932, "grad_norm": 0.036873240023851395, "grad_norm_var": 2.8755196426749245e-06, "learning_rate": 0.0018676926346512135, "loss": 2.547, "step": 9284 }, { "crossentropy": 2.519698143005371, "epoch": 0.5049076918893934, "grad_norm": 0.03228176385164261, "grad_norm_var": 2.884047211588432e-06, "learning_rate": 0.001866722172184578, "loss": 2.5197, "step": 9285 }, { "crossentropy": 2.572647452354431, "epoch": 0.5049620707468936, "grad_norm": 0.03255739063024521, "grad_norm_var": 2.7868470767440268e-06, "learning_rate": 0.0018657519040409154, "loss": 2.5726, "step": 9286 }, { "crossentropy": 2.6120699644088745, "epoch": 0.5050164496043938, "grad_norm": 0.03199047967791557, "grad_norm_var": 2.789893052390852e-06, "learning_rate": 0.0018647818302803966, "loss": 2.6121, "step": 9287 }, { "crossentropy": 2.513134717941284, "epoch": 0.505070828461894, "grad_norm": 0.0320814773440361, "grad_norm_var": 2.8159179853467447e-06, "learning_rate": 0.001863811950963185, "loss": 2.5131, "step": 9288 }, { "crossentropy": 2.6550636291503906, "epoch": 0.5051252073193943, "grad_norm": 0.03274819999933243, "grad_norm_var": 2.8030156441980245e-06, "learning_rate": 0.0018628422661494332, "loss": 2.6551, "step": 9289 }, { "crossentropy": 2.536185145378113, "epoch": 0.5051795861768944, "grad_norm": 0.03194364160299301, "grad_norm_var": 2.8369371526980644e-06, "learning_rate": 0.0018618727758992787, "loss": 2.5362, "step": 9290 }, { "crossentropy": 2.6040594577789307, "epoch": 0.5052339650343947, "grad_norm": 0.03141550347208977, "grad_norm_var": 2.932453750695618e-06, "learning_rate": 0.001860903480272851, "loss": 2.6041, "step": 9291 }, { "crossentropy": 2.5962761640548706, "epoch": 0.5052883438918948, "grad_norm": 0.033633578568696976, "grad_norm_var": 2.9978515578732288e-06, "learning_rate": 0.0018599343793302592, "loss": 2.5963, "step": 9292 }, { "crossentropy": 2.474920630455017, "epoch": 0.5053427227493951, "grad_norm": 0.03139722719788551, "grad_norm_var": 3.0709756010757614e-06, "learning_rate": 0.0018589654731316113, "loss": 2.4749, "step": 9293 }, { "crossentropy": 2.6357266902923584, "epoch": 0.5053971016068952, "grad_norm": 0.03220837190747261, "grad_norm_var": 3.0672257564285833e-06, "learning_rate": 0.0018579967617369981, "loss": 2.6357, "step": 9294 }, { "crossentropy": 2.5330138206481934, "epoch": 0.5054514804643955, "grad_norm": 0.03215012699365616, "grad_norm_var": 2.9593525353814916e-06, "learning_rate": 0.0018570282452064952, "loss": 2.533, "step": 9295 }, { "crossentropy": 2.55718195438385, "epoch": 0.5055058593218956, "grad_norm": 0.03105115331709385, "grad_norm_var": 3.081066139043696e-06, "learning_rate": 0.0018560599236001679, "loss": 2.5572, "step": 9296 }, { "crossentropy": 2.5077013969421387, "epoch": 0.5055602381793959, "grad_norm": 0.030910955742001534, "grad_norm_var": 3.1141248570558244e-06, "learning_rate": 0.0018550917969780772, "loss": 2.5077, "step": 9297 }, { "crossentropy": 2.5807583332061768, "epoch": 0.505614617036896, "grad_norm": 0.03138197213411331, "grad_norm_var": 2.3134457869802e-06, "learning_rate": 0.001854123865400259, "loss": 2.5808, "step": 9298 }, { "crossentropy": 2.4945263862609863, "epoch": 0.5056689958943963, "grad_norm": 0.03269534930586815, "grad_norm_var": 1.9553579165935975e-06, "learning_rate": 0.0018531561289267461, "loss": 2.4945, "step": 9299 }, { "crossentropy": 2.5384846925735474, "epoch": 0.5057233747518964, "grad_norm": 0.031440503895282745, "grad_norm_var": 5.108890120781098e-07, "learning_rate": 0.001852188587617556, "loss": 2.5385, "step": 9300 }, { "crossentropy": 2.5780270099639893, "epoch": 0.5057777536093967, "grad_norm": 0.031954120844602585, "grad_norm_var": 5.049826914307887e-07, "learning_rate": 0.001851221241532695, "loss": 2.578, "step": 9301 }, { "crossentropy": 2.5564855337142944, "epoch": 0.5058321324668968, "grad_norm": 0.03234093636274338, "grad_norm_var": 4.910307880981457e-07, "learning_rate": 0.0018502540907321574, "loss": 2.5565, "step": 9302 }, { "crossentropy": 2.7118102312088013, "epoch": 0.5058865113243971, "grad_norm": 0.034997422248125076, "grad_norm_var": 1.0687683396230773e-06, "learning_rate": 0.0018492871352759244, "loss": 2.7118, "step": 9303 }, { "crossentropy": 2.4713295698165894, "epoch": 0.5059408901818973, "grad_norm": 0.03254995867609978, "grad_norm_var": 1.0783983894114072e-06, "learning_rate": 0.0018483203752239657, "loss": 2.4713, "step": 9304 }, { "crossentropy": 2.5076977014541626, "epoch": 0.5059952690393975, "grad_norm": 0.032544150948524475, "grad_norm_var": 1.0654381972169738e-06, "learning_rate": 0.0018473538106362408, "loss": 2.5077, "step": 9305 }, { "crossentropy": 2.60614550113678, "epoch": 0.5060496478968977, "grad_norm": 0.032200708985328674, "grad_norm_var": 1.0620348345808176e-06, "learning_rate": 0.0018463874415726917, "loss": 2.6061, "step": 9306 }, { "crossentropy": 2.5924190282821655, "epoch": 0.5061040267543979, "grad_norm": 0.03874693065881729, "grad_norm_var": 3.6745717359114303e-06, "learning_rate": 0.001845421268093252, "loss": 2.5924, "step": 9307 }, { "crossentropy": 2.4443936347961426, "epoch": 0.5061584056118981, "grad_norm": 0.03193581476807594, "grad_norm_var": 3.629290100229619e-06, "learning_rate": 0.001844455290257847, "loss": 2.4444, "step": 9308 }, { "crossentropy": 2.5241395235061646, "epoch": 0.5062127844693983, "grad_norm": 0.03212478756904602, "grad_norm_var": 3.552330182693813e-06, "learning_rate": 0.001843489508126382, "loss": 2.5241, "step": 9309 }, { "crossentropy": 2.5569615364074707, "epoch": 0.5062671633268985, "grad_norm": 0.032258640974760056, "grad_norm_var": 3.550016843853881e-06, "learning_rate": 0.0018425239217587542, "loss": 2.557, "step": 9310 }, { "crossentropy": 2.569725751876831, "epoch": 0.5063215421843987, "grad_norm": 0.03200121223926544, "grad_norm_var": 3.5599424616880014e-06, "learning_rate": 0.001841558531214849, "loss": 2.5697, "step": 9311 }, { "crossentropy": 2.4849990606307983, "epoch": 0.5063759210418989, "grad_norm": 0.03972966969013214, "grad_norm_var": 6.508664186868545e-06, "learning_rate": 0.0018405933365545396, "loss": 2.485, "step": 9312 }, { "crossentropy": 2.634031295776367, "epoch": 0.5064302998993991, "grad_norm": 0.032925207167863846, "grad_norm_var": 6.1707574385218174e-06, "learning_rate": 0.001839628337837686, "loss": 2.634, "step": 9313 }, { "crossentropy": 2.5382591485977173, "epoch": 0.5064846787568993, "grad_norm": 0.033257171511650085, "grad_norm_var": 5.926171480326719e-06, "learning_rate": 0.0018386635351241366, "loss": 2.5383, "step": 9314 }, { "crossentropy": 2.522379517555237, "epoch": 0.5065390576143995, "grad_norm": 0.03240017965435982, "grad_norm_var": 5.9576335380617214e-06, "learning_rate": 0.0018376989284737267, "loss": 2.5224, "step": 9315 }, { "crossentropy": 2.6702020168304443, "epoch": 0.5065934364718997, "grad_norm": 0.03247342258691788, "grad_norm_var": 5.762993234711909e-06, "learning_rate": 0.0018367345179462814, "loss": 2.6702, "step": 9316 }, { "crossentropy": 2.5455474853515625, "epoch": 0.5066478153294, "grad_norm": 0.03265790641307831, "grad_norm_var": 5.658035290576994e-06, "learning_rate": 0.0018357703036016137, "loss": 2.5455, "step": 9317 }, { "crossentropy": 2.5739662647247314, "epoch": 0.5067021941869001, "grad_norm": 0.031173327937722206, "grad_norm_var": 5.915358691214534e-06, "learning_rate": 0.001834806285499519, "loss": 2.574, "step": 9318 }, { "crossentropy": 2.6153472661972046, "epoch": 0.5067565730444004, "grad_norm": 0.03206019103527069, "grad_norm_var": 5.818601158731141e-06, "learning_rate": 0.0018338424636997908, "loss": 2.6153, "step": 9319 }, { "crossentropy": 2.569001793861389, "epoch": 0.5068109519019005, "grad_norm": 0.032825496047735214, "grad_norm_var": 5.799833822227535e-06, "learning_rate": 0.001832878838262199, "loss": 2.569, "step": 9320 }, { "crossentropy": 2.520227551460266, "epoch": 0.5068653307594008, "grad_norm": 0.03220716863870621, "grad_norm_var": 5.836721507469185e-06, "learning_rate": 0.001831915409246509, "loss": 2.5202, "step": 9321 }, { "crossentropy": 2.4705477952957153, "epoch": 0.5069197096169009, "grad_norm": 0.03524196147918701, "grad_norm_var": 6.015215191019227e-06, "learning_rate": 0.001830952176712472, "loss": 2.4705, "step": 9322 }, { "crossentropy": 2.543851613998413, "epoch": 0.5069740884744012, "grad_norm": 0.03199021518230438, "grad_norm_var": 4.0300676292912785e-06, "learning_rate": 0.0018299891407198266, "loss": 2.5439, "step": 9323 }, { "crossentropy": 2.5744357109069824, "epoch": 0.5070284673319013, "grad_norm": 0.03274758160114288, "grad_norm_var": 3.9610601063278975e-06, "learning_rate": 0.0018290263013282992, "loss": 2.5744, "step": 9324 }, { "crossentropy": 2.5462534427642822, "epoch": 0.5070828461894016, "grad_norm": 0.032137349247932434, "grad_norm_var": 3.959596322566335e-06, "learning_rate": 0.001828063658597604, "loss": 2.5463, "step": 9325 }, { "crossentropy": 2.610248565673828, "epoch": 0.5071372250469017, "grad_norm": 0.03328036516904831, "grad_norm_var": 3.923107875056564e-06, "learning_rate": 0.0018271012125874436, "loss": 2.6102, "step": 9326 }, { "crossentropy": 2.5023542642593384, "epoch": 0.507191603904402, "grad_norm": 0.03158504143357277, "grad_norm_var": 3.993199052855833e-06, "learning_rate": 0.001826138963357508, "loss": 2.5024, "step": 9327 }, { "crossentropy": 2.4986588954925537, "epoch": 0.5072459827619021, "grad_norm": 0.03149506449699402, "grad_norm_var": 8.899248648493724e-07, "learning_rate": 0.001825176910967477, "loss": 2.4987, "step": 9328 }, { "crossentropy": 2.5875173807144165, "epoch": 0.5073003616194024, "grad_norm": 0.032990358769893646, "grad_norm_var": 8.936354126028023e-07, "learning_rate": 0.0018242150554770098, "loss": 2.5875, "step": 9329 }, { "crossentropy": 2.60615074634552, "epoch": 0.5073547404769025, "grad_norm": 0.033126793801784515, "grad_norm_var": 8.821033844818299e-07, "learning_rate": 0.0018232533969457666, "loss": 2.6062, "step": 9330 }, { "crossentropy": 2.582970380783081, "epoch": 0.5074091193344028, "grad_norm": 0.03118128515779972, "grad_norm_var": 9.95168624576675e-07, "learning_rate": 0.0018222919354333872, "loss": 2.583, "step": 9331 }, { "crossentropy": 2.6642929315567017, "epoch": 0.507463498191903, "grad_norm": 0.03283590078353882, "grad_norm_var": 1.0045925105468695e-06, "learning_rate": 0.001821330670999498, "loss": 2.6643, "step": 9332 }, { "crossentropy": 2.5806963443756104, "epoch": 0.5075178770494032, "grad_norm": 0.031889408826828, "grad_norm_var": 1.0223527234097413e-06, "learning_rate": 0.001820369603703717, "loss": 2.5807, "step": 9333 }, { "crossentropy": 2.589316248893738, "epoch": 0.5075722559069034, "grad_norm": 0.030804023146629333, "grad_norm_var": 1.0924099916610839e-06, "learning_rate": 0.001819408733605649, "loss": 2.5893, "step": 9334 }, { "crossentropy": 2.446404457092285, "epoch": 0.5076266347644036, "grad_norm": 0.032996002584695816, "grad_norm_var": 1.1047583242553785e-06, "learning_rate": 0.0018184480607648862, "loss": 2.4464, "step": 9335 }, { "crossentropy": 2.521654963493347, "epoch": 0.5076810136219038, "grad_norm": 0.031252216547727585, "grad_norm_var": 1.1824478604630876e-06, "learning_rate": 0.0018174875852410082, "loss": 2.5217, "step": 9336 }, { "crossentropy": 2.4873056411743164, "epoch": 0.507735392479404, "grad_norm": 0.03219953179359436, "grad_norm_var": 1.1826071723699254e-06, "learning_rate": 0.0018165273070935828, "loss": 2.4873, "step": 9337 }, { "crossentropy": 2.550102710723877, "epoch": 0.5077897713369042, "grad_norm": 0.03082127682864666, "grad_norm_var": 7.050572326259066e-07, "learning_rate": 0.0018155672263821666, "loss": 2.5501, "step": 9338 }, { "crossentropy": 2.5951744318008423, "epoch": 0.5078441501944044, "grad_norm": 0.03187430277466774, "grad_norm_var": 7.073352162011159e-07, "learning_rate": 0.0018146073431663035, "loss": 2.5952, "step": 9339 }, { "crossentropy": 2.478369355201721, "epoch": 0.5078985290519046, "grad_norm": 0.03136715665459633, "grad_norm_var": 7.02830250524864e-07, "learning_rate": 0.0018136476575055195, "loss": 2.4784, "step": 9340 }, { "crossentropy": 2.55411434173584, "epoch": 0.5079529079094048, "grad_norm": 0.03145744279026985, "grad_norm_var": 7.183422511052492e-07, "learning_rate": 0.0018126881694593394, "loss": 2.5541, "step": 9341 }, { "crossentropy": 2.585011124610901, "epoch": 0.508007286766905, "grad_norm": 0.032603535801172256, "grad_norm_var": 6.266688103670828e-07, "learning_rate": 0.0018117288790872688, "loss": 2.585, "step": 9342 }, { "crossentropy": 2.6166279315948486, "epoch": 0.5080616656244052, "grad_norm": 0.032222259789705276, "grad_norm_var": 6.248657926157785e-07, "learning_rate": 0.0018107697864487994, "loss": 2.6166, "step": 9343 }, { "crossentropy": 2.42246413230896, "epoch": 0.5081160444819054, "grad_norm": 0.032338134944438934, "grad_norm_var": 6.18735966212367e-07, "learning_rate": 0.0018098108916034146, "loss": 2.4225, "step": 9344 }, { "crossentropy": 2.597006916999817, "epoch": 0.5081704233394057, "grad_norm": 0.031993649899959564, "grad_norm_var": 5.488767013394601e-07, "learning_rate": 0.0018088521946105834, "loss": 2.597, "step": 9345 }, { "crossentropy": 2.571867346763611, "epoch": 0.5082248021969058, "grad_norm": 0.030747728422284126, "grad_norm_var": 5.246342559661465e-07, "learning_rate": 0.0018078936955297643, "loss": 2.5719, "step": 9346 }, { "crossentropy": 2.659275770187378, "epoch": 0.5082791810544061, "grad_norm": 0.031997211277484894, "grad_norm_var": 5.004022815140984e-07, "learning_rate": 0.0018069353944204026, "loss": 2.6593, "step": 9347 }, { "crossentropy": 2.5711774826049805, "epoch": 0.5083335599119062, "grad_norm": 0.033650439232587814, "grad_norm_var": 6.503022541240004e-07, "learning_rate": 0.0018059772913419302, "loss": 2.5712, "step": 9348 }, { "crossentropy": 2.5923287868499756, "epoch": 0.5083879387694065, "grad_norm": 0.033288128674030304, "grad_norm_var": 7.727673891053111e-07, "learning_rate": 0.001805019386353769, "loss": 2.5923, "step": 9349 }, { "crossentropy": 2.6083686351776123, "epoch": 0.5084423176269066, "grad_norm": 0.03237738460302353, "grad_norm_var": 6.816637309579876e-07, "learning_rate": 0.0018040616795153282, "loss": 2.6084, "step": 9350 }, { "crossentropy": 2.4179913997650146, "epoch": 0.5084966964844069, "grad_norm": 0.03194490075111389, "grad_norm_var": 6.215199031567656e-07, "learning_rate": 0.0018031041708859992, "loss": 2.418, "step": 9351 }, { "crossentropy": 2.50998592376709, "epoch": 0.508551075341907, "grad_norm": 0.032009307295084, "grad_norm_var": 5.810051040023092e-07, "learning_rate": 0.0018021468605251706, "loss": 2.51, "step": 9352 }, { "crossentropy": 2.587233781814575, "epoch": 0.5086054541994073, "grad_norm": 0.03191760554909706, "grad_norm_var": 5.805688928425812e-07, "learning_rate": 0.001801189748492214, "loss": 2.5872, "step": 9353 }, { "crossentropy": 2.5769214630126953, "epoch": 0.5086598330569074, "grad_norm": 0.03312724083662033, "grad_norm_var": 5.387674155660766e-07, "learning_rate": 0.0018002328348464853, "loss": 2.5769, "step": 9354 }, { "crossentropy": 2.422472357749939, "epoch": 0.5087142119144077, "grad_norm": 0.03380521386861801, "grad_norm_var": 6.925041159240442e-07, "learning_rate": 0.0017992761196473318, "loss": 2.4225, "step": 9355 }, { "crossentropy": 2.6520798206329346, "epoch": 0.5087685907719078, "grad_norm": 0.03089972957968712, "grad_norm_var": 7.64482186660697e-07, "learning_rate": 0.001798319602954092, "loss": 2.6521, "step": 9356 }, { "crossentropy": 2.494821310043335, "epoch": 0.5088229696294081, "grad_norm": 0.032799381762742996, "grad_norm_var": 7.309752411406665e-07, "learning_rate": 0.0017973632848260841, "loss": 2.4948, "step": 9357 }, { "crossentropy": 2.5534181594848633, "epoch": 0.5088773484869082, "grad_norm": 0.032388728111982346, "grad_norm_var": 7.26815735724986e-07, "learning_rate": 0.0017964071653226194, "loss": 2.5534, "step": 9358 }, { "crossentropy": 2.573305606842041, "epoch": 0.5089317273444085, "grad_norm": 0.03540102019906044, "grad_norm_var": 1.306669715976635e-06, "learning_rate": 0.0017954512445029957, "loss": 2.5733, "step": 9359 }, { "crossentropy": 2.5659823417663574, "epoch": 0.5089861062019087, "grad_norm": 0.03295547142624855, "grad_norm_var": 1.3136372724326192e-06, "learning_rate": 0.0017944955224264987, "loss": 2.566, "step": 9360 }, { "crossentropy": 2.4854071140289307, "epoch": 0.5090404850594089, "grad_norm": 0.03269930183887482, "grad_norm_var": 1.2894548536414626e-06, "learning_rate": 0.0017935399991524021, "loss": 2.4854, "step": 9361 }, { "crossentropy": 2.6293972730636597, "epoch": 0.5090948639169091, "grad_norm": 0.03230109438300133, "grad_norm_var": 1.0513381691519436e-06, "learning_rate": 0.0017925846747399632, "loss": 2.6294, "step": 9362 }, { "crossentropy": 2.538857936859131, "epoch": 0.5091492427744093, "grad_norm": 0.03262314945459366, "grad_norm_var": 1.015282864601635e-06, "learning_rate": 0.0017916295492484313, "loss": 2.5389, "step": 9363 }, { "crossentropy": 2.529055118560791, "epoch": 0.5092036216319095, "grad_norm": 0.030550949275493622, "grad_norm_var": 1.2484483124332312e-06, "learning_rate": 0.0017906746227370474, "loss": 2.5291, "step": 9364 }, { "crossentropy": 2.561095356941223, "epoch": 0.5092580004894097, "grad_norm": 0.03251931816339493, "grad_norm_var": 1.2115750561682293e-06, "learning_rate": 0.001789719895265029, "loss": 2.5611, "step": 9365 }, { "crossentropy": 2.6942495107650757, "epoch": 0.5093123793469099, "grad_norm": 0.03304267302155495, "grad_norm_var": 1.2265885073020968e-06, "learning_rate": 0.00178876536689159, "loss": 2.6942, "step": 9366 }, { "crossentropy": 2.591156005859375, "epoch": 0.5093667582044101, "grad_norm": 0.03491245582699776, "grad_norm_var": 1.5329883109517494e-06, "learning_rate": 0.0017878110376759293, "loss": 2.5912, "step": 9367 }, { "crossentropy": 2.4989781379699707, "epoch": 0.5094211370619103, "grad_norm": 0.0322360135614872, "grad_norm_var": 1.5139007260816609e-06, "learning_rate": 0.001786856907677233, "loss": 2.499, "step": 9368 }, { "crossentropy": 2.476415514945984, "epoch": 0.5094755159194105, "grad_norm": 0.030949139967560768, "grad_norm_var": 1.6814545522947683e-06, "learning_rate": 0.0017859029769546758, "loss": 2.4764, "step": 9369 }, { "crossentropy": 2.4426504373550415, "epoch": 0.5095298947769107, "grad_norm": 0.0322154238820076, "grad_norm_var": 1.6815583098577265e-06, "learning_rate": 0.0017849492455674193, "loss": 2.4427, "step": 9370 }, { "crossentropy": 2.5085537433624268, "epoch": 0.509584273634411, "grad_norm": 0.03131549060344696, "grad_norm_var": 1.683395869081144e-06, "learning_rate": 0.0017839957135746137, "loss": 2.5086, "step": 9371 }, { "crossentropy": 2.414395332336426, "epoch": 0.5096386524919111, "grad_norm": 0.031022166833281517, "grad_norm_var": 1.658402968421809e-06, "learning_rate": 0.001783042381035397, "loss": 2.4144, "step": 9372 }, { "crossentropy": 2.5220178365707397, "epoch": 0.5096930313494114, "grad_norm": 0.030574113130569458, "grad_norm_var": 1.877799339473849e-06, "learning_rate": 0.0017820892480088907, "loss": 2.522, "step": 9373 }, { "crossentropy": 2.549509286880493, "epoch": 0.5097474102069115, "grad_norm": 0.03274988755583763, "grad_norm_var": 1.8874959767794496e-06, "learning_rate": 0.0017811363145542082, "loss": 2.5495, "step": 9374 }, { "crossentropy": 2.6334840059280396, "epoch": 0.5098017890644118, "grad_norm": 0.033911459147930145, "grad_norm_var": 1.426018218222506e-06, "learning_rate": 0.0017801835807304534, "loss": 2.6335, "step": 9375 }, { "crossentropy": 2.479520320892334, "epoch": 0.5098561679219119, "grad_norm": 0.030918460339307785, "grad_norm_var": 1.5035629594431036e-06, "learning_rate": 0.0017792310465967094, "loss": 2.4795, "step": 9376 }, { "crossentropy": 2.595108151435852, "epoch": 0.5099105467794122, "grad_norm": 0.030893921852111816, "grad_norm_var": 1.5771715732140294e-06, "learning_rate": 0.0017782787122120531, "loss": 2.5951, "step": 9377 }, { "crossentropy": 2.5300673246383667, "epoch": 0.5099649256369123, "grad_norm": 0.031367022544145584, "grad_norm_var": 1.5999298106042503e-06, "learning_rate": 0.001777326577635548, "loss": 2.5301, "step": 9378 }, { "crossentropy": 2.522353172302246, "epoch": 0.5100193044944126, "grad_norm": 0.03155110403895378, "grad_norm_var": 1.5809152493115666e-06, "learning_rate": 0.0017763746429262433, "loss": 2.5224, "step": 9379 }, { "crossentropy": 2.551806926727295, "epoch": 0.5100736833519127, "grad_norm": 0.031697485595941544, "grad_norm_var": 1.4536937740417903e-06, "learning_rate": 0.0017754229081431783, "loss": 2.5518, "step": 9380 }, { "crossentropy": 2.581291437149048, "epoch": 0.510128062209413, "grad_norm": 0.03152075409889221, "grad_norm_var": 1.4458406995490151e-06, "learning_rate": 0.0017744713733453782, "loss": 2.5813, "step": 9381 }, { "crossentropy": 2.4059295654296875, "epoch": 0.5101824410669132, "grad_norm": 0.03305257856845856, "grad_norm_var": 1.4473165837695807e-06, "learning_rate": 0.0017735200385918565, "loss": 2.4059, "step": 9382 }, { "crossentropy": 2.564141631126404, "epoch": 0.5102368199244134, "grad_norm": 0.031916141510009766, "grad_norm_var": 8.171053314349568e-07, "learning_rate": 0.0017725689039416138, "loss": 2.5641, "step": 9383 }, { "crossentropy": 2.544608473777771, "epoch": 0.5102911987819136, "grad_norm": 0.030991626903414726, "grad_norm_var": 8.321193353635989e-07, "learning_rate": 0.0017716179694536405, "loss": 2.5446, "step": 9384 }, { "crossentropy": 2.4585952758789062, "epoch": 0.5103455776394138, "grad_norm": 0.030378196388483047, "grad_norm_var": 9.070205382261456e-07, "learning_rate": 0.0017706672351869073, "loss": 2.4586, "step": 9385 }, { "crossentropy": 2.6579941511154175, "epoch": 0.510399956496914, "grad_norm": 0.030553149059414864, "grad_norm_var": 9.499087807997615e-07, "learning_rate": 0.0017697167012003857, "loss": 2.658, "step": 9386 }, { "crossentropy": 2.5704407691955566, "epoch": 0.5104543353544142, "grad_norm": 0.031724367290735245, "grad_norm_var": 9.488895419876616e-07, "learning_rate": 0.0017687663675530203, "loss": 2.5704, "step": 9387 }, { "crossentropy": 2.4887964725494385, "epoch": 0.5105087142119145, "grad_norm": 0.032733004540205, "grad_norm_var": 1.0111001244227092e-06, "learning_rate": 0.0017678162343037524, "loss": 2.4888, "step": 9388 }, { "crossentropy": 2.561874747276306, "epoch": 0.5105630930694146, "grad_norm": 0.03414454311132431, "grad_norm_var": 1.2916990972911916e-06, "learning_rate": 0.0017668663015115082, "loss": 2.5619, "step": 9389 }, { "crossentropy": 2.542031407356262, "epoch": 0.5106174719269149, "grad_norm": 0.031492382287979126, "grad_norm_var": 1.24492819206645e-06, "learning_rate": 0.001765916569235202, "loss": 2.542, "step": 9390 }, { "crossentropy": 2.5542794466018677, "epoch": 0.510671850784415, "grad_norm": 0.03193120285868645, "grad_norm_var": 9.332816138780027e-07, "learning_rate": 0.0017649670375337345, "loss": 2.5543, "step": 9391 }, { "crossentropy": 2.5916038751602173, "epoch": 0.5107262296419153, "grad_norm": 0.0318140983581543, "grad_norm_var": 8.925801619947989e-07, "learning_rate": 0.0017640177064659956, "loss": 2.5916, "step": 9392 }, { "crossentropy": 2.551993250846863, "epoch": 0.5107806084994154, "grad_norm": 0.03186222165822983, "grad_norm_var": 8.425789285480707e-07, "learning_rate": 0.001763068576090862, "loss": 2.552, "step": 9393 }, { "crossentropy": 2.531903862953186, "epoch": 0.5108349873569157, "grad_norm": 0.03257080167531967, "grad_norm_var": 8.643555476012717e-07, "learning_rate": 0.0017621196464671974, "loss": 2.5319, "step": 9394 }, { "crossentropy": 2.5634233951568604, "epoch": 0.5108893662144158, "grad_norm": 0.0313151516020298, "grad_norm_var": 8.778945703932783e-07, "learning_rate": 0.0017611709176538549, "loss": 2.5634, "step": 9395 }, { "crossentropy": 2.5464982986450195, "epoch": 0.5109437450719161, "grad_norm": 0.030324673280119896, "grad_norm_var": 1.024717171792935e-06, "learning_rate": 0.0017602223897096691, "loss": 2.5465, "step": 9396 }, { "crossentropy": 2.462088942527771, "epoch": 0.5109981239294162, "grad_norm": 0.03275078907608986, "grad_norm_var": 1.0783511528034226e-06, "learning_rate": 0.0017592740626934727, "loss": 2.4621, "step": 9397 }, { "crossentropy": 2.6126251220703125, "epoch": 0.5110525027869165, "grad_norm": 0.03325023874640465, "grad_norm_var": 1.1125608261548185e-06, "learning_rate": 0.0017583259366640786, "loss": 2.6126, "step": 9398 }, { "crossentropy": 2.525835871696472, "epoch": 0.5111068816444166, "grad_norm": 0.032271429896354675, "grad_norm_var": 1.1231316584830402e-06, "learning_rate": 0.0017573780116802863, "loss": 2.5258, "step": 9399 }, { "crossentropy": 2.595182776451111, "epoch": 0.5111612605019169, "grad_norm": 0.03473425656557083, "grad_norm_var": 1.554403488360628e-06, "learning_rate": 0.001756430287800887, "loss": 2.5952, "step": 9400 }, { "crossentropy": 2.450569748878479, "epoch": 0.511215639359417, "grad_norm": 0.030434008687734604, "grad_norm_var": 1.5416686230525107e-06, "learning_rate": 0.0017554827650846567, "loss": 2.4506, "step": 9401 }, { "crossentropy": 2.5512943267822266, "epoch": 0.5112700182169173, "grad_norm": 0.03154009208083153, "grad_norm_var": 1.3964739980773839e-06, "learning_rate": 0.0017545354435903604, "loss": 2.5513, "step": 9402 }, { "crossentropy": 2.626810312271118, "epoch": 0.5113243970744175, "grad_norm": 0.03313887119293213, "grad_norm_var": 1.435436439724833e-06, "learning_rate": 0.0017535883233767509, "loss": 2.6268, "step": 9403 }, { "crossentropy": 2.5489574670791626, "epoch": 0.5113787759319177, "grad_norm": 0.03235633298754692, "grad_norm_var": 1.4210122094826614e-06, "learning_rate": 0.0017526414045025663, "loss": 2.549, "step": 9404 }, { "crossentropy": 2.5565625429153442, "epoch": 0.5114331547894179, "grad_norm": 0.03360837697982788, "grad_norm_var": 1.3032328199873012e-06, "learning_rate": 0.0017516946870265344, "loss": 2.5566, "step": 9405 }, { "crossentropy": 2.728625178337097, "epoch": 0.5114875336469181, "grad_norm": 0.035854749381542206, "grad_norm_var": 2.073951913241972e-06, "learning_rate": 0.001750748171007371, "loss": 2.7286, "step": 9406 }, { "crossentropy": 2.5576030015945435, "epoch": 0.5115419125044183, "grad_norm": 0.031241482123732567, "grad_norm_var": 2.1545972503847834e-06, "learning_rate": 0.0017498018565037738, "loss": 2.5576, "step": 9407 }, { "crossentropy": 2.6345688104629517, "epoch": 0.5115962913619185, "grad_norm": 0.03283470869064331, "grad_norm_var": 2.1342920116221643e-06, "learning_rate": 0.0017488557435744367, "loss": 2.6346, "step": 9408 }, { "crossentropy": 2.4755325317382812, "epoch": 0.5116506702194187, "grad_norm": 0.031630534678697586, "grad_norm_var": 2.15751919207946e-06, "learning_rate": 0.0017479098322780373, "loss": 2.4755, "step": 9409 }, { "crossentropy": 2.5955923795700073, "epoch": 0.5117050490769189, "grad_norm": 0.03275596350431442, "grad_norm_var": 2.161631393858554e-06, "learning_rate": 0.0017469641226732363, "loss": 2.5956, "step": 9410 }, { "crossentropy": 2.555577278137207, "epoch": 0.5117594279344191, "grad_norm": 0.04326276108622551, "grad_norm_var": 9.191588614353846e-06, "learning_rate": 0.0017460186148186862, "loss": 2.5556, "step": 9411 }, { "crossentropy": 2.558356285095215, "epoch": 0.5118138067919193, "grad_norm": 0.031540386378765106, "grad_norm_var": 8.809888657899352e-06, "learning_rate": 0.0017450733087730314, "loss": 2.5584, "step": 9412 }, { "crossentropy": 2.530758500099182, "epoch": 0.5118681856494195, "grad_norm": 0.03437512740492821, "grad_norm_var": 8.850364199580956e-06, "learning_rate": 0.001744128204594893, "loss": 2.5308, "step": 9413 }, { "crossentropy": 2.669989228248596, "epoch": 0.5119225645069198, "grad_norm": 0.03247184678912163, "grad_norm_var": 8.906560467200142e-06, "learning_rate": 0.0017431833023428883, "loss": 2.67, "step": 9414 }, { "crossentropy": 2.571052312850952, "epoch": 0.5119769433644199, "grad_norm": 0.03431427478790283, "grad_norm_var": 8.865929756552089e-06, "learning_rate": 0.0017422386020756193, "loss": 2.5711, "step": 9415 }, { "crossentropy": 2.6046199798583984, "epoch": 0.5120313222219202, "grad_norm": 0.03201831504702568, "grad_norm_var": 8.882117439298999e-06, "learning_rate": 0.0017412941038516744, "loss": 2.6046, "step": 9416 }, { "crossentropy": 2.5415133237838745, "epoch": 0.5120857010794203, "grad_norm": 0.03173235431313515, "grad_norm_var": 8.485082258523795e-06, "learning_rate": 0.0017403498077296332, "loss": 2.5415, "step": 9417 }, { "crossentropy": 2.5440279245376587, "epoch": 0.5121400799369206, "grad_norm": 0.0365871824324131, "grad_norm_var": 8.813920110032587e-06, "learning_rate": 0.0017394057137680547, "loss": 2.544, "step": 9418 }, { "crossentropy": 2.6351345777511597, "epoch": 0.5121944587944207, "grad_norm": 0.03366486728191376, "grad_norm_var": 8.789564921783779e-06, "learning_rate": 0.001738461822025495, "loss": 2.6351, "step": 9419 }, { "crossentropy": 2.6229796409606934, "epoch": 0.512248837651921, "grad_norm": 0.033291012048721313, "grad_norm_var": 8.668540783375227e-06, "learning_rate": 0.0017375181325604944, "loss": 2.623, "step": 9420 }, { "crossentropy": 2.6126322746276855, "epoch": 0.5123032165094211, "grad_norm": 0.034799687564373016, "grad_norm_var": 8.722992791929567e-06, "learning_rate": 0.001736574645431576, "loss": 2.6126, "step": 9421 }, { "crossentropy": 2.456623077392578, "epoch": 0.5123575953669214, "grad_norm": 0.03146606683731079, "grad_norm_var": 8.782034560728226e-06, "learning_rate": 0.001735631360697254, "loss": 2.4566, "step": 9422 }, { "crossentropy": 2.5083913803100586, "epoch": 0.5124119742244215, "grad_norm": 0.03091147541999817, "grad_norm_var": 8.893681073693776e-06, "learning_rate": 0.0017346882784160344, "loss": 2.5084, "step": 9423 }, { "crossentropy": 2.6515166759490967, "epoch": 0.5124663530819218, "grad_norm": 0.03242403268814087, "grad_norm_var": 8.946320480693731e-06, "learning_rate": 0.0017337453986464024, "loss": 2.6515, "step": 9424 }, { "crossentropy": 2.4942795038223267, "epoch": 0.5125207319394219, "grad_norm": 0.03978295996785164, "grad_norm_var": 1.0983465921779861e-05, "learning_rate": 0.0017328027214468355, "loss": 2.4943, "step": 9425 }, { "crossentropy": 2.352970838546753, "epoch": 0.5125751107969222, "grad_norm": 0.031717777252197266, "grad_norm_var": 1.1235133467637065e-05, "learning_rate": 0.0017318602468757982, "loss": 2.353, "step": 9426 }, { "crossentropy": 2.5285838842391968, "epoch": 0.5126294896544223, "grad_norm": 0.03153933584690094, "grad_norm_var": 5.381396608289402e-06, "learning_rate": 0.0017309179749917414, "loss": 2.5286, "step": 9427 }, { "crossentropy": 2.5969202518463135, "epoch": 0.5126838685119226, "grad_norm": 0.03253119811415672, "grad_norm_var": 5.2116422209248664e-06, "learning_rate": 0.0017299759058531051, "loss": 2.5969, "step": 9428 }, { "crossentropy": 2.588368058204651, "epoch": 0.5127382473694228, "grad_norm": 0.03253748640418053, "grad_norm_var": 5.171945849528383e-06, "learning_rate": 0.0017290340395183118, "loss": 2.5884, "step": 9429 }, { "crossentropy": 2.4305062294006348, "epoch": 0.512792626226923, "grad_norm": 0.03229358792304993, "grad_norm_var": 5.192114749376596e-06, "learning_rate": 0.001728092376045779, "loss": 2.4305, "step": 9430 }, { "crossentropy": 2.576274275779724, "epoch": 0.5128470050844232, "grad_norm": 0.032394763082265854, "grad_norm_var": 5.143799921770015e-06, "learning_rate": 0.0017271509154939085, "loss": 2.5763, "step": 9431 }, { "crossentropy": 2.567407965660095, "epoch": 0.5129013839419234, "grad_norm": 0.03538314998149872, "grad_norm_var": 5.3635573064071684e-06, "learning_rate": 0.0017262096579210857, "loss": 2.5674, "step": 9432 }, { "crossentropy": 2.5285468101501465, "epoch": 0.5129557627994236, "grad_norm": 0.032634437084198, "grad_norm_var": 5.223932579042668e-06, "learning_rate": 0.0017252686033856857, "loss": 2.5285, "step": 9433 }, { "crossentropy": 2.564997673034668, "epoch": 0.5130101416569238, "grad_norm": 0.03289119526743889, "grad_norm_var": 4.493482461905412e-06, "learning_rate": 0.0017243277519460771, "loss": 2.565, "step": 9434 }, { "crossentropy": 2.5974165201187134, "epoch": 0.513064520514424, "grad_norm": 0.03356538340449333, "grad_norm_var": 4.487158010949704e-06, "learning_rate": 0.0017233871036606058, "loss": 2.5974, "step": 9435 }, { "crossentropy": 2.4680837392807007, "epoch": 0.5131188993719242, "grad_norm": 0.03262628614902496, "grad_norm_var": 4.500966585605642e-06, "learning_rate": 0.0017224466585876114, "loss": 2.4681, "step": 9436 }, { "crossentropy": 2.4049824476242065, "epoch": 0.5131732782294244, "grad_norm": 0.031001929193735123, "grad_norm_var": 4.538533053641674e-06, "learning_rate": 0.00172150641678542, "loss": 2.405, "step": 9437 }, { "crossentropy": 2.492303490638733, "epoch": 0.5132276570869246, "grad_norm": 0.03157417103648186, "grad_norm_var": 4.519224550262694e-06, "learning_rate": 0.0017205663783123437, "loss": 2.4923, "step": 9438 }, { "crossentropy": 2.586240530014038, "epoch": 0.5132820359444248, "grad_norm": 0.03303616866469383, "grad_norm_var": 4.248496796379191e-06, "learning_rate": 0.0017196265432266838, "loss": 2.5862, "step": 9439 }, { "crossentropy": 2.6644413471221924, "epoch": 0.513336414801925, "grad_norm": 0.03300948813557625, "grad_norm_var": 4.2252814232100295e-06, "learning_rate": 0.001718686911586727, "loss": 2.6644, "step": 9440 }, { "crossentropy": 2.4337249994277954, "epoch": 0.5133907936594252, "grad_norm": 0.03241400420665741, "grad_norm_var": 9.865717097938426e-07, "learning_rate": 0.001717747483450749, "loss": 2.4337, "step": 9441 }, { "crossentropy": 2.6246092319488525, "epoch": 0.5134451725169255, "grad_norm": 0.03709622845053673, "grad_norm_var": 2.1820426696107786e-06, "learning_rate": 0.0017168082588770134, "loss": 2.6246, "step": 9442 }, { "crossentropy": 2.7148901224136353, "epoch": 0.5134995513744256, "grad_norm": 0.03200466185808182, "grad_norm_var": 2.110655864982306e-06, "learning_rate": 0.0017158692379237667, "loss": 2.7149, "step": 9443 }, { "crossentropy": 2.544859766960144, "epoch": 0.5135539302319259, "grad_norm": 0.0344441682100296, "grad_norm_var": 2.2358327623932133e-06, "learning_rate": 0.0017149304206492471, "loss": 2.5449, "step": 9444 }, { "crossentropy": 2.5870752334594727, "epoch": 0.513608309089426, "grad_norm": 0.035438187420368195, "grad_norm_var": 2.5609029892496397e-06, "learning_rate": 0.0017139918071116833, "loss": 2.5871, "step": 9445 }, { "crossentropy": 2.6017900705337524, "epoch": 0.5136626879469263, "grad_norm": 0.03172174096107483, "grad_norm_var": 2.65334803366916e-06, "learning_rate": 0.001713053397369282, "loss": 2.6018, "step": 9446 }, { "crossentropy": 2.546104907989502, "epoch": 0.5137170668044264, "grad_norm": 0.03395586833357811, "grad_norm_var": 2.637587838298187e-06, "learning_rate": 0.0017121151914802452, "loss": 2.5461, "step": 9447 }, { "crossentropy": 2.5325095653533936, "epoch": 0.5137714456619267, "grad_norm": 0.032142143696546555, "grad_norm_var": 2.393815977054919e-06, "learning_rate": 0.0017111771895027585, "loss": 2.5325, "step": 9448 }, { "crossentropy": 2.6032450199127197, "epoch": 0.5138258245194268, "grad_norm": 0.03484344109892845, "grad_norm_var": 2.5624819269263096e-06, "learning_rate": 0.0017102393914949965, "loss": 2.6032, "step": 9449 }, { "crossentropy": 2.4681005477905273, "epoch": 0.5138802033769271, "grad_norm": 0.03391587734222412, "grad_norm_var": 2.5810899292779665e-06, "learning_rate": 0.0017093017975151204, "loss": 2.4681, "step": 9450 }, { "crossentropy": 2.439635157585144, "epoch": 0.5139345822344272, "grad_norm": 0.03255435451865196, "grad_norm_var": 2.6091150630402326e-06, "learning_rate": 0.0017083644076212785, "loss": 2.4396, "step": 9451 }, { "crossentropy": 2.710688352584839, "epoch": 0.5139889610919275, "grad_norm": 0.03205163776874542, "grad_norm_var": 2.676483026083492e-06, "learning_rate": 0.0017074272218716075, "loss": 2.7107, "step": 9452 }, { "crossentropy": 2.606308937072754, "epoch": 0.5140433399494276, "grad_norm": 0.03101467527449131, "grad_norm_var": 2.6727571758607445e-06, "learning_rate": 0.001706490240324231, "loss": 2.6063, "step": 9453 }, { "crossentropy": 2.6041810512542725, "epoch": 0.5140977188069279, "grad_norm": 0.03142335265874863, "grad_norm_var": 2.7068939331115523e-06, "learning_rate": 0.0017055534630372605, "loss": 2.6042, "step": 9454 }, { "crossentropy": 2.406777024269104, "epoch": 0.514152097664428, "grad_norm": 0.031693484634160995, "grad_norm_var": 2.8473994413152297e-06, "learning_rate": 0.0017046168900687892, "loss": 2.4068, "step": 9455 }, { "crossentropy": 2.4576176404953003, "epoch": 0.5142064765219283, "grad_norm": 0.032192278653383255, "grad_norm_var": 2.8998409690628486e-06, "learning_rate": 0.0017036805214769102, "loss": 2.4576, "step": 9456 }, { "crossentropy": 2.456951379776001, "epoch": 0.5142608553794284, "grad_norm": 0.03410828486084938, "grad_norm_var": 2.9340805050459404e-06, "learning_rate": 0.0017027443573196899, "loss": 2.457, "step": 9457 }, { "crossentropy": 2.6350187063217163, "epoch": 0.5143152342369287, "grad_norm": 0.03369483724236488, "grad_norm_var": 1.8731629076697836e-06, "learning_rate": 0.0017018083976551906, "loss": 2.635, "step": 9458 }, { "crossentropy": 2.539870023727417, "epoch": 0.5143696130944289, "grad_norm": 0.03423690423369408, "grad_norm_var": 1.9032500565789908e-06, "learning_rate": 0.0017008726425414595, "loss": 2.5399, "step": 9459 }, { "crossentropy": 2.534728527069092, "epoch": 0.5144239919519291, "grad_norm": 0.03492175415158272, "grad_norm_var": 2.0037713466633376e-06, "learning_rate": 0.0016999370920365315, "loss": 2.5347, "step": 9460 }, { "crossentropy": 2.515139937400818, "epoch": 0.5144783708094293, "grad_norm": 0.03194824978709221, "grad_norm_var": 1.6859647238579572e-06, "learning_rate": 0.001699001746198428, "loss": 2.5151, "step": 9461 }, { "crossentropy": 2.5520530939102173, "epoch": 0.5145327496669295, "grad_norm": 0.0323810875415802, "grad_norm_var": 1.6094479351435777e-06, "learning_rate": 0.0016980666050851595, "loss": 2.5521, "step": 9462 }, { "crossentropy": 2.5058469772338867, "epoch": 0.5145871285244297, "grad_norm": 0.03134268894791603, "grad_norm_var": 1.6831218209729793e-06, "learning_rate": 0.001697131668754721, "loss": 2.5058, "step": 9463 }, { "crossentropy": 2.5097678899765015, "epoch": 0.5146415073819299, "grad_norm": 0.031188197433948517, "grad_norm_var": 1.8210095878963175e-06, "learning_rate": 0.0016961969372650976, "loss": 2.5098, "step": 9464 }, { "crossentropy": 2.53393018245697, "epoch": 0.5146958862394301, "grad_norm": 0.03202953189611435, "grad_norm_var": 1.5189919530888731e-06, "learning_rate": 0.0016952624106742614, "loss": 2.5339, "step": 9465 }, { "crossentropy": 2.5017805099487305, "epoch": 0.5147502650969303, "grad_norm": 0.031054625287652016, "grad_norm_var": 1.50713086825525e-06, "learning_rate": 0.0016943280890401662, "loss": 2.5018, "step": 9466 }, { "crossentropy": 2.476540207862854, "epoch": 0.5148046439544305, "grad_norm": 0.031798966228961945, "grad_norm_var": 1.5236970670369382e-06, "learning_rate": 0.0016933939724207649, "loss": 2.4765, "step": 9467 }, { "crossentropy": 2.5547053813934326, "epoch": 0.5148590228119307, "grad_norm": 0.03226018324494362, "grad_norm_var": 1.5190217186262604e-06, "learning_rate": 0.0016924600608739843, "loss": 2.5547, "step": 9468 }, { "crossentropy": 2.4990814924240112, "epoch": 0.5149134016694309, "grad_norm": 0.031088832765817642, "grad_norm_var": 1.506354311154552e-06, "learning_rate": 0.0016915263544577475, "loss": 2.4991, "step": 9469 }, { "crossentropy": 2.5804269313812256, "epoch": 0.5149677805269312, "grad_norm": 0.030929867178201675, "grad_norm_var": 1.5815728408778793e-06, "learning_rate": 0.0016905928532299614, "loss": 2.5804, "step": 9470 }, { "crossentropy": 2.572116732597351, "epoch": 0.5150221593844313, "grad_norm": 0.03346572071313858, "grad_norm_var": 1.6335252127105352e-06, "learning_rate": 0.0016896595572485212, "loss": 2.5721, "step": 9471 }, { "crossentropy": 2.6002033948898315, "epoch": 0.5150765382419316, "grad_norm": 0.031204644590616226, "grad_norm_var": 1.7238345305351736e-06, "learning_rate": 0.0016887264665713098, "loss": 2.6002, "step": 9472 }, { "crossentropy": 2.5712993144989014, "epoch": 0.5151309170994317, "grad_norm": 0.03197290375828743, "grad_norm_var": 1.5091784877710775e-06, "learning_rate": 0.001687793581256195, "loss": 2.5713, "step": 9473 }, { "crossentropy": 2.5398412942886353, "epoch": 0.515185295956932, "grad_norm": 0.03282502293586731, "grad_norm_var": 1.3854126566549408e-06, "learning_rate": 0.0016868609013610347, "loss": 2.5398, "step": 9474 }, { "crossentropy": 2.4728416204452515, "epoch": 0.5152396748144321, "grad_norm": 0.03109298273921013, "grad_norm_var": 1.1348977652763144e-06, "learning_rate": 0.001685928426943672, "loss": 2.4728, "step": 9475 }, { "crossentropy": 2.592036008834839, "epoch": 0.5152940536719324, "grad_norm": 0.032197799533605576, "grad_norm_var": 5.262494552838442e-07, "learning_rate": 0.0016849961580619405, "loss": 2.592, "step": 9476 }, { "crossentropy": 2.4876238107681274, "epoch": 0.5153484325294325, "grad_norm": 0.030755557119846344, "grad_norm_var": 5.913953524352833e-07, "learning_rate": 0.0016840640947736534, "loss": 2.4876, "step": 9477 }, { "crossentropy": 2.4767173528671265, "epoch": 0.5154028113869328, "grad_norm": 0.030249979346990585, "grad_norm_var": 6.886186875054567e-07, "learning_rate": 0.0016831322371366215, "loss": 2.4767, "step": 9478 }, { "crossentropy": 2.6109893321990967, "epoch": 0.5154571902444329, "grad_norm": 0.03187129646539688, "grad_norm_var": 6.88574970900121e-07, "learning_rate": 0.0016822005852086374, "loss": 2.611, "step": 9479 }, { "crossentropy": 2.5198980569839478, "epoch": 0.5155115691019332, "grad_norm": 0.03167752921581268, "grad_norm_var": 6.750981051537129e-07, "learning_rate": 0.0016812691390474788, "loss": 2.5199, "step": 9480 }, { "crossentropy": 2.510693311691284, "epoch": 0.5155659479594333, "grad_norm": 0.030809445306658745, "grad_norm_var": 7.071617987644552e-07, "learning_rate": 0.0016803378987109135, "loss": 2.5107, "step": 9481 }, { "crossentropy": 2.5616344213485718, "epoch": 0.5156203268169336, "grad_norm": 0.03293044865131378, "grad_norm_var": 7.960652496672432e-07, "learning_rate": 0.0016794068642566968, "loss": 2.5616, "step": 9482 }, { "crossentropy": 2.5507465600967407, "epoch": 0.5156747056744337, "grad_norm": 0.03148815780878067, "grad_norm_var": 7.978233459982603e-07, "learning_rate": 0.0016784760357425705, "loss": 2.5507, "step": 9483 }, { "crossentropy": 2.443807601928711, "epoch": 0.515729084531934, "grad_norm": 0.03340073674917221, "grad_norm_var": 9.679246575622908e-07, "learning_rate": 0.001677545413226264, "loss": 2.4438, "step": 9484 }, { "crossentropy": 2.655112862586975, "epoch": 0.5157834633894341, "grad_norm": 0.04715171083807945, "grad_norm_var": 1.5683125240501653e-05, "learning_rate": 0.0016766149967654936, "loss": 2.6551, "step": 9485 }, { "crossentropy": 2.635727882385254, "epoch": 0.5158378422469344, "grad_norm": 0.034322209656238556, "grad_norm_var": 1.5578433042231694e-05, "learning_rate": 0.0016756847864179624, "loss": 2.6357, "step": 9486 }, { "crossentropy": 2.490434408187866, "epoch": 0.5158922211044346, "grad_norm": 0.031258635222911835, "grad_norm_var": 1.5735094824742426e-05, "learning_rate": 0.0016747547822413634, "loss": 2.4904, "step": 9487 }, { "crossentropy": 2.5858298540115356, "epoch": 0.5159465999619348, "grad_norm": 0.030829792842268944, "grad_norm_var": 1.582489098152252e-05, "learning_rate": 0.0016738249842933694, "loss": 2.5858, "step": 9488 }, { "crossentropy": 2.5157456398010254, "epoch": 0.516000978819435, "grad_norm": 0.031047891825437546, "grad_norm_var": 1.59806424677301e-05, "learning_rate": 0.0016728953926316503, "loss": 2.5157, "step": 9489 }, { "crossentropy": 2.6994961500167847, "epoch": 0.5160553576769352, "grad_norm": 0.032458752393722534, "grad_norm_var": 1.5985086117955662e-05, "learning_rate": 0.0016719660073138594, "loss": 2.6995, "step": 9490 }, { "crossentropy": 2.521745204925537, "epoch": 0.5161097365344354, "grad_norm": 0.03177421912550926, "grad_norm_var": 1.5866176709844006e-05, "learning_rate": 0.0016710368283976312, "loss": 2.5217, "step": 9491 }, { "crossentropy": 2.6142122745513916, "epoch": 0.5161641153919356, "grad_norm": 0.03372303768992424, "grad_norm_var": 1.58964262066764e-05, "learning_rate": 0.0016701078559405948, "loss": 2.6142, "step": 9492 }, { "crossentropy": 2.531327247619629, "epoch": 0.5162184942494358, "grad_norm": 0.032333336770534515, "grad_norm_var": 1.5609439409306388e-05, "learning_rate": 0.0016691790900003672, "loss": 2.5313, "step": 9493 }, { "crossentropy": 2.4526089429855347, "epoch": 0.516272873106936, "grad_norm": 0.03272911533713341, "grad_norm_var": 1.5098448360498692e-05, "learning_rate": 0.0016682505306345459, "loss": 2.4526, "step": 9494 }, { "crossentropy": 2.4643890857696533, "epoch": 0.5163272519644362, "grad_norm": 0.03112071193754673, "grad_norm_var": 1.5257916025670582e-05, "learning_rate": 0.0016673221779007204, "loss": 2.4644, "step": 9495 }, { "crossentropy": 2.5340601205825806, "epoch": 0.5163816308219364, "grad_norm": 0.032010696828365326, "grad_norm_var": 1.520317517978395e-05, "learning_rate": 0.0016663940318564663, "loss": 2.5341, "step": 9496 }, { "crossentropy": 2.654139995574951, "epoch": 0.5164360096794366, "grad_norm": 0.03283655270934105, "grad_norm_var": 1.4844470609752293e-05, "learning_rate": 0.0016654660925593463, "loss": 2.6541, "step": 9497 }, { "crossentropy": 2.5710339546203613, "epoch": 0.5164903885369369, "grad_norm": 0.03466916084289551, "grad_norm_var": 1.4967796211768035e-05, "learning_rate": 0.0016645383600669122, "loss": 2.571, "step": 9498 }, { "crossentropy": 2.5049076080322266, "epoch": 0.516544767394437, "grad_norm": 0.03121018223464489, "grad_norm_var": 1.504060035528904e-05, "learning_rate": 0.0016636108344366962, "loss": 2.5049, "step": 9499 }, { "crossentropy": 2.5989545583724976, "epoch": 0.5165991462519373, "grad_norm": 0.032161932438611984, "grad_norm_var": 1.5120668276598114e-05, "learning_rate": 0.0016626835157262277, "loss": 2.599, "step": 9500 }, { "crossentropy": 2.5309815406799316, "epoch": 0.5166535251094374, "grad_norm": 0.030702563002705574, "grad_norm_var": 1.4924360767672173e-06, "learning_rate": 0.0016617564039930171, "loss": 2.531, "step": 9501 }, { "crossentropy": 2.7015631198883057, "epoch": 0.5167079039669377, "grad_norm": 0.033241353929042816, "grad_norm_var": 1.2595103553251126e-06, "learning_rate": 0.0016608294992945612, "loss": 2.7016, "step": 9502 }, { "crossentropy": 2.467995285987854, "epoch": 0.5167622828244378, "grad_norm": 0.031869664788246155, "grad_norm_var": 1.211712313877363e-06, "learning_rate": 0.0016599028016883461, "loss": 2.468, "step": 9503 }, { "crossentropy": 2.640327215194702, "epoch": 0.5168166616819381, "grad_norm": 0.03178900480270386, "grad_norm_var": 1.0978203655553533e-06, "learning_rate": 0.0016589763112318451, "loss": 2.6403, "step": 9504 }, { "crossentropy": 2.5910180807113647, "epoch": 0.5168710405394382, "grad_norm": 0.0317780040204525, "grad_norm_var": 1.0160717353057397e-06, "learning_rate": 0.001658050027982519, "loss": 2.591, "step": 9505 }, { "crossentropy": 2.663912296295166, "epoch": 0.5169254193969385, "grad_norm": 0.033643852919340134, "grad_norm_var": 1.1328041705612462e-06, "learning_rate": 0.0016571239519978143, "loss": 2.6639, "step": 9506 }, { "crossentropy": 2.5811914205551147, "epoch": 0.5169797982544386, "grad_norm": 0.03279625624418259, "grad_norm_var": 1.11968287494836e-06, "learning_rate": 0.0016561980833351654, "loss": 2.5812, "step": 9507 }, { "crossentropy": 2.5583988428115845, "epoch": 0.5170341771119389, "grad_norm": 0.031131619587540627, "grad_norm_var": 1.0869113529811865e-06, "learning_rate": 0.0016552724220519938, "loss": 2.5584, "step": 9508 }, { "crossentropy": 2.519698977470398, "epoch": 0.517088555969439, "grad_norm": 0.032002732157707214, "grad_norm_var": 1.090135174053088e-06, "learning_rate": 0.0016543469682057106, "loss": 2.5197, "step": 9509 }, { "crossentropy": 2.5500303506851196, "epoch": 0.5171429348269393, "grad_norm": 0.032097745686769485, "grad_norm_var": 1.0731030873141942e-06, "learning_rate": 0.0016534217218537068, "loss": 2.55, "step": 9510 }, { "crossentropy": 2.4050978422164917, "epoch": 0.5171973136844394, "grad_norm": 0.032611772418022156, "grad_norm_var": 9.992000472972545e-07, "learning_rate": 0.0016524966830533667, "loss": 2.4051, "step": 9511 }, { "crossentropy": 2.558112144470215, "epoch": 0.5172516925419397, "grad_norm": 0.03140215948224068, "grad_norm_var": 1.0445663819677453e-06, "learning_rate": 0.0016515718518620647, "loss": 2.5581, "step": 9512 }, { "crossentropy": 2.561373710632324, "epoch": 0.5173060713994398, "grad_norm": 0.03144753351807594, "grad_norm_var": 1.0558794553815748e-06, "learning_rate": 0.0016506472283371527, "loss": 2.5614, "step": 9513 }, { "crossentropy": 2.4965254068374634, "epoch": 0.5173604502569401, "grad_norm": 0.03143259882926941, "grad_norm_var": 6.27660176085e-07, "learning_rate": 0.0016497228125359775, "loss": 2.4965, "step": 9514 }, { "crossentropy": 2.6118478775024414, "epoch": 0.5174148291144403, "grad_norm": 0.03175722062587738, "grad_norm_var": 5.918598303871735e-07, "learning_rate": 0.0016487986045158698, "loss": 2.6118, "step": 9515 }, { "crossentropy": 2.542288303375244, "epoch": 0.5174692079719405, "grad_norm": 0.03243650868535042, "grad_norm_var": 6.028067881644686e-07, "learning_rate": 0.001647874604334148, "loss": 2.5423, "step": 9516 }, { "crossentropy": 2.545250415802002, "epoch": 0.5175235868294407, "grad_norm": 0.03262747451663017, "grad_norm_var": 4.991383317972612e-07, "learning_rate": 0.0016469508120481185, "loss": 2.5453, "step": 9517 }, { "crossentropy": 2.552629590034485, "epoch": 0.5175779656869409, "grad_norm": 0.03252355381846428, "grad_norm_var": 4.2488992684588425e-07, "learning_rate": 0.0016460272277150734, "loss": 2.5526, "step": 9518 }, { "crossentropy": 2.475993514060974, "epoch": 0.5176323445444411, "grad_norm": 0.031613267958164215, "grad_norm_var": 4.3633386115752607e-07, "learning_rate": 0.0016451038513922928, "loss": 2.476, "step": 9519 }, { "crossentropy": 2.6599512100219727, "epoch": 0.5176867234019413, "grad_norm": 0.03213520720601082, "grad_norm_var": 4.309368241798972e-07, "learning_rate": 0.0016441806831370438, "loss": 2.66, "step": 9520 }, { "crossentropy": 2.560814619064331, "epoch": 0.5177411022594415, "grad_norm": 0.03174956887960434, "grad_norm_var": 4.321696550431033e-07, "learning_rate": 0.0016432577230065814, "loss": 2.5608, "step": 9521 }, { "crossentropy": 2.5853697061538696, "epoch": 0.5177954811169417, "grad_norm": 0.03232566639780998, "grad_norm_var": 2.673285052625368e-07, "learning_rate": 0.001642334971058142, "loss": 2.5854, "step": 9522 }, { "crossentropy": 2.5535316467285156, "epoch": 0.5178498599744419, "grad_norm": 0.03208962827920914, "grad_norm_var": 2.240504622070255e-07, "learning_rate": 0.0016414124273489612, "loss": 2.5535, "step": 9523 }, { "crossentropy": 2.503181576728821, "epoch": 0.5179042388319421, "grad_norm": 0.031385622918605804, "grad_norm_var": 1.9997661987620578e-07, "learning_rate": 0.0016404900919362486, "loss": 2.5032, "step": 9524 }, { "crossentropy": 2.476382613182068, "epoch": 0.5179586176894423, "grad_norm": 0.031873226165771484, "grad_norm_var": 2.0058728573297638e-07, "learning_rate": 0.001639567964877209, "loss": 2.4764, "step": 9525 }, { "crossentropy": 2.59803569316864, "epoch": 0.5180129965469426, "grad_norm": 0.0321609266102314, "grad_norm_var": 2.0191884082138937e-07, "learning_rate": 0.0016386460462290303, "loss": 2.598, "step": 9526 }, { "crossentropy": 2.5614819526672363, "epoch": 0.5180673754044427, "grad_norm": 0.033273931592702866, "grad_norm_var": 2.856964125320822e-07, "learning_rate": 0.0016377243360488909, "loss": 2.5615, "step": 9527 }, { "crossentropy": 2.5849777460098267, "epoch": 0.518121754261943, "grad_norm": 0.032823581248521805, "grad_norm_var": 2.9589652895487676e-07, "learning_rate": 0.001636802834393953, "loss": 2.585, "step": 9528 }, { "crossentropy": 2.5767682790756226, "epoch": 0.5181761331194431, "grad_norm": 0.042302004992961884, "grad_norm_var": 6.710306204136401e-06, "learning_rate": 0.001635881541321368, "loss": 2.5768, "step": 9529 }, { "crossentropy": 2.5668057203292847, "epoch": 0.5182305119769434, "grad_norm": 0.031291380524635315, "grad_norm_var": 6.736958269740884e-06, "learning_rate": 0.001634960456888273, "loss": 2.5668, "step": 9530 }, { "crossentropy": 2.5788060426712036, "epoch": 0.5182848908344436, "grad_norm": 0.03495138883590698, "grad_norm_var": 6.9419978142265065e-06, "learning_rate": 0.001634039581151794, "loss": 2.5788, "step": 9531 }, { "crossentropy": 2.5649887323379517, "epoch": 0.5183392696919438, "grad_norm": 0.035711366683244705, "grad_norm_var": 7.37817165674832e-06, "learning_rate": 0.0016331189141690434, "loss": 2.565, "step": 9532 }, { "crossentropy": 2.5317516326904297, "epoch": 0.518393648549444, "grad_norm": 0.03225483372807503, "grad_norm_var": 7.414171894174851e-06, "learning_rate": 0.0016321984559971154, "loss": 2.5318, "step": 9533 }, { "crossentropy": 2.502625346183777, "epoch": 0.5184480274069442, "grad_norm": 0.03166408836841583, "grad_norm_var": 7.532593954437336e-06, "learning_rate": 0.001631278206693101, "loss": 2.5026, "step": 9534 }, { "crossentropy": 2.618248701095581, "epoch": 0.5185024062644444, "grad_norm": 0.03201275318861008, "grad_norm_var": 7.463358956652915e-06, "learning_rate": 0.0016303581663140731, "loss": 2.6182, "step": 9535 }, { "crossentropy": 2.590814709663391, "epoch": 0.5185567851219446, "grad_norm": 0.03231626749038696, "grad_norm_var": 7.441505118852924e-06, "learning_rate": 0.0016294383349170888, "loss": 2.5908, "step": 9536 }, { "crossentropy": 2.608260989189148, "epoch": 0.5186111639794448, "grad_norm": 0.03265928849577904, "grad_norm_var": 7.324983403518964e-06, "learning_rate": 0.0016285187125591966, "loss": 2.6083, "step": 9537 }, { "crossentropy": 2.7005268335342407, "epoch": 0.518665542836945, "grad_norm": 0.03489881008863449, "grad_norm_var": 7.441059727876046e-06, "learning_rate": 0.001627599299297431, "loss": 2.7005, "step": 9538 }, { "crossentropy": 2.6631863117218018, "epoch": 0.5187199216944453, "grad_norm": 0.033679284155368805, "grad_norm_var": 7.330941249318138e-06, "learning_rate": 0.0016266800951888123, "loss": 2.6632, "step": 9539 }, { "crossentropy": 2.6515614986419678, "epoch": 0.5187743005519454, "grad_norm": 0.030651722103357315, "grad_norm_var": 7.566970139524729e-06, "learning_rate": 0.0016257611002903488, "loss": 2.6516, "step": 9540 }, { "crossentropy": 2.5137014389038086, "epoch": 0.5188286794094457, "grad_norm": 0.03286545351147652, "grad_norm_var": 7.425482401889524e-06, "learning_rate": 0.0016248423146590363, "loss": 2.5137, "step": 9541 }, { "crossentropy": 2.5992056131362915, "epoch": 0.5188830582669458, "grad_norm": 0.03339923918247223, "grad_norm_var": 7.305212205918101e-06, "learning_rate": 0.0016239237383518573, "loss": 2.5992, "step": 9542 }, { "crossentropy": 2.549374580383301, "epoch": 0.5189374371244461, "grad_norm": 0.05287202447652817, "grad_norm_var": 3.059643633344817e-05, "learning_rate": 0.001623005371425782, "loss": 2.5494, "step": 9543 }, { "crossentropy": 2.636587381362915, "epoch": 0.5189918159819462, "grad_norm": 0.032971397042274475, "grad_norm_var": 3.055939915406937e-05, "learning_rate": 0.0016220872139377613, "loss": 2.6366, "step": 9544 }, { "crossentropy": 2.573811650276184, "epoch": 0.5190461948394465, "grad_norm": 0.030536770820617676, "grad_norm_var": 2.74130303516024e-05, "learning_rate": 0.0016211692659447447, "loss": 2.5738, "step": 9545 }, { "crossentropy": 2.5954326391220093, "epoch": 0.5191005736969466, "grad_norm": 0.034066300839185715, "grad_norm_var": 2.6875110246258465e-05, "learning_rate": 0.0016202515275036617, "loss": 2.5954, "step": 9546 }, { "crossentropy": 2.5208845138549805, "epoch": 0.5191549525544469, "grad_norm": 0.0324786975979805, "grad_norm_var": 2.7015929021539007e-05, "learning_rate": 0.001619333998671426, "loss": 2.5209, "step": 9547 }, { "crossentropy": 2.5558888912200928, "epoch": 0.519209331411947, "grad_norm": 0.032303594052791595, "grad_norm_var": 2.699362871559126e-05, "learning_rate": 0.0016184166795049427, "loss": 2.5559, "step": 9548 }, { "crossentropy": 2.538857579231262, "epoch": 0.5192637102694473, "grad_norm": 0.035907160490751266, "grad_norm_var": 2.7049608695702865e-05, "learning_rate": 0.0016174995700611074, "loss": 2.5389, "step": 9549 }, { "crossentropy": 2.54072105884552, "epoch": 0.5193180891269474, "grad_norm": 0.03530319407582283, "grad_norm_var": 2.670498098175933e-05, "learning_rate": 0.0016165826703967935, "loss": 2.5407, "step": 9550 }, { "crossentropy": 2.4818273782730103, "epoch": 0.5193724679844477, "grad_norm": 0.031823161989450455, "grad_norm_var": 2.6765239132699586e-05, "learning_rate": 0.001615665980568868, "loss": 2.4818, "step": 9551 }, { "crossentropy": 2.6154589653015137, "epoch": 0.5194268468419478, "grad_norm": 0.0336860753595829, "grad_norm_var": 2.6520973537499012e-05, "learning_rate": 0.0016147495006341834, "loss": 2.6155, "step": 9552 }, { "crossentropy": 2.623923182487488, "epoch": 0.5194812256994481, "grad_norm": 0.03244585543870926, "grad_norm_var": 2.657282764346221e-05, "learning_rate": 0.0016138332306495785, "loss": 2.6239, "step": 9553 }, { "crossentropy": 2.5325703620910645, "epoch": 0.5195356045569482, "grad_norm": 0.03290162235498428, "grad_norm_var": 2.668078708783396e-05, "learning_rate": 0.0016129171706718815, "loss": 2.5326, "step": 9554 }, { "crossentropy": 2.48028302192688, "epoch": 0.5195899834144485, "grad_norm": 0.03307824581861496, "grad_norm_var": 2.6748558139833415e-05, "learning_rate": 0.0016120013207579003, "loss": 2.4803, "step": 9555 }, { "crossentropy": 2.5434130430221558, "epoch": 0.5196443622719487, "grad_norm": 0.03152623772621155, "grad_norm_var": 2.6381960503294583e-05, "learning_rate": 0.0016110856809644404, "loss": 2.5434, "step": 9556 }, { "crossentropy": 2.6073912382125854, "epoch": 0.5196987411294489, "grad_norm": 0.03076513297855854, "grad_norm_var": 2.7048290318177358e-05, "learning_rate": 0.0016101702513482885, "loss": 2.6074, "step": 9557 }, { "crossentropy": 2.53957736492157, "epoch": 0.5197531199869491, "grad_norm": 0.034036409109830856, "grad_norm_var": 2.7011663088781717e-05, "learning_rate": 0.0016092550319662159, "loss": 2.5396, "step": 9558 }, { "crossentropy": 2.5301411151885986, "epoch": 0.5198074988444493, "grad_norm": 0.0323776938021183, "grad_norm_var": 2.154937982494021e-06, "learning_rate": 0.0016083400228749834, "loss": 2.5301, "step": 9559 }, { "crossentropy": 2.625627279281616, "epoch": 0.5198618777019495, "grad_norm": 0.03547419235110283, "grad_norm_var": 2.5742764976488156e-06, "learning_rate": 0.0016074252241313437, "loss": 2.6256, "step": 9560 }, { "crossentropy": 2.6051785945892334, "epoch": 0.5199162565594497, "grad_norm": 0.03228968009352684, "grad_norm_var": 2.180234294127956e-06, "learning_rate": 0.0016065106357920273, "loss": 2.6052, "step": 9561 }, { "crossentropy": 2.5640079975128174, "epoch": 0.5199706354169499, "grad_norm": 0.0313405804336071, "grad_norm_var": 2.3130073989996535e-06, "learning_rate": 0.0016055962579137573, "loss": 2.564, "step": 9562 }, { "crossentropy": 2.5993974208831787, "epoch": 0.5200250142744501, "grad_norm": 0.03210236504673958, "grad_norm_var": 2.3471936556907716e-06, "learning_rate": 0.0016046820905532427, "loss": 2.5994, "step": 9563 }, { "crossentropy": 2.6603819131851196, "epoch": 0.5200793931319503, "grad_norm": 0.03172209858894348, "grad_norm_var": 2.4192259810883737e-06, "learning_rate": 0.00160376813376718, "loss": 2.6604, "step": 9564 }, { "crossentropy": 2.556322932243347, "epoch": 0.5201337719894505, "grad_norm": 0.0323118194937706, "grad_norm_var": 1.7969382277957612e-06, "learning_rate": 0.0016028543876122525, "loss": 2.5563, "step": 9565 }, { "crossentropy": 2.530843496322632, "epoch": 0.5201881508469507, "grad_norm": 0.03346496820449829, "grad_norm_var": 1.3698563711236755e-06, "learning_rate": 0.0016019408521451256, "loss": 2.5308, "step": 9566 }, { "crossentropy": 2.5865821838378906, "epoch": 0.520242529704451, "grad_norm": 0.031658586114645004, "grad_norm_var": 1.3882475413843347e-06, "learning_rate": 0.0016010275274224606, "loss": 2.5866, "step": 9567 }, { "crossentropy": 2.5262731313705444, "epoch": 0.5202969085619511, "grad_norm": 0.0340406596660614, "grad_norm_var": 1.4486894655677996e-06, "learning_rate": 0.001600114413500901, "loss": 2.5263, "step": 9568 }, { "crossentropy": 2.5155014991760254, "epoch": 0.5203512874194514, "grad_norm": 0.031164957210421562, "grad_norm_var": 1.576877457661457e-06, "learning_rate": 0.0015992015104370743, "loss": 2.5155, "step": 9569 }, { "crossentropy": 2.6090656518936157, "epoch": 0.5204056662769515, "grad_norm": 0.03373156487941742, "grad_norm_var": 1.6626055243153132e-06, "learning_rate": 0.0015982888182875976, "loss": 2.6091, "step": 9570 }, { "crossentropy": 2.434427499771118, "epoch": 0.5204600451344518, "grad_norm": 0.034015629440546036, "grad_norm_var": 1.7813182760770262e-06, "learning_rate": 0.0015973763371090804, "loss": 2.4344, "step": 9571 }, { "crossentropy": 2.693307042121887, "epoch": 0.5205144239919519, "grad_norm": 0.033507317304611206, "grad_norm_var": 1.7360064513846641e-06, "learning_rate": 0.0015964640669581087, "loss": 2.6933, "step": 9572 }, { "crossentropy": 2.577852249145508, "epoch": 0.5205688028494522, "grad_norm": 0.032575253397226334, "grad_norm_var": 1.46168837483102e-06, "learning_rate": 0.0015955520078912628, "loss": 2.5779, "step": 9573 }, { "crossentropy": 2.5989574193954468, "epoch": 0.5206231817069523, "grad_norm": 0.033675022423267365, "grad_norm_var": 1.4133276973633205e-06, "learning_rate": 0.0015946401599651072, "loss": 2.599, "step": 9574 }, { "crossentropy": 2.5562938451766968, "epoch": 0.5206775605644526, "grad_norm": 0.030871089547872543, "grad_norm_var": 1.6482175897462493e-06, "learning_rate": 0.0015937285232361937, "loss": 2.5563, "step": 9575 }, { "crossentropy": 2.5431829690933228, "epoch": 0.5207319394219527, "grad_norm": 0.03187328949570656, "grad_norm_var": 1.1490567748358015e-06, "learning_rate": 0.0015928170977610634, "loss": 2.5432, "step": 9576 }, { "crossentropy": 2.5614490509033203, "epoch": 0.520786318279453, "grad_norm": 0.0348125696182251, "grad_norm_var": 1.4688681657490694e-06, "learning_rate": 0.0015919058835962374, "loss": 2.5614, "step": 9577 }, { "crossentropy": 2.515169858932495, "epoch": 0.5208406971369531, "grad_norm": 0.03443574905395508, "grad_norm_var": 1.5151739704491087e-06, "learning_rate": 0.0015909948807982322, "loss": 2.5152, "step": 9578 }, { "crossentropy": 2.560463309288025, "epoch": 0.5208950759944534, "grad_norm": 0.03234758973121643, "grad_norm_var": 1.4937455966564876e-06, "learning_rate": 0.001590084089423548, "loss": 2.5605, "step": 9579 }, { "crossentropy": 2.5582858324050903, "epoch": 0.5209494548519535, "grad_norm": 0.03155141696333885, "grad_norm_var": 1.5220996534973425e-06, "learning_rate": 0.001589173509528668, "loss": 2.5583, "step": 9580 }, { "crossentropy": 2.420480728149414, "epoch": 0.5210038337094538, "grad_norm": 0.03172535449266434, "grad_norm_var": 1.5878172508425644e-06, "learning_rate": 0.0015882631411700654, "loss": 2.4205, "step": 9581 }, { "crossentropy": 2.622344136238098, "epoch": 0.521058212566954, "grad_norm": 0.03265907242894173, "grad_norm_var": 1.561328427726359e-06, "learning_rate": 0.001587352984404205, "loss": 2.6223, "step": 9582 }, { "crossentropy": 2.499503493309021, "epoch": 0.5211125914244542, "grad_norm": 0.03403148055076599, "grad_norm_var": 1.5551779711104787e-06, "learning_rate": 0.0015864430392875295, "loss": 2.4995, "step": 9583 }, { "crossentropy": 2.5338321924209595, "epoch": 0.5211669702819544, "grad_norm": 0.03160018101334572, "grad_norm_var": 1.5688253604487773e-06, "learning_rate": 0.0015855333058764748, "loss": 2.5338, "step": 9584 }, { "crossentropy": 2.467833161354065, "epoch": 0.5212213491394546, "grad_norm": 0.033165764063596725, "grad_norm_var": 1.3865490085434677e-06, "learning_rate": 0.0015846237842274602, "loss": 2.4678, "step": 9585 }, { "crossentropy": 2.5565602779388428, "epoch": 0.5212757279969548, "grad_norm": 0.031859323382377625, "grad_norm_var": 1.4008266841593341e-06, "learning_rate": 0.001583714474396895, "loss": 2.5566, "step": 9586 }, { "crossentropy": 2.5138884782791138, "epoch": 0.521330106854455, "grad_norm": 0.03172573447227478, "grad_norm_var": 1.3556059169638734e-06, "learning_rate": 0.0015828053764411726, "loss": 2.5139, "step": 9587 }, { "crossentropy": 2.5201187133789062, "epoch": 0.5213844857119552, "grad_norm": 0.03223014622926712, "grad_norm_var": 1.311734176610788e-06, "learning_rate": 0.0015818964904166755, "loss": 2.5201, "step": 9588 }, { "crossentropy": 2.577358603477478, "epoch": 0.5214388645694554, "grad_norm": 0.03247592970728874, "grad_norm_var": 1.31229693651243e-06, "learning_rate": 0.001580987816379771, "loss": 2.5774, "step": 9589 }, { "crossentropy": 2.5878958702087402, "epoch": 0.5214932434269556, "grad_norm": 0.03223207965493202, "grad_norm_var": 1.2288638833318766e-06, "learning_rate": 0.0015800793543868164, "loss": 2.5879, "step": 9590 }, { "crossentropy": 2.472055196762085, "epoch": 0.5215476222844558, "grad_norm": 0.03351392596960068, "grad_norm_var": 1.100288471400125e-06, "learning_rate": 0.0015791711044941498, "loss": 2.4721, "step": 9591 }, { "crossentropy": 2.5025479793548584, "epoch": 0.521602001141956, "grad_norm": 0.030225487425923347, "grad_norm_var": 1.4384379220900294e-06, "learning_rate": 0.001578263066758101, "loss": 2.5025, "step": 9592 }, { "crossentropy": 2.509647846221924, "epoch": 0.5216563799994562, "grad_norm": 0.034285422414541245, "grad_norm_var": 1.295863470503764e-06, "learning_rate": 0.0015773552412349896, "loss": 2.5096, "step": 9593 }, { "crossentropy": 2.5954595804214478, "epoch": 0.5217107588569564, "grad_norm": 0.0327068492770195, "grad_norm_var": 1.037384612166435e-06, "learning_rate": 0.0015764476279811134, "loss": 2.5955, "step": 9594 }, { "crossentropy": 2.680287480354309, "epoch": 0.5217651377144567, "grad_norm": 0.034357860684394836, "grad_norm_var": 1.276987300565417e-06, "learning_rate": 0.001575540227052764, "loss": 2.6803, "step": 9595 }, { "crossentropy": 2.523194193840027, "epoch": 0.5218195165719568, "grad_norm": 0.03337777405977249, "grad_norm_var": 1.24920111749537e-06, "learning_rate": 0.0015746330385062169, "loss": 2.5232, "step": 9596 }, { "crossentropy": 2.5834347009658813, "epoch": 0.5218738954294571, "grad_norm": 0.03248300403356552, "grad_norm_var": 1.1931076347373627e-06, "learning_rate": 0.0015737260623977355, "loss": 2.5834, "step": 9597 }, { "crossentropy": 2.612333297729492, "epoch": 0.5219282742869572, "grad_norm": 0.0320037379860878, "grad_norm_var": 1.222050943650701e-06, "learning_rate": 0.0015728192987835693, "loss": 2.6123, "step": 9598 }, { "crossentropy": 2.6115998029708862, "epoch": 0.5219826531444575, "grad_norm": 0.04072294384241104, "grad_norm_var": 5.2600681422789446e-06, "learning_rate": 0.0015719127477199547, "loss": 2.6116, "step": 9599 }, { "crossentropy": 2.6403976678848267, "epoch": 0.5220370320019576, "grad_norm": 0.031240230426192284, "grad_norm_var": 5.338246102482245e-06, "learning_rate": 0.001571006409263116, "loss": 2.6404, "step": 9600 }, { "crossentropy": 2.5320382118225098, "epoch": 0.5220914108594579, "grad_norm": 0.03229103982448578, "grad_norm_var": 5.371153377097387e-06, "learning_rate": 0.0015701002834692624, "loss": 2.532, "step": 9601 }, { "crossentropy": 2.63472580909729, "epoch": 0.522145789716958, "grad_norm": 0.03200970217585564, "grad_norm_var": 5.350032081154893e-06, "learning_rate": 0.0015691943703945938, "loss": 2.6347, "step": 9602 }, { "crossentropy": 2.597443699836731, "epoch": 0.5222001685744583, "grad_norm": 0.03310687467455864, "grad_norm_var": 5.235954905812428e-06, "learning_rate": 0.001568288670095288, "loss": 2.5974, "step": 9603 }, { "crossentropy": 2.47499942779541, "epoch": 0.5222545474319584, "grad_norm": 0.03464988246560097, "grad_norm_var": 5.328053153237644e-06, "learning_rate": 0.001567383182627523, "loss": 2.475, "step": 9604 }, { "crossentropy": 2.626254677772522, "epoch": 0.5223089262894587, "grad_norm": 0.032731812447309494, "grad_norm_var": 5.306412406243244e-06, "learning_rate": 0.0015664779080474512, "loss": 2.6263, "step": 9605 }, { "crossentropy": 2.522402286529541, "epoch": 0.5223633051469588, "grad_norm": 0.031907759606838226, "grad_norm_var": 5.356838101488494e-06, "learning_rate": 0.001565572846411218, "loss": 2.5224, "step": 9606 }, { "crossentropy": 2.62082302570343, "epoch": 0.5224176840044591, "grad_norm": 0.03499405086040497, "grad_norm_var": 5.550604271391199e-06, "learning_rate": 0.0015646679977749556, "loss": 2.6208, "step": 9607 }, { "crossentropy": 2.5048506259918213, "epoch": 0.5224720628619592, "grad_norm": 0.034838948398828506, "grad_norm_var": 4.97831689045615e-06, "learning_rate": 0.0015637633621947806, "loss": 2.5049, "step": 9608 }, { "crossentropy": 2.620584011077881, "epoch": 0.5225264417194595, "grad_norm": 0.032946109771728516, "grad_norm_var": 4.96923165607097e-06, "learning_rate": 0.0015628589397267988, "loss": 2.6206, "step": 9609 }, { "crossentropy": 2.6113061904907227, "epoch": 0.5225808205769596, "grad_norm": 0.03239649906754494, "grad_norm_var": 5.009025325475299e-06, "learning_rate": 0.001561954730427101, "loss": 2.6113, "step": 9610 }, { "crossentropy": 2.5534428358078003, "epoch": 0.5226351994344599, "grad_norm": 0.0318266935646534, "grad_norm_var": 5.121160516920172e-06, "learning_rate": 0.0015610507343517654, "loss": 2.5534, "step": 9611 }, { "crossentropy": 2.5039191246032715, "epoch": 0.52268957829196, "grad_norm": 0.03188152238726616, "grad_norm_var": 5.254633220850531e-06, "learning_rate": 0.0015601469515568573, "loss": 2.5039, "step": 9612 }, { "crossentropy": 2.411560893058777, "epoch": 0.5227439571494603, "grad_norm": 0.033170510083436966, "grad_norm_var": 5.21368964813879e-06, "learning_rate": 0.0015592433820984298, "loss": 2.4116, "step": 9613 }, { "crossentropy": 2.5490909814834595, "epoch": 0.5227983360069605, "grad_norm": 0.03126322850584984, "grad_norm_var": 5.37544364427379e-06, "learning_rate": 0.0015583400260325165, "loss": 2.5491, "step": 9614 }, { "crossentropy": 2.6120561361312866, "epoch": 0.5228527148644607, "grad_norm": 0.034549564123153687, "grad_norm_var": 1.6051062302797759e-06, "learning_rate": 0.001557436883415148, "loss": 2.6121, "step": 9615 }, { "crossentropy": 2.536659836769104, "epoch": 0.5229070937219609, "grad_norm": 0.031091760843992233, "grad_norm_var": 1.6386037684463946e-06, "learning_rate": 0.001556533954302336, "loss": 2.5367, "step": 9616 }, { "crossentropy": 2.5302882194519043, "epoch": 0.5229614725794611, "grad_norm": 0.03236376866698265, "grad_norm_var": 1.6334801091968018e-06, "learning_rate": 0.0015556312387500766, "loss": 2.5303, "step": 9617 }, { "crossentropy": 2.641229510307312, "epoch": 0.5230158514369613, "grad_norm": 0.03245675563812256, "grad_norm_var": 1.5954040000570302e-06, "learning_rate": 0.0015547287368143559, "loss": 2.6412, "step": 9618 }, { "crossentropy": 2.5086151361465454, "epoch": 0.5230702302944615, "grad_norm": 0.03238539770245552, "grad_norm_var": 1.606688041815886e-06, "learning_rate": 0.001553826448551147, "loss": 2.5086, "step": 9619 }, { "crossentropy": 2.5036767721176147, "epoch": 0.5231246091519617, "grad_norm": 0.0321480967104435, "grad_norm_var": 1.3944435818808234e-06, "learning_rate": 0.0015529243740164094, "loss": 2.5037, "step": 9620 }, { "crossentropy": 2.5782177448272705, "epoch": 0.5231789880094619, "grad_norm": 0.031498249620199203, "grad_norm_var": 1.481771610660892e-06, "learning_rate": 0.0015520225132660882, "loss": 2.5782, "step": 9621 }, { "crossentropy": 2.5472633838653564, "epoch": 0.5232333668669621, "grad_norm": 0.03264298662543297, "grad_norm_var": 1.4469674389376112e-06, "learning_rate": 0.001551120866356116, "loss": 2.5473, "step": 9622 }, { "crossentropy": 2.575073480606079, "epoch": 0.5232877457244623, "grad_norm": 0.032016489654779434, "grad_norm_var": 1.0718204791171706e-06, "learning_rate": 0.001550219433342413, "loss": 2.5751, "step": 9623 }, { "crossentropy": 2.563977360725403, "epoch": 0.5233421245819625, "grad_norm": 0.033422596752643585, "grad_norm_var": 7.493177325047784e-07, "learning_rate": 0.0015493182142808854, "loss": 2.564, "step": 9624 }, { "crossentropy": 2.5800551176071167, "epoch": 0.5233965034394628, "grad_norm": 0.03208693489432335, "grad_norm_var": 7.304608841393034e-07, "learning_rate": 0.0015484172092274217, "loss": 2.5801, "step": 9625 }, { "crossentropy": 2.5199891328811646, "epoch": 0.5234508822969629, "grad_norm": 0.0318172425031662, "grad_norm_var": 7.459149375684444e-07, "learning_rate": 0.001547516418237907, "loss": 2.52, "step": 9626 }, { "crossentropy": 2.5990185737609863, "epoch": 0.5235052611544632, "grad_norm": 0.03933756798505783, "grad_norm_var": 3.808903160774403e-06, "learning_rate": 0.0015466158413682063, "loss": 2.599, "step": 9627 }, { "crossentropy": 2.62093448638916, "epoch": 0.5235596400119633, "grad_norm": 0.0322473905980587, "grad_norm_var": 3.7744984297792434e-06, "learning_rate": 0.0015457154786741706, "loss": 2.6209, "step": 9628 }, { "crossentropy": 2.4957088232040405, "epoch": 0.5236140188694636, "grad_norm": 0.03134097531437874, "grad_norm_var": 3.8887207119868315e-06, "learning_rate": 0.0015448153302116386, "loss": 2.4957, "step": 9629 }, { "crossentropy": 2.606823205947876, "epoch": 0.5236683977269637, "grad_norm": 0.03235810995101929, "grad_norm_var": 3.7587424159025995e-06, "learning_rate": 0.0015439153960364416, "loss": 2.6068, "step": 9630 }, { "crossentropy": 2.555362582206726, "epoch": 0.523722776584464, "grad_norm": 0.031855709850788116, "grad_norm_var": 3.5606266895749084e-06, "learning_rate": 0.0015430156762043879, "loss": 2.5554, "step": 9631 }, { "crossentropy": 2.557352900505066, "epoch": 0.5237771554419641, "grad_norm": 0.03218033164739609, "grad_norm_var": 3.4205858767598236e-06, "learning_rate": 0.0015421161707712793, "loss": 2.5574, "step": 9632 }, { "crossentropy": 2.559744954109192, "epoch": 0.5238315342994644, "grad_norm": 0.032391902059316635, "grad_norm_var": 3.419618251002526e-06, "learning_rate": 0.0015412168797929021, "loss": 2.5597, "step": 9633 }, { "crossentropy": 2.4949010610580444, "epoch": 0.5238859131569645, "grad_norm": 0.0315808542072773, "grad_norm_var": 3.488580219011365e-06, "learning_rate": 0.0015403178033250291, "loss": 2.4949, "step": 9634 }, { "crossentropy": 2.5173720121383667, "epoch": 0.5239402920144648, "grad_norm": 0.031041689217090607, "grad_norm_var": 3.6366377068112905e-06, "learning_rate": 0.0015394189414234222, "loss": 2.5174, "step": 9635 }, { "crossentropy": 2.5784629583358765, "epoch": 0.5239946708719649, "grad_norm": 0.0319070965051651, "grad_norm_var": 3.651509592021244e-06, "learning_rate": 0.001538520294143823, "loss": 2.5785, "step": 9636 }, { "crossentropy": 2.5474884510040283, "epoch": 0.5240490497294652, "grad_norm": 0.03138294070959091, "grad_norm_var": 3.6674788674685593e-06, "learning_rate": 0.0015376218615419696, "loss": 2.5475, "step": 9637 }, { "crossentropy": 2.518220543861389, "epoch": 0.5241034285869653, "grad_norm": 0.030219290405511856, "grad_norm_var": 3.980554860482227e-06, "learning_rate": 0.0015367236436735821, "loss": 2.5182, "step": 9638 }, { "crossentropy": 2.464453101158142, "epoch": 0.5241578074444656, "grad_norm": 0.0351884588599205, "grad_norm_var": 4.479254006852183e-06, "learning_rate": 0.0015358256405943638, "loss": 2.4645, "step": 9639 }, { "crossentropy": 2.541829228401184, "epoch": 0.5242121863019658, "grad_norm": 0.033122118562459946, "grad_norm_var": 4.44883341921039e-06, "learning_rate": 0.001534927852360008, "loss": 2.5418, "step": 9640 }, { "crossentropy": 2.437538981437683, "epoch": 0.524266565159466, "grad_norm": 0.031013790518045425, "grad_norm_var": 4.580438817687547e-06, "learning_rate": 0.0015340302790261995, "loss": 2.4375, "step": 9641 }, { "crossentropy": 2.5078643560409546, "epoch": 0.5243209440169662, "grad_norm": 0.030844533815979958, "grad_norm_var": 4.7199001475255044e-06, "learning_rate": 0.0015331329206486005, "loss": 2.5079, "step": 9642 }, { "crossentropy": 2.5725903511047363, "epoch": 0.5243753228744664, "grad_norm": 0.030554495751857758, "grad_norm_var": 1.3885331230346764e-06, "learning_rate": 0.0015322357772828655, "loss": 2.5726, "step": 9643 }, { "crossentropy": 2.5269769430160522, "epoch": 0.5244297017319666, "grad_norm": 0.03328394517302513, "grad_norm_var": 1.5138069705842993e-06, "learning_rate": 0.0015313388489846343, "loss": 2.527, "step": 9644 }, { "crossentropy": 2.491502523422241, "epoch": 0.5244840805894668, "grad_norm": 0.030373942106962204, "grad_norm_var": 1.6432555392593134e-06, "learning_rate": 0.0015304421358095339, "loss": 2.4915, "step": 9645 }, { "crossentropy": 2.568275213241577, "epoch": 0.524538459446967, "grad_norm": 0.03140788525342941, "grad_norm_var": 1.6329308360420498e-06, "learning_rate": 0.001529545637813179, "loss": 2.5683, "step": 9646 }, { "crossentropy": 2.548171639442444, "epoch": 0.5245928383044672, "grad_norm": 0.031364090740680695, "grad_norm_var": 1.6425369520113904e-06, "learning_rate": 0.0015286493550511655, "loss": 2.5482, "step": 9647 }, { "crossentropy": 2.502424955368042, "epoch": 0.5246472171619674, "grad_norm": 0.03200298547744751, "grad_norm_var": 1.6341161944454964e-06, "learning_rate": 0.001527753287579084, "loss": 2.5024, "step": 9648 }, { "crossentropy": 2.581589937210083, "epoch": 0.5247015960194676, "grad_norm": 0.03328802064061165, "grad_norm_var": 1.763391029740976e-06, "learning_rate": 0.0015268574354525078, "loss": 2.5816, "step": 9649 }, { "crossentropy": 2.550511956214905, "epoch": 0.5247559748769678, "grad_norm": 0.03235041722655296, "grad_norm_var": 1.7793546655504027e-06, "learning_rate": 0.0015259617987269936, "loss": 2.5505, "step": 9650 }, { "crossentropy": 2.421599864959717, "epoch": 0.524810353734468, "grad_norm": 0.03281629830598831, "grad_norm_var": 1.788684594099587e-06, "learning_rate": 0.0015250663774580902, "loss": 2.4216, "step": 9651 }, { "crossentropy": 2.54938280582428, "epoch": 0.5248647325919682, "grad_norm": 0.03217329457402229, "grad_norm_var": 1.791767432927554e-06, "learning_rate": 0.0015241711717013306, "loss": 2.5494, "step": 9652 }, { "crossentropy": 2.447230339050293, "epoch": 0.5249191114494685, "grad_norm": 0.03181201592087746, "grad_norm_var": 1.7701656675746795e-06, "learning_rate": 0.0015232761815122343, "loss": 2.4472, "step": 9653 }, { "crossentropy": 2.4716293811798096, "epoch": 0.5249734903069686, "grad_norm": 0.03157130256295204, "grad_norm_var": 1.5654840325698132e-06, "learning_rate": 0.0015223814069463078, "loss": 2.4716, "step": 9654 }, { "crossentropy": 2.6241865158081055, "epoch": 0.5250278691644689, "grad_norm": 0.03258760645985603, "grad_norm_var": 9.078725203464592e-07, "learning_rate": 0.0015214868480590443, "loss": 2.6242, "step": 9655 }, { "crossentropy": 2.5805002450942993, "epoch": 0.525082248021969, "grad_norm": 0.03144983574748039, "grad_norm_var": 8.124822644034324e-07, "learning_rate": 0.0015205925049059234, "loss": 2.5805, "step": 9656 }, { "crossentropy": 2.6016108989715576, "epoch": 0.5251366268794693, "grad_norm": 0.03260837122797966, "grad_norm_var": 8.029884409687377e-07, "learning_rate": 0.0015196983775424117, "loss": 2.6016, "step": 9657 }, { "crossentropy": 2.6033542156219482, "epoch": 0.5251910057369694, "grad_norm": 0.031016945838928223, "grad_norm_var": 7.804550376096694e-07, "learning_rate": 0.001518804466023963, "loss": 2.6034, "step": 9658 }, { "crossentropy": 2.572229266166687, "epoch": 0.5252453845944697, "grad_norm": 0.031671732664108276, "grad_norm_var": 6.556015396746533e-07, "learning_rate": 0.001517910770406013, "loss": 2.5722, "step": 9659 }, { "crossentropy": 2.505414843559265, "epoch": 0.5252997634519698, "grad_norm": 0.03293781355023384, "grad_norm_var": 6.031959264611124e-07, "learning_rate": 0.0015170172907439938, "loss": 2.5054, "step": 9660 }, { "crossentropy": 2.626587748527527, "epoch": 0.5253541423094701, "grad_norm": 0.03321187198162079, "grad_norm_var": 5.046958614427291e-07, "learning_rate": 0.0015161240270933135, "loss": 2.6266, "step": 9661 }, { "crossentropy": 2.6718310117721558, "epoch": 0.5254085211669702, "grad_norm": 0.03677236661314964, "grad_norm_var": 1.7782812291753288e-06, "learning_rate": 0.0015152309795093734, "loss": 2.6718, "step": 9662 }, { "crossentropy": 2.5593191385269165, "epoch": 0.5254629000244705, "grad_norm": 0.0321076475083828, "grad_norm_var": 1.7024827925028625e-06, "learning_rate": 0.0015143381480475582, "loss": 2.5593, "step": 9663 }, { "crossentropy": 2.664471983909607, "epoch": 0.5255172788819706, "grad_norm": 0.032142821699380875, "grad_norm_var": 1.693997079588172e-06, "learning_rate": 0.0015134455327632424, "loss": 2.6645, "step": 9664 }, { "crossentropy": 2.5641242265701294, "epoch": 0.5255716577394709, "grad_norm": 0.03073415905237198, "grad_norm_var": 1.844334274641625e-06, "learning_rate": 0.0015125531337117832, "loss": 2.5641, "step": 9665 }, { "crossentropy": 2.5623979568481445, "epoch": 0.525626036596971, "grad_norm": 0.032256465405225754, "grad_norm_var": 1.8451661115164696e-06, "learning_rate": 0.001511660950948528, "loss": 2.5624, "step": 9666 }, { "crossentropy": 2.4422730207443237, "epoch": 0.5256804154544713, "grad_norm": 0.03221868351101875, "grad_norm_var": 1.8316793757809184e-06, "learning_rate": 0.001510768984528808, "loss": 2.4423, "step": 9667 }, { "crossentropy": 2.5499775409698486, "epoch": 0.5257347943119715, "grad_norm": 0.03446692228317261, "grad_norm_var": 2.1126867427907642e-06, "learning_rate": 0.0015098772345079432, "loss": 2.55, "step": 9668 }, { "crossentropy": 2.517624258995056, "epoch": 0.5257891731694717, "grad_norm": 0.03218911215662956, "grad_norm_var": 2.088344912014722e-06, "learning_rate": 0.0015089857009412395, "loss": 2.5176, "step": 9669 }, { "crossentropy": 2.5289974212646484, "epoch": 0.5258435520269719, "grad_norm": 0.03290758281946182, "grad_norm_var": 2.035108449436468e-06, "learning_rate": 0.0015080943838839851, "loss": 2.529, "step": 9670 }, { "crossentropy": 2.5739521980285645, "epoch": 0.5258979308844721, "grad_norm": 0.03167947009205818, "grad_norm_var": 2.085731436024298e-06, "learning_rate": 0.0015072032833914624, "loss": 2.574, "step": 9671 }, { "crossentropy": 2.413965940475464, "epoch": 0.5259523097419723, "grad_norm": 0.036545347422361374, "grad_norm_var": 2.9792254749081333e-06, "learning_rate": 0.0015063123995189382, "loss": 2.414, "step": 9672 }, { "crossentropy": 2.541030764579773, "epoch": 0.5260066885994725, "grad_norm": 0.031079957261681557, "grad_norm_var": 3.1727797301457264e-06, "learning_rate": 0.0015054217323216595, "loss": 2.541, "step": 9673 }, { "crossentropy": 2.559691309928894, "epoch": 0.5260610674569727, "grad_norm": 0.03503536060452461, "grad_norm_var": 3.255503630408005e-06, "learning_rate": 0.0015045312818548673, "loss": 2.5597, "step": 9674 }, { "crossentropy": 2.6031538248062134, "epoch": 0.5261154463144729, "grad_norm": 0.03224216401576996, "grad_norm_var": 3.1750188239049733e-06, "learning_rate": 0.0015036410481737861, "loss": 2.6032, "step": 9675 }, { "crossentropy": 2.5727570056915283, "epoch": 0.5261698251719731, "grad_norm": 0.03244850039482117, "grad_norm_var": 3.196192129297281e-06, "learning_rate": 0.0015027510313336278, "loss": 2.5728, "step": 9676 }, { "crossentropy": 2.3942538499832153, "epoch": 0.5262242040294733, "grad_norm": 0.03418901190161705, "grad_norm_var": 3.2831581382043583e-06, "learning_rate": 0.0015018612313895897, "loss": 2.3943, "step": 9677 }, { "crossentropy": 2.608741879463196, "epoch": 0.5262785828869735, "grad_norm": 0.031965650618076324, "grad_norm_var": 2.3501776436354728e-06, "learning_rate": 0.0015009716483968573, "loss": 2.6087, "step": 9678 }, { "crossentropy": 2.646406412124634, "epoch": 0.5263329617444737, "grad_norm": 0.03194504231214523, "grad_norm_var": 2.3660398287465943e-06, "learning_rate": 0.0015000822824106004, "loss": 2.6464, "step": 9679 }, { "crossentropy": 2.494257688522339, "epoch": 0.526387340601974, "grad_norm": 0.03577050566673279, "grad_norm_var": 2.893460499256067e-06, "learning_rate": 0.0014991931334859792, "loss": 2.4943, "step": 9680 }, { "crossentropy": 2.6262937784194946, "epoch": 0.5264417194594742, "grad_norm": 0.03158632665872574, "grad_norm_var": 2.6837126967712283e-06, "learning_rate": 0.001498304201678133, "loss": 2.6263, "step": 9681 }, { "crossentropy": 2.5272884368896484, "epoch": 0.5264960983169744, "grad_norm": 0.03134064003825188, "grad_norm_var": 2.8309419038332334e-06, "learning_rate": 0.001497415487042197, "loss": 2.5273, "step": 9682 }, { "crossentropy": 2.5945377349853516, "epoch": 0.5265504771744746, "grad_norm": 0.031239906325936317, "grad_norm_var": 2.9896030759604095e-06, "learning_rate": 0.0014965269896332883, "loss": 2.5945, "step": 9683 }, { "crossentropy": 2.5464714765548706, "epoch": 0.5266048560319748, "grad_norm": 0.03193129971623421, "grad_norm_var": 2.8665812889576886e-06, "learning_rate": 0.0014956387095065083, "loss": 2.5465, "step": 9684 }, { "crossentropy": 2.467004656791687, "epoch": 0.526659234889475, "grad_norm": 0.03274564445018768, "grad_norm_var": 2.843874347964864e-06, "learning_rate": 0.001494750646716947, "loss": 2.467, "step": 9685 }, { "crossentropy": 2.576253652572632, "epoch": 0.5267136137469752, "grad_norm": 0.030964065343141556, "grad_norm_var": 3.049684204258866e-06, "learning_rate": 0.0014938628013196854, "loss": 2.5763, "step": 9686 }, { "crossentropy": 2.5371077060699463, "epoch": 0.5267679926044754, "grad_norm": 0.03064814954996109, "grad_norm_var": 3.2522723030882377e-06, "learning_rate": 0.0014929751733697826, "loss": 2.5371, "step": 9687 }, { "crossentropy": 2.5053977966308594, "epoch": 0.5268223714619756, "grad_norm": 0.03165600821375847, "grad_norm_var": 2.1775164023999163e-06, "learning_rate": 0.00149208776292229, "loss": 2.5054, "step": 9688 }, { "crossentropy": 2.6731228828430176, "epoch": 0.5268767503194758, "grad_norm": 0.03288276866078377, "grad_norm_var": 2.087558646664021e-06, "learning_rate": 0.0014912005700322445, "loss": 2.6731, "step": 9689 }, { "crossentropy": 2.486404299736023, "epoch": 0.526931129176976, "grad_norm": 0.032960373908281326, "grad_norm_var": 1.630848455412776e-06, "learning_rate": 0.0014903135947546674, "loss": 2.4864, "step": 9690 }, { "crossentropy": 2.585096597671509, "epoch": 0.5269855080344762, "grad_norm": 0.032671961933374405, "grad_norm_var": 1.640096456102325e-06, "learning_rate": 0.0014894268371445712, "loss": 2.5851, "step": 9691 }, { "crossentropy": 2.5559319257736206, "epoch": 0.5270398868919765, "grad_norm": 0.03438510373234749, "grad_norm_var": 1.9104894627328206e-06, "learning_rate": 0.0014885402972569467, "loss": 2.5559, "step": 9692 }, { "crossentropy": 2.4379384517669678, "epoch": 0.5270942657494766, "grad_norm": 0.030131559818983078, "grad_norm_var": 1.9878907448879577e-06, "learning_rate": 0.0014876539751467804, "loss": 2.4379, "step": 9693 }, { "crossentropy": 2.602773070335388, "epoch": 0.5271486446069769, "grad_norm": 0.03371640667319298, "grad_norm_var": 2.1302282824013728e-06, "learning_rate": 0.0014867678708690418, "loss": 2.6028, "step": 9694 }, { "crossentropy": 2.4921674728393555, "epoch": 0.527203023464477, "grad_norm": 0.03141074255108833, "grad_norm_var": 2.1723593066551856e-06, "learning_rate": 0.0014858819844786825, "loss": 2.4922, "step": 9695 }, { "crossentropy": 2.5547136068344116, "epoch": 0.5272574023219773, "grad_norm": 0.03192074969410896, "grad_norm_var": 1.2928999545775877e-06, "learning_rate": 0.0014849963160306452, "loss": 2.5547, "step": 9696 }, { "crossentropy": 2.51849627494812, "epoch": 0.5273117811794774, "grad_norm": 0.033860936760902405, "grad_norm_var": 1.4871724138329808e-06, "learning_rate": 0.0014841108655798624, "loss": 2.5185, "step": 9697 }, { "crossentropy": 2.548882246017456, "epoch": 0.5273661600369777, "grad_norm": 0.03127119317650795, "grad_norm_var": 1.495006557021662e-06, "learning_rate": 0.0014832256331812443, "loss": 2.5489, "step": 9698 }, { "crossentropy": 2.504385232925415, "epoch": 0.5274205388944778, "grad_norm": 0.03125849738717079, "grad_norm_var": 1.492772695952232e-06, "learning_rate": 0.0014823406188896938, "loss": 2.5044, "step": 9699 }, { "crossentropy": 2.683089256286621, "epoch": 0.5274749177519781, "grad_norm": 0.031758084893226624, "grad_norm_var": 1.4997211757859323e-06, "learning_rate": 0.0014814558227600983, "loss": 2.6831, "step": 9700 }, { "crossentropy": 2.669519066810608, "epoch": 0.5275292966094782, "grad_norm": 0.03132211044430733, "grad_norm_var": 1.511446832369069e-06, "learning_rate": 0.0014805712448473325, "loss": 2.6695, "step": 9701 }, { "crossentropy": 2.5922465324401855, "epoch": 0.5275836754669785, "grad_norm": 0.03178602457046509, "grad_norm_var": 1.4345321822983869e-06, "learning_rate": 0.0014796868852062584, "loss": 2.5922, "step": 9702 }, { "crossentropy": 2.520575523376465, "epoch": 0.5276380543244786, "grad_norm": 0.03044903464615345, "grad_norm_var": 1.4756222655085225e-06, "learning_rate": 0.0014788027438917184, "loss": 2.5206, "step": 9703 }, { "crossentropy": 2.59709095954895, "epoch": 0.5276924331819789, "grad_norm": 0.03241223096847534, "grad_norm_var": 1.4675952446506808e-06, "learning_rate": 0.0014779188209585515, "loss": 2.5971, "step": 9704 }, { "crossentropy": 2.5650393962860107, "epoch": 0.527746812039479, "grad_norm": 0.052744898945093155, "grad_norm_var": 2.8098159536695196e-05, "learning_rate": 0.001477035116461577, "loss": 2.565, "step": 9705 }, { "crossentropy": 2.493469715118408, "epoch": 0.5278011908969793, "grad_norm": 0.03252160921692848, "grad_norm_var": 2.81346671822997e-05, "learning_rate": 0.0014761516304555983, "loss": 2.4935, "step": 9706 }, { "crossentropy": 2.4649258852005005, "epoch": 0.5278555697544794, "grad_norm": 0.03233844414353371, "grad_norm_var": 2.8171829785306176e-05, "learning_rate": 0.001475268362995409, "loss": 2.4649, "step": 9707 }, { "crossentropy": 2.4299079179763794, "epoch": 0.5279099486119797, "grad_norm": 0.032004546374082565, "grad_norm_var": 2.81912739415761e-05, "learning_rate": 0.0014743853141357926, "loss": 2.4299, "step": 9708 }, { "crossentropy": 2.502739191055298, "epoch": 0.5279643274694799, "grad_norm": 0.03118552453815937, "grad_norm_var": 2.7832070664301986e-05, "learning_rate": 0.001473502483931511, "loss": 2.5027, "step": 9709 }, { "crossentropy": 2.5172754526138306, "epoch": 0.5280187063269801, "grad_norm": 0.031085200607776642, "grad_norm_var": 2.81002908486587e-05, "learning_rate": 0.0014726198724373174, "loss": 2.5173, "step": 9710 }, { "crossentropy": 2.4836281538009644, "epoch": 0.5280730851844803, "grad_norm": 0.03337499126791954, "grad_norm_var": 2.790343903566056e-05, "learning_rate": 0.0014717374797079503, "loss": 2.4836, "step": 9711 }, { "crossentropy": 2.610029101371765, "epoch": 0.5281274640419805, "grad_norm": 0.03183479979634285, "grad_norm_var": 2.7918628321312032e-05, "learning_rate": 0.0014708553057981356, "loss": 2.61, "step": 9712 }, { "crossentropy": 2.4747363328933716, "epoch": 0.5281818428994807, "grad_norm": 0.03172915801405907, "grad_norm_var": 2.801493995622789e-05, "learning_rate": 0.0014699733507625861, "loss": 2.4747, "step": 9713 }, { "crossentropy": 2.5930423736572266, "epoch": 0.5282362217569809, "grad_norm": 0.0328119620680809, "grad_norm_var": 2.7794334059710418e-05, "learning_rate": 0.0014690916146559956, "loss": 2.593, "step": 9714 }, { "crossentropy": 2.5261857509613037, "epoch": 0.5282906006144811, "grad_norm": 0.030886787921190262, "grad_norm_var": 2.7897387351735286e-05, "learning_rate": 0.0014682100975330526, "loss": 2.5262, "step": 9715 }, { "crossentropy": 2.543661594390869, "epoch": 0.5283449794719813, "grad_norm": 0.03273024410009384, "grad_norm_var": 2.7777286349055936e-05, "learning_rate": 0.001467328799448428, "loss": 2.5437, "step": 9716 }, { "crossentropy": 2.5238847732543945, "epoch": 0.5283993583294815, "grad_norm": 0.03208205848932266, "grad_norm_var": 2.762299035994984e-05, "learning_rate": 0.0014664477204567768, "loss": 2.5239, "step": 9717 }, { "crossentropy": 2.620179772377014, "epoch": 0.5284537371869817, "grad_norm": 0.03182583302259445, "grad_norm_var": 2.761532638395018e-05, "learning_rate": 0.0014655668606127414, "loss": 2.6202, "step": 9718 }, { "crossentropy": 2.5397026538848877, "epoch": 0.5285081160444819, "grad_norm": 0.0318235345184803, "grad_norm_var": 2.7219882533134952e-05, "learning_rate": 0.0014646862199709577, "loss": 2.5397, "step": 9719 }, { "crossentropy": 2.4913625717163086, "epoch": 0.5285624949019821, "grad_norm": 0.03192592039704323, "grad_norm_var": 2.7294626270578945e-05, "learning_rate": 0.001463805798586037, "loss": 2.4914, "step": 9720 }, { "crossentropy": 2.5882010459899902, "epoch": 0.5286168737594823, "grad_norm": 0.031226731836795807, "grad_norm_var": 4.6387180976125745e-07, "learning_rate": 0.0014629255965125838, "loss": 2.5882, "step": 9721 }, { "crossentropy": 2.4630101919174194, "epoch": 0.5286712526169826, "grad_norm": 0.031650640070438385, "grad_norm_var": 4.4626275472075575e-07, "learning_rate": 0.0014620456138051874, "loss": 2.463, "step": 9722 }, { "crossentropy": 2.5393073558807373, "epoch": 0.5287256314744827, "grad_norm": 0.03241078555583954, "grad_norm_var": 4.5074869978686293e-07, "learning_rate": 0.0014611658505184239, "loss": 2.5393, "step": 9723 }, { "crossentropy": 2.5765331983566284, "epoch": 0.528780010331983, "grad_norm": 0.031820353120565414, "grad_norm_var": 4.5059125445485956e-07, "learning_rate": 0.001460286306706855, "loss": 2.5765, "step": 9724 }, { "crossentropy": 2.567945122718811, "epoch": 0.5288343891894831, "grad_norm": 0.03142790123820305, "grad_norm_var": 4.3116414092913187e-07, "learning_rate": 0.0014594069824250288, "loss": 2.5679, "step": 9725 }, { "crossentropy": 2.6427026987075806, "epoch": 0.5288887680469834, "grad_norm": 0.03232034668326378, "grad_norm_var": 3.897857517267655e-07, "learning_rate": 0.0014585278777274812, "loss": 2.6427, "step": 9726 }, { "crossentropy": 2.4617176055908203, "epoch": 0.5289431469044835, "grad_norm": 0.03289855271577835, "grad_norm_var": 3.161580374775227e-07, "learning_rate": 0.001457648992668734, "loss": 2.4617, "step": 9727 }, { "crossentropy": 2.533020853996277, "epoch": 0.5289975257619838, "grad_norm": 0.031549517065286636, "grad_norm_var": 3.261154348748833e-07, "learning_rate": 0.0014567703273032928, "loss": 2.533, "step": 9728 }, { "crossentropy": 2.460314631462097, "epoch": 0.5290519046194839, "grad_norm": 0.03154228627681732, "grad_norm_var": 3.3367647811934307e-07, "learning_rate": 0.0014558918816856509, "loss": 2.4603, "step": 9729 }, { "crossentropy": 2.515602707862854, "epoch": 0.5291062834769842, "grad_norm": 0.03322301432490349, "grad_norm_var": 3.9239128502658687e-07, "learning_rate": 0.0014550136558702932, "loss": 2.5156, "step": 9730 }, { "crossentropy": 2.5130999088287354, "epoch": 0.5291606623344843, "grad_norm": 0.032595112919807434, "grad_norm_var": 3.305575632943254e-07, "learning_rate": 0.001454135649911682, "loss": 2.5131, "step": 9731 }, { "crossentropy": 2.6003823280334473, "epoch": 0.5292150411919846, "grad_norm": 0.03149205446243286, "grad_norm_var": 3.1668312234078613e-07, "learning_rate": 0.0014532578638642723, "loss": 2.6004, "step": 9732 }, { "crossentropy": 2.529475450515747, "epoch": 0.5292694200494847, "grad_norm": 0.0312013179063797, "grad_norm_var": 3.541678910256562e-07, "learning_rate": 0.001452380297782503, "loss": 2.5295, "step": 9733 }, { "crossentropy": 2.439850687980652, "epoch": 0.529323798906985, "grad_norm": 0.03284144401550293, "grad_norm_var": 4.0407254976813217e-07, "learning_rate": 0.0014515029517208, "loss": 2.4399, "step": 9734 }, { "crossentropy": 2.5439528226852417, "epoch": 0.5293781777644851, "grad_norm": 0.03130945935845375, "grad_norm_var": 4.3246888142874317e-07, "learning_rate": 0.0014506258257335764, "loss": 2.544, "step": 9735 }, { "crossentropy": 2.61457896232605, "epoch": 0.5294325566219854, "grad_norm": 0.03271974250674248, "grad_norm_var": 4.6774735622991347e-07, "learning_rate": 0.0014497489198752294, "loss": 2.6146, "step": 9736 }, { "crossentropy": 2.5551549196243286, "epoch": 0.5294869354794856, "grad_norm": 0.03548385202884674, "grad_norm_var": 1.1533867451451439e-06, "learning_rate": 0.0014488722342001442, "loss": 2.5552, "step": 9737 }, { "crossentropy": 2.5949994325637817, "epoch": 0.5295413143369858, "grad_norm": 0.03252892941236496, "grad_norm_var": 1.1278507089970113e-06, "learning_rate": 0.0014479957687626931, "loss": 2.595, "step": 9738 }, { "crossentropy": 2.5950348377227783, "epoch": 0.529595693194486, "grad_norm": 0.03168817609548569, "grad_norm_var": 1.1532123238177626e-06, "learning_rate": 0.0014471195236172342, "loss": 2.595, "step": 9739 }, { "crossentropy": 2.4454673528671265, "epoch": 0.5296500720519862, "grad_norm": 0.030761823058128357, "grad_norm_var": 1.289545576803793e-06, "learning_rate": 0.0014462434988181067, "loss": 2.4455, "step": 9740 }, { "crossentropy": 2.526175379753113, "epoch": 0.5297044509094864, "grad_norm": 0.03183044120669365, "grad_norm_var": 1.2569463454302024e-06, "learning_rate": 0.0014453676944196476, "loss": 2.5262, "step": 9741 }, { "crossentropy": 2.3318439722061157, "epoch": 0.5297588297669866, "grad_norm": 0.03222495689988136, "grad_norm_var": 1.256609258985103e-06, "learning_rate": 0.0014444921104761683, "loss": 2.3318, "step": 9742 }, { "crossentropy": 2.5549986362457275, "epoch": 0.5298132086244868, "grad_norm": 0.03417545184493065, "grad_norm_var": 1.4700951594193504e-06, "learning_rate": 0.0014436167470419736, "loss": 2.555, "step": 9743 }, { "crossentropy": 2.5536766052246094, "epoch": 0.529867587481987, "grad_norm": 0.03389322757720947, "grad_norm_var": 1.5717052150437487e-06, "learning_rate": 0.0014427416041713525, "loss": 2.5537, "step": 9744 }, { "crossentropy": 2.5466901063919067, "epoch": 0.5299219663394872, "grad_norm": 0.03163641318678856, "grad_norm_var": 1.56062274588997e-06, "learning_rate": 0.001441866681918581, "loss": 2.5467, "step": 9745 }, { "crossentropy": 2.581247925758362, "epoch": 0.5299763451969874, "grad_norm": 0.03233761340379715, "grad_norm_var": 1.5213529610619282e-06, "learning_rate": 0.0014409919803379206, "loss": 2.5812, "step": 9746 }, { "crossentropy": 2.6807137727737427, "epoch": 0.5300307240544876, "grad_norm": 0.03178338333964348, "grad_norm_var": 1.54358204721098e-06, "learning_rate": 0.0014401174994836197, "loss": 2.6807, "step": 9747 }, { "crossentropy": 2.4794628620147705, "epoch": 0.5300851029119878, "grad_norm": 0.03252723813056946, "grad_norm_var": 1.4894804381802735e-06, "learning_rate": 0.001439243239409912, "loss": 2.4795, "step": 9748 }, { "crossentropy": 2.6278090476989746, "epoch": 0.530139481769488, "grad_norm": 0.03185361623764038, "grad_norm_var": 1.408866442527817e-06, "learning_rate": 0.0014383692001710196, "loss": 2.6278, "step": 9749 }, { "crossentropy": 2.6090112924575806, "epoch": 0.5301938606269883, "grad_norm": 0.0316246822476387, "grad_norm_var": 1.4419053503651631e-06, "learning_rate": 0.00143749538182115, "loss": 2.609, "step": 9750 }, { "crossentropy": 2.4871262311935425, "epoch": 0.5302482394844884, "grad_norm": 0.03197697550058365, "grad_norm_var": 1.3728102775887225e-06, "learning_rate": 0.0014366217844144925, "loss": 2.4871, "step": 9751 }, { "crossentropy": 2.548198103904724, "epoch": 0.5303026183419887, "grad_norm": 0.03146030753850937, "grad_norm_var": 1.4250390964221441e-06, "learning_rate": 0.0014357484080052313, "loss": 2.5482, "step": 9752 }, { "crossentropy": 2.613092064857483, "epoch": 0.5303569971994888, "grad_norm": 0.032517705112695694, "grad_norm_var": 7.401448263812116e-07, "learning_rate": 0.0014348752526475328, "loss": 2.6131, "step": 9753 }, { "crossentropy": 2.481371521949768, "epoch": 0.5304113760569891, "grad_norm": 0.0315210297703743, "grad_norm_var": 7.562486899775438e-07, "learning_rate": 0.0014340023183955454, "loss": 2.4814, "step": 9754 }, { "crossentropy": 2.5565727949142456, "epoch": 0.5304657549144892, "grad_norm": 0.031627003103494644, "grad_norm_var": 7.599501765051877e-07, "learning_rate": 0.0014331296053034098, "loss": 2.5566, "step": 9755 }, { "crossentropy": 2.566246271133423, "epoch": 0.5305201337719895, "grad_norm": 0.03399632126092911, "grad_norm_var": 8.326195746486201e-07, "learning_rate": 0.0014322571134252505, "loss": 2.5662, "step": 9756 }, { "crossentropy": 2.614659309387207, "epoch": 0.5305745126294896, "grad_norm": 0.033855877816677094, "grad_norm_var": 9.590653461379051e-07, "learning_rate": 0.0014313848428151787, "loss": 2.6147, "step": 9757 }, { "crossentropy": 2.5801384449005127, "epoch": 0.5306288914869899, "grad_norm": 0.03135937079787254, "grad_norm_var": 1.0305078535925253e-06, "learning_rate": 0.0014305127935272926, "loss": 2.5801, "step": 9758 }, { "crossentropy": 2.5308789014816284, "epoch": 0.53068327034449, "grad_norm": 0.03396778926253319, "grad_norm_var": 9.836045886761162e-07, "learning_rate": 0.0014296409656156745, "loss": 2.5309, "step": 9759 }, { "crossentropy": 2.5737942457199097, "epoch": 0.5307376492019903, "grad_norm": 0.03193476051092148, "grad_norm_var": 8.258731771938755e-07, "learning_rate": 0.001428769359134396, "loss": 2.5738, "step": 9760 }, { "crossentropy": 2.5717501640319824, "epoch": 0.5307920280594904, "grad_norm": 0.03104676678776741, "grad_norm_var": 8.957454117673273e-07, "learning_rate": 0.001427897974137513, "loss": 2.5718, "step": 9761 }, { "crossentropy": 2.5550018548965454, "epoch": 0.5308464069169907, "grad_norm": 0.052578557282686234, "grad_norm_var": 2.6841000988943725e-05, "learning_rate": 0.0014270268106790645, "loss": 2.555, "step": 9762 }, { "crossentropy": 2.5601621866226196, "epoch": 0.5309007857744908, "grad_norm": 0.03201693668961525, "grad_norm_var": 2.6791671405163435e-05, "learning_rate": 0.0014261558688130839, "loss": 2.5602, "step": 9763 }, { "crossentropy": 2.526657223701477, "epoch": 0.5309551646319911, "grad_norm": 0.033231645822525024, "grad_norm_var": 2.673211332230903e-05, "learning_rate": 0.0014252851485935859, "loss": 2.5267, "step": 9764 }, { "crossentropy": 2.584436774253845, "epoch": 0.5310095434894913, "grad_norm": 0.03157318755984306, "grad_norm_var": 2.6799917941015363e-05, "learning_rate": 0.0014244146500745685, "loss": 2.5844, "step": 9765 }, { "crossentropy": 2.549330472946167, "epoch": 0.5310639223469915, "grad_norm": 0.03180515021085739, "grad_norm_var": 2.6756394344902106e-05, "learning_rate": 0.0014235443733100195, "loss": 2.5493, "step": 9766 }, { "crossentropy": 2.457746386528015, "epoch": 0.5311183012044917, "grad_norm": 0.031652167439460754, "grad_norm_var": 2.6830217363821064e-05, "learning_rate": 0.0014226743183539175, "loss": 2.4577, "step": 9767 }, { "crossentropy": 2.6195261478424072, "epoch": 0.5311726800619919, "grad_norm": 0.035098571330308914, "grad_norm_var": 2.666368560409049e-05, "learning_rate": 0.0014218044852602175, "loss": 2.6195, "step": 9768 }, { "crossentropy": 2.623052954673767, "epoch": 0.5312270589194921, "grad_norm": 0.03191983699798584, "grad_norm_var": 2.6783177368005138e-05, "learning_rate": 0.001420934874082867, "loss": 2.6231, "step": 9769 }, { "crossentropy": 2.550071358680725, "epoch": 0.5312814377769923, "grad_norm": 0.032801151275634766, "grad_norm_var": 2.651384423008214e-05, "learning_rate": 0.0014200654848757993, "loss": 2.5501, "step": 9770 }, { "crossentropy": 2.4821399450302124, "epoch": 0.5313358166344925, "grad_norm": 0.03171965852379799, "grad_norm_var": 2.6487794059182105e-05, "learning_rate": 0.001419196317692933, "loss": 2.4821, "step": 9771 }, { "crossentropy": 2.5548932552337646, "epoch": 0.5313901954919927, "grad_norm": 0.031235121190547943, "grad_norm_var": 2.6886456353658374e-05, "learning_rate": 0.0014183273725881735, "loss": 2.5549, "step": 9772 }, { "crossentropy": 2.6067962646484375, "epoch": 0.5314445743494929, "grad_norm": 0.0323205292224884, "grad_norm_var": 2.698392052408923e-05, "learning_rate": 0.0014174586496154085, "loss": 2.6068, "step": 9773 }, { "crossentropy": 2.4915865659713745, "epoch": 0.5314989532069931, "grad_norm": 0.032832443714141846, "grad_norm_var": 2.669589522595045e-05, "learning_rate": 0.0014165901488285193, "loss": 2.4916, "step": 9774 }, { "crossentropy": 2.588394284248352, "epoch": 0.5315533320644933, "grad_norm": 0.03200296312570572, "grad_norm_var": 2.6843025363692172e-05, "learning_rate": 0.0014157218702813702, "loss": 2.5884, "step": 9775 }, { "crossentropy": 2.6032098531723022, "epoch": 0.5316077109219935, "grad_norm": 0.036146700382232666, "grad_norm_var": 2.7080869028628154e-05, "learning_rate": 0.0014148538140278072, "loss": 2.6032, "step": 9776 }, { "crossentropy": 2.5308626890182495, "epoch": 0.5316620897794937, "grad_norm": 0.032725121825933456, "grad_norm_var": 2.6652252730698563e-05, "learning_rate": 0.0014139859801216665, "loss": 2.5309, "step": 9777 }, { "crossentropy": 2.609661340713501, "epoch": 0.531716468636994, "grad_norm": 0.03331511840224266, "grad_norm_var": 1.750825501659753e-06, "learning_rate": 0.0014131183686167754, "loss": 2.6097, "step": 9778 }, { "crossentropy": 2.4877545833587646, "epoch": 0.5317708474944941, "grad_norm": 0.032578323036432266, "grad_norm_var": 1.723154211550476e-06, "learning_rate": 0.0014122509795669374, "loss": 2.4878, "step": 9779 }, { "crossentropy": 2.478149175643921, "epoch": 0.5318252263519944, "grad_norm": 0.0319330096244812, "grad_norm_var": 1.7338801715317819e-06, "learning_rate": 0.0014113838130259493, "loss": 2.4781, "step": 9780 }, { "crossentropy": 2.6015607118606567, "epoch": 0.5318796052094945, "grad_norm": 0.031464867293834686, "grad_norm_var": 1.7494967538384323e-06, "learning_rate": 0.0014105168690475912, "loss": 2.6016, "step": 9781 }, { "crossentropy": 2.511888861656189, "epoch": 0.5319339840669948, "grad_norm": 0.03356355428695679, "grad_norm_var": 1.7571121139295724e-06, "learning_rate": 0.00140965014768563, "loss": 2.5119, "step": 9782 }, { "crossentropy": 2.481590151786804, "epoch": 0.5319883629244949, "grad_norm": 0.030695293098688126, "grad_norm_var": 1.9488937828299193e-06, "learning_rate": 0.001408783648993821, "loss": 2.4816, "step": 9783 }, { "crossentropy": 2.485211968421936, "epoch": 0.5320427417819952, "grad_norm": 0.03274870663881302, "grad_norm_var": 1.5259006560074928e-06, "learning_rate": 0.0014079173730258993, "loss": 2.4852, "step": 9784 }, { "crossentropy": 2.5089458227157593, "epoch": 0.5320971206394953, "grad_norm": 0.03214051201939583, "grad_norm_var": 1.5118695055232426e-06, "learning_rate": 0.0014070513198355945, "loss": 2.5089, "step": 9785 }, { "crossentropy": 2.4448176622390747, "epoch": 0.5321514994969956, "grad_norm": 0.03248406946659088, "grad_norm_var": 1.5060108029557133e-06, "learning_rate": 0.001406185489476619, "loss": 2.4448, "step": 9786 }, { "crossentropy": 2.487525224685669, "epoch": 0.5322058783544957, "grad_norm": 0.033676087856292725, "grad_norm_var": 1.5432117282979454e-06, "learning_rate": 0.001405319882002667, "loss": 2.4875, "step": 9787 }, { "crossentropy": 2.5260263681411743, "epoch": 0.532260257211996, "grad_norm": 0.03431016951799393, "grad_norm_var": 1.5678731115718764e-06, "learning_rate": 0.0014044544974674245, "loss": 2.526, "step": 9788 }, { "crossentropy": 2.58260178565979, "epoch": 0.5323146360694961, "grad_norm": 0.03260388597846031, "grad_norm_var": 1.5544518567639498e-06, "learning_rate": 0.0014035893359245623, "loss": 2.5826, "step": 9789 }, { "crossentropy": 2.5953463315963745, "epoch": 0.5323690149269964, "grad_norm": 0.03360101208090782, "grad_norm_var": 1.5919998538351776e-06, "learning_rate": 0.0014027243974277365, "loss": 2.5953, "step": 9790 }, { "crossentropy": 2.5764577388763428, "epoch": 0.5324233937844965, "grad_norm": 0.032580241560935974, "grad_norm_var": 1.545757944425239e-06, "learning_rate": 0.0014018596820305896, "loss": 2.5765, "step": 9791 }, { "crossentropy": 2.5226573944091797, "epoch": 0.5324777726419968, "grad_norm": 0.03156851604580879, "grad_norm_var": 8.802368053948313e-07, "learning_rate": 0.0014009951897867511, "loss": 2.5227, "step": 9792 }, { "crossentropy": 2.4588558673858643, "epoch": 0.532532151499497, "grad_norm": 0.03081103414297104, "grad_norm_var": 1.0834841604078748e-06, "learning_rate": 0.0014001309207498352, "loss": 2.4589, "step": 9793 }, { "crossentropy": 2.6049104928970337, "epoch": 0.5325865303569972, "grad_norm": 0.032788701355457306, "grad_norm_var": 1.0439179290046748e-06, "learning_rate": 0.0013992668749734439, "loss": 2.6049, "step": 9794 }, { "crossentropy": 2.5268301963806152, "epoch": 0.5326409092144974, "grad_norm": 0.04082528129220009, "grad_norm_var": 5.411876175532663e-06, "learning_rate": 0.0013984030525111652, "loss": 2.5268, "step": 9795 }, { "crossentropy": 2.4841160774230957, "epoch": 0.5326952880719976, "grad_norm": 0.03180554509162903, "grad_norm_var": 5.430807603372852e-06, "learning_rate": 0.0013975394534165681, "loss": 2.4841, "step": 9796 }, { "crossentropy": 2.5773415565490723, "epoch": 0.5327496669294978, "grad_norm": 0.03257936239242554, "grad_norm_var": 5.283407381306606e-06, "learning_rate": 0.001396676077743219, "loss": 2.5773, "step": 9797 }, { "crossentropy": 2.4751083850860596, "epoch": 0.532804045786998, "grad_norm": 0.03192431107163429, "grad_norm_var": 5.338860634468365e-06, "learning_rate": 0.0013958129255446582, "loss": 2.4751, "step": 9798 }, { "crossentropy": 2.515089750289917, "epoch": 0.5328584246444982, "grad_norm": 0.03147164359688759, "grad_norm_var": 5.14350877945189e-06, "learning_rate": 0.0013949499968744206, "loss": 2.5151, "step": 9799 }, { "crossentropy": 2.578788995742798, "epoch": 0.5329128035019984, "grad_norm": 0.03150840476155281, "grad_norm_var": 5.280376484027155e-06, "learning_rate": 0.001394087291786022, "loss": 2.5788, "step": 9800 }, { "crossentropy": 2.6436121463775635, "epoch": 0.5329671823594986, "grad_norm": 0.03235062211751938, "grad_norm_var": 5.261370694630875e-06, "learning_rate": 0.0013932248103329687, "loss": 2.6436, "step": 9801 }, { "crossentropy": 2.62883985042572, "epoch": 0.5330215612169988, "grad_norm": 0.032035570591688156, "grad_norm_var": 5.300642446229992e-06, "learning_rate": 0.0013923625525687495, "loss": 2.6288, "step": 9802 }, { "crossentropy": 2.530522346496582, "epoch": 0.533075940074499, "grad_norm": 0.03220129385590553, "grad_norm_var": 5.284468119347239e-06, "learning_rate": 0.0013915005185468415, "loss": 2.5305, "step": 9803 }, { "crossentropy": 2.422611355781555, "epoch": 0.5331303189319992, "grad_norm": 0.030327294021844864, "grad_norm_var": 5.479444790327387e-06, "learning_rate": 0.0013906387083207072, "loss": 2.4226, "step": 9804 }, { "crossentropy": 2.6126075983047485, "epoch": 0.5331846977894994, "grad_norm": 0.032829973846673965, "grad_norm_var": 5.483919662171988e-06, "learning_rate": 0.0013897771219437954, "loss": 2.6126, "step": 9805 }, { "crossentropy": 2.4761215448379517, "epoch": 0.5332390766469997, "grad_norm": 0.03199811652302742, "grad_norm_var": 5.4253382537318425e-06, "learning_rate": 0.0013889157594695423, "loss": 2.4761, "step": 9806 }, { "crossentropy": 2.4000003337860107, "epoch": 0.5332934555044998, "grad_norm": 0.03173084929585457, "grad_norm_var": 5.4585529545611e-06, "learning_rate": 0.001388054620951364, "loss": 2.4, "step": 9807 }, { "crossentropy": 2.585394501686096, "epoch": 0.5333478343620001, "grad_norm": 0.03199581056833267, "grad_norm_var": 5.421322945359242e-06, "learning_rate": 0.0013871937064426731, "loss": 2.5854, "step": 9808 }, { "crossentropy": 2.49735951423645, "epoch": 0.5334022132195002, "grad_norm": 0.031613048166036606, "grad_norm_var": 5.286369629240763e-06, "learning_rate": 0.0013863330159968597, "loss": 2.4974, "step": 9809 }, { "crossentropy": 2.543380618095398, "epoch": 0.5334565920770005, "grad_norm": 0.03087134286761284, "grad_norm_var": 5.442103805240045e-06, "learning_rate": 0.0013854725496673032, "loss": 2.5434, "step": 9810 }, { "crossentropy": 2.554770827293396, "epoch": 0.5335109709345006, "grad_norm": 0.032052554190158844, "grad_norm_var": 3.728874537288369e-07, "learning_rate": 0.001384612307507369, "loss": 2.5548, "step": 9811 }, { "crossentropy": 2.6517502069473267, "epoch": 0.5335653497920009, "grad_norm": 0.03240194916725159, "grad_norm_var": 3.93095656924837e-07, "learning_rate": 0.0013837522895704096, "loss": 2.6518, "step": 9812 }, { "crossentropy": 2.6159138679504395, "epoch": 0.533619728649501, "grad_norm": 0.0313371904194355, "grad_norm_var": 3.7175761179453205e-07, "learning_rate": 0.001382892495909761, "loss": 2.6159, "step": 9813 }, { "crossentropy": 2.5885090827941895, "epoch": 0.5336741075070013, "grad_norm": 0.031844716519117355, "grad_norm_var": 3.707347933125382e-07, "learning_rate": 0.0013820329265787485, "loss": 2.5885, "step": 9814 }, { "crossentropy": 2.506574034690857, "epoch": 0.5337284863645014, "grad_norm": 0.030571836978197098, "grad_norm_var": 4.590105660782125e-07, "learning_rate": 0.0013811735816306803, "loss": 2.5066, "step": 9815 }, { "crossentropy": 2.592359781265259, "epoch": 0.5337828652220017, "grad_norm": 0.03315132111310959, "grad_norm_var": 5.792963435427914e-07, "learning_rate": 0.0013803144611188528, "loss": 2.5924, "step": 9816 }, { "crossentropy": 2.4969476461410522, "epoch": 0.5338372440795018, "grad_norm": 0.032628536224365234, "grad_norm_var": 6.033378115674961e-07, "learning_rate": 0.0013794555650965491, "loss": 2.4969, "step": 9817 }, { "crossentropy": 2.5036468505859375, "epoch": 0.5338916229370021, "grad_norm": 0.03232419118285179, "grad_norm_var": 6.157061167868493e-07, "learning_rate": 0.001378596893617033, "loss": 2.5036, "step": 9818 }, { "crossentropy": 2.586059331893921, "epoch": 0.5339460017945022, "grad_norm": 0.030963944271206856, "grad_norm_var": 6.563267067551896e-07, "learning_rate": 0.001377738446733563, "loss": 2.5861, "step": 9819 }, { "crossentropy": 2.6415016651153564, "epoch": 0.5340003806520025, "grad_norm": 0.032900121062994, "grad_norm_var": 5.682123505056641e-07, "learning_rate": 0.001376880224499379, "loss": 2.6415, "step": 9820 }, { "crossentropy": 2.607011079788208, "epoch": 0.5340547595095027, "grad_norm": 0.032594479620456696, "grad_norm_var": 5.440783675251937e-07, "learning_rate": 0.0013760222269677037, "loss": 2.607, "step": 9821 }, { "crossentropy": 2.575593113899231, "epoch": 0.5341091383670029, "grad_norm": 0.03259372338652611, "grad_norm_var": 5.711631313961291e-07, "learning_rate": 0.0013751644541917518, "loss": 2.5756, "step": 9822 }, { "crossentropy": 2.517789363861084, "epoch": 0.5341635172245031, "grad_norm": 0.0318690650165081, "grad_norm_var": 5.678857966306353e-07, "learning_rate": 0.001374306906224721, "loss": 2.5178, "step": 9823 }, { "crossentropy": 2.538332939147949, "epoch": 0.5342178960820033, "grad_norm": 0.03239841014146805, "grad_norm_var": 5.787514059970525e-07, "learning_rate": 0.0013734495831197952, "loss": 2.5383, "step": 9824 }, { "crossentropy": 2.496643900871277, "epoch": 0.5342722749395035, "grad_norm": 0.0318005345761776, "grad_norm_var": 5.710933492208362e-07, "learning_rate": 0.0013725924849301453, "loss": 2.4966, "step": 9825 }, { "crossentropy": 2.405751347541809, "epoch": 0.5343266537970037, "grad_norm": 0.0316694900393486, "grad_norm_var": 4.887755955083981e-07, "learning_rate": 0.001371735611708928, "loss": 2.4058, "step": 9826 }, { "crossentropy": 2.431877613067627, "epoch": 0.5343810326545039, "grad_norm": 0.03305019065737724, "grad_norm_var": 5.488090070596922e-07, "learning_rate": 0.001370878963509285, "loss": 2.4319, "step": 9827 }, { "crossentropy": 2.601907730102539, "epoch": 0.5344354115120041, "grad_norm": 0.03225702792406082, "grad_norm_var": 5.448906059851335e-07, "learning_rate": 0.001370022540384347, "loss": 2.6019, "step": 9828 }, { "crossentropy": 2.462906837463379, "epoch": 0.5344897903695044, "grad_norm": 0.03387615829706192, "grad_norm_var": 6.820483306143779e-07, "learning_rate": 0.001369166342387224, "loss": 2.4629, "step": 9829 }, { "crossentropy": 2.517135977745056, "epoch": 0.5345441692270045, "grad_norm": 0.03361692279577255, "grad_norm_var": 7.752850534696684e-07, "learning_rate": 0.0013683103695710209, "loss": 2.5171, "step": 9830 }, { "crossentropy": 2.662383556365967, "epoch": 0.5345985480845048, "grad_norm": 0.03268290311098099, "grad_norm_var": 5.415976739182957e-07, "learning_rate": 0.0013674546219888239, "loss": 2.6624, "step": 9831 }, { "crossentropy": 2.5569337606430054, "epoch": 0.534652926942005, "grad_norm": 0.034761786460876465, "grad_norm_var": 8.384951271501409e-07, "learning_rate": 0.0013665990996937034, "loss": 2.5569, "step": 9832 }, { "crossentropy": 2.592353582382202, "epoch": 0.5347073057995052, "grad_norm": 0.03211516514420509, "grad_norm_var": 8.546713997441366e-07, "learning_rate": 0.0013657438027387182, "loss": 2.5924, "step": 9833 }, { "crossentropy": 2.506598711013794, "epoch": 0.5347616846570054, "grad_norm": 0.03221222013235092, "grad_norm_var": 8.594552113808887e-07, "learning_rate": 0.0013648887311769164, "loss": 2.5066, "step": 9834 }, { "crossentropy": 2.6046801805496216, "epoch": 0.5348160635145056, "grad_norm": 0.03269355371594429, "grad_norm_var": 6.725570240945569e-07, "learning_rate": 0.0013640338850613254, "loss": 2.6047, "step": 9835 }, { "crossentropy": 2.5953729152679443, "epoch": 0.5348704423720058, "grad_norm": 0.031979843974113464, "grad_norm_var": 7.001031662214052e-07, "learning_rate": 0.001363179264444963, "loss": 2.5954, "step": 9836 }, { "crossentropy": 2.4690524339675903, "epoch": 0.534924821229506, "grad_norm": 0.03212761506438255, "grad_norm_var": 7.162928044266228e-07, "learning_rate": 0.0013623248693808321, "loss": 2.4691, "step": 9837 }, { "crossentropy": 2.4995769262313843, "epoch": 0.5349792000870062, "grad_norm": 0.04456643387675285, "grad_norm_var": 9.65494808715054e-06, "learning_rate": 0.0013614706999219212, "loss": 2.4996, "step": 9838 }, { "crossentropy": 2.5623754262924194, "epoch": 0.5350335789445064, "grad_norm": 0.03218492493033409, "grad_norm_var": 9.598610967291992e-06, "learning_rate": 0.001360616756121207, "loss": 2.5624, "step": 9839 }, { "crossentropy": 2.6898564100265503, "epoch": 0.5350879578020066, "grad_norm": 0.03297344967722893, "grad_norm_var": 9.544433511721184e-06, "learning_rate": 0.0013597630380316455, "loss": 2.6899, "step": 9840 }, { "crossentropy": 2.4972339868545532, "epoch": 0.5351423366595068, "grad_norm": 0.03276769071817398, "grad_norm_var": 9.39528194684406e-06, "learning_rate": 0.0013589095457061885, "loss": 2.4972, "step": 9841 }, { "crossentropy": 2.58042049407959, "epoch": 0.535196715517007, "grad_norm": 0.03148701786994934, "grad_norm_var": 9.441192061994338e-06, "learning_rate": 0.001358056279197768, "loss": 2.5804, "step": 9842 }, { "crossentropy": 2.557324767112732, "epoch": 0.5352510943745072, "grad_norm": 0.0329999178647995, "grad_norm_var": 9.444094016957995e-06, "learning_rate": 0.0013572032385592997, "loss": 2.5573, "step": 9843 }, { "crossentropy": 2.5231122970581055, "epoch": 0.5353054732320074, "grad_norm": 0.0319172739982605, "grad_norm_var": 9.505641401031753e-06, "learning_rate": 0.0013563504238436896, "loss": 2.5231, "step": 9844 }, { "crossentropy": 2.6098737716674805, "epoch": 0.5353598520895076, "grad_norm": 0.0337015800178051, "grad_norm_var": 9.497281548421071e-06, "learning_rate": 0.0013554978351038316, "loss": 2.6099, "step": 9845 }, { "crossentropy": 2.431278705596924, "epoch": 0.5354142309470078, "grad_norm": 0.03192119672894478, "grad_norm_var": 9.633441005147774e-06, "learning_rate": 0.0013546454723925995, "loss": 2.4313, "step": 9846 }, { "crossentropy": 2.6220898628234863, "epoch": 0.5354686098045081, "grad_norm": 0.031688839197158813, "grad_norm_var": 9.779416001063496e-06, "learning_rate": 0.0013537933357628557, "loss": 2.6221, "step": 9847 }, { "crossentropy": 2.6413882970809937, "epoch": 0.5355229886620082, "grad_norm": 0.032326530665159225, "grad_norm_var": 9.661191322687425e-06, "learning_rate": 0.0013529414252674505, "loss": 2.6414, "step": 9848 }, { "crossentropy": 2.6042628288269043, "epoch": 0.5355773675195085, "grad_norm": 0.03109959326684475, "grad_norm_var": 9.859544361623728e-06, "learning_rate": 0.0013520897409592175, "loss": 2.6043, "step": 9849 }, { "crossentropy": 2.530107259750366, "epoch": 0.5356317463770086, "grad_norm": 0.031667791306972504, "grad_norm_var": 9.93819334719422e-06, "learning_rate": 0.0013512382828909798, "loss": 2.5301, "step": 9850 }, { "crossentropy": 2.542561650276184, "epoch": 0.5356861252345089, "grad_norm": 0.032929059118032455, "grad_norm_var": 9.931834499387925e-06, "learning_rate": 0.0013503870511155387, "loss": 2.5426, "step": 9851 }, { "crossentropy": 2.5309152603149414, "epoch": 0.535740504092009, "grad_norm": 0.034334827214479446, "grad_norm_var": 9.951481331985769e-06, "learning_rate": 0.001349536045685692, "loss": 2.5309, "step": 9852 }, { "crossentropy": 2.5643545389175415, "epoch": 0.5357948829495093, "grad_norm": 0.031158076599240303, "grad_norm_var": 1.0144770451181682e-05, "learning_rate": 0.0013486852666542177, "loss": 2.5644, "step": 9853 }, { "crossentropy": 2.4965118169784546, "epoch": 0.5358492618070094, "grad_norm": 0.03303757309913635, "grad_norm_var": 8.378782378683109e-07, "learning_rate": 0.0013478347140738773, "loss": 2.4965, "step": 9854 }, { "crossentropy": 2.635343551635742, "epoch": 0.5359036406645097, "grad_norm": 0.03283314034342766, "grad_norm_var": 8.466565449897929e-07, "learning_rate": 0.001346984387997422, "loss": 2.6353, "step": 9855 }, { "crossentropy": 2.5298746824264526, "epoch": 0.5359580195220098, "grad_norm": 0.03085586614906788, "grad_norm_var": 9.72833453220786e-07, "learning_rate": 0.0013461342884775918, "loss": 2.5299, "step": 9856 }, { "crossentropy": 2.512735962867737, "epoch": 0.5360123983795101, "grad_norm": 0.03185134381055832, "grad_norm_var": 9.676066446225822e-07, "learning_rate": 0.0013452844155671052, "loss": 2.5127, "step": 9857 }, { "crossentropy": 2.5455793142318726, "epoch": 0.5360667772370102, "grad_norm": 0.03235772252082825, "grad_norm_var": 9.277932648740165e-07, "learning_rate": 0.001344434769318672, "loss": 2.5456, "step": 9858 }, { "crossentropy": 2.451973795890808, "epoch": 0.5361211560945105, "grad_norm": 0.030651740729808807, "grad_norm_var": 1.050935089967952e-06, "learning_rate": 0.0013435853497849866, "loss": 2.452, "step": 9859 }, { "crossentropy": 2.422284722328186, "epoch": 0.5361755349520106, "grad_norm": 0.031353697180747986, "grad_norm_var": 1.0879554973735322e-06, "learning_rate": 0.0013427361570187285, "loss": 2.4223, "step": 9860 }, { "crossentropy": 2.50519859790802, "epoch": 0.5362299138095109, "grad_norm": 0.03204847127199173, "grad_norm_var": 9.080643862803637e-07, "learning_rate": 0.0013418871910725644, "loss": 2.5052, "step": 9861 }, { "crossentropy": 2.5309441089630127, "epoch": 0.536284292667011, "grad_norm": 0.03211801499128342, "grad_norm_var": 9.082280987732314e-07, "learning_rate": 0.001341038451999146, "loss": 2.5309, "step": 9862 }, { "crossentropy": 2.6022220849990845, "epoch": 0.5363386715245113, "grad_norm": 0.03264480084180832, "grad_norm_var": 9.231956860362381e-07, "learning_rate": 0.001340189939851112, "loss": 2.6022, "step": 9863 }, { "crossentropy": 2.4724299907684326, "epoch": 0.5363930503820115, "grad_norm": 0.029757514595985413, "grad_norm_var": 1.2509888414973056e-06, "learning_rate": 0.001339341654681087, "loss": 2.4724, "step": 9864 }, { "crossentropy": 2.5957071781158447, "epoch": 0.5364474292395117, "grad_norm": 0.0297621488571167, "grad_norm_var": 1.5088545193928104e-06, "learning_rate": 0.0013384935965416783, "loss": 2.5957, "step": 9865 }, { "crossentropy": 2.6070550680160522, "epoch": 0.5365018080970119, "grad_norm": 0.03337811678647995, "grad_norm_var": 1.6535240160121348e-06, "learning_rate": 0.0013376457654854818, "loss": 2.6071, "step": 9866 }, { "crossentropy": 2.50738263130188, "epoch": 0.5365561869545121, "grad_norm": 0.0327322743833065, "grad_norm_var": 1.6300460454000502e-06, "learning_rate": 0.0013367981615650837, "loss": 2.5074, "step": 9867 }, { "crossentropy": 2.4535235166549683, "epoch": 0.5366105658120123, "grad_norm": 0.032168470323085785, "grad_norm_var": 1.2286520785451464e-06, "learning_rate": 0.0013359507848330465, "loss": 2.4535, "step": 9868 }, { "crossentropy": 2.4623074531555176, "epoch": 0.5366649446695125, "grad_norm": 0.031130652874708176, "grad_norm_var": 1.2310254704592094e-06, "learning_rate": 0.0013351036353419254, "loss": 2.4623, "step": 9869 }, { "crossentropy": 2.5512664318084717, "epoch": 0.5367193235270127, "grad_norm": 0.0312921367585659, "grad_norm_var": 1.1316978042994469e-06, "learning_rate": 0.0013342567131442607, "loss": 2.5513, "step": 9870 }, { "crossentropy": 2.5862146615982056, "epoch": 0.5367737023845129, "grad_norm": 0.03204996511340141, "grad_norm_var": 1.0499844409991478e-06, "learning_rate": 0.0013334100182925768, "loss": 2.5862, "step": 9871 }, { "crossentropy": 2.580047607421875, "epoch": 0.5368280812420131, "grad_norm": 0.03156930208206177, "grad_norm_var": 1.0077234136501393e-06, "learning_rate": 0.001332563550839385, "loss": 2.58, "step": 9872 }, { "crossentropy": 2.527826428413391, "epoch": 0.5368824600995133, "grad_norm": 0.03190078213810921, "grad_norm_var": 1.009011247189197e-06, "learning_rate": 0.0013317173108371833, "loss": 2.5278, "step": 9873 }, { "crossentropy": 2.4892395734786987, "epoch": 0.5369368389570135, "grad_norm": 0.03230556473135948, "grad_norm_var": 1.0044837052858302e-06, "learning_rate": 0.0013308712983384535, "loss": 2.4892, "step": 9874 }, { "crossentropy": 2.663829803466797, "epoch": 0.5369912178145138, "grad_norm": 0.03334549441933632, "grad_norm_var": 1.0890529954283093e-06, "learning_rate": 0.001330025513395665, "loss": 2.6638, "step": 9875 }, { "crossentropy": 2.4168413877487183, "epoch": 0.5370455966720139, "grad_norm": 0.030819855630397797, "grad_norm_var": 1.1420014649575367e-06, "learning_rate": 0.0013291799560612744, "loss": 2.4168, "step": 9876 }, { "crossentropy": 2.4304096698760986, "epoch": 0.5370999755295142, "grad_norm": 0.031598299741744995, "grad_norm_var": 1.1405920976568945e-06, "learning_rate": 0.0013283346263877178, "loss": 2.4304, "step": 9877 }, { "crossentropy": 2.414063572883606, "epoch": 0.5371543543870143, "grad_norm": 0.032921336591243744, "grad_norm_var": 1.2165043660865177e-06, "learning_rate": 0.001327489524427427, "loss": 2.4141, "step": 9878 }, { "crossentropy": 2.576021909713745, "epoch": 0.5372087332445146, "grad_norm": 0.030823487788438797, "grad_norm_var": 1.2274284545585786e-06, "learning_rate": 0.001326644650232811, "loss": 2.576, "step": 9879 }, { "crossentropy": 2.5022202730178833, "epoch": 0.5372631121020147, "grad_norm": 0.033782701939344406, "grad_norm_var": 1.1856247303755503e-06, "learning_rate": 0.001325800003856269, "loss": 2.5022, "step": 9880 }, { "crossentropy": 2.498100996017456, "epoch": 0.537317490959515, "grad_norm": 0.0326680950820446, "grad_norm_var": 8.564872897215414e-07, "learning_rate": 0.0013249555853501855, "loss": 2.4981, "step": 9881 }, { "crossentropy": 2.5010701417922974, "epoch": 0.5373718698170151, "grad_norm": 0.03148988261818886, "grad_norm_var": 7.714919543173766e-07, "learning_rate": 0.0013241113947669299, "loss": 2.5011, "step": 9882 }, { "crossentropy": 2.6460174322128296, "epoch": 0.5374262486745154, "grad_norm": 0.03132222220301628, "grad_norm_var": 7.651150336901794e-07, "learning_rate": 0.001323267432158859, "loss": 2.646, "step": 9883 }, { "crossentropy": 2.4589611291885376, "epoch": 0.5374806275320155, "grad_norm": 0.030306128785014153, "grad_norm_var": 9.274535825660405e-07, "learning_rate": 0.0013224236975783138, "loss": 2.459, "step": 9884 }, { "crossentropy": 2.53317129611969, "epoch": 0.5375350063895158, "grad_norm": 0.03195647895336151, "grad_norm_var": 8.927567275809525e-07, "learning_rate": 0.0013215801910776226, "loss": 2.5332, "step": 9885 }, { "crossentropy": 2.4304698705673218, "epoch": 0.5375893852470159, "grad_norm": 0.03120499476790428, "grad_norm_var": 9.001137710165265e-07, "learning_rate": 0.0013207369127090985, "loss": 2.4305, "step": 9886 }, { "crossentropy": 2.519347667694092, "epoch": 0.5376437641045162, "grad_norm": 0.031017957255244255, "grad_norm_var": 9.431589022587209e-07, "learning_rate": 0.0013198938625250422, "loss": 2.5193, "step": 9887 }, { "crossentropy": 2.517946481704712, "epoch": 0.5376981429620163, "grad_norm": 0.03293893113732338, "grad_norm_var": 1.0156176037223163e-06, "learning_rate": 0.0013190510405777344, "loss": 2.5179, "step": 9888 }, { "crossentropy": 2.624971389770508, "epoch": 0.5377525218195166, "grad_norm": 0.03120366670191288, "grad_norm_var": 1.0459308862076673e-06, "learning_rate": 0.001318208446919451, "loss": 2.625, "step": 9889 }, { "crossentropy": 2.435276985168457, "epoch": 0.5378069006770168, "grad_norm": 0.03153754398226738, "grad_norm_var": 1.036818436901662e-06, "learning_rate": 0.0013173660816024491, "loss": 2.4353, "step": 9890 }, { "crossentropy": 2.4121341705322266, "epoch": 0.537861279534517, "grad_norm": 0.03280220180749893, "grad_norm_var": 9.439328828731435e-07, "learning_rate": 0.0013165239446789672, "loss": 2.4121, "step": 9891 }, { "crossentropy": 2.5943639278411865, "epoch": 0.5379156583920172, "grad_norm": 0.03294815868139267, "grad_norm_var": 9.561028386782887e-07, "learning_rate": 0.0013156820362012368, "loss": 2.5944, "step": 9892 }, { "crossentropy": 2.5638628005981445, "epoch": 0.5379700372495174, "grad_norm": 0.030839750543236732, "grad_norm_var": 1.0233508241484317e-06, "learning_rate": 0.0013148403562214717, "loss": 2.5639, "step": 9893 }, { "crossentropy": 2.558592438697815, "epoch": 0.5380244161070176, "grad_norm": 0.03174887225031853, "grad_norm_var": 9.433851989819215e-07, "learning_rate": 0.0013139989047918717, "loss": 2.5586, "step": 9894 }, { "crossentropy": 2.409327983856201, "epoch": 0.5380787949645178, "grad_norm": 0.0353328213095665, "grad_norm_var": 1.6349941391646342e-06, "learning_rate": 0.0013131576819646236, "loss": 2.4093, "step": 9895 }, { "crossentropy": 2.4719914197921753, "epoch": 0.538133173822018, "grad_norm": 0.032007455825805664, "grad_norm_var": 1.4262773121076836e-06, "learning_rate": 0.0013123166877918979, "loss": 2.472, "step": 9896 }, { "crossentropy": 2.5565359592437744, "epoch": 0.5381875526795182, "grad_norm": 0.03236132115125656, "grad_norm_var": 1.403106792340301e-06, "learning_rate": 0.0013114759223258542, "loss": 2.5565, "step": 9897 }, { "crossentropy": 2.5469616651535034, "epoch": 0.5382419315370184, "grad_norm": 0.03245767205953598, "grad_norm_var": 1.403737103343403e-06, "learning_rate": 0.0013106353856186359, "loss": 2.547, "step": 9898 }, { "crossentropy": 2.604836940765381, "epoch": 0.5382963103945186, "grad_norm": 0.031130626797676086, "grad_norm_var": 1.423323882003095e-06, "learning_rate": 0.0013097950777223688, "loss": 2.6048, "step": 9899 }, { "crossentropy": 2.5083508491516113, "epoch": 0.5383506892520188, "grad_norm": 0.03123835287988186, "grad_norm_var": 1.268692447502449e-06, "learning_rate": 0.001308954998689172, "loss": 2.5084, "step": 9900 }, { "crossentropy": 2.5842026472091675, "epoch": 0.538405068109519, "grad_norm": 0.031501058489084244, "grad_norm_var": 1.2870565047714532e-06, "learning_rate": 0.0013081151485711462, "loss": 2.5842, "step": 9901 }, { "crossentropy": 2.5689207315444946, "epoch": 0.5384594469670192, "grad_norm": 0.032959505915641785, "grad_norm_var": 1.2895035251916006e-06, "learning_rate": 0.0013072755274203757, "loss": 2.5689, "step": 9902 }, { "crossentropy": 2.546513795852661, "epoch": 0.5385138258245195, "grad_norm": 0.03354734554886818, "grad_norm_var": 1.315468359957281e-06, "learning_rate": 0.0013064361352889326, "loss": 2.5465, "step": 9903 }, { "crossentropy": 2.552956461906433, "epoch": 0.5385682046820196, "grad_norm": 0.032273996621370316, "grad_norm_var": 1.2850996688604782e-06, "learning_rate": 0.00130559697222888, "loss": 2.553, "step": 9904 }, { "crossentropy": 2.4441094398498535, "epoch": 0.5386225835395199, "grad_norm": 0.03139270842075348, "grad_norm_var": 1.2611325337034846e-06, "learning_rate": 0.0013047580382922574, "loss": 2.4441, "step": 9905 }, { "crossentropy": 2.653258800506592, "epoch": 0.53867696239702, "grad_norm": 0.03433775156736374, "grad_norm_var": 1.4833492678910375e-06, "learning_rate": 0.0013039193335310961, "loss": 2.6533, "step": 9906 }, { "crossentropy": 2.518749237060547, "epoch": 0.5387313412545203, "grad_norm": 0.032902177423238754, "grad_norm_var": 1.4889357771371547e-06, "learning_rate": 0.001303080857997412, "loss": 2.5187, "step": 9907 }, { "crossentropy": 2.6080949306488037, "epoch": 0.5387857201120204, "grad_norm": 0.033264875411987305, "grad_norm_var": 1.5168235797182716e-06, "learning_rate": 0.0013022426117432063, "loss": 2.6081, "step": 9908 }, { "crossentropy": 2.5511595010757446, "epoch": 0.5388400989695207, "grad_norm": 0.03258368745446205, "grad_norm_var": 1.3310833018482926e-06, "learning_rate": 0.0013014045948204684, "loss": 2.5512, "step": 9909 }, { "crossentropy": 2.5999741554260254, "epoch": 0.5388944778270208, "grad_norm": 0.03147031366825104, "grad_norm_var": 1.3662454324512985e-06, "learning_rate": 0.0013005668072811655, "loss": 2.6, "step": 9910 }, { "crossentropy": 2.507293701171875, "epoch": 0.5389488566845211, "grad_norm": 0.035454634577035904, "grad_norm_var": 1.4124096864613026e-06, "learning_rate": 0.0012997292491772623, "loss": 2.5073, "step": 9911 }, { "crossentropy": 2.5336265563964844, "epoch": 0.5390032355420212, "grad_norm": 0.03214288502931595, "grad_norm_var": 1.4036649420150292e-06, "learning_rate": 0.0012988919205607031, "loss": 2.5336, "step": 9912 }, { "crossentropy": 2.549621105194092, "epoch": 0.5390576143995215, "grad_norm": 0.032649893313646317, "grad_norm_var": 1.401083463518438e-06, "learning_rate": 0.0012980548214834142, "loss": 2.5496, "step": 9913 }, { "crossentropy": 2.5738481283187866, "epoch": 0.5391119932570216, "grad_norm": 0.03307345509529114, "grad_norm_var": 1.4145980565886575e-06, "learning_rate": 0.0012972179519973126, "loss": 2.5738, "step": 9914 }, { "crossentropy": 2.5885133743286133, "epoch": 0.5391663721145219, "grad_norm": 0.03214051574468613, "grad_norm_var": 1.2777659111439746e-06, "learning_rate": 0.0012963813121543049, "loss": 2.5885, "step": 9915 }, { "crossentropy": 2.5902689695358276, "epoch": 0.539220750972022, "grad_norm": 0.03173360973596573, "grad_norm_var": 1.1976784009622038e-06, "learning_rate": 0.0012955449020062737, "loss": 2.5903, "step": 9916 }, { "crossentropy": 2.582340955734253, "epoch": 0.5392751298295223, "grad_norm": 0.032602228224277496, "grad_norm_var": 1.0953365559023913e-06, "learning_rate": 0.001294708721605094, "loss": 2.5823, "step": 9917 }, { "crossentropy": 2.6141902208328247, "epoch": 0.5393295086870225, "grad_norm": 0.03525928780436516, "grad_norm_var": 1.4799918472520209e-06, "learning_rate": 0.001293872771002625, "loss": 2.6142, "step": 9918 }, { "crossentropy": 2.642594814300537, "epoch": 0.5393838875445227, "grad_norm": 0.03250778838992119, "grad_norm_var": 1.4615268479349514e-06, "learning_rate": 0.001293037050250711, "loss": 2.6426, "step": 9919 }, { "crossentropy": 2.565388560295105, "epoch": 0.5394382664020229, "grad_norm": 0.03135205805301666, "grad_norm_var": 1.586913580209117e-06, "learning_rate": 0.0012922015594011855, "loss": 2.5654, "step": 9920 }, { "crossentropy": 2.4343544244766235, "epoch": 0.5394926452595231, "grad_norm": 0.03179978206753731, "grad_norm_var": 1.5206573140177348e-06, "learning_rate": 0.0012913662985058589, "loss": 2.4344, "step": 9921 }, { "crossentropy": 2.452388286590576, "epoch": 0.5395470241170233, "grad_norm": 0.03262413293123245, "grad_norm_var": 1.359620834556705e-06, "learning_rate": 0.0012905312676165387, "loss": 2.4524, "step": 9922 }, { "crossentropy": 2.4809162616729736, "epoch": 0.5396014029745235, "grad_norm": 0.03149804100394249, "grad_norm_var": 1.4492223986000673e-06, "learning_rate": 0.0012896964667850125, "loss": 2.4809, "step": 9923 }, { "crossentropy": 2.5313644409179688, "epoch": 0.5396557818320237, "grad_norm": 0.030814792960882187, "grad_norm_var": 1.6185802706141875e-06, "learning_rate": 0.0012888618960630505, "loss": 2.5314, "step": 9924 }, { "crossentropy": 2.622729539871216, "epoch": 0.5397101606895239, "grad_norm": 0.03284187242388725, "grad_norm_var": 1.6262575738538547e-06, "learning_rate": 0.0012880275555024123, "loss": 2.6227, "step": 9925 }, { "crossentropy": 2.479408025741577, "epoch": 0.5397645395470241, "grad_norm": 0.0332183875143528, "grad_norm_var": 1.577752625526024e-06, "learning_rate": 0.0012871934451548477, "loss": 2.4794, "step": 9926 }, { "crossentropy": 2.5751075744628906, "epoch": 0.5398189184045243, "grad_norm": 0.0313953161239624, "grad_norm_var": 1.0664171949975598e-06, "learning_rate": 0.0012863595650720828, "loss": 2.5751, "step": 9927 }, { "crossentropy": 2.6008743047714233, "epoch": 0.5398732972620245, "grad_norm": 0.03254593908786774, "grad_norm_var": 1.0652584800483386e-06, "learning_rate": 0.0012855259153058353, "loss": 2.6009, "step": 9928 }, { "crossentropy": 2.518204092979431, "epoch": 0.5399276761195247, "grad_norm": 0.030426686629652977, "grad_norm_var": 1.29374590853933e-06, "learning_rate": 0.001284692495907807, "loss": 2.5182, "step": 9929 }, { "crossentropy": 2.6659047603607178, "epoch": 0.5399820549770249, "grad_norm": 0.033040665090084076, "grad_norm_var": 1.290167572789745e-06, "learning_rate": 0.0012838593069296872, "loss": 2.6659, "step": 9930 }, { "crossentropy": 2.5285857915878296, "epoch": 0.5400364338345252, "grad_norm": 0.030869653448462486, "grad_norm_var": 1.4075562662377306e-06, "learning_rate": 0.0012830263484231498, "loss": 2.5286, "step": 9931 }, { "crossentropy": 2.513686180114746, "epoch": 0.5400908126920253, "grad_norm": 0.031195955350995064, "grad_norm_var": 1.4560566955006213e-06, "learning_rate": 0.00128219362043985, "loss": 2.5137, "step": 9932 }, { "crossentropy": 2.4709030389785767, "epoch": 0.5401451915495256, "grad_norm": 0.03121008351445198, "grad_norm_var": 1.4885171114169348e-06, "learning_rate": 0.001281361123031437, "loss": 2.4709, "step": 9933 }, { "crossentropy": 2.6435388326644897, "epoch": 0.5401995704070257, "grad_norm": 0.03110833466053009, "grad_norm_var": 7.823011358493258e-07, "learning_rate": 0.0012805288562495422, "loss": 2.6435, "step": 9934 }, { "crossentropy": 2.6109111309051514, "epoch": 0.540253949264526, "grad_norm": 0.031613294035196304, "grad_norm_var": 7.45280866993681e-07, "learning_rate": 0.0012796968201457782, "loss": 2.6109, "step": 9935 }, { "crossentropy": 2.486551523208618, "epoch": 0.5403083281220261, "grad_norm": 0.032337650656700134, "grad_norm_var": 7.573533783106993e-07, "learning_rate": 0.001278865014771749, "loss": 2.4866, "step": 9936 }, { "crossentropy": 2.5719552040100098, "epoch": 0.5403627069795264, "grad_norm": 0.031114570796489716, "grad_norm_var": 7.852366745792477e-07, "learning_rate": 0.0012780334401790421, "loss": 2.572, "step": 9937 }, { "crossentropy": 2.450224757194519, "epoch": 0.5404170858370265, "grad_norm": 0.032292429357767105, "grad_norm_var": 7.530532075921178e-07, "learning_rate": 0.0012772020964192316, "loss": 2.4502, "step": 9938 }, { "crossentropy": 2.6588598489761353, "epoch": 0.5404714646945268, "grad_norm": 0.03218904882669449, "grad_norm_var": 7.624252434929354e-07, "learning_rate": 0.0012763709835438763, "loss": 2.6589, "step": 9939 }, { "crossentropy": 2.46760356426239, "epoch": 0.5405258435520269, "grad_norm": 0.03271365538239479, "grad_norm_var": 7.47605809991641e-07, "learning_rate": 0.0012755401016045216, "loss": 2.4676, "step": 9940 }, { "crossentropy": 2.5631935596466064, "epoch": 0.5405802224095272, "grad_norm": 0.031426262110471725, "grad_norm_var": 6.916970057850204e-07, "learning_rate": 0.0012747094506526968, "loss": 2.5632, "step": 9941 }, { "crossentropy": 2.5295172929763794, "epoch": 0.5406346012670273, "grad_norm": 0.03205995261669159, "grad_norm_var": 5.55503619252999e-07, "learning_rate": 0.001273879030739919, "loss": 2.5295, "step": 9942 }, { "crossentropy": 2.469259738922119, "epoch": 0.5406889801245276, "grad_norm": 0.032315444201231, "grad_norm_var": 5.684354126657265e-07, "learning_rate": 0.001273048841917691, "loss": 2.4693, "step": 9943 }, { "crossentropy": 2.670642375946045, "epoch": 0.5407433589820277, "grad_norm": 0.031293053179979324, "grad_norm_var": 5.383791461114103e-07, "learning_rate": 0.0012722188842374965, "loss": 2.6706, "step": 9944 }, { "crossentropy": 2.5379897356033325, "epoch": 0.540797737839528, "grad_norm": 0.033344779163599014, "grad_norm_var": 5.749997537464006e-07, "learning_rate": 0.0012713891577508147, "loss": 2.538, "step": 9945 }, { "crossentropy": 2.519742488861084, "epoch": 0.5408521166970282, "grad_norm": 0.0326598584651947, "grad_norm_var": 5.252735189478312e-07, "learning_rate": 0.0012705596625090993, "loss": 2.5197, "step": 9946 }, { "crossentropy": 2.47042715549469, "epoch": 0.5409064955545284, "grad_norm": 0.031277116388082504, "grad_norm_var": 4.819004521779454e-07, "learning_rate": 0.0012697303985637965, "loss": 2.4704, "step": 9947 }, { "crossentropy": 2.5204769372940063, "epoch": 0.5409608744120286, "grad_norm": 0.03367283195257187, "grad_norm_var": 6.379513444495476e-07, "learning_rate": 0.0012689013659663367, "loss": 2.5205, "step": 9948 }, { "crossentropy": 2.7287429571151733, "epoch": 0.5410152532695288, "grad_norm": 0.03266485780477524, "grad_norm_var": 6.093865857701349e-07, "learning_rate": 0.0012680725647681362, "loss": 2.7287, "step": 9949 }, { "crossentropy": 2.5221484899520874, "epoch": 0.541069632127029, "grad_norm": 0.03284702077507973, "grad_norm_var": 5.614330551836332e-07, "learning_rate": 0.0012672439950205956, "loss": 2.5221, "step": 9950 }, { "crossentropy": 2.6024938821792603, "epoch": 0.5411240109845292, "grad_norm": 0.031133998185396194, "grad_norm_var": 6.157685890544861e-07, "learning_rate": 0.0012664156567751023, "loss": 2.6025, "step": 9951 }, { "crossentropy": 2.58814537525177, "epoch": 0.5411783898420294, "grad_norm": 0.032875653356313705, "grad_norm_var": 6.43094198498941e-07, "learning_rate": 0.0012655875500830293, "loss": 2.5881, "step": 9952 }, { "crossentropy": 2.5320483446121216, "epoch": 0.5412327686995296, "grad_norm": 0.031393010169267654, "grad_norm_var": 6.060638397197814e-07, "learning_rate": 0.0012647596749957347, "loss": 2.532, "step": 9953 }, { "crossentropy": 2.5436002016067505, "epoch": 0.5412871475570298, "grad_norm": 0.03213511034846306, "grad_norm_var": 6.069290874289104e-07, "learning_rate": 0.001263932031564564, "loss": 2.5436, "step": 9954 }, { "crossentropy": 2.546096086502075, "epoch": 0.54134152641453, "grad_norm": 0.034469373524188995, "grad_norm_var": 9.133584272897135e-07, "learning_rate": 0.001263104619840843, "loss": 2.5461, "step": 9955 }, { "crossentropy": 2.527728319168091, "epoch": 0.5413959052720302, "grad_norm": 0.030970251187682152, "grad_norm_var": 1.0286994767928163e-06, "learning_rate": 0.0012622774398758914, "loss": 2.5277, "step": 9956 }, { "crossentropy": 2.56202495098114, "epoch": 0.5414502841295304, "grad_norm": 0.031392887234687805, "grad_norm_var": 1.0325845045816429e-06, "learning_rate": 0.001261450491721009, "loss": 2.562, "step": 9957 }, { "crossentropy": 2.526582717895508, "epoch": 0.5415046629870306, "grad_norm": 0.03167002275586128, "grad_norm_var": 1.0536096222473935e-06, "learning_rate": 0.0012606237754274807, "loss": 2.5266, "step": 9958 }, { "crossentropy": 2.527743101119995, "epoch": 0.5415590418445309, "grad_norm": 0.032883722335100174, "grad_norm_var": 1.0782062367484564e-06, "learning_rate": 0.0012597972910465794, "loss": 2.5277, "step": 9959 }, { "crossentropy": 2.558882236480713, "epoch": 0.541613420702031, "grad_norm": 0.03424186632037163, "grad_norm_var": 1.2286301737758719e-06, "learning_rate": 0.0012589710386295633, "loss": 2.5589, "step": 9960 }, { "crossentropy": 2.531691789627075, "epoch": 0.5416677995595313, "grad_norm": 0.03129808232188225, "grad_norm_var": 1.2536360277895826e-06, "learning_rate": 0.0012581450182276755, "loss": 2.5317, "step": 9961 }, { "crossentropy": 2.4339241981506348, "epoch": 0.5417221784170314, "grad_norm": 0.03244710713624954, "grad_norm_var": 1.2476498483433475e-06, "learning_rate": 0.0012573192298921448, "loss": 2.4339, "step": 9962 }, { "crossentropy": 2.5384881496429443, "epoch": 0.5417765572745317, "grad_norm": 0.0328398235142231, "grad_norm_var": 1.1796884168147377e-06, "learning_rate": 0.0012564936736741866, "loss": 2.5385, "step": 9963 }, { "crossentropy": 2.480794310569763, "epoch": 0.5418309361320318, "grad_norm": 0.031000832095742226, "grad_norm_var": 1.1843712436838497e-06, "learning_rate": 0.0012556683496250003, "loss": 2.4808, "step": 9964 }, { "crossentropy": 2.612048864364624, "epoch": 0.5418853149895321, "grad_norm": 0.03399581089615822, "grad_norm_var": 1.365782967184479e-06, "learning_rate": 0.0012548432577957742, "loss": 2.612, "step": 9965 }, { "crossentropy": 2.601175308227539, "epoch": 0.5419396938470322, "grad_norm": 0.030948977917432785, "grad_norm_var": 1.4650753019831408e-06, "learning_rate": 0.0012540183982376741, "loss": 2.6012, "step": 9966 }, { "crossentropy": 2.576050639152527, "epoch": 0.5419940727045325, "grad_norm": 0.03276056423783302, "grad_norm_var": 1.3925126595222431e-06, "learning_rate": 0.0012531937710018625, "loss": 2.5761, "step": 9967 }, { "crossentropy": 2.560228943824768, "epoch": 0.5420484515620326, "grad_norm": 0.03185388818383217, "grad_norm_var": 1.3837925787328503e-06, "learning_rate": 0.001252369376139481, "loss": 2.5602, "step": 9968 }, { "crossentropy": 2.5697124004364014, "epoch": 0.5421028304195329, "grad_norm": 0.031565651297569275, "grad_norm_var": 1.365494979988178e-06, "learning_rate": 0.0012515452137016564, "loss": 2.5697, "step": 9969 }, { "crossentropy": 2.5714482069015503, "epoch": 0.542157209277033, "grad_norm": 0.031725719571113586, "grad_norm_var": 1.3838583280407004e-06, "learning_rate": 0.0012507212837395004, "loss": 2.5714, "step": 9970 }, { "crossentropy": 2.326101541519165, "epoch": 0.5422115881345333, "grad_norm": 0.03163241967558861, "grad_norm_var": 1.0489029226341355e-06, "learning_rate": 0.0012498975863041183, "loss": 2.3261, "step": 9971 }, { "crossentropy": 2.5695470571517944, "epoch": 0.5422659669920334, "grad_norm": 0.030868401750922203, "grad_norm_var": 1.064577108291868e-06, "learning_rate": 0.0012490741214465906, "loss": 2.5695, "step": 9972 }, { "crossentropy": 2.5707632303237915, "epoch": 0.5423203458495337, "grad_norm": 0.034332387149333954, "grad_norm_var": 1.3390937686932428e-06, "learning_rate": 0.0012482508892179883, "loss": 2.5708, "step": 9973 }, { "crossentropy": 2.6080663204193115, "epoch": 0.5423747247070339, "grad_norm": 0.031227413564920425, "grad_norm_var": 1.3858055697815267e-06, "learning_rate": 0.0012474278896693685, "loss": 2.6081, "step": 9974 }, { "crossentropy": 2.4753142595291138, "epoch": 0.5424291035645341, "grad_norm": 0.03387788310647011, "grad_norm_var": 1.5347067878370793e-06, "learning_rate": 0.0012466051228517722, "loss": 2.4753, "step": 9975 }, { "crossentropy": 2.6216018199920654, "epoch": 0.5424834824220343, "grad_norm": 0.03285225108265877, "grad_norm_var": 1.2934821110694808e-06, "learning_rate": 0.0012457825888162288, "loss": 2.6216, "step": 9976 }, { "crossentropy": 2.5826855897903442, "epoch": 0.5425378612795345, "grad_norm": 0.03312480077147484, "grad_norm_var": 1.2819508293816322e-06, "learning_rate": 0.0012449602876137456, "loss": 2.5827, "step": 9977 }, { "crossentropy": 2.5920509099960327, "epoch": 0.5425922401370348, "grad_norm": 0.03097846917808056, "grad_norm_var": 1.391058421104059e-06, "learning_rate": 0.0012441382192953266, "loss": 2.5921, "step": 9978 }, { "crossentropy": 2.649571180343628, "epoch": 0.5426466189945349, "grad_norm": 0.03228839486837387, "grad_norm_var": 1.3647912674081731e-06, "learning_rate": 0.001243316383911955, "loss": 2.6496, "step": 9979 }, { "crossentropy": 2.562945604324341, "epoch": 0.5427009978520352, "grad_norm": 0.03187503293156624, "grad_norm_var": 1.273990608784079e-06, "learning_rate": 0.001242494781514597, "loss": 2.5629, "step": 9980 }, { "crossentropy": 2.47413969039917, "epoch": 0.5427553767095353, "grad_norm": 0.031373485922813416, "grad_norm_var": 1.0913572376392937e-06, "learning_rate": 0.0012416734121542083, "loss": 2.4741, "step": 9981 }, { "crossentropy": 2.5913665294647217, "epoch": 0.5428097555670356, "grad_norm": 0.032770343124866486, "grad_norm_var": 1.0239385624795256e-06, "learning_rate": 0.0012408522758817337, "loss": 2.5914, "step": 9982 }, { "crossentropy": 2.539888858795166, "epoch": 0.5428641344245357, "grad_norm": 0.03163965418934822, "grad_norm_var": 1.0178193648434248e-06, "learning_rate": 0.001240031372748095, "loss": 2.5399, "step": 9983 }, { "crossentropy": 2.4236929416656494, "epoch": 0.542918513282036, "grad_norm": 0.03223438188433647, "grad_norm_var": 1.0131574207154654e-06, "learning_rate": 0.001239210702804205, "loss": 2.4237, "step": 9984 }, { "crossentropy": 2.510231137275696, "epoch": 0.5429728921395361, "grad_norm": 0.033806148916482925, "grad_norm_var": 1.1529544428005856e-06, "learning_rate": 0.0012383902661009612, "loss": 2.5102, "step": 9985 }, { "crossentropy": 2.5109797716140747, "epoch": 0.5430272709970364, "grad_norm": 0.03284171596169472, "grad_norm_var": 1.1471354365438201e-06, "learning_rate": 0.0012375700626892456, "loss": 2.511, "step": 9986 }, { "crossentropy": 2.504401683807373, "epoch": 0.5430816498545366, "grad_norm": 0.03113040328025818, "grad_norm_var": 1.2114336639709733e-06, "learning_rate": 0.0012367500926199289, "loss": 2.5044, "step": 9987 }, { "crossentropy": 2.600279927253723, "epoch": 0.5431360287120368, "grad_norm": 0.034738630056381226, "grad_norm_var": 1.3952686254908025e-06, "learning_rate": 0.00123593035594386, "loss": 2.6003, "step": 9988 }, { "crossentropy": 2.6349772214889526, "epoch": 0.543190407569537, "grad_norm": 0.03280116617679596, "grad_norm_var": 1.1816295988444398e-06, "learning_rate": 0.0012351108527118826, "loss": 2.635, "step": 9989 }, { "crossentropy": 2.520642876625061, "epoch": 0.5432447864270372, "grad_norm": 0.030463550239801407, "grad_norm_var": 1.3449087768813932e-06, "learning_rate": 0.0012342915829748218, "loss": 2.5206, "step": 9990 }, { "crossentropy": 2.551109790802002, "epoch": 0.5432991652845374, "grad_norm": 0.03141544759273529, "grad_norm_var": 1.2467899217468663e-06, "learning_rate": 0.0012334725467834846, "loss": 2.5511, "step": 9991 }, { "crossentropy": 2.540061593055725, "epoch": 0.5433535441420376, "grad_norm": 0.03169860690832138, "grad_norm_var": 1.2405428439695668e-06, "learning_rate": 0.001232653744188667, "loss": 2.5401, "step": 9992 }, { "crossentropy": 2.4948806762695312, "epoch": 0.5434079229995378, "grad_norm": 0.03183140605688095, "grad_norm_var": 1.1853998227828125e-06, "learning_rate": 0.001231835175241155, "loss": 2.4949, "step": 9993 }, { "crossentropy": 2.4678194522857666, "epoch": 0.543462301857038, "grad_norm": 0.031106818467378616, "grad_norm_var": 1.1669295998295929e-06, "learning_rate": 0.0012310168399917104, "loss": 2.4678, "step": 9994 }, { "crossentropy": 2.5250091552734375, "epoch": 0.5435166807145382, "grad_norm": 0.03187103942036629, "grad_norm_var": 1.1687765178301317e-06, "learning_rate": 0.0012301987384910884, "loss": 2.525, "step": 9995 }, { "crossentropy": 2.413507580757141, "epoch": 0.5435710595720384, "grad_norm": 0.030462536960840225, "grad_norm_var": 1.335816219731149e-06, "learning_rate": 0.0012293808707900256, "loss": 2.4135, "step": 9996 }, { "crossentropy": 2.5249452590942383, "epoch": 0.5436254384295386, "grad_norm": 0.03201844543218613, "grad_norm_var": 1.3069415755867046e-06, "learning_rate": 0.0012285632369392458, "loss": 2.5249, "step": 9997 }, { "crossentropy": 2.5438212156295776, "epoch": 0.5436798172870388, "grad_norm": 0.03142194822430611, "grad_norm_var": 1.2914099363875789e-06, "learning_rate": 0.001227745836989458, "loss": 2.5438, "step": 9998 }, { "crossentropy": 2.5476903915405273, "epoch": 0.543734196144539, "grad_norm": 0.032240130007267, "grad_norm_var": 1.2876876663345711e-06, "learning_rate": 0.0012269286709913564, "loss": 2.5477, "step": 9999 }, { "crossentropy": 2.5978163480758667, "epoch": 0.5437885750020393, "grad_norm": 0.03245710954070091, "grad_norm_var": 1.2975956912536038e-06, "learning_rate": 0.0012261117389956212, "loss": 2.5978, "step": 10000 }, { "crossentropy": 2.4417518377304077, "epoch": 0.5438429538595394, "grad_norm": 0.032077375799417496, "grad_norm_var": 1.0724592819604897e-06, "learning_rate": 0.001225295041052919, "loss": 2.4418, "step": 10001 }, { "crossentropy": 2.565797448158264, "epoch": 0.5438973327170397, "grad_norm": 0.03153206408023834, "grad_norm_var": 1.0171402833223167e-06, "learning_rate": 0.0012244785772138973, "loss": 2.5658, "step": 10002 }, { "crossentropy": 2.501757860183716, "epoch": 0.5439517115745398, "grad_norm": 0.03283290937542915, "grad_norm_var": 1.039678200233605e-06, "learning_rate": 0.0012236623475291935, "loss": 2.5018, "step": 10003 }, { "crossentropy": 2.5536351203918457, "epoch": 0.5440060904320401, "grad_norm": 0.03128020465373993, "grad_norm_var": 4.946676488849778e-07, "learning_rate": 0.0012228463520494327, "loss": 2.5536, "step": 10004 }, { "crossentropy": 2.3761266469955444, "epoch": 0.5440604692895402, "grad_norm": 0.03215985372662544, "grad_norm_var": 4.278746551626573e-07, "learning_rate": 0.0012220305908252182, "loss": 2.3761, "step": 10005 }, { "crossentropy": 2.5528879165649414, "epoch": 0.5441148481470405, "grad_norm": 0.03250657394528389, "grad_norm_var": 3.575611875934632e-07, "learning_rate": 0.001221215063907145, "loss": 2.5529, "step": 10006 }, { "crossentropy": 2.4838305711746216, "epoch": 0.5441692270045406, "grad_norm": 0.02993372268974781, "grad_norm_var": 5.721426800807713e-07, "learning_rate": 0.0012203997713457898, "loss": 2.4838, "step": 10007 }, { "crossentropy": 2.5874757766723633, "epoch": 0.5442236058620409, "grad_norm": 0.031850624829530716, "grad_norm_var": 5.732664721243372e-07, "learning_rate": 0.0012195847131917176, "loss": 2.5875, "step": 10008 }, { "crossentropy": 2.4958853721618652, "epoch": 0.544277984719541, "grad_norm": 0.032618582248687744, "grad_norm_var": 6.232754813086423e-07, "learning_rate": 0.0012187698894954768, "loss": 2.4959, "step": 10009 }, { "crossentropy": 2.538087248802185, "epoch": 0.5443323635770413, "grad_norm": 0.03128303587436676, "grad_norm_var": 6.095610482786806e-07, "learning_rate": 0.001217955300307602, "loss": 2.5381, "step": 10010 }, { "crossentropy": 2.6101315021514893, "epoch": 0.5443867424345414, "grad_norm": 0.032594192773103714, "grad_norm_var": 6.506248537943426e-07, "learning_rate": 0.0012171409456786136, "loss": 2.6101, "step": 10011 }, { "crossentropy": 2.5508522987365723, "epoch": 0.5444411212920417, "grad_norm": 0.03224444389343262, "grad_norm_var": 5.243408601098097e-07, "learning_rate": 0.001216326825659017, "loss": 2.5509, "step": 10012 }, { "crossentropy": 2.582515597343445, "epoch": 0.5444955001495418, "grad_norm": 0.0319303534924984, "grad_norm_var": 5.2391271840034e-07, "learning_rate": 0.0012155129402993048, "loss": 2.5825, "step": 10013 }, { "crossentropy": 2.485482335090637, "epoch": 0.5445498790070421, "grad_norm": 0.031526897102594376, "grad_norm_var": 5.17419149130777e-07, "learning_rate": 0.0012146992896499486, "loss": 2.4855, "step": 10014 }, { "crossentropy": 2.5977131128311157, "epoch": 0.5446042578645423, "grad_norm": 0.032746657729148865, "grad_norm_var": 5.536061818269539e-07, "learning_rate": 0.0012138858737614161, "loss": 2.5977, "step": 10015 }, { "crossentropy": 2.531848192214966, "epoch": 0.5446586367220425, "grad_norm": 0.032423388212919235, "grad_norm_var": 5.51502465289177e-07, "learning_rate": 0.0012130726926841513, "loss": 2.5318, "step": 10016 }, { "crossentropy": 2.560384511947632, "epoch": 0.5447130155795427, "grad_norm": 0.03240029513835907, "grad_norm_var": 5.625867453476804e-07, "learning_rate": 0.0012122597464685868, "loss": 2.5604, "step": 10017 }, { "crossentropy": 2.399757981300354, "epoch": 0.5447673944370429, "grad_norm": 0.032728467136621475, "grad_norm_var": 5.787606036226998e-07, "learning_rate": 0.001211447035165142, "loss": 2.3998, "step": 10018 }, { "crossentropy": 2.5077834129333496, "epoch": 0.5448217732945431, "grad_norm": 0.0323234386742115, "grad_norm_var": 5.429052574884199e-07, "learning_rate": 0.0012106345588242197, "loss": 2.5078, "step": 10019 }, { "crossentropy": 2.552794098854065, "epoch": 0.5448761521520433, "grad_norm": 0.030919482931494713, "grad_norm_var": 5.873127184663832e-07, "learning_rate": 0.0012098223174962092, "loss": 2.5528, "step": 10020 }, { "crossentropy": 2.5666158199310303, "epoch": 0.5449305310095435, "grad_norm": 0.03224360942840576, "grad_norm_var": 5.89403691420252e-07, "learning_rate": 0.0012090103112314854, "loss": 2.5666, "step": 10021 }, { "crossentropy": 2.6228461265563965, "epoch": 0.5449849098670437, "grad_norm": 0.03258825093507767, "grad_norm_var": 5.951510251562056e-07, "learning_rate": 0.0012081985400804074, "loss": 2.6228, "step": 10022 }, { "crossentropy": 2.497936487197876, "epoch": 0.5450392887245439, "grad_norm": 0.032721295952796936, "grad_norm_var": 3.045678703733247e-07, "learning_rate": 0.001207387004093321, "loss": 2.4979, "step": 10023 }, { "crossentropy": 2.5611188411712646, "epoch": 0.5450936675820441, "grad_norm": 0.03164728730916977, "grad_norm_var": 3.1652759107240966e-07, "learning_rate": 0.0012065757033205577, "loss": 2.5611, "step": 10024 }, { "crossentropy": 2.5300140380859375, "epoch": 0.5451480464395443, "grad_norm": 0.03245744854211807, "grad_norm_var": 3.088077633484615e-07, "learning_rate": 0.0012057646378124305, "loss": 2.53, "step": 10025 }, { "crossentropy": 2.561172604560852, "epoch": 0.5452024252970445, "grad_norm": 0.032128043472766876, "grad_norm_var": 2.530906793723886e-07, "learning_rate": 0.001204953807619244, "loss": 2.5612, "step": 10026 }, { "crossentropy": 2.4836173057556152, "epoch": 0.5452568041545447, "grad_norm": 0.032576411962509155, "grad_norm_var": 2.522386561839279e-07, "learning_rate": 0.001204143212791286, "loss": 2.4836, "step": 10027 }, { "crossentropy": 2.562575578689575, "epoch": 0.545311183012045, "grad_norm": 0.03273634985089302, "grad_norm_var": 2.686134883980584e-07, "learning_rate": 0.0012033328533788256, "loss": 2.5626, "step": 10028 }, { "crossentropy": 2.5487762689590454, "epoch": 0.5453655618695451, "grad_norm": 0.030975306406617165, "grad_norm_var": 3.671017253690026e-07, "learning_rate": 0.0012025227294321223, "loss": 2.5488, "step": 10029 }, { "crossentropy": 2.467354416847229, "epoch": 0.5454199407270454, "grad_norm": 0.03200941160321236, "grad_norm_var": 3.3857941510306623e-07, "learning_rate": 0.0012017128410014182, "loss": 2.4674, "step": 10030 }, { "crossentropy": 2.55991792678833, "epoch": 0.5454743195845455, "grad_norm": 0.04002102091908455, "grad_norm_var": 4.150291308608241e-06, "learning_rate": 0.001200903188136943, "loss": 2.5599, "step": 10031 }, { "crossentropy": 2.586857318878174, "epoch": 0.5455286984420458, "grad_norm": 0.03226259723305702, "grad_norm_var": 4.157434755292187e-06, "learning_rate": 0.0012000937708889104, "loss": 2.5869, "step": 10032 }, { "crossentropy": 2.4898195266723633, "epoch": 0.5455830772995459, "grad_norm": 0.030374396592378616, "grad_norm_var": 4.4871199355501e-06, "learning_rate": 0.001199284589307519, "loss": 2.4898, "step": 10033 }, { "crossentropy": 2.5345892906188965, "epoch": 0.5456374561570462, "grad_norm": 0.03257782757282257, "grad_norm_var": 4.484844200427357e-06, "learning_rate": 0.0011984756434429544, "loss": 2.5346, "step": 10034 }, { "crossentropy": 2.4877899885177612, "epoch": 0.5456918350145463, "grad_norm": 0.03398711234331131, "grad_norm_var": 4.61087291440624e-06, "learning_rate": 0.001197666933345387, "loss": 2.4878, "step": 10035 }, { "crossentropy": 2.4996243715286255, "epoch": 0.5457462138720466, "grad_norm": 0.03182433173060417, "grad_norm_var": 4.454577173676e-06, "learning_rate": 0.0011968584590649695, "loss": 2.4996, "step": 10036 }, { "crossentropy": 2.5642699003219604, "epoch": 0.5458005927295467, "grad_norm": 0.03274966776371002, "grad_norm_var": 4.440080654075302e-06, "learning_rate": 0.0011960502206518453, "loss": 2.5643, "step": 10037 }, { "crossentropy": 2.745266318321228, "epoch": 0.545854971587047, "grad_norm": 0.033961933106184006, "grad_norm_var": 4.532550882808137e-06, "learning_rate": 0.0011952422181561422, "loss": 2.7453, "step": 10038 }, { "crossentropy": 2.3823635578155518, "epoch": 0.5459093504445471, "grad_norm": 0.03236197680234909, "grad_norm_var": 4.545021056154734e-06, "learning_rate": 0.0011944344516279675, "loss": 2.3824, "step": 10039 }, { "crossentropy": 2.5743842124938965, "epoch": 0.5459637293020474, "grad_norm": 0.030661318451166153, "grad_norm_var": 4.756094744366824e-06, "learning_rate": 0.0011936269211174195, "loss": 2.5744, "step": 10040 }, { "crossentropy": 2.6546040773391724, "epoch": 0.5460181081595475, "grad_norm": 0.03156713768839836, "grad_norm_var": 4.837879508695385e-06, "learning_rate": 0.0011928196266745832, "loss": 2.6546, "step": 10041 }, { "crossentropy": 2.5280802249908447, "epoch": 0.5460724870170478, "grad_norm": 0.031768012791872025, "grad_norm_var": 4.872161564249096e-06, "learning_rate": 0.0011920125683495231, "loss": 2.5281, "step": 10042 }, { "crossentropy": 2.524237275123596, "epoch": 0.546126865874548, "grad_norm": 0.030456827953457832, "grad_norm_var": 5.174009782903978e-06, "learning_rate": 0.0011912057461922938, "loss": 2.5242, "step": 10043 }, { "crossentropy": 2.559045672416687, "epoch": 0.5461812447320482, "grad_norm": 0.0324506051838398, "grad_norm_var": 5.170811148046996e-06, "learning_rate": 0.0011903991602529324, "loss": 2.559, "step": 10044 }, { "crossentropy": 2.4682334661483765, "epoch": 0.5462356235895484, "grad_norm": 0.031219424679875374, "grad_norm_var": 5.124889053517191e-06, "learning_rate": 0.001189592810581464, "loss": 2.4682, "step": 10045 }, { "crossentropy": 2.4693065881729126, "epoch": 0.5462900024470486, "grad_norm": 0.030579397454857826, "grad_norm_var": 5.349259737687803e-06, "learning_rate": 0.0011887866972278983, "loss": 2.4693, "step": 10046 }, { "crossentropy": 2.483546018600464, "epoch": 0.5463443813045488, "grad_norm": 0.034369561821222305, "grad_norm_var": 1.622743972234811e-06, "learning_rate": 0.0011879808202422254, "loss": 2.4835, "step": 10047 }, { "crossentropy": 2.5210492610931396, "epoch": 0.546398760162049, "grad_norm": 0.03581821918487549, "grad_norm_var": 2.5026594018962098e-06, "learning_rate": 0.0011871751796744296, "loss": 2.521, "step": 10048 }, { "crossentropy": 2.550099492073059, "epoch": 0.5464531390195492, "grad_norm": 0.03201849386096001, "grad_norm_var": 2.250473019008376e-06, "learning_rate": 0.0011863697755744756, "loss": 2.5501, "step": 10049 }, { "crossentropy": 2.552030324935913, "epoch": 0.5465075178770494, "grad_norm": 0.03182633966207504, "grad_norm_var": 2.2677745662272407e-06, "learning_rate": 0.0011855646079923115, "loss": 2.552, "step": 10050 }, { "crossentropy": 2.609103798866272, "epoch": 0.5465618967345496, "grad_norm": 0.03120812587440014, "grad_norm_var": 2.1443172008148504e-06, "learning_rate": 0.0011847596769778734, "loss": 2.6091, "step": 10051 }, { "crossentropy": 2.497005581855774, "epoch": 0.5466162755920498, "grad_norm": 0.03059248998761177, "grad_norm_var": 2.2971772576956794e-06, "learning_rate": 0.0011839549825810852, "loss": 2.497, "step": 10052 }, { "crossentropy": 2.5028403997421265, "epoch": 0.54667065444955, "grad_norm": 0.03141249343752861, "grad_norm_var": 2.2932064683215774e-06, "learning_rate": 0.0011831505248518503, "loss": 2.5028, "step": 10053 }, { "crossentropy": 2.45103120803833, "epoch": 0.5467250333070502, "grad_norm": 0.03129562735557556, "grad_norm_var": 2.04610039197291e-06, "learning_rate": 0.001182346303840061, "loss": 2.451, "step": 10054 }, { "crossentropy": 2.5764037370681763, "epoch": 0.5467794121645504, "grad_norm": 0.03136663883924484, "grad_norm_var": 2.0401238692128893e-06, "learning_rate": 0.0011815423195955955, "loss": 2.5764, "step": 10055 }, { "crossentropy": 2.5156409740448, "epoch": 0.5468337910220507, "grad_norm": 0.03169787675142288, "grad_norm_var": 1.9515375949646457e-06, "learning_rate": 0.0011807385721683144, "loss": 2.5156, "step": 10056 }, { "crossentropy": 2.551588296890259, "epoch": 0.5468881698795508, "grad_norm": 0.0332048274576664, "grad_norm_var": 2.0567537549430006e-06, "learning_rate": 0.0011799350616080685, "loss": 2.5516, "step": 10057 }, { "crossentropy": 2.4802311658859253, "epoch": 0.5469425487370511, "grad_norm": 0.03229575231671333, "grad_norm_var": 2.0609813391072225e-06, "learning_rate": 0.0011791317879646856, "loss": 2.4802, "step": 10058 }, { "crossentropy": 2.543696165084839, "epoch": 0.5469969275945512, "grad_norm": 0.03142467886209488, "grad_norm_var": 1.921896544054474e-06, "learning_rate": 0.0011783287512879886, "loss": 2.5437, "step": 10059 }, { "crossentropy": 2.5287548303604126, "epoch": 0.5470513064520515, "grad_norm": 0.031917307525873184, "grad_norm_var": 1.9110999428495103e-06, "learning_rate": 0.001177525951627781, "loss": 2.5288, "step": 10060 }, { "crossentropy": 2.557669162750244, "epoch": 0.5471056853095516, "grad_norm": 0.030673682689666748, "grad_norm_var": 1.987638097295831e-06, "learning_rate": 0.001176723389033848, "loss": 2.5577, "step": 10061 }, { "crossentropy": 2.6434956789016724, "epoch": 0.5471600641670519, "grad_norm": 0.03308050334453583, "grad_norm_var": 1.9110863401603965e-06, "learning_rate": 0.0011759210635559653, "loss": 2.6435, "step": 10062 }, { "crossentropy": 2.5277395248413086, "epoch": 0.547214443024552, "grad_norm": 0.03158877417445183, "grad_norm_var": 1.5668604509788079e-06, "learning_rate": 0.0011751189752438956, "loss": 2.5277, "step": 10063 }, { "crossentropy": 2.567347764968872, "epoch": 0.5472688218820523, "grad_norm": 0.03087160736322403, "grad_norm_var": 5.540380954006102e-07, "learning_rate": 0.00117431712414738, "loss": 2.5673, "step": 10064 }, { "crossentropy": 2.4695838689804077, "epoch": 0.5473232007395524, "grad_norm": 0.03194264695048332, "grad_norm_var": 5.507186361214013e-07, "learning_rate": 0.0011735155103161493, "loss": 2.4696, "step": 10065 }, { "crossentropy": 2.646671772003174, "epoch": 0.5473775795970527, "grad_norm": 0.03257971629500389, "grad_norm_var": 6.039094544612085e-07, "learning_rate": 0.0011727141337999191, "loss": 2.6467, "step": 10066 }, { "crossentropy": 2.457634925842285, "epoch": 0.5474319584545528, "grad_norm": 0.032334815710783005, "grad_norm_var": 6.098005263234392e-07, "learning_rate": 0.0011719129946483903, "loss": 2.4576, "step": 10067 }, { "crossentropy": 2.562687873840332, "epoch": 0.5474863373120531, "grad_norm": 0.03512867912650108, "grad_norm_var": 1.1852092871503585e-06, "learning_rate": 0.0011711120929112507, "loss": 2.5627, "step": 10068 }, { "crossentropy": 2.570467710494995, "epoch": 0.5475407161695532, "grad_norm": 0.033169426023960114, "grad_norm_var": 1.2285654229467665e-06, "learning_rate": 0.001170311428638166, "loss": 2.5705, "step": 10069 }, { "crossentropy": 2.541544198989868, "epoch": 0.5475950950270535, "grad_norm": 0.03151770308613777, "grad_norm_var": 1.2060303712465558e-06, "learning_rate": 0.0011695110018787975, "loss": 2.5415, "step": 10070 }, { "crossentropy": 2.6779705286026, "epoch": 0.5476494738845537, "grad_norm": 0.031051529571413994, "grad_norm_var": 1.2461850986900663e-06, "learning_rate": 0.0011687108126827866, "loss": 2.678, "step": 10071 }, { "crossentropy": 2.6132739782333374, "epoch": 0.5477038527420539, "grad_norm": 0.03131553903222084, "grad_norm_var": 1.2786233703595267e-06, "learning_rate": 0.0011679108610997584, "loss": 2.6133, "step": 10072 }, { "crossentropy": 2.506166458129883, "epoch": 0.5477582315995541, "grad_norm": 0.03177785500884056, "grad_norm_var": 1.201593537445084e-06, "learning_rate": 0.0011671111471793245, "loss": 2.5062, "step": 10073 }, { "crossentropy": 2.4123737812042236, "epoch": 0.5478126104570543, "grad_norm": 0.031155340373516083, "grad_norm_var": 1.2442760164351699e-06, "learning_rate": 0.0011663116709710864, "loss": 2.4124, "step": 10074 }, { "crossentropy": 2.468882918357849, "epoch": 0.5478669893145545, "grad_norm": 0.0318211130797863, "grad_norm_var": 1.2252416017313837e-06, "learning_rate": 0.001165512432524623, "loss": 2.4689, "step": 10075 }, { "crossentropy": 2.467267155647278, "epoch": 0.5479213681720547, "grad_norm": 0.03123108483850956, "grad_norm_var": 1.2618172086225598e-06, "learning_rate": 0.0011647134318895037, "loss": 2.4673, "step": 10076 }, { "crossentropy": 2.4791512489318848, "epoch": 0.5479757470295549, "grad_norm": 0.030754247680306435, "grad_norm_var": 1.2484858133371037e-06, "learning_rate": 0.0011639146691152812, "loss": 2.4792, "step": 10077 }, { "crossentropy": 2.6066783666610718, "epoch": 0.5480301258870551, "grad_norm": 0.032318536192178726, "grad_norm_var": 1.1706844066431405e-06, "learning_rate": 0.0011631161442514954, "loss": 2.6067, "step": 10078 }, { "crossentropy": 2.4792646169662476, "epoch": 0.5480845047445553, "grad_norm": 0.03684572875499725, "grad_norm_var": 2.6728123752798543e-06, "learning_rate": 0.001162317857347669, "loss": 2.4793, "step": 10079 }, { "crossentropy": 2.514072299003601, "epoch": 0.5481388836020555, "grad_norm": 0.03217120096087456, "grad_norm_var": 2.5415220227189046e-06, "learning_rate": 0.0011615198084533125, "loss": 2.5141, "step": 10080 }, { "crossentropy": 2.537107467651367, "epoch": 0.5481932624595557, "grad_norm": 0.036612171679735184, "grad_norm_var": 3.669547827683907e-06, "learning_rate": 0.0011607219976179156, "loss": 2.5371, "step": 10081 }, { "crossentropy": 2.557400703430176, "epoch": 0.548247641317056, "grad_norm": 0.03306104615330696, "grad_norm_var": 3.681985179608072e-06, "learning_rate": 0.001159924424890964, "loss": 2.5574, "step": 10082 }, { "crossentropy": 2.582677483558655, "epoch": 0.5483020201745561, "grad_norm": 0.03294236958026886, "grad_norm_var": 3.6802014425614624e-06, "learning_rate": 0.0011591270903219186, "loss": 2.5827, "step": 10083 }, { "crossentropy": 2.4779168367385864, "epoch": 0.5483563990320564, "grad_norm": 0.03349515050649643, "grad_norm_var": 3.313558254816592e-06, "learning_rate": 0.00115832999396023, "loss": 2.4779, "step": 10084 }, { "crossentropy": 2.53153395652771, "epoch": 0.5484107778895565, "grad_norm": 0.031706731766462326, "grad_norm_var": 3.33183502266332e-06, "learning_rate": 0.0011575331358553338, "loss": 2.5315, "step": 10085 }, { "crossentropy": 2.3966223001480103, "epoch": 0.5484651567470568, "grad_norm": 0.033327773213386536, "grad_norm_var": 3.302895426396376e-06, "learning_rate": 0.0011567365160566495, "loss": 2.3966, "step": 10086 }, { "crossentropy": 2.5516302585601807, "epoch": 0.5485195356045569, "grad_norm": 0.03259497880935669, "grad_norm_var": 3.133282240154409e-06, "learning_rate": 0.001155940134613584, "loss": 2.5516, "step": 10087 }, { "crossentropy": 2.512904167175293, "epoch": 0.5485739144620572, "grad_norm": 0.033220794051885605, "grad_norm_var": 3.0095544954454484e-06, "learning_rate": 0.0011551439915755274, "loss": 2.5129, "step": 10088 }, { "crossentropy": 2.5721110105514526, "epoch": 0.5486282933195573, "grad_norm": 0.03227255120873451, "grad_norm_var": 2.956456192219845e-06, "learning_rate": 0.0011543480869918555, "loss": 2.5721, "step": 10089 }, { "crossentropy": 2.5602540969848633, "epoch": 0.5486826721770576, "grad_norm": 0.032322388142347336, "grad_norm_var": 2.7785542058967235e-06, "learning_rate": 0.0011535524209119303, "loss": 2.5603, "step": 10090 }, { "crossentropy": 2.5416029691696167, "epoch": 0.5487370510345577, "grad_norm": 0.032222531735897064, "grad_norm_var": 2.7298841426598787e-06, "learning_rate": 0.0011527569933850994, "loss": 2.5416, "step": 10091 }, { "crossentropy": 2.5273109674453735, "epoch": 0.548791429892058, "grad_norm": 0.03261976316571236, "grad_norm_var": 2.5333070000142182e-06, "learning_rate": 0.00115196180446069, "loss": 2.5273, "step": 10092 }, { "crossentropy": 2.6025742292404175, "epoch": 0.5488458087495581, "grad_norm": 0.031117869541049004, "grad_norm_var": 2.4312115667066927e-06, "learning_rate": 0.0011511668541880238, "loss": 2.6026, "step": 10093 }, { "crossentropy": 2.5947253704071045, "epoch": 0.5489001876070584, "grad_norm": 0.03236396983265877, "grad_norm_var": 2.426889974069937e-06, "learning_rate": 0.0011503721426164022, "loss": 2.5947, "step": 10094 }, { "crossentropy": 2.515032410621643, "epoch": 0.5489545664645585, "grad_norm": 0.03116273321211338, "grad_norm_var": 1.5738640835519581e-06, "learning_rate": 0.0011495776697951093, "loss": 2.515, "step": 10095 }, { "crossentropy": 2.4894580841064453, "epoch": 0.5490089453220588, "grad_norm": 0.03221911936998367, "grad_norm_var": 1.570623433374413e-06, "learning_rate": 0.00114878343577342, "loss": 2.4895, "step": 10096 }, { "crossentropy": 2.545781970024109, "epoch": 0.5490633241795589, "grad_norm": 0.03149854391813278, "grad_norm_var": 5.402020121382407e-07, "learning_rate": 0.001147989440600592, "loss": 2.5458, "step": 10097 }, { "crossentropy": 2.488843083381653, "epoch": 0.5491177030370592, "grad_norm": 0.031072041019797325, "grad_norm_var": 6.07979255864037e-07, "learning_rate": 0.0011471956843258674, "loss": 2.4888, "step": 10098 }, { "crossentropy": 2.501207113265991, "epoch": 0.5491720818945593, "grad_norm": 0.0325041189789772, "grad_norm_var": 5.801075230405974e-07, "learning_rate": 0.0011464021669984742, "loss": 2.5012, "step": 10099 }, { "crossentropy": 2.5248429775238037, "epoch": 0.5492264607520596, "grad_norm": 0.03155519440770149, "grad_norm_var": 4.887407819483773e-07, "learning_rate": 0.0011456088886676258, "loss": 2.5248, "step": 10100 }, { "crossentropy": 2.5866187810897827, "epoch": 0.5492808396095598, "grad_norm": 0.033241551369428635, "grad_norm_var": 5.531744646433083e-07, "learning_rate": 0.0011448158493825211, "loss": 2.5866, "step": 10101 }, { "crossentropy": 2.7013700008392334, "epoch": 0.54933521846706, "grad_norm": 0.03245190531015396, "grad_norm_var": 4.7026305599940676e-07, "learning_rate": 0.0011440230491923443, "loss": 2.7014, "step": 10102 }, { "crossentropy": 2.5862377882003784, "epoch": 0.5493895973245602, "grad_norm": 0.03563234582543373, "grad_norm_var": 1.2260576588237708e-06, "learning_rate": 0.0011432304881462607, "loss": 2.5862, "step": 10103 }, { "crossentropy": 2.59361732006073, "epoch": 0.5494439761820604, "grad_norm": 0.03460173308849335, "grad_norm_var": 1.5069904476453736e-06, "learning_rate": 0.0011424381662934276, "loss": 2.5936, "step": 10104 }, { "crossentropy": 2.4564905166625977, "epoch": 0.5494983550395606, "grad_norm": 0.0317956916987896, "grad_norm_var": 1.5311274352901854e-06, "learning_rate": 0.0011416460836829844, "loss": 2.4565, "step": 10105 }, { "crossentropy": 2.6215603351593018, "epoch": 0.5495527338970608, "grad_norm": 0.03275720402598381, "grad_norm_var": 1.538511438000387e-06, "learning_rate": 0.0011408542403640526, "loss": 2.6216, "step": 10106 }, { "crossentropy": 2.456638813018799, "epoch": 0.549607112754561, "grad_norm": 0.03324287384748459, "grad_norm_var": 1.5758964082897836e-06, "learning_rate": 0.001140062636385742, "loss": 2.4566, "step": 10107 }, { "crossentropy": 2.5442062616348267, "epoch": 0.5496614916120612, "grad_norm": 0.03241224214434624, "grad_norm_var": 1.5749917128239036e-06, "learning_rate": 0.0011392712717971504, "loss": 2.5442, "step": 10108 }, { "crossentropy": 2.6274189949035645, "epoch": 0.5497158704695614, "grad_norm": 0.032452408224344254, "grad_norm_var": 1.44449401661903e-06, "learning_rate": 0.0011384801466473539, "loss": 2.6274, "step": 10109 }, { "crossentropy": 2.576662302017212, "epoch": 0.5497702493270616, "grad_norm": 0.032749243080616, "grad_norm_var": 1.4436894068579782e-06, "learning_rate": 0.0011376892609854183, "loss": 2.5767, "step": 10110 }, { "crossentropy": 2.4425615072250366, "epoch": 0.5498246281845618, "grad_norm": 0.03204529359936714, "grad_norm_var": 1.3250878881865818e-06, "learning_rate": 0.0011368986148603943, "loss": 2.4426, "step": 10111 }, { "crossentropy": 2.495081305503845, "epoch": 0.549879007042062, "grad_norm": 0.033133767545223236, "grad_norm_var": 1.3261112371525658e-06, "learning_rate": 0.0011361082083213164, "loss": 2.4951, "step": 10112 }, { "crossentropy": 2.491029143333435, "epoch": 0.5499333858995622, "grad_norm": 0.032650724053382874, "grad_norm_var": 1.2250256310841149e-06, "learning_rate": 0.001135318041417207, "loss": 2.491, "step": 10113 }, { "crossentropy": 2.580090284347534, "epoch": 0.5499877647570625, "grad_norm": 0.03210201486945152, "grad_norm_var": 1.058333993512496e-06, "learning_rate": 0.0011345281141970664, "loss": 2.5801, "step": 10114 }, { "crossentropy": 2.4695171117782593, "epoch": 0.5500421436145626, "grad_norm": 0.030900975689291954, "grad_norm_var": 1.2892665583241613e-06, "learning_rate": 0.0011337384267098899, "loss": 2.4695, "step": 10115 }, { "crossentropy": 2.6138076782226562, "epoch": 0.5500965224720629, "grad_norm": 0.03327396139502525, "grad_norm_var": 1.2040256580845745e-06, "learning_rate": 0.0011329489790046527, "loss": 2.6138, "step": 10116 }, { "crossentropy": 2.578964114189148, "epoch": 0.550150901329563, "grad_norm": 0.03228261321783066, "grad_norm_var": 1.210188034324886e-06, "learning_rate": 0.0011321597711303139, "loss": 2.579, "step": 10117 }, { "crossentropy": 2.5672460794448853, "epoch": 0.5502052801870633, "grad_norm": 0.031031738966703415, "grad_norm_var": 1.3984282455223867e-06, "learning_rate": 0.0011313708031358183, "loss": 2.5672, "step": 10118 }, { "crossentropy": 2.5563857555389404, "epoch": 0.5502596590445634, "grad_norm": 0.03222035989165306, "grad_norm_var": 7.881716033823615e-07, "learning_rate": 0.0011305820750701012, "loss": 2.5564, "step": 10119 }, { "crossentropy": 2.466510534286499, "epoch": 0.5503140379020637, "grad_norm": 0.032032739371061325, "grad_norm_var": 4.733107838399923e-07, "learning_rate": 0.0011297935869820752, "loss": 2.4665, "step": 10120 }, { "crossentropy": 2.5596699714660645, "epoch": 0.5503684167595638, "grad_norm": 0.030965594574809074, "grad_norm_var": 5.741572972159381e-07, "learning_rate": 0.0011290053389206428, "loss": 2.5597, "step": 10121 }, { "crossentropy": 2.5762100219726562, "epoch": 0.5504227956170641, "grad_norm": 0.03199835121631622, "grad_norm_var": 5.60433988967303e-07, "learning_rate": 0.0011282173309346904, "loss": 2.5762, "step": 10122 }, { "crossentropy": 2.483335018157959, "epoch": 0.5504771744745642, "grad_norm": 0.03146682679653168, "grad_norm_var": 5.149860254996877e-07, "learning_rate": 0.0011274295630730896, "loss": 2.4833, "step": 10123 }, { "crossentropy": 2.4336841106414795, "epoch": 0.5505315533320645, "grad_norm": 0.03121742233633995, "grad_norm_var": 5.556510095996425e-07, "learning_rate": 0.0011266420353846984, "loss": 2.4337, "step": 10124 }, { "crossentropy": 2.6410863399505615, "epoch": 0.5505859321895646, "grad_norm": 0.033766958862543106, "grad_norm_var": 7.37208270899734e-07, "learning_rate": 0.001125854747918354, "loss": 2.6411, "step": 10125 }, { "crossentropy": 2.4009125232696533, "epoch": 0.5506403110470649, "grad_norm": 0.030875060707330704, "grad_norm_var": 7.982295245246203e-07, "learning_rate": 0.0011250677007228877, "loss": 2.4009, "step": 10126 }, { "crossentropy": 2.5580930709838867, "epoch": 0.5506946899045652, "grad_norm": 0.03157937154173851, "grad_norm_var": 8.088452516240759e-07, "learning_rate": 0.0011242808938471116, "loss": 2.5581, "step": 10127 }, { "crossentropy": 2.633308172225952, "epoch": 0.5507490687620653, "grad_norm": 0.030955372378230095, "grad_norm_var": 7.670230427216645e-07, "learning_rate": 0.0011234943273398196, "loss": 2.6333, "step": 10128 }, { "crossentropy": 2.5741323232650757, "epoch": 0.5508034476195656, "grad_norm": 0.034170687198638916, "grad_norm_var": 1.0772376735277418e-06, "learning_rate": 0.0011227080012497937, "loss": 2.5741, "step": 10129 }, { "crossentropy": 2.474242925643921, "epoch": 0.5508578264770657, "grad_norm": 0.031077586114406586, "grad_norm_var": 1.1189919106933347e-06, "learning_rate": 0.001121921915625806, "loss": 2.4742, "step": 10130 }, { "crossentropy": 2.467893123626709, "epoch": 0.550912205334566, "grad_norm": 0.032388683408498764, "grad_norm_var": 1.0663989467467327e-06, "learning_rate": 0.0011211360705166034, "loss": 2.4679, "step": 10131 }, { "crossentropy": 2.5143423080444336, "epoch": 0.5509665841920661, "grad_norm": 0.03269490227103233, "grad_norm_var": 9.856341402189143e-07, "learning_rate": 0.0011203504659709263, "loss": 2.5143, "step": 10132 }, { "crossentropy": 2.5655665397644043, "epoch": 0.5510209630495664, "grad_norm": 0.032719049602746964, "grad_norm_var": 1.0186244229950491e-06, "learning_rate": 0.0011195651020374957, "loss": 2.5656, "step": 10133 }, { "crossentropy": 2.5545907020568848, "epoch": 0.5510753419070665, "grad_norm": 0.03146177530288696, "grad_norm_var": 9.776720295518367e-07, "learning_rate": 0.0011187799787650205, "loss": 2.5546, "step": 10134 }, { "crossentropy": 2.5586684942245483, "epoch": 0.5511297207645668, "grad_norm": 0.03300810232758522, "grad_norm_var": 1.042287160588536e-06, "learning_rate": 0.0011179950962021917, "loss": 2.5587, "step": 10135 }, { "crossentropy": 2.642109513282776, "epoch": 0.5511840996220669, "grad_norm": 0.03155312314629555, "grad_norm_var": 1.0560832244092822e-06, "learning_rate": 0.0011172104543976885, "loss": 2.6421, "step": 10136 }, { "crossentropy": 2.5478497743606567, "epoch": 0.5512384784795672, "grad_norm": 0.033434223383665085, "grad_norm_var": 1.09857164810842e-06, "learning_rate": 0.0011164260534001725, "loss": 2.5478, "step": 10137 }, { "crossentropy": 2.533526659011841, "epoch": 0.5512928573370673, "grad_norm": 0.031232811510562897, "grad_norm_var": 1.1504715714809177e-06, "learning_rate": 0.001115641893258294, "loss": 2.5335, "step": 10138 }, { "crossentropy": 2.714630126953125, "epoch": 0.5513472361945676, "grad_norm": 0.0338631309568882, "grad_norm_var": 1.3070211848857042e-06, "learning_rate": 0.0011148579740206822, "loss": 2.7146, "step": 10139 }, { "crossentropy": 2.491783022880554, "epoch": 0.5514016150520678, "grad_norm": 0.033077798783779144, "grad_norm_var": 1.2672295882120194e-06, "learning_rate": 0.001114074295735955, "loss": 2.4918, "step": 10140 }, { "crossentropy": 2.5612759590148926, "epoch": 0.551455993909568, "grad_norm": 0.03162562847137451, "grad_norm_var": 1.1538688711864766e-06, "learning_rate": 0.0011132908584527202, "loss": 2.5613, "step": 10141 }, { "crossentropy": 2.5851563215255737, "epoch": 0.5515103727670682, "grad_norm": 0.03151639178395271, "grad_norm_var": 1.0635141262895492e-06, "learning_rate": 0.0011125076622195613, "loss": 2.5852, "step": 10142 }, { "crossentropy": 2.533677816390991, "epoch": 0.5515647516245684, "grad_norm": 0.031167728826403618, "grad_norm_var": 1.1121429000059945e-06, "learning_rate": 0.0011117247070850534, "loss": 2.5337, "step": 10143 }, { "crossentropy": 2.5824825763702393, "epoch": 0.5516191304820686, "grad_norm": 0.03183611109852791, "grad_norm_var": 1.0089827295692618e-06, "learning_rate": 0.0011109419930977532, "loss": 2.5825, "step": 10144 }, { "crossentropy": 2.5044260025024414, "epoch": 0.5516735093395688, "grad_norm": 0.0749683827161789, "grad_norm_var": 0.0001152037646750333, "learning_rate": 0.0011101595203062053, "loss": 2.5044, "step": 10145 }, { "crossentropy": 2.6618306636810303, "epoch": 0.551727888197069, "grad_norm": 0.030544061213731766, "grad_norm_var": 0.00011549002518691554, "learning_rate": 0.0011093772887589377, "loss": 2.6618, "step": 10146 }, { "crossentropy": 2.6136852502822876, "epoch": 0.5517822670545692, "grad_norm": 0.031168505549430847, "grad_norm_var": 0.00011597834345937866, "learning_rate": 0.0011085952985044633, "loss": 2.6137, "step": 10147 }, { "crossentropy": 2.487648844718933, "epoch": 0.5518366459120694, "grad_norm": 0.03417674079537392, "grad_norm_var": 0.00011571112473245369, "learning_rate": 0.0011078135495912805, "loss": 2.4876, "step": 10148 }, { "crossentropy": 2.6212615966796875, "epoch": 0.5518910247695696, "grad_norm": 0.03191468119621277, "grad_norm_var": 0.0001159784534545961, "learning_rate": 0.0011070320420678726, "loss": 2.6213, "step": 10149 }, { "crossentropy": 2.6248233318328857, "epoch": 0.5519454036270698, "grad_norm": 0.03394182771444321, "grad_norm_var": 0.0001152641901112072, "learning_rate": 0.0011062507759827095, "loss": 2.6248, "step": 10150 }, { "crossentropy": 2.522218346595764, "epoch": 0.55199978248457, "grad_norm": 0.03267314285039902, "grad_norm_var": 0.0001153574534599073, "learning_rate": 0.0011054697513842404, "loss": 2.5222, "step": 10151 }, { "crossentropy": 2.53041410446167, "epoch": 0.5520541613420702, "grad_norm": 0.03363484516739845, "grad_norm_var": 0.00011469422715456034, "learning_rate": 0.0011046889683209089, "loss": 2.5304, "step": 10152 }, { "crossentropy": 2.6054847240448, "epoch": 0.5521085401995705, "grad_norm": 0.03845411166548729, "grad_norm_var": 0.00011518871644676728, "learning_rate": 0.0011039084268411347, "loss": 2.6055, "step": 10153 }, { "crossentropy": 2.566983222961426, "epoch": 0.5521629190570706, "grad_norm": 0.03159691393375397, "grad_norm_var": 0.00011499653060212977, "learning_rate": 0.001103128126993328, "loss": 2.567, "step": 10154 }, { "crossentropy": 2.6187511682510376, "epoch": 0.5522172979145709, "grad_norm": 0.0314490981400013, "grad_norm_var": 0.00011585059838455473, "learning_rate": 0.0011023480688258814, "loss": 2.6188, "step": 10155 }, { "crossentropy": 2.4670958518981934, "epoch": 0.552271676772071, "grad_norm": 0.03442300856113434, "grad_norm_var": 0.00011557693657474378, "learning_rate": 0.001101568252387174, "loss": 2.4671, "step": 10156 }, { "crossentropy": 2.502099871635437, "epoch": 0.5523260556295713, "grad_norm": 0.03244797885417938, "grad_norm_var": 0.00011521432462546197, "learning_rate": 0.0011007886777255687, "loss": 2.5021, "step": 10157 }, { "crossentropy": 2.5498801469802856, "epoch": 0.5523804344870714, "grad_norm": 0.034879300743341446, "grad_norm_var": 0.00011419341708616183, "learning_rate": 0.0011000093448894149, "loss": 2.5499, "step": 10158 }, { "crossentropy": 2.5322265625, "epoch": 0.5524348133445717, "grad_norm": 0.03159615769982338, "grad_norm_var": 0.00011395285584016937, "learning_rate": 0.0010992302539270455, "loss": 2.5322, "step": 10159 }, { "crossentropy": 2.588396668434143, "epoch": 0.5524891922020718, "grad_norm": 0.03331100195646286, "grad_norm_var": 0.00011334734666365257, "learning_rate": 0.001098451404886779, "loss": 2.5884, "step": 10160 }, { "crossentropy": 2.5057793855667114, "epoch": 0.5525435710595721, "grad_norm": 0.03344658389687538, "grad_norm_var": 3.694894017950793e-06, "learning_rate": 0.0010976727978169198, "loss": 2.5058, "step": 10161 }, { "crossentropy": 2.3721463680267334, "epoch": 0.5525979499170722, "grad_norm": 0.031522445380687714, "grad_norm_var": 3.420823343845171e-06, "learning_rate": 0.001096894432765753, "loss": 2.3721, "step": 10162 }, { "crossentropy": 2.4747555255889893, "epoch": 0.5526523287745725, "grad_norm": 0.031015994027256966, "grad_norm_var": 3.462870887091388e-06, "learning_rate": 0.0010961163097815579, "loss": 2.4748, "step": 10163 }, { "crossentropy": 2.5274271965026855, "epoch": 0.5527067076320726, "grad_norm": 0.032170675694942474, "grad_norm_var": 3.4411630578734154e-06, "learning_rate": 0.001095338428912588, "loss": 2.5274, "step": 10164 }, { "crossentropy": 2.546507239341736, "epoch": 0.5527610864895729, "grad_norm": 0.03353697061538696, "grad_norm_var": 3.364432855174866e-06, "learning_rate": 0.0010945607902070886, "loss": 2.5465, "step": 10165 }, { "crossentropy": 2.5819932222366333, "epoch": 0.552815465347073, "grad_norm": 0.031516268849372864, "grad_norm_var": 3.4699952985081053e-06, "learning_rate": 0.001093783393713288, "loss": 2.582, "step": 10166 }, { "crossentropy": 2.5648257732391357, "epoch": 0.5528698442045733, "grad_norm": 0.031602274626493454, "grad_norm_var": 3.585432431024585e-06, "learning_rate": 0.0010930062394793989, "loss": 2.5648, "step": 10167 }, { "crossentropy": 2.5372633934020996, "epoch": 0.5529242230620734, "grad_norm": 0.03352690115571022, "grad_norm_var": 3.5757675624903944e-06, "learning_rate": 0.001092229327553621, "loss": 2.5373, "step": 10168 }, { "crossentropy": 2.524200677871704, "epoch": 0.5529786019195737, "grad_norm": 0.03237325698137283, "grad_norm_var": 1.3884998052051626e-06, "learning_rate": 0.0010914526579841367, "loss": 2.5242, "step": 10169 }, { "crossentropy": 2.574341297149658, "epoch": 0.5530329807770739, "grad_norm": 0.034819021821022034, "grad_norm_var": 1.6382561541611648e-06, "learning_rate": 0.0010906762308191147, "loss": 2.5743, "step": 10170 }, { "crossentropy": 2.514135003089905, "epoch": 0.5530873596345741, "grad_norm": 0.031760651618242264, "grad_norm_var": 1.5912252933303658e-06, "learning_rate": 0.0010899000461067083, "loss": 2.5141, "step": 10171 }, { "crossentropy": 2.6065906286239624, "epoch": 0.5531417384920743, "grad_norm": 0.03311259299516678, "grad_norm_var": 1.4056756062087351e-06, "learning_rate": 0.0010891241038950567, "loss": 2.6066, "step": 10172 }, { "crossentropy": 2.5907517671585083, "epoch": 0.5531961173495745, "grad_norm": 0.031389907002449036, "grad_norm_var": 1.5062449313333794e-06, "learning_rate": 0.0010883484042322794, "loss": 2.5908, "step": 10173 }, { "crossentropy": 2.6059125661849976, "epoch": 0.5532504962070747, "grad_norm": 0.030841458588838577, "grad_norm_var": 1.2974551572003291e-06, "learning_rate": 0.001087572947166489, "loss": 2.6059, "step": 10174 }, { "crossentropy": 2.4405338764190674, "epoch": 0.5533048750645749, "grad_norm": 0.03107125125825405, "grad_norm_var": 1.3671821455386752e-06, "learning_rate": 0.001086797732745778, "loss": 2.4405, "step": 10175 }, { "crossentropy": 2.5242717266082764, "epoch": 0.5533592539220751, "grad_norm": 0.030353954061865807, "grad_norm_var": 1.5204332318407745e-06, "learning_rate": 0.0010860227610182223, "loss": 2.5243, "step": 10176 }, { "crossentropy": 2.5575865507125854, "epoch": 0.5534136327795753, "grad_norm": 0.0314149409532547, "grad_norm_var": 1.4214276870597726e-06, "learning_rate": 0.0010852480320318852, "loss": 2.5576, "step": 10177 }, { "crossentropy": 2.5597625970840454, "epoch": 0.5534680116370755, "grad_norm": 0.032442606985569, "grad_norm_var": 1.4155369082947578e-06, "learning_rate": 0.001084473545834816, "loss": 2.5598, "step": 10178 }, { "crossentropy": 2.523857355117798, "epoch": 0.5535223904945757, "grad_norm": 0.03428124263882637, "grad_norm_var": 1.6276839686887405e-06, "learning_rate": 0.001083699302475047, "loss": 2.5239, "step": 10179 }, { "crossentropy": 2.505889892578125, "epoch": 0.5535767693520759, "grad_norm": 0.03311021998524666, "grad_norm_var": 1.6712429445166095e-06, "learning_rate": 0.0010829253020005957, "loss": 2.5059, "step": 10180 }, { "crossentropy": 2.5321052074432373, "epoch": 0.5536311482095762, "grad_norm": 0.032136667519807816, "grad_norm_var": 1.5669701155881603e-06, "learning_rate": 0.001082151544459466, "loss": 2.5321, "step": 10181 }, { "crossentropy": 2.579238772392273, "epoch": 0.5536855270670763, "grad_norm": 0.03980361670255661, "grad_norm_var": 5.065763616450558e-06, "learning_rate": 0.0010813780298996444, "loss": 2.5792, "step": 10182 }, { "crossentropy": 2.412826657295227, "epoch": 0.5537399059245766, "grad_norm": 0.030755696818232536, "grad_norm_var": 5.240395012529259e-06, "learning_rate": 0.0010806047583691053, "loss": 2.4128, "step": 10183 }, { "crossentropy": 2.5245933532714844, "epoch": 0.5537942847820767, "grad_norm": 0.03124074637889862, "grad_norm_var": 5.3148804007974025e-06, "learning_rate": 0.0010798317299158028, "loss": 2.5246, "step": 10184 }, { "crossentropy": 2.545851469039917, "epoch": 0.553848663639577, "grad_norm": 0.03098485618829727, "grad_norm_var": 5.46932525526815e-06, "learning_rate": 0.0010790589445876825, "loss": 2.5459, "step": 10185 }, { "crossentropy": 2.5315818786621094, "epoch": 0.5539030424970771, "grad_norm": 0.031794533133506775, "grad_norm_var": 5.093753009388621e-06, "learning_rate": 0.0010782864024326728, "loss": 2.5316, "step": 10186 }, { "crossentropy": 2.488891363143921, "epoch": 0.5539574213545774, "grad_norm": 0.033057939261198044, "grad_norm_var": 5.108943603206285e-06, "learning_rate": 0.0010775141034986824, "loss": 2.4889, "step": 10187 }, { "crossentropy": 2.6343964338302612, "epoch": 0.5540118002120775, "grad_norm": 0.03243228793144226, "grad_norm_var": 5.069786550645341e-06, "learning_rate": 0.0010767420478336094, "loss": 2.6344, "step": 10188 }, { "crossentropy": 2.6065651178359985, "epoch": 0.5540661790695778, "grad_norm": 0.031300369650125504, "grad_norm_var": 5.081385326345652e-06, "learning_rate": 0.0010759702354853397, "loss": 2.6066, "step": 10189 }, { "crossentropy": 2.5627361536026, "epoch": 0.5541205579270779, "grad_norm": 0.03095472790300846, "grad_norm_var": 5.05994955154257e-06, "learning_rate": 0.001075198666501736, "loss": 2.5627, "step": 10190 }, { "crossentropy": 2.4704174995422363, "epoch": 0.5541749367845782, "grad_norm": 0.03130144253373146, "grad_norm_var": 5.02490445933415e-06, "learning_rate": 0.0010744273409306515, "loss": 2.4704, "step": 10191 }, { "crossentropy": 2.6143778562545776, "epoch": 0.5542293156420783, "grad_norm": 0.030484741553664207, "grad_norm_var": 4.991421033169333e-06, "learning_rate": 0.0010736562588199246, "loss": 2.6144, "step": 10192 }, { "crossentropy": 2.510239005088806, "epoch": 0.5542836944995786, "grad_norm": 0.030639301985502243, "grad_norm_var": 5.1250563436975545e-06, "learning_rate": 0.001072885420217375, "loss": 2.5102, "step": 10193 }, { "crossentropy": 2.5432268381118774, "epoch": 0.5543380733570787, "grad_norm": 0.03201141580939293, "grad_norm_var": 5.128194047225226e-06, "learning_rate": 0.0010721148251708124, "loss": 2.5432, "step": 10194 }, { "crossentropy": 2.6230751276016235, "epoch": 0.554392452214579, "grad_norm": 0.032300785183906555, "grad_norm_var": 4.8417432338231875e-06, "learning_rate": 0.001071344473728023, "loss": 2.6231, "step": 10195 }, { "crossentropy": 2.4927241802215576, "epoch": 0.5544468310720791, "grad_norm": 0.03112432360649109, "grad_norm_var": 4.8324765715766165e-06, "learning_rate": 0.0010705743659367884, "loss": 2.4927, "step": 10196 }, { "crossentropy": 2.5933533906936646, "epoch": 0.5545012099295794, "grad_norm": 0.030709359794855118, "grad_norm_var": 4.937640364249572e-06, "learning_rate": 0.0010698045018448699, "loss": 2.5934, "step": 10197 }, { "crossentropy": 2.617009401321411, "epoch": 0.5545555887870796, "grad_norm": 0.053435277193784714, "grad_norm_var": 3.086042118336378e-05, "learning_rate": 0.00106903488150001, "loss": 2.617, "step": 10198 }, { "crossentropy": 2.512259602546692, "epoch": 0.5546099676445798, "grad_norm": 0.031602147966623306, "grad_norm_var": 3.067640078351942e-05, "learning_rate": 0.0010682655049499413, "loss": 2.5123, "step": 10199 }, { "crossentropy": 2.537665009498596, "epoch": 0.55466434650208, "grad_norm": 0.03293031454086304, "grad_norm_var": 3.049546840960283e-05, "learning_rate": 0.0010674963722423826, "loss": 2.5377, "step": 10200 }, { "crossentropy": 2.4944186210632324, "epoch": 0.5547187253595802, "grad_norm": 0.03296899423003197, "grad_norm_var": 3.022388802851461e-05, "learning_rate": 0.0010667274834250317, "loss": 2.4944, "step": 10201 }, { "crossentropy": 2.6276592016220093, "epoch": 0.5547731042170804, "grad_norm": 0.03399476781487465, "grad_norm_var": 3.015359654383737e-05, "learning_rate": 0.0010659588385455749, "loss": 2.6277, "step": 10202 }, { "crossentropy": 2.5088201761245728, "epoch": 0.5548274830745806, "grad_norm": 0.03193223103880882, "grad_norm_var": 3.0254572387972596e-05, "learning_rate": 0.0010651904376516835, "loss": 2.5088, "step": 10203 }, { "crossentropy": 2.525639533996582, "epoch": 0.5548818619320808, "grad_norm": 0.031797125935554504, "grad_norm_var": 3.033909972946456e-05, "learning_rate": 0.0010644222807910126, "loss": 2.5256, "step": 10204 }, { "crossentropy": 2.4853122234344482, "epoch": 0.554936240789581, "grad_norm": 0.0331156961619854, "grad_norm_var": 3.011117847419795e-05, "learning_rate": 0.0010636543680112042, "loss": 2.4853, "step": 10205 }, { "crossentropy": 2.5271713733673096, "epoch": 0.5549906196470812, "grad_norm": 0.031875308603048325, "grad_norm_var": 2.9887763882904685e-05, "learning_rate": 0.0010628866993598791, "loss": 2.5272, "step": 10206 }, { "crossentropy": 2.526122808456421, "epoch": 0.5550449985045814, "grad_norm": 0.03375379741191864, "grad_norm_var": 2.96219389992801e-05, "learning_rate": 0.001062119274884652, "loss": 2.5261, "step": 10207 }, { "crossentropy": 2.6588655710220337, "epoch": 0.5550993773620816, "grad_norm": 0.032420627772808075, "grad_norm_var": 2.909924040641586e-05, "learning_rate": 0.0010613520946331173, "loss": 2.6589, "step": 10208 }, { "crossentropy": 2.5066596269607544, "epoch": 0.5551537562195819, "grad_norm": 0.032255951315164566, "grad_norm_var": 2.8637717020938795e-05, "learning_rate": 0.0010605851586528525, "loss": 2.5067, "step": 10209 }, { "crossentropy": 2.58783495426178, "epoch": 0.555208135077082, "grad_norm": 0.030702002346515656, "grad_norm_var": 2.902907967146321e-05, "learning_rate": 0.0010598184669914218, "loss": 2.5878, "step": 10210 }, { "crossentropy": 2.5756925344467163, "epoch": 0.5552625139345823, "grad_norm": 0.031846582889556885, "grad_norm_var": 2.911807555873071e-05, "learning_rate": 0.0010590520196963793, "loss": 2.5757, "step": 10211 }, { "crossentropy": 2.482868790626526, "epoch": 0.5553168927920824, "grad_norm": 0.0321350172162056, "grad_norm_var": 2.885786298126297e-05, "learning_rate": 0.0010582858168152554, "loss": 2.4829, "step": 10212 }, { "crossentropy": 2.636616826057434, "epoch": 0.5553712716495827, "grad_norm": 0.032742150127887726, "grad_norm_var": 2.8334766433646177e-05, "learning_rate": 0.0010575198583955697, "loss": 2.6366, "step": 10213 }, { "crossentropy": 2.596203088760376, "epoch": 0.5554256505070828, "grad_norm": 0.03403276950120926, "grad_norm_var": 8.579642065381064e-07, "learning_rate": 0.0010567541444848271, "loss": 2.5962, "step": 10214 }, { "crossentropy": 2.5376909971237183, "epoch": 0.5554800293645831, "grad_norm": 0.032264355570077896, "grad_norm_var": 8.055142719388837e-07, "learning_rate": 0.0010559886751305158, "loss": 2.5377, "step": 10215 }, { "crossentropy": 2.5675368309020996, "epoch": 0.5555344082220832, "grad_norm": 0.03206406906247139, "grad_norm_var": 8.082537798379438e-07, "learning_rate": 0.00105522345038011, "loss": 2.5675, "step": 10216 }, { "crossentropy": 2.5873939990997314, "epoch": 0.5555887870795835, "grad_norm": 0.034243859350681305, "grad_norm_var": 9.906014467042067e-07, "learning_rate": 0.0010544584702810683, "loss": 2.5874, "step": 10217 }, { "crossentropy": 2.637973666191101, "epoch": 0.5556431659370836, "grad_norm": 0.03151744604110718, "grad_norm_var": 9.047197278141036e-07, "learning_rate": 0.001053693734880834, "loss": 2.638, "step": 10218 }, { "crossentropy": 2.4561671018600464, "epoch": 0.5556975447945839, "grad_norm": 0.03246038779616356, "grad_norm_var": 8.878974155449951e-07, "learning_rate": 0.0010529292442268362, "loss": 2.4562, "step": 10219 }, { "crossentropy": 2.4784592390060425, "epoch": 0.555751923652084, "grad_norm": 0.03160939738154411, "grad_norm_var": 9.064842574309113e-07, "learning_rate": 0.0010521649983664856, "loss": 2.4785, "step": 10220 }, { "crossentropy": 2.5311572551727295, "epoch": 0.5558063025095843, "grad_norm": 0.0311563890427351, "grad_norm_var": 9.698855312156717e-07, "learning_rate": 0.001051400997347181, "loss": 2.5312, "step": 10221 }, { "crossentropy": 2.6299595832824707, "epoch": 0.5558606813670844, "grad_norm": 0.03069435991346836, "grad_norm_var": 1.1266789958829769e-06, "learning_rate": 0.0010506372412163057, "loss": 2.63, "step": 10222 }, { "crossentropy": 2.562487840652466, "epoch": 0.5559150602245847, "grad_norm": 0.03318371996283531, "grad_norm_var": 1.032207590099856e-06, "learning_rate": 0.0010498737300212264, "loss": 2.5625, "step": 10223 }, { "crossentropy": 2.4981640577316284, "epoch": 0.5559694390820848, "grad_norm": 0.03309302404522896, "grad_norm_var": 1.0795214885005988e-06, "learning_rate": 0.0010491104638092957, "loss": 2.4982, "step": 10224 }, { "crossentropy": 2.5788369178771973, "epoch": 0.5560238179395851, "grad_norm": 0.03975868225097656, "grad_norm_var": 4.603568057740445e-06, "learning_rate": 0.0010483474426278506, "loss": 2.5788, "step": 10225 }, { "crossentropy": 2.5896044969558716, "epoch": 0.5560781967970853, "grad_norm": 0.033396292477846146, "grad_norm_var": 4.332679660591585e-06, "learning_rate": 0.0010475846665242133, "loss": 2.5896, "step": 10226 }, { "crossentropy": 2.559462547302246, "epoch": 0.5561325756545855, "grad_norm": 0.03296754136681557, "grad_norm_var": 4.25565124436191e-06, "learning_rate": 0.0010468221355456897, "loss": 2.5595, "step": 10227 }, { "crossentropy": 2.5296213626861572, "epoch": 0.5561869545120857, "grad_norm": 0.03360923379659653, "grad_norm_var": 4.2298209883417e-06, "learning_rate": 0.0010460598497395734, "loss": 2.5296, "step": 10228 }, { "crossentropy": 2.5515401363372803, "epoch": 0.5562413333695859, "grad_norm": 0.03380700573325157, "grad_norm_var": 4.257038177599553e-06, "learning_rate": 0.0010452978091531361, "loss": 2.5515, "step": 10229 }, { "crossentropy": 2.549304723739624, "epoch": 0.5562957122270861, "grad_norm": 0.03315586596727371, "grad_norm_var": 4.197927555646812e-06, "learning_rate": 0.001044536013833643, "loss": 2.5493, "step": 10230 }, { "crossentropy": 2.7062872648239136, "epoch": 0.5563500910845863, "grad_norm": 0.03131599724292755, "grad_norm_var": 4.354917434532442e-06, "learning_rate": 0.0010437744638283402, "loss": 2.7063, "step": 10231 }, { "crossentropy": 2.629023313522339, "epoch": 0.5564044699420865, "grad_norm": 0.03296421468257904, "grad_norm_var": 4.292979357453594e-06, "learning_rate": 0.0010430131591844549, "loss": 2.629, "step": 10232 }, { "crossentropy": 2.5621625185012817, "epoch": 0.5564588487995867, "grad_norm": 0.031526871025562286, "grad_norm_var": 4.324883137302114e-06, "learning_rate": 0.0010422520999492052, "loss": 2.5622, "step": 10233 }, { "crossentropy": 2.5197646617889404, "epoch": 0.5565132276570869, "grad_norm": 0.031636301428079605, "grad_norm_var": 4.304038005506957e-06, "learning_rate": 0.00104149128616979, "loss": 2.5198, "step": 10234 }, { "crossentropy": 2.4874101877212524, "epoch": 0.5565676065145871, "grad_norm": 0.031624555587768555, "grad_norm_var": 4.396242981416441e-06, "learning_rate": 0.0010407307178933944, "loss": 2.4874, "step": 10235 }, { "crossentropy": 2.5863393545150757, "epoch": 0.5566219853720873, "grad_norm": 0.032070137560367584, "grad_norm_var": 4.333683900862214e-06, "learning_rate": 0.001039970395167189, "loss": 2.5863, "step": 10236 }, { "crossentropy": 2.4578078985214233, "epoch": 0.5566763642295876, "grad_norm": 0.03073807992041111, "grad_norm_var": 4.44033629384364e-06, "learning_rate": 0.0010392103180383277, "loss": 2.4578, "step": 10237 }, { "crossentropy": 2.5139055252075195, "epoch": 0.5567307430870877, "grad_norm": 0.03480105847120285, "grad_norm_var": 4.316044147487429e-06, "learning_rate": 0.0010384504865539495, "loss": 2.5139, "step": 10238 }, { "crossentropy": 2.5150612592697144, "epoch": 0.556785121944588, "grad_norm": 0.03162838891148567, "grad_norm_var": 4.450503100119014e-06, "learning_rate": 0.00103769090076118, "loss": 2.5151, "step": 10239 }, { "crossentropy": 2.4572386741638184, "epoch": 0.5568395008020881, "grad_norm": 0.032974306493997574, "grad_norm_var": 4.450003741826103e-06, "learning_rate": 0.0010369315607071244, "loss": 2.4572, "step": 10240 }, { "crossentropy": 2.489273190498352, "epoch": 0.5568938796595884, "grad_norm": 0.0371079221367836, "grad_norm_var": 2.4998465449821513e-06, "learning_rate": 0.0010361724664388805, "loss": 2.4893, "step": 10241 }, { "crossentropy": 2.4999457597732544, "epoch": 0.5569482585170885, "grad_norm": 0.032229479402303696, "grad_norm_var": 2.497261968686691e-06, "learning_rate": 0.0010354136180035255, "loss": 2.4999, "step": 10242 }, { "crossentropy": 2.569002866744995, "epoch": 0.5570026373745888, "grad_norm": 0.03318878263235092, "grad_norm_var": 2.5064490336858347e-06, "learning_rate": 0.0010346550154481204, "loss": 2.569, "step": 10243 }, { "crossentropy": 2.4439446926116943, "epoch": 0.5570570162320889, "grad_norm": 0.03315413370728493, "grad_norm_var": 2.468689799929345e-06, "learning_rate": 0.001033896658819713, "loss": 2.4439, "step": 10244 }, { "crossentropy": 2.5041158199310303, "epoch": 0.5571113950895892, "grad_norm": 0.03263198584318161, "grad_norm_var": 2.388628432678262e-06, "learning_rate": 0.00103313854816534, "loss": 2.5041, "step": 10245 }, { "crossentropy": 2.5030676126480103, "epoch": 0.5571657739470893, "grad_norm": 0.03191624954342842, "grad_norm_var": 2.4046540772406825e-06, "learning_rate": 0.001032380683532015, "loss": 2.5031, "step": 10246 }, { "crossentropy": 2.5387978553771973, "epoch": 0.5572201528045896, "grad_norm": 0.0310467891395092, "grad_norm_var": 2.4550668155752146e-06, "learning_rate": 0.0010316230649667403, "loss": 2.5388, "step": 10247 }, { "crossentropy": 2.5316693782806396, "epoch": 0.5572745316620897, "grad_norm": 0.032535579055547714, "grad_norm_var": 2.444445899008963e-06, "learning_rate": 0.0010308656925165033, "loss": 2.5317, "step": 10248 }, { "crossentropy": 2.4674609899520874, "epoch": 0.55732891051959, "grad_norm": 0.030245663598179817, "grad_norm_var": 2.7219313000351533e-06, "learning_rate": 0.0010301085662282761, "loss": 2.4675, "step": 10249 }, { "crossentropy": 2.6594148874282837, "epoch": 0.5573832893770901, "grad_norm": 0.03295363113284111, "grad_norm_var": 2.68385368954917e-06, "learning_rate": 0.001029351686149016, "loss": 2.6594, "step": 10250 }, { "crossentropy": 2.4542516469955444, "epoch": 0.5574376682345904, "grad_norm": 0.03139055520296097, "grad_norm_var": 2.7162410132151755e-06, "learning_rate": 0.0010285950523256592, "loss": 2.4543, "step": 10251 }, { "crossentropy": 2.4830236434936523, "epoch": 0.5574920470920905, "grad_norm": 0.03742571920156479, "grad_norm_var": 4.174580199006681e-06, "learning_rate": 0.0010278386648051368, "loss": 2.483, "step": 10252 }, { "crossentropy": 2.517199993133545, "epoch": 0.5575464259495908, "grad_norm": 0.03198624029755592, "grad_norm_var": 3.9166494917659e-06, "learning_rate": 0.0010270825236343578, "loss": 2.5172, "step": 10253 }, { "crossentropy": 2.5689265727996826, "epoch": 0.557600804807091, "grad_norm": 0.033034369349479675, "grad_norm_var": 3.675933941968398e-06, "learning_rate": 0.0010263266288602158, "loss": 2.5689, "step": 10254 }, { "crossentropy": 2.5193026065826416, "epoch": 0.5576551836645912, "grad_norm": 0.032777026295661926, "grad_norm_var": 3.5727404294086935e-06, "learning_rate": 0.0010255709805295904, "loss": 2.5193, "step": 10255 }, { "crossentropy": 2.395082473754883, "epoch": 0.5577095625220914, "grad_norm": 0.029927635565400124, "grad_norm_var": 4.127731173528733e-06, "learning_rate": 0.0010248155786893498, "loss": 2.3951, "step": 10256 }, { "crossentropy": 2.4941643476486206, "epoch": 0.5577639413795916, "grad_norm": 0.03521876037120819, "grad_norm_var": 3.24602350168073e-06, "learning_rate": 0.00102406042338634, "loss": 2.4942, "step": 10257 }, { "crossentropy": 2.5976929664611816, "epoch": 0.5578183202370918, "grad_norm": 0.03213338926434517, "grad_norm_var": 3.2513978278502404e-06, "learning_rate": 0.0010233055146673953, "loss": 2.5977, "step": 10258 }, { "crossentropy": 2.5309754610061646, "epoch": 0.557872699094592, "grad_norm": 0.03384973481297493, "grad_norm_var": 3.330773521019401e-06, "learning_rate": 0.0010225508525793358, "loss": 2.531, "step": 10259 }, { "crossentropy": 2.5874446630477905, "epoch": 0.5579270779520922, "grad_norm": 0.03303994983434677, "grad_norm_var": 3.3237490263187302e-06, "learning_rate": 0.0010217964371689642, "loss": 2.5874, "step": 10260 }, { "crossentropy": 2.6035670042037964, "epoch": 0.5579814568095924, "grad_norm": 0.03289979696273804, "grad_norm_var": 3.3282283422172524e-06, "learning_rate": 0.0010210422684830695, "loss": 2.6036, "step": 10261 }, { "crossentropy": 2.646269917488098, "epoch": 0.5580358356670926, "grad_norm": 0.031384821981191635, "grad_norm_var": 3.397786912001192e-06, "learning_rate": 0.0010202883465684216, "loss": 2.6463, "step": 10262 }, { "crossentropy": 2.541306495666504, "epoch": 0.5580902145245928, "grad_norm": 0.03220425918698311, "grad_norm_var": 3.2394063261121282e-06, "learning_rate": 0.0010195346714717813, "loss": 2.5413, "step": 10263 }, { "crossentropy": 2.585060477256775, "epoch": 0.558144593382093, "grad_norm": 0.03260481357574463, "grad_norm_var": 3.238299376723206e-06, "learning_rate": 0.0010187812432398908, "loss": 2.5851, "step": 10264 }, { "crossentropy": 2.5094375610351562, "epoch": 0.5581989722395932, "grad_norm": 0.030374212190508842, "grad_norm_var": 3.1973977485298678e-06, "learning_rate": 0.0010180280619194748, "loss": 2.5094, "step": 10265 }, { "crossentropy": 2.5394229888916016, "epoch": 0.5582533510970934, "grad_norm": 0.031202619895339012, "grad_norm_var": 3.3298821972671387e-06, "learning_rate": 0.0010172751275572445, "loss": 2.5394, "step": 10266 }, { "crossentropy": 2.6079704761505127, "epoch": 0.5583077299545937, "grad_norm": 0.031082401052117348, "grad_norm_var": 3.3851346907119603e-06, "learning_rate": 0.0010165224401999008, "loss": 2.608, "step": 10267 }, { "crossentropy": 2.4531362056732178, "epoch": 0.5583621088120938, "grad_norm": 0.033279843628406525, "grad_norm_var": 1.7761310705919145e-06, "learning_rate": 0.0010157699998941195, "loss": 2.4531, "step": 10268 }, { "crossentropy": 2.542572259902954, "epoch": 0.5584164876695941, "grad_norm": 0.03174493461847305, "grad_norm_var": 1.7902672038815995e-06, "learning_rate": 0.001015017806686569, "loss": 2.5426, "step": 10269 }, { "crossentropy": 2.6201566457748413, "epoch": 0.5584708665270942, "grad_norm": 0.0321384072303772, "grad_norm_var": 1.7524006706818215e-06, "learning_rate": 0.0010142658606238986, "loss": 2.6202, "step": 10270 }, { "crossentropy": 2.6436506509780884, "epoch": 0.5585252453845945, "grad_norm": 0.0330582931637764, "grad_norm_var": 1.7774318181969692e-06, "learning_rate": 0.0010135141617527439, "loss": 2.6437, "step": 10271 }, { "crossentropy": 2.548312783241272, "epoch": 0.5585796242420946, "grad_norm": 0.032208431512117386, "grad_norm_var": 1.3935789030684037e-06, "learning_rate": 0.001012762710119724, "loss": 2.5483, "step": 10272 }, { "crossentropy": 2.4919233322143555, "epoch": 0.5586340030995949, "grad_norm": 0.032369498163461685, "grad_norm_var": 8.307064392242773e-07, "learning_rate": 0.0010120115057714436, "loss": 2.4919, "step": 10273 }, { "crossentropy": 2.511996269226074, "epoch": 0.558688381957095, "grad_norm": 0.033730339258909225, "grad_norm_var": 9.709179316612446e-07, "learning_rate": 0.0010112605487544919, "loss": 2.512, "step": 10274 }, { "crossentropy": 2.6047022342681885, "epoch": 0.5587427608145953, "grad_norm": 0.0334913544356823, "grad_norm_var": 9.060046518770721e-07, "learning_rate": 0.0010105098391154433, "loss": 2.6047, "step": 10275 }, { "crossentropy": 2.52029812335968, "epoch": 0.5587971396720955, "grad_norm": 0.03215048089623451, "grad_norm_var": 8.678004516784183e-07, "learning_rate": 0.0010097593769008535, "loss": 2.5203, "step": 10276 }, { "crossentropy": 2.5605238676071167, "epoch": 0.5588515185295957, "grad_norm": 0.0325697585940361, "grad_norm_var": 8.458062660504495e-07, "learning_rate": 0.001009009162157266, "loss": 2.5605, "step": 10277 }, { "crossentropy": 2.5977760553359985, "epoch": 0.558905897387096, "grad_norm": 0.032141003757715225, "grad_norm_var": 7.968689896785946e-07, "learning_rate": 0.001008259194931212, "loss": 2.5978, "step": 10278 }, { "crossentropy": 2.479670286178589, "epoch": 0.5589602762445961, "grad_norm": 0.032012585550546646, "grad_norm_var": 8.008942261965388e-07, "learning_rate": 0.0010075094752691998, "loss": 2.4797, "step": 10279 }, { "crossentropy": 2.600748896598816, "epoch": 0.5590146551020964, "grad_norm": 0.03461811691522598, "grad_norm_var": 1.1468101866793224e-06, "learning_rate": 0.0010067600032177282, "loss": 2.6007, "step": 10280 }, { "crossentropy": 2.63162624835968, "epoch": 0.5590690339595965, "grad_norm": 0.0318572036921978, "grad_norm_var": 8.865149059247845e-07, "learning_rate": 0.0010060107788232781, "loss": 2.6316, "step": 10281 }, { "crossentropy": 2.5361456871032715, "epoch": 0.5591234128170968, "grad_norm": 0.032010018825531006, "grad_norm_var": 7.899105520138648e-07, "learning_rate": 0.0010052618021323162, "loss": 2.5361, "step": 10282 }, { "crossentropy": 2.552605628967285, "epoch": 0.5591777916745969, "grad_norm": 0.03215717896819115, "grad_norm_var": 6.548161619138529e-07, "learning_rate": 0.0010045130731912927, "loss": 2.5526, "step": 10283 }, { "crossentropy": 2.5281871557235718, "epoch": 0.5592321705320972, "grad_norm": 0.033308032900094986, "grad_norm_var": 6.574357600132308e-07, "learning_rate": 0.0010037645920466431, "loss": 2.5282, "step": 10284 }, { "crossentropy": 2.6056612730026245, "epoch": 0.5592865493895973, "grad_norm": 0.03164343535900116, "grad_norm_var": 6.696223777413746e-07, "learning_rate": 0.0010030163587447882, "loss": 2.6057, "step": 10285 }, { "crossentropy": 2.6158485412597656, "epoch": 0.5593409282470976, "grad_norm": 0.03244541585445404, "grad_norm_var": 6.569657966350635e-07, "learning_rate": 0.0010022683733321336, "loss": 2.6158, "step": 10286 }, { "crossentropy": 2.5881245136260986, "epoch": 0.5593953071045977, "grad_norm": 0.03198803961277008, "grad_norm_var": 6.646837415689074e-07, "learning_rate": 0.001001520635855066, "loss": 2.5881, "step": 10287 }, { "crossentropy": 2.462857484817505, "epoch": 0.559449685962098, "grad_norm": 0.03160402923822403, "grad_norm_var": 7.145419285244902e-07, "learning_rate": 0.0010007731463599601, "loss": 2.4629, "step": 10288 }, { "crossentropy": 2.5200960636138916, "epoch": 0.5595040648195981, "grad_norm": 0.03203815594315529, "grad_norm_var": 7.274355258198838e-07, "learning_rate": 0.001000025904893177, "loss": 2.5201, "step": 10289 }, { "crossentropy": 2.5295389890670776, "epoch": 0.5595584436770984, "grad_norm": 0.033506158739328384, "grad_norm_var": 6.933620961666401e-07, "learning_rate": 0.000999278911501057, "loss": 2.5295, "step": 10290 }, { "crossentropy": 2.4706616401672363, "epoch": 0.5596128225345985, "grad_norm": 0.03201430290937424, "grad_norm_var": 6.288295273817098e-07, "learning_rate": 0.000998532166229929, "loss": 2.4707, "step": 10291 }, { "crossentropy": 2.4998821020126343, "epoch": 0.5596672013920988, "grad_norm": 0.03132772818207741, "grad_norm_var": 6.962052186670898e-07, "learning_rate": 0.0009977856691261056, "loss": 2.4999, "step": 10292 }, { "crossentropy": 2.5374999046325684, "epoch": 0.559721580249599, "grad_norm": 0.03471082076430321, "grad_norm_var": 1.0518523943148575e-06, "learning_rate": 0.000997039420235884, "loss": 2.5375, "step": 10293 }, { "crossentropy": 2.592400312423706, "epoch": 0.5597759591070992, "grad_norm": 0.035793036222457886, "grad_norm_var": 1.7294284759707586e-06, "learning_rate": 0.000996293419605545, "loss": 2.5924, "step": 10294 }, { "crossentropy": 2.5078030824661255, "epoch": 0.5598303379645994, "grad_norm": 0.030378976836800575, "grad_norm_var": 2.0436933563002055e-06, "learning_rate": 0.0009955476672813563, "loss": 2.5078, "step": 10295 }, { "crossentropy": 2.5080913305282593, "epoch": 0.5598847168220996, "grad_norm": 0.03212914988398552, "grad_norm_var": 1.7570065226685337e-06, "learning_rate": 0.0009948021633095672, "loss": 2.5081, "step": 10296 }, { "crossentropy": 2.51908540725708, "epoch": 0.5599390956795998, "grad_norm": 0.03150911256670952, "grad_norm_var": 1.7912561028034784e-06, "learning_rate": 0.0009940569077364141, "loss": 2.5191, "step": 10297 }, { "crossentropy": 2.552369475364685, "epoch": 0.5599934745371, "grad_norm": 0.0313439816236496, "grad_norm_var": 1.8545217034360844e-06, "learning_rate": 0.0009933119006081182, "loss": 2.5524, "step": 10298 }, { "crossentropy": 2.598751664161682, "epoch": 0.5600478533946002, "grad_norm": 0.03349967300891876, "grad_norm_var": 1.929321143989447e-06, "learning_rate": 0.0009925671419708803, "loss": 2.5988, "step": 10299 }, { "crossentropy": 2.5537129640579224, "epoch": 0.5601022322521004, "grad_norm": 0.03195129334926605, "grad_norm_var": 1.889603383657086e-06, "learning_rate": 0.0009918226318708944, "loss": 2.5537, "step": 10300 }, { "crossentropy": 2.5570895671844482, "epoch": 0.5601566111096006, "grad_norm": 0.03179478272795677, "grad_norm_var": 1.8764194650544199e-06, "learning_rate": 0.0009910783703543312, "loss": 2.5571, "step": 10301 }, { "crossentropy": 2.519067645072937, "epoch": 0.5602109899671008, "grad_norm": 0.03140971064567566, "grad_norm_var": 1.934037410352427e-06, "learning_rate": 0.0009903343574673496, "loss": 2.5191, "step": 10302 }, { "crossentropy": 2.6030794382095337, "epoch": 0.560265368824601, "grad_norm": 0.03168191388249397, "grad_norm_var": 1.953135220739939e-06, "learning_rate": 0.0009895905932560934, "loss": 2.6031, "step": 10303 }, { "crossentropy": 2.5748109817504883, "epoch": 0.5603197476821012, "grad_norm": 0.03115154057741165, "grad_norm_var": 2.0075169055410915e-06, "learning_rate": 0.000988847077766689, "loss": 2.5748, "step": 10304 }, { "crossentropy": 2.60040283203125, "epoch": 0.5603741265396014, "grad_norm": 0.031544361263513565, "grad_norm_var": 2.0376931217211028e-06, "learning_rate": 0.0009881038110452496, "loss": 2.6004, "step": 10305 }, { "crossentropy": 2.425158977508545, "epoch": 0.5604285053971017, "grad_norm": 0.031891029328107834, "grad_norm_var": 1.926807407166816e-06, "learning_rate": 0.0009873607931378714, "loss": 2.4252, "step": 10306 }, { "crossentropy": 2.4394500255584717, "epoch": 0.5604828842546018, "grad_norm": 0.030897408723831177, "grad_norm_var": 2.0224812482618474e-06, "learning_rate": 0.0009866180240906365, "loss": 2.4395, "step": 10307 }, { "crossentropy": 2.5232667922973633, "epoch": 0.5605372631121021, "grad_norm": 0.0312669463455677, "grad_norm_var": 2.0286742753675696e-06, "learning_rate": 0.0009858755039496092, "loss": 2.5233, "step": 10308 }, { "crossentropy": 2.528661012649536, "epoch": 0.5605916419696022, "grad_norm": 0.03347800299525261, "grad_norm_var": 1.6878693930555497e-06, "learning_rate": 0.0009851332327608425, "loss": 2.5287, "step": 10309 }, { "crossentropy": 2.475454092025757, "epoch": 0.5606460208271025, "grad_norm": 0.0344558022916317, "grad_norm_var": 1.1202313624117937e-06, "learning_rate": 0.0009843912105703674, "loss": 2.4755, "step": 10310 }, { "crossentropy": 2.551538109779358, "epoch": 0.5607003996846026, "grad_norm": 0.03261084109544754, "grad_norm_var": 9.792319481250269e-07, "learning_rate": 0.000983649437424206, "loss": 2.5515, "step": 10311 }, { "crossentropy": 2.596189022064209, "epoch": 0.5607547785421029, "grad_norm": 0.03223879635334015, "grad_norm_var": 9.813090144118741e-07, "learning_rate": 0.0009829079133683643, "loss": 2.5962, "step": 10312 }, { "crossentropy": 2.5881677865982056, "epoch": 0.560809157399603, "grad_norm": 0.03177005797624588, "grad_norm_var": 9.669085133551595e-07, "learning_rate": 0.000982166638448827, "loss": 2.5882, "step": 10313 }, { "crossentropy": 2.5604093074798584, "epoch": 0.5608635362571033, "grad_norm": 0.03403034433722496, "grad_norm_var": 1.1608926485329186e-06, "learning_rate": 0.000981425612711569, "loss": 2.5604, "step": 10314 }, { "crossentropy": 2.430890440940857, "epoch": 0.5609179151146034, "grad_norm": 0.032059188932180405, "grad_norm_var": 1.0466306586441483e-06, "learning_rate": 0.0009806848362025479, "loss": 2.4309, "step": 10315 }, { "crossentropy": 2.546611785888672, "epoch": 0.5609722939721037, "grad_norm": 0.03158660605549812, "grad_norm_var": 1.064094566539479e-06, "learning_rate": 0.0009799443089677058, "loss": 2.5466, "step": 10316 }, { "crossentropy": 2.50103223323822, "epoch": 0.5610266728296038, "grad_norm": 0.03327320143580437, "grad_norm_var": 1.1372434029951102e-06, "learning_rate": 0.0009792040310529703, "loss": 2.501, "step": 10317 }, { "crossentropy": 2.5367168188095093, "epoch": 0.5610810516871041, "grad_norm": 0.030809789896011353, "grad_norm_var": 1.2236809221373897e-06, "learning_rate": 0.0009784640025042519, "loss": 2.5367, "step": 10318 }, { "crossentropy": 2.547165274620056, "epoch": 0.5611354305446042, "grad_norm": 0.030810581520199776, "grad_norm_var": 1.328024439895672e-06, "learning_rate": 0.0009777242233674466, "loss": 2.5472, "step": 10319 }, { "crossentropy": 2.6629016399383545, "epoch": 0.5611898094021045, "grad_norm": 0.032890427857637405, "grad_norm_var": 1.2931279386219624e-06, "learning_rate": 0.000976984693688437, "loss": 2.6629, "step": 10320 }, { "crossentropy": 2.550708293914795, "epoch": 0.5612441882596046, "grad_norm": 0.03305458277463913, "grad_norm_var": 1.2984521634227016e-06, "learning_rate": 0.0009762454135130828, "loss": 2.5507, "step": 10321 }, { "crossentropy": 2.5135799646377563, "epoch": 0.5612985671171049, "grad_norm": 0.03171722963452339, "grad_norm_var": 1.3102859475696277e-06, "learning_rate": 0.0009755063828872379, "loss": 2.5136, "step": 10322 }, { "crossentropy": 2.5253710746765137, "epoch": 0.561352945974605, "grad_norm": 0.032620083540678024, "grad_norm_var": 1.1714497339137952e-06, "learning_rate": 0.000974767601856737, "loss": 2.5254, "step": 10323 }, { "crossentropy": 2.6222985982894897, "epoch": 0.5614073248321053, "grad_norm": 0.032108426094055176, "grad_norm_var": 1.0866689424582672e-06, "learning_rate": 0.0009740290704673949, "loss": 2.6223, "step": 10324 }, { "crossentropy": 2.5146982669830322, "epoch": 0.5614617036896055, "grad_norm": 0.03189033642411232, "grad_norm_var": 1.030748866922522e-06, "learning_rate": 0.0009732907887650156, "loss": 2.5147, "step": 10325 }, { "crossentropy": 2.5099838972091675, "epoch": 0.5615160825471057, "grad_norm": 0.0317516028881073, "grad_norm_var": 7.358774446812224e-07, "learning_rate": 0.0009725527567953896, "loss": 2.51, "step": 10326 }, { "crossentropy": 2.5189393758773804, "epoch": 0.5615704614046059, "grad_norm": 0.03206216171383858, "grad_norm_var": 7.247380399700755e-07, "learning_rate": 0.0009718149746042859, "loss": 2.5189, "step": 10327 }, { "crossentropy": 2.5494918823242188, "epoch": 0.5616248402621061, "grad_norm": 0.0321149006485939, "grad_norm_var": 7.245128528840835e-07, "learning_rate": 0.0009710774422374619, "loss": 2.5495, "step": 10328 }, { "crossentropy": 2.534469962120056, "epoch": 0.5616792191196063, "grad_norm": 0.03236325457692146, "grad_norm_var": 7.157156503857873e-07, "learning_rate": 0.0009703401597406586, "loss": 2.5345, "step": 10329 }, { "crossentropy": 2.4887163639068604, "epoch": 0.5617335979771065, "grad_norm": 0.0327916145324707, "grad_norm_var": 5.087206222299193e-07, "learning_rate": 0.0009696031271596023, "loss": 2.4887, "step": 10330 }, { "crossentropy": 2.356523275375366, "epoch": 0.5617879768346067, "grad_norm": 0.03145729750394821, "grad_norm_var": 5.361626145911618e-07, "learning_rate": 0.0009688663445400031, "loss": 2.3565, "step": 10331 }, { "crossentropy": 2.53377628326416, "epoch": 0.5618423556921069, "grad_norm": 0.032558176666498184, "grad_norm_var": 5.310649404507145e-07, "learning_rate": 0.0009681298119275528, "loss": 2.5338, "step": 10332 }, { "crossentropy": 2.6200486421585083, "epoch": 0.5618967345496071, "grad_norm": 0.03220077231526375, "grad_norm_var": 4.4121024347387216e-07, "learning_rate": 0.0009673935293679342, "loss": 2.62, "step": 10333 }, { "crossentropy": 2.5784846544265747, "epoch": 0.5619511134071073, "grad_norm": 0.031234975904226303, "grad_norm_var": 3.807781834163585e-07, "learning_rate": 0.0009666574969068099, "loss": 2.5785, "step": 10334 }, { "crossentropy": 2.521179676055908, "epoch": 0.5620054922646075, "grad_norm": 0.03138618916273117, "grad_norm_var": 3.023993074538836e-07, "learning_rate": 0.0009659217145898263, "loss": 2.5212, "step": 10335 }, { "crossentropy": 2.6498056650161743, "epoch": 0.5620598711221078, "grad_norm": 0.03203986585140228, "grad_norm_var": 2.62241451010059e-07, "learning_rate": 0.0009651861824626162, "loss": 2.6498, "step": 10336 }, { "crossentropy": 2.5460801124572754, "epoch": 0.5621142499796079, "grad_norm": 0.03292199596762657, "grad_norm_var": 2.461902124948137e-07, "learning_rate": 0.0009644509005707996, "loss": 2.5461, "step": 10337 }, { "crossentropy": 2.6939773559570312, "epoch": 0.5621686288371082, "grad_norm": 0.03507066145539284, "grad_norm_var": 7.885387674983271e-07, "learning_rate": 0.0009637158689599746, "loss": 2.694, "step": 10338 }, { "crossentropy": 2.592559337615967, "epoch": 0.5622230076946083, "grad_norm": 0.031878430396318436, "grad_norm_var": 7.898575329548866e-07, "learning_rate": 0.0009629810876757283, "loss": 2.5926, "step": 10339 }, { "crossentropy": 2.506853938102722, "epoch": 0.5622773865521086, "grad_norm": 0.03732859715819359, "grad_norm_var": 2.4018219264822228e-06, "learning_rate": 0.0009622465567636312, "loss": 2.5069, "step": 10340 }, { "crossentropy": 2.4494959115982056, "epoch": 0.5623317654096087, "grad_norm": 0.031472183763980865, "grad_norm_var": 2.450402886266095e-06, "learning_rate": 0.0009615122762692386, "loss": 2.4495, "step": 10341 }, { "crossentropy": 2.5480817556381226, "epoch": 0.562386144267109, "grad_norm": 0.03099733218550682, "grad_norm_var": 2.5652032930328454e-06, "learning_rate": 0.0009607782462380904, "loss": 2.5481, "step": 10342 }, { "crossentropy": 2.566745638847351, "epoch": 0.5624405231246091, "grad_norm": 0.03576650843024254, "grad_norm_var": 3.210339289947776e-06, "learning_rate": 0.0009600444667157072, "loss": 2.5667, "step": 10343 }, { "crossentropy": 2.495324969291687, "epoch": 0.5624949019821094, "grad_norm": 0.03317943960428238, "grad_norm_var": 3.1947233553282958e-06, "learning_rate": 0.0009593109377476011, "loss": 2.4953, "step": 10344 }, { "crossentropy": 2.514054775238037, "epoch": 0.5625492808396095, "grad_norm": 0.03129659220576286, "grad_norm_var": 3.3265911800933284e-06, "learning_rate": 0.0009585776593792645, "loss": 2.5141, "step": 10345 }, { "crossentropy": 2.5401002168655396, "epoch": 0.5626036596971098, "grad_norm": 0.03189338371157646, "grad_norm_var": 3.3688943467187368e-06, "learning_rate": 0.0009578446316561729, "loss": 2.5401, "step": 10346 }, { "crossentropy": 2.4539066553115845, "epoch": 0.5626580385546099, "grad_norm": 0.03146502375602722, "grad_norm_var": 3.3676512123755863e-06, "learning_rate": 0.0009571118546237867, "loss": 2.4539, "step": 10347 }, { "crossentropy": 2.5159114599227905, "epoch": 0.5627124174121102, "grad_norm": 0.03250408545136452, "grad_norm_var": 3.368627101731023e-06, "learning_rate": 0.0009563793283275573, "loss": 2.5159, "step": 10348 }, { "crossentropy": 2.544972777366638, "epoch": 0.5627667962696103, "grad_norm": 0.030463222414255142, "grad_norm_var": 3.6648113714546694e-06, "learning_rate": 0.0009556470528129107, "loss": 2.545, "step": 10349 }, { "crossentropy": 2.4958842992782593, "epoch": 0.5628211751271106, "grad_norm": 0.03155627101659775, "grad_norm_var": 3.614664809446083e-06, "learning_rate": 0.0009549150281252633, "loss": 2.4959, "step": 10350 }, { "crossentropy": 2.664449453353882, "epoch": 0.5628755539846108, "grad_norm": 0.03310966491699219, "grad_norm_var": 3.5268438418001314e-06, "learning_rate": 0.0009541832543100143, "loss": 2.6644, "step": 10351 }, { "crossentropy": 2.6300594806671143, "epoch": 0.562929932842111, "grad_norm": 0.030536316335201263, "grad_norm_var": 3.7972575484913894e-06, "learning_rate": 0.0009534517314125485, "loss": 2.6301, "step": 10352 }, { "crossentropy": 2.6488455533981323, "epoch": 0.5629843116996112, "grad_norm": 0.03260346129536629, "grad_norm_var": 3.789497999623359e-06, "learning_rate": 0.0009527204594782335, "loss": 2.6488, "step": 10353 }, { "crossentropy": 2.566746711730957, "epoch": 0.5630386905571114, "grad_norm": 0.031881555914878845, "grad_norm_var": 3.361862396256537e-06, "learning_rate": 0.0009519894385524219, "loss": 2.5667, "step": 10354 }, { "crossentropy": 2.578001856803894, "epoch": 0.5630930694146116, "grad_norm": 0.031554654240608215, "grad_norm_var": 3.3896680318240094e-06, "learning_rate": 0.000951258668680452, "loss": 2.578, "step": 10355 }, { "crossentropy": 2.4663997888565063, "epoch": 0.5631474482721118, "grad_norm": 0.03136363625526428, "grad_norm_var": 1.6542593213814474e-06, "learning_rate": 0.0009505281499076457, "loss": 2.4664, "step": 10356 }, { "crossentropy": 2.358597993850708, "epoch": 0.563201827129612, "grad_norm": 0.03153892606496811, "grad_norm_var": 1.6500390810950463e-06, "learning_rate": 0.0009497978822793074, "loss": 2.3586, "step": 10357 }, { "crossentropy": 2.3956029415130615, "epoch": 0.5632562059871122, "grad_norm": 0.0317780077457428, "grad_norm_var": 1.5856483615295236e-06, "learning_rate": 0.0009490678658407264, "loss": 2.3956, "step": 10358 }, { "crossentropy": 2.4995304346084595, "epoch": 0.5633105848446124, "grad_norm": 0.03252704441547394, "grad_norm_var": 6.279168767298407e-07, "learning_rate": 0.0009483381006371833, "loss": 2.4995, "step": 10359 }, { "crossentropy": 2.4652477502822876, "epoch": 0.5633649637021126, "grad_norm": 0.031475573778152466, "grad_norm_var": 5.023880459951886e-07, "learning_rate": 0.0009476085867139317, "loss": 2.4652, "step": 10360 }, { "crossentropy": 2.6399871110916138, "epoch": 0.5634193425596128, "grad_norm": 0.03207292780280113, "grad_norm_var": 4.960516773492088e-07, "learning_rate": 0.0009468793241162177, "loss": 2.64, "step": 10361 }, { "crossentropy": 2.6035298109054565, "epoch": 0.563473721417113, "grad_norm": 0.03183397650718689, "grad_norm_var": 4.952967948362863e-07, "learning_rate": 0.0009461503128892696, "loss": 2.6035, "step": 10362 }, { "crossentropy": 2.4435412883758545, "epoch": 0.5635281002746132, "grad_norm": 0.044373881071805954, "grad_norm_var": 1.0391276518010618e-05, "learning_rate": 0.0009454215530782995, "loss": 2.4435, "step": 10363 }, { "crossentropy": 2.534146785736084, "epoch": 0.5635824791321135, "grad_norm": 0.03293871879577637, "grad_norm_var": 1.0399070624206077e-05, "learning_rate": 0.0009446930447285046, "loss": 2.5341, "step": 10364 }, { "crossentropy": 2.4973623752593994, "epoch": 0.5636368579896136, "grad_norm": 0.03310844674706459, "grad_norm_var": 1.0082589428246867e-05, "learning_rate": 0.000943964787885066, "loss": 2.4974, "step": 10365 }, { "crossentropy": 2.5330519676208496, "epoch": 0.5636912368471139, "grad_norm": 0.03187376260757446, "grad_norm_var": 1.0037686752058701e-05, "learning_rate": 0.0009432367825931504, "loss": 2.5331, "step": 10366 }, { "crossentropy": 2.5584659576416016, "epoch": 0.563745615704614, "grad_norm": 0.031799040734767914, "grad_norm_var": 1.0088425352205767e-05, "learning_rate": 0.0009425090288979065, "loss": 2.5585, "step": 10367 }, { "crossentropy": 2.483667016029358, "epoch": 0.5637999945621143, "grad_norm": 0.032025620341300964, "grad_norm_var": 9.796657183172575e-06, "learning_rate": 0.0009417815268444718, "loss": 2.4837, "step": 10368 }, { "crossentropy": 2.4414607286453247, "epoch": 0.5638543734196144, "grad_norm": 0.03235739842057228, "grad_norm_var": 9.806785387730017e-06, "learning_rate": 0.0009410542764779622, "loss": 2.4415, "step": 10369 }, { "crossentropy": 2.5693020820617676, "epoch": 0.5639087522771147, "grad_norm": 0.03117399662733078, "grad_norm_var": 9.922972350806293e-06, "learning_rate": 0.0009403272778434819, "loss": 2.5693, "step": 10370 }, { "crossentropy": 2.448883891105652, "epoch": 0.5639631311346148, "grad_norm": 0.032433923333883286, "grad_norm_var": 9.832652172803563e-06, "learning_rate": 0.0009396005309861194, "loss": 2.4489, "step": 10371 }, { "crossentropy": 2.4755489826202393, "epoch": 0.5640175099921151, "grad_norm": 0.031120484694838524, "grad_norm_var": 9.882661028341834e-06, "learning_rate": 0.0009388740359509462, "loss": 2.4755, "step": 10372 }, { "crossentropy": 2.5164120197296143, "epoch": 0.5640718888496152, "grad_norm": 0.03226848691701889, "grad_norm_var": 9.795495491553386e-06, "learning_rate": 0.000938147792783019, "loss": 2.5164, "step": 10373 }, { "crossentropy": 2.522936224937439, "epoch": 0.5641262677071155, "grad_norm": 0.03194611519575119, "grad_norm_var": 9.773848350222885e-06, "learning_rate": 0.0009374218015273789, "loss": 2.5229, "step": 10374 }, { "crossentropy": 2.5211524963378906, "epoch": 0.5641806465646156, "grad_norm": 0.03172279894351959, "grad_norm_var": 9.847091845784995e-06, "learning_rate": 0.000936696062229051, "loss": 2.5212, "step": 10375 }, { "crossentropy": 2.647208333015442, "epoch": 0.5642350254221159, "grad_norm": 0.030879173427820206, "grad_norm_var": 9.973275124147566e-06, "learning_rate": 0.0009359705749330466, "loss": 2.6472, "step": 10376 }, { "crossentropy": 2.5631076097488403, "epoch": 0.564289404279616, "grad_norm": 0.03228224441409111, "grad_norm_var": 9.957241414615042e-06, "learning_rate": 0.0009352453396843558, "loss": 2.5631, "step": 10377 }, { "crossentropy": 2.5566693544387817, "epoch": 0.5643437831371163, "grad_norm": 0.03138263151049614, "grad_norm_var": 1.0025618421896628e-05, "learning_rate": 0.0009345203565279602, "loss": 2.5567, "step": 10378 }, { "crossentropy": 2.582908511161804, "epoch": 0.5643981619946165, "grad_norm": 0.03209613636136055, "grad_norm_var": 3.8633157935888336e-07, "learning_rate": 0.0009337956255088237, "loss": 2.5829, "step": 10379 }, { "crossentropy": 2.4445927143096924, "epoch": 0.5644525408521167, "grad_norm": 0.032329458743333817, "grad_norm_var": 3.302742155379997e-07, "learning_rate": 0.00093307114667189, "loss": 2.4446, "step": 10380 }, { "crossentropy": 2.48542320728302, "epoch": 0.5645069197096169, "grad_norm": 0.03252314776182175, "grad_norm_var": 2.5932775242740587e-07, "learning_rate": 0.000932346920062091, "loss": 2.4854, "step": 10381 }, { "crossentropy": 2.437260627746582, "epoch": 0.5645612985671171, "grad_norm": 0.03247913345694542, "grad_norm_var": 2.810507945078296e-07, "learning_rate": 0.0009316229457243464, "loss": 2.4373, "step": 10382 }, { "crossentropy": 2.5862209796905518, "epoch": 0.5646156774246173, "grad_norm": 0.031172236427664757, "grad_norm_var": 3.162363050650147e-07, "learning_rate": 0.0009308992237035524, "loss": 2.5862, "step": 10383 }, { "crossentropy": 2.513144373893738, "epoch": 0.5646700562821175, "grad_norm": 0.03182109445333481, "grad_norm_var": 3.150722208794879e-07, "learning_rate": 0.0009301757540445949, "loss": 2.5131, "step": 10384 }, { "crossentropy": 2.4858914613723755, "epoch": 0.5647244351396177, "grad_norm": 0.03178972750902176, "grad_norm_var": 2.9864580112712313e-07, "learning_rate": 0.0009294525367923429, "loss": 2.4859, "step": 10385 }, { "crossentropy": 2.439104199409485, "epoch": 0.5647788139971179, "grad_norm": 0.034015655517578125, "grad_norm_var": 5.514491993212818e-07, "learning_rate": 0.0009287295719916495, "loss": 2.4391, "step": 10386 }, { "crossentropy": 2.484429359436035, "epoch": 0.5648331928546181, "grad_norm": 0.03235260024666786, "grad_norm_var": 5.473353343862323e-07, "learning_rate": 0.0009280068596873536, "loss": 2.4844, "step": 10387 }, { "crossentropy": 2.488279938697815, "epoch": 0.5648875717121183, "grad_norm": 0.03216062858700752, "grad_norm_var": 4.914077397389817e-07, "learning_rate": 0.0009272843999242736, "loss": 2.4883, "step": 10388 }, { "crossentropy": 2.529256224632263, "epoch": 0.5649419505696185, "grad_norm": 0.030981391668319702, "grad_norm_var": 5.619694457147425e-07, "learning_rate": 0.0009265621927472196, "loss": 2.5293, "step": 10389 }, { "crossentropy": 2.5317304134368896, "epoch": 0.5649963294271187, "grad_norm": 0.03286203369498253, "grad_norm_var": 6.083229944641207e-07, "learning_rate": 0.0009258402382009812, "loss": 2.5317, "step": 10390 }, { "crossentropy": 2.6194010972976685, "epoch": 0.5650507082846189, "grad_norm": 0.03061979077756405, "grad_norm_var": 7.329433468402738e-07, "learning_rate": 0.0009251185363303322, "loss": 2.6194, "step": 10391 }, { "crossentropy": 2.613894462585449, "epoch": 0.5651050871421192, "grad_norm": 0.033430054783821106, "grad_norm_var": 7.637941408237976e-07, "learning_rate": 0.0009243970871800317, "loss": 2.6139, "step": 10392 }, { "crossentropy": 2.534834861755371, "epoch": 0.5651594659996193, "grad_norm": 0.03417911008000374, "grad_norm_var": 1.0237348769148146e-06, "learning_rate": 0.0009236758907948261, "loss": 2.5348, "step": 10393 }, { "crossentropy": 2.573958158493042, "epoch": 0.5652138448571196, "grad_norm": 0.031563956290483475, "grad_norm_var": 1.004525342472989e-06, "learning_rate": 0.0009229549472194404, "loss": 2.574, "step": 10394 }, { "crossentropy": 2.5562714338302612, "epoch": 0.5652682237146197, "grad_norm": 0.03061629645526409, "grad_norm_var": 1.1763936269795627e-06, "learning_rate": 0.0009222342564985875, "loss": 2.5563, "step": 10395 }, { "crossentropy": 2.5234415531158447, "epoch": 0.56532260257212, "grad_norm": 0.030984973534941673, "grad_norm_var": 1.2627612910034077e-06, "learning_rate": 0.0009215138186769644, "loss": 2.5234, "step": 10396 }, { "crossentropy": 2.6853398084640503, "epoch": 0.5653769814296201, "grad_norm": 0.03294913470745087, "grad_norm_var": 1.2983078960552193e-06, "learning_rate": 0.0009207936337992523, "loss": 2.6853, "step": 10397 }, { "crossentropy": 2.4518927335739136, "epoch": 0.5654313602871204, "grad_norm": 0.03199293091893196, "grad_norm_var": 1.2900351676300067e-06, "learning_rate": 0.0009200737019101168, "loss": 2.4519, "step": 10398 }, { "crossentropy": 2.4929429292678833, "epoch": 0.5654857391446205, "grad_norm": 0.0310969240963459, "grad_norm_var": 1.2996379136903513e-06, "learning_rate": 0.0009193540230542046, "loss": 2.4929, "step": 10399 }, { "crossentropy": 2.5707980394363403, "epoch": 0.5655401180021208, "grad_norm": 0.0316130593419075, "grad_norm_var": 1.3097606515371455e-06, "learning_rate": 0.0009186345972761522, "loss": 2.5708, "step": 10400 }, { "crossentropy": 2.4122939109802246, "epoch": 0.5655944968596209, "grad_norm": 0.031106799840927124, "grad_norm_var": 1.3649331575455442e-06, "learning_rate": 0.000917915424620579, "loss": 2.4123, "step": 10401 }, { "crossentropy": 2.498523712158203, "epoch": 0.5656488757171212, "grad_norm": 0.03172942250967026, "grad_norm_var": 1.0871863267670426e-06, "learning_rate": 0.0009171965051320835, "loss": 2.4985, "step": 10402 }, { "crossentropy": 2.605736255645752, "epoch": 0.5657032545746213, "grad_norm": 0.0323607437312603, "grad_norm_var": 1.087692822500707e-06, "learning_rate": 0.0009164778388552536, "loss": 2.6057, "step": 10403 }, { "crossentropy": 2.459981679916382, "epoch": 0.5657576334321216, "grad_norm": 0.032202161848545074, "grad_norm_var": 1.0892968044697113e-06, "learning_rate": 0.000915759425834663, "loss": 2.46, "step": 10404 }, { "crossentropy": 2.458507537841797, "epoch": 0.5658120122896217, "grad_norm": 0.0317557193338871, "grad_norm_var": 1.0326478952225275e-06, "learning_rate": 0.0009150412661148644, "loss": 2.4585, "step": 10405 }, { "crossentropy": 2.4366745948791504, "epoch": 0.565866391147122, "grad_norm": 0.031807735562324524, "grad_norm_var": 9.727093667188327e-07, "learning_rate": 0.0009143233597403977, "loss": 2.4367, "step": 10406 }, { "crossentropy": 2.4532852172851562, "epoch": 0.5659207700046222, "grad_norm": 0.03103131614625454, "grad_norm_var": 9.143903196483545e-07, "learning_rate": 0.0009136057067557873, "loss": 2.4533, "step": 10407 }, { "crossentropy": 2.6519747972488403, "epoch": 0.5659751488621224, "grad_norm": 0.032471735030412674, "grad_norm_var": 7.764470654239199e-07, "learning_rate": 0.0009128883072055411, "loss": 2.652, "step": 10408 }, { "crossentropy": 2.566198229789734, "epoch": 0.5660295277196226, "grad_norm": 0.032044682651758194, "grad_norm_var": 3.958869056039458e-07, "learning_rate": 0.0009121711611341533, "loss": 2.5662, "step": 10409 }, { "crossentropy": 2.5247700214385986, "epoch": 0.5660839065771228, "grad_norm": 0.031515464186668396, "grad_norm_var": 3.969650394913208e-07, "learning_rate": 0.000911454268586096, "loss": 2.5248, "step": 10410 }, { "crossentropy": 2.5113850831985474, "epoch": 0.566138285434623, "grad_norm": 0.0315740592777729, "grad_norm_var": 3.1527477106519744e-07, "learning_rate": 0.0009107376296058345, "loss": 2.5114, "step": 10411 }, { "crossentropy": 2.552757978439331, "epoch": 0.5661926642921232, "grad_norm": 0.03259757533669472, "grad_norm_var": 3.101309388988388e-07, "learning_rate": 0.0009100212442378142, "loss": 2.5528, "step": 10412 }, { "crossentropy": 2.539646029472351, "epoch": 0.5662470431496234, "grad_norm": 0.0328359492123127, "grad_norm_var": 2.945794409009063e-07, "learning_rate": 0.0009093051125264623, "loss": 2.5396, "step": 10413 }, { "crossentropy": 2.4866316318511963, "epoch": 0.5663014220071236, "grad_norm": 0.032362814992666245, "grad_norm_var": 3.097593175488267e-07, "learning_rate": 0.0009085892345161916, "loss": 2.4866, "step": 10414 }, { "crossentropy": 2.4824084043502808, "epoch": 0.5663558008646238, "grad_norm": 0.031736407428979874, "grad_norm_var": 2.6841004515039346e-07, "learning_rate": 0.0009078736102514057, "loss": 2.4824, "step": 10415 }, { "crossentropy": 2.6170971393585205, "epoch": 0.566410179722124, "grad_norm": 0.031840745359659195, "grad_norm_var": 2.622832955510712e-07, "learning_rate": 0.000907158239776481, "loss": 2.6171, "step": 10416 }, { "crossentropy": 2.550269365310669, "epoch": 0.5664645585796242, "grad_norm": 0.033127084374427795, "grad_norm_var": 2.940623545333165e-07, "learning_rate": 0.0009064431231357872, "loss": 2.5503, "step": 10417 }, { "crossentropy": 2.5699535608291626, "epoch": 0.5665189374371244, "grad_norm": 0.03140195831656456, "grad_norm_var": 3.152897795302556e-07, "learning_rate": 0.0009057282603736744, "loss": 2.57, "step": 10418 }, { "crossentropy": 2.4841378927230835, "epoch": 0.5665733162946246, "grad_norm": 0.03142143785953522, "grad_norm_var": 3.3046776580397616e-07, "learning_rate": 0.0009050136515344781, "loss": 2.4841, "step": 10419 }, { "crossentropy": 2.4893211126327515, "epoch": 0.5666276951521249, "grad_norm": 0.032220806926488876, "grad_norm_var": 3.3103451103111356e-07, "learning_rate": 0.0009042992966625168, "loss": 2.4893, "step": 10420 }, { "crossentropy": 2.5479624271392822, "epoch": 0.566682074009625, "grad_norm": 0.034229010343551636, "grad_norm_var": 6.380461834520988e-07, "learning_rate": 0.0009035851958020952, "loss": 2.548, "step": 10421 }, { "crossentropy": 2.542055130004883, "epoch": 0.5667364528671253, "grad_norm": 0.03225662559270859, "grad_norm_var": 6.308327496688196e-07, "learning_rate": 0.0009028713489975005, "loss": 2.5421, "step": 10422 }, { "crossentropy": 2.536310911178589, "epoch": 0.5667908317246254, "grad_norm": 0.03145908936858177, "grad_norm_var": 5.775096927155068e-07, "learning_rate": 0.0009021577562930061, "loss": 2.5363, "step": 10423 }, { "crossentropy": 2.570290207862854, "epoch": 0.5668452105821257, "grad_norm": 0.031303759664297104, "grad_norm_var": 6.194351517038153e-07, "learning_rate": 0.0009014444177328657, "loss": 2.5703, "step": 10424 }, { "crossentropy": 2.538431167602539, "epoch": 0.5668995894396259, "grad_norm": 0.0318402536213398, "grad_norm_var": 6.241127710415161e-07, "learning_rate": 0.000900731333361321, "loss": 2.5384, "step": 10425 }, { "crossentropy": 2.590947389602661, "epoch": 0.5669539682971261, "grad_norm": 0.03264956548810005, "grad_norm_var": 6.149468214755594e-07, "learning_rate": 0.0009000185032225988, "loss": 2.5909, "step": 10426 }, { "crossentropy": 2.5433512926101685, "epoch": 0.5670083471546263, "grad_norm": 0.0308432225137949, "grad_norm_var": 7.072360990263328e-07, "learning_rate": 0.0008993059273609061, "loss": 2.5434, "step": 10427 }, { "crossentropy": 2.5128602981567383, "epoch": 0.5670627260121265, "grad_norm": 0.03210929036140442, "grad_norm_var": 6.918845753584354e-07, "learning_rate": 0.0008985936058204364, "loss": 2.5129, "step": 10428 }, { "crossentropy": 2.4048320055007935, "epoch": 0.5671171048696267, "grad_norm": 0.03166545554995537, "grad_norm_var": 6.63027385501659e-07, "learning_rate": 0.0008978815386453676, "loss": 2.4048, "step": 10429 }, { "crossentropy": 2.5061196088790894, "epoch": 0.5671714837271269, "grad_norm": 0.03185289725661278, "grad_norm_var": 6.565976241255557e-07, "learning_rate": 0.0008971697258798617, "loss": 2.5061, "step": 10430 }, { "crossentropy": 2.3947149515151978, "epoch": 0.5672258625846271, "grad_norm": 0.03197455778717995, "grad_norm_var": 6.518565198964371e-07, "learning_rate": 0.0008964581675680639, "loss": 2.3947, "step": 10431 }, { "crossentropy": 2.3850876092910767, "epoch": 0.5672802414421273, "grad_norm": 0.031205499544739723, "grad_norm_var": 6.916026766098926e-07, "learning_rate": 0.0008957468637541049, "loss": 2.3851, "step": 10432 }, { "crossentropy": 2.4892200231552124, "epoch": 0.5673346202996276, "grad_norm": 0.03097710572183132, "grad_norm_var": 6.495348411129544e-07, "learning_rate": 0.0008950358144820991, "loss": 2.4892, "step": 10433 }, { "crossentropy": 2.6152477264404297, "epoch": 0.5673889991571277, "grad_norm": 0.03300948813557625, "grad_norm_var": 7.175503760764908e-07, "learning_rate": 0.000894325019796145, "loss": 2.6152, "step": 10434 }, { "crossentropy": 2.5400055646896362, "epoch": 0.567443378014628, "grad_norm": 0.03250487893819809, "grad_norm_var": 7.162028291421516e-07, "learning_rate": 0.0008936144797403267, "loss": 2.54, "step": 10435 }, { "crossentropy": 2.5846303701400757, "epoch": 0.5674977568721281, "grad_norm": 0.03188513591885567, "grad_norm_var": 7.136464945487239e-07, "learning_rate": 0.0008929041943587074, "loss": 2.5846, "step": 10436 }, { "crossentropy": 2.542329430580139, "epoch": 0.5675521357296284, "grad_norm": 0.030688436701893806, "grad_norm_var": 4.37953019212974e-07, "learning_rate": 0.0008921941636953434, "loss": 2.5423, "step": 10437 }, { "crossentropy": 2.5405192375183105, "epoch": 0.5676065145871285, "grad_norm": 0.03147529810667038, "grad_norm_var": 4.247955218213602e-07, "learning_rate": 0.0008914843877942669, "loss": 2.5405, "step": 10438 }, { "crossentropy": 2.4916270971298218, "epoch": 0.5676608934446288, "grad_norm": 0.03060624748468399, "grad_norm_var": 4.993823645812444e-07, "learning_rate": 0.0008907748666994974, "loss": 2.4916, "step": 10439 }, { "crossentropy": 2.603752374649048, "epoch": 0.5677152723021289, "grad_norm": 0.030807513743638992, "grad_norm_var": 5.38473237830274e-07, "learning_rate": 0.0008900656004550406, "loss": 2.6038, "step": 10440 }, { "crossentropy": 2.3882800340652466, "epoch": 0.5677696511596292, "grad_norm": 0.03057503141462803, "grad_norm_var": 6.032099713785261e-07, "learning_rate": 0.0008893565891048827, "loss": 2.3883, "step": 10441 }, { "crossentropy": 2.612137198448181, "epoch": 0.5678240300171293, "grad_norm": 0.031221959739923477, "grad_norm_var": 5.21641575184833e-07, "learning_rate": 0.0008886478326929976, "loss": 2.6121, "step": 10442 }, { "crossentropy": 2.554288864135742, "epoch": 0.5678784088746296, "grad_norm": 0.03276605159044266, "grad_norm_var": 5.939200692341162e-07, "learning_rate": 0.0008879393312633405, "loss": 2.5543, "step": 10443 }, { "crossentropy": 2.5297330617904663, "epoch": 0.5679327877321297, "grad_norm": 0.03111264668405056, "grad_norm_var": 5.860385262683772e-07, "learning_rate": 0.0008872310848598525, "loss": 2.5297, "step": 10444 }, { "crossentropy": 2.4138482809066772, "epoch": 0.56798716658963, "grad_norm": 0.03591950610280037, "grad_norm_var": 1.799310179703102e-06, "learning_rate": 0.0008865230935264579, "loss": 2.4138, "step": 10445 }, { "crossentropy": 2.5567177534103394, "epoch": 0.5680415454471301, "grad_norm": 0.031465403735637665, "grad_norm_var": 1.8052585287207327e-06, "learning_rate": 0.0008858153573070676, "loss": 2.5567, "step": 10446 }, { "crossentropy": 2.5894887447357178, "epoch": 0.5680959243046304, "grad_norm": 0.032230962067842484, "grad_norm_var": 1.8166283392686504e-06, "learning_rate": 0.0008851078762455711, "loss": 2.5895, "step": 10447 }, { "crossentropy": 2.613479495048523, "epoch": 0.5681503031621306, "grad_norm": 0.03239528462290764, "grad_norm_var": 1.8142509025028652e-06, "learning_rate": 0.0008844006503858487, "loss": 2.6135, "step": 10448 }, { "crossentropy": 2.5744516849517822, "epoch": 0.5682046820196308, "grad_norm": 0.03298036754131317, "grad_norm_var": 1.8312319600785003e-06, "learning_rate": 0.0008836936797717621, "loss": 2.5745, "step": 10449 }, { "crossentropy": 2.5306259393692017, "epoch": 0.568259060877131, "grad_norm": 0.030902216210961342, "grad_norm_var": 1.8188858365958751e-06, "learning_rate": 0.0008829869644471544, "loss": 2.5306, "step": 10450 }, { "crossentropy": 2.471975803375244, "epoch": 0.5683134397346312, "grad_norm": 0.03147904947400093, "grad_norm_var": 1.794544600942453e-06, "learning_rate": 0.0008822805044558573, "loss": 2.472, "step": 10451 }, { "crossentropy": 2.6082913875579834, "epoch": 0.5683678185921314, "grad_norm": 0.03114900551736355, "grad_norm_var": 1.8182842815894166e-06, "learning_rate": 0.0008815742998416842, "loss": 2.6083, "step": 10452 }, { "crossentropy": 2.499286651611328, "epoch": 0.5684221974496316, "grad_norm": 0.03130670264363289, "grad_norm_var": 1.7558239702586755e-06, "learning_rate": 0.0008808683506484338, "loss": 2.4993, "step": 10453 }, { "crossentropy": 2.415414333343506, "epoch": 0.5684765763071318, "grad_norm": 0.031278371810913086, "grad_norm_var": 1.7661058600709189e-06, "learning_rate": 0.0008801626569198873, "loss": 2.4154, "step": 10454 }, { "crossentropy": 2.576721668243408, "epoch": 0.568530955164632, "grad_norm": 0.03413804993033409, "grad_norm_var": 2.0013285446984773e-06, "learning_rate": 0.0008794572186998123, "loss": 2.5767, "step": 10455 }, { "crossentropy": 2.5653700828552246, "epoch": 0.5685853340221322, "grad_norm": 0.03048228658735752, "grad_norm_var": 2.058913011866133e-06, "learning_rate": 0.0008787520360319595, "loss": 2.5654, "step": 10456 }, { "crossentropy": 2.7109793424606323, "epoch": 0.5686397128796324, "grad_norm": 0.0321006216108799, "grad_norm_var": 1.9221124926703065e-06, "learning_rate": 0.0008780471089600639, "loss": 2.711, "step": 10457 }, { "crossentropy": 2.5624589920043945, "epoch": 0.5686940917371326, "grad_norm": 0.03178996965289116, "grad_norm_var": 1.8789576759555004e-06, "learning_rate": 0.000877342437527841, "loss": 2.5625, "step": 10458 }, { "crossentropy": 2.5895702838897705, "epoch": 0.5687484705946328, "grad_norm": 0.03192939981818199, "grad_norm_var": 1.8476847386797793e-06, "learning_rate": 0.0008766380217789982, "loss": 2.5896, "step": 10459 }, { "crossentropy": 2.4554378986358643, "epoch": 0.568802849452133, "grad_norm": 0.031892016530036926, "grad_norm_var": 1.7891526134795512e-06, "learning_rate": 0.0008759338617572221, "loss": 2.4554, "step": 10460 }, { "crossentropy": 2.6950843334198, "epoch": 0.5688572283096333, "grad_norm": 0.0312541238963604, "grad_norm_var": 7.67336044567981e-07, "learning_rate": 0.0008752299575061822, "loss": 2.6951, "step": 10461 }, { "crossentropy": 2.541319966316223, "epoch": 0.5689116071671334, "grad_norm": 0.031402409076690674, "grad_norm_var": 7.703806981313912e-07, "learning_rate": 0.0008745263090695332, "loss": 2.5413, "step": 10462 }, { "crossentropy": 2.6132283210754395, "epoch": 0.5689659860246337, "grad_norm": 0.03099721670150757, "grad_norm_var": 7.937039773022221e-07, "learning_rate": 0.000873822916490919, "loss": 2.6132, "step": 10463 }, { "crossentropy": 2.429492712020874, "epoch": 0.5690203648821338, "grad_norm": 0.032216913998126984, "grad_norm_var": 7.795685769964074e-07, "learning_rate": 0.0008731197798139595, "loss": 2.4295, "step": 10464 }, { "crossentropy": 2.4525091648101807, "epoch": 0.5690747437396341, "grad_norm": 0.03202752023935318, "grad_norm_var": 6.744313668238886e-07, "learning_rate": 0.0008724168990822634, "loss": 2.4525, "step": 10465 }, { "crossentropy": 2.5303560495376587, "epoch": 0.5691291225971342, "grad_norm": 0.03325837105512619, "grad_norm_var": 7.875414665652868e-07, "learning_rate": 0.0008717142743394236, "loss": 2.5304, "step": 10466 }, { "crossentropy": 2.469369888305664, "epoch": 0.5691835014546345, "grad_norm": 0.03290938958525658, "grad_norm_var": 8.553671680820602e-07, "learning_rate": 0.000871011905629015, "loss": 2.4694, "step": 10467 }, { "crossentropy": 2.4923261404037476, "epoch": 0.5692378803121346, "grad_norm": 0.03087810054421425, "grad_norm_var": 8.864762419440724e-07, "learning_rate": 0.0008703097929946002, "loss": 2.4923, "step": 10468 }, { "crossentropy": 2.6313436031341553, "epoch": 0.5692922591696349, "grad_norm": 0.031776025891304016, "grad_norm_var": 8.652225592050879e-07, "learning_rate": 0.0008696079364797194, "loss": 2.6313, "step": 10469 }, { "crossentropy": 2.57629132270813, "epoch": 0.569346638027135, "grad_norm": 0.03032161481678486, "grad_norm_var": 1.0011818374208262e-06, "learning_rate": 0.0008689063361279048, "loss": 2.5763, "step": 10470 }, { "crossentropy": 2.5127099752426147, "epoch": 0.5694010168846353, "grad_norm": 0.03115270286798477, "grad_norm_var": 6.418289944214982e-07, "learning_rate": 0.0008682049919826695, "loss": 2.5127, "step": 10471 }, { "crossentropy": 2.5652639865875244, "epoch": 0.5694553957421354, "grad_norm": 0.03306663781404495, "grad_norm_var": 6.571312982463073e-07, "learning_rate": 0.0008675039040875072, "loss": 2.5653, "step": 10472 }, { "crossentropy": 2.4477040767669678, "epoch": 0.5695097745996357, "grad_norm": 0.031093910336494446, "grad_norm_var": 6.815727621039957e-07, "learning_rate": 0.0008668030724858983, "loss": 2.4477, "step": 10473 }, { "crossentropy": 2.4006850719451904, "epoch": 0.5695641534571358, "grad_norm": 0.031624265015125275, "grad_norm_var": 6.823592973412834e-07, "learning_rate": 0.0008661024972213122, "loss": 2.4007, "step": 10474 }, { "crossentropy": 2.4899877309799194, "epoch": 0.5696185323146361, "grad_norm": 0.03678750619292259, "grad_norm_var": 2.2817117479773655e-06, "learning_rate": 0.000865402178337194, "loss": 2.49, "step": 10475 }, { "crossentropy": 2.598593831062317, "epoch": 0.5696729111721363, "grad_norm": 0.03389531746506691, "grad_norm_var": 2.4926976818192505e-06, "learning_rate": 0.000864702115876978, "loss": 2.5986, "step": 10476 }, { "crossentropy": 2.5387152433395386, "epoch": 0.5697272900296365, "grad_norm": 0.033031243830919266, "grad_norm_var": 2.473924725217855e-06, "learning_rate": 0.0008640023098840816, "loss": 2.5387, "step": 10477 }, { "crossentropy": 2.537447452545166, "epoch": 0.5697816688871367, "grad_norm": 0.031504157930612564, "grad_norm_var": 2.462700568506152e-06, "learning_rate": 0.0008633027604019055, "loss": 2.5374, "step": 10478 }, { "crossentropy": 2.487791061401367, "epoch": 0.5698360477446369, "grad_norm": 0.03135181963443756, "grad_norm_var": 2.409729080914237e-06, "learning_rate": 0.000862603467473837, "loss": 2.4878, "step": 10479 }, { "crossentropy": 2.442283272743225, "epoch": 0.5698904266021371, "grad_norm": 0.031616728752851486, "grad_norm_var": 2.4393695422853287e-06, "learning_rate": 0.0008619044311432422, "loss": 2.4423, "step": 10480 }, { "crossentropy": 2.4068093299865723, "epoch": 0.5699448054596373, "grad_norm": 0.03056308627128601, "grad_norm_var": 2.6204497667361617e-06, "learning_rate": 0.0008612056514534772, "loss": 2.4068, "step": 10481 }, { "crossentropy": 2.508971929550171, "epoch": 0.5699991843171375, "grad_norm": 0.030825909227132797, "grad_norm_var": 2.639512257216027e-06, "learning_rate": 0.0008605071284478805, "loss": 2.509, "step": 10482 }, { "crossentropy": 2.548020601272583, "epoch": 0.5700535631746377, "grad_norm": 0.030653759837150574, "grad_norm_var": 2.6914933560976633e-06, "learning_rate": 0.0008598088621697714, "loss": 2.548, "step": 10483 }, { "crossentropy": 2.4861092567443848, "epoch": 0.5701079420321379, "grad_norm": 0.031905919313430786, "grad_norm_var": 2.619678485679085e-06, "learning_rate": 0.0008591108526624552, "loss": 2.4861, "step": 10484 }, { "crossentropy": 2.495797872543335, "epoch": 0.5701623208896381, "grad_norm": 0.03183244913816452, "grad_norm_var": 2.618582456310548e-06, "learning_rate": 0.0008584130999692263, "loss": 2.4958, "step": 10485 }, { "crossentropy": 2.587185263633728, "epoch": 0.5702166997471383, "grad_norm": 0.032304488122463226, "grad_norm_var": 2.4333549641299136e-06, "learning_rate": 0.0008577156041333545, "loss": 2.5872, "step": 10486 }, { "crossentropy": 2.4071866273880005, "epoch": 0.5702710786046385, "grad_norm": 0.03093690797686577, "grad_norm_var": 2.4628201696028433e-06, "learning_rate": 0.0008570183651980984, "loss": 2.4072, "step": 10487 }, { "crossentropy": 2.5762674808502197, "epoch": 0.5703254574621387, "grad_norm": 0.03438693284988403, "grad_norm_var": 2.7486014511546654e-06, "learning_rate": 0.0008563213832067012, "loss": 2.5763, "step": 10488 }, { "crossentropy": 2.5297365188598633, "epoch": 0.570379836319639, "grad_norm": 0.03305840864777565, "grad_norm_var": 2.7145812651338993e-06, "learning_rate": 0.0008556246582023891, "loss": 2.5297, "step": 10489 }, { "crossentropy": 2.502761960029602, "epoch": 0.5704342151771391, "grad_norm": 0.03210444748401642, "grad_norm_var": 2.6878139277810204e-06, "learning_rate": 0.0008549281902283712, "loss": 2.5028, "step": 10490 }, { "crossentropy": 2.461966037750244, "epoch": 0.5704885940346394, "grad_norm": 0.03105621598660946, "grad_norm_var": 1.3096134439887954e-06, "learning_rate": 0.000854231979327843, "loss": 2.462, "step": 10491 }, { "crossentropy": 2.591406226158142, "epoch": 0.5705429728921395, "grad_norm": 0.031194953247904778, "grad_norm_var": 1.06107742695079e-06, "learning_rate": 0.0008535360255439822, "loss": 2.5914, "step": 10492 }, { "crossentropy": 2.5350775718688965, "epoch": 0.5705973517496398, "grad_norm": 0.03207292780280113, "grad_norm_var": 9.573788255260243e-07, "learning_rate": 0.0008528403289199527, "loss": 2.5351, "step": 10493 }, { "crossentropy": 2.597747325897217, "epoch": 0.5706517306071399, "grad_norm": 0.03187278285622597, "grad_norm_var": 9.557264692381177e-07, "learning_rate": 0.0008521448894988981, "loss": 2.5977, "step": 10494 }, { "crossentropy": 2.5738470554351807, "epoch": 0.5707061094646402, "grad_norm": 0.03191627934575081, "grad_norm_var": 9.469059623110066e-07, "learning_rate": 0.0008514497073239491, "loss": 2.5738, "step": 10495 }, { "crossentropy": 2.5183156728744507, "epoch": 0.5707604883221403, "grad_norm": 0.03260141611099243, "grad_norm_var": 9.875293900215553e-07, "learning_rate": 0.0008507547824382245, "loss": 2.5183, "step": 10496 }, { "crossentropy": 2.547469973564148, "epoch": 0.5708148671796406, "grad_norm": 0.03252764791250229, "grad_norm_var": 8.967782401710898e-07, "learning_rate": 0.0008500601148848186, "loss": 2.5475, "step": 10497 }, { "crossentropy": 2.3709815740585327, "epoch": 0.5708692460371407, "grad_norm": 0.036629725247621536, "grad_norm_var": 2.129688774473298e-06, "learning_rate": 0.0008493657047068149, "loss": 2.371, "step": 10498 }, { "crossentropy": 2.4754605293273926, "epoch": 0.570923624894641, "grad_norm": 0.033633071929216385, "grad_norm_var": 2.024164921809318e-06, "learning_rate": 0.0008486715519472809, "loss": 2.4755, "step": 10499 }, { "crossentropy": 2.5434162616729736, "epoch": 0.5709780037521411, "grad_norm": 0.030952177941799164, "grad_norm_var": 2.1568377003034858e-06, "learning_rate": 0.000847977656649267, "loss": 2.5434, "step": 10500 }, { "crossentropy": 2.5811073780059814, "epoch": 0.5710323826096414, "grad_norm": 0.03160899877548218, "grad_norm_var": 2.1781353578916955e-06, "learning_rate": 0.0008472840188558079, "loss": 2.5811, "step": 10501 }, { "crossentropy": 2.6078773736953735, "epoch": 0.5710867614671415, "grad_norm": 0.032934337854385376, "grad_norm_var": 2.1925080076308038e-06, "learning_rate": 0.0008465906386099226, "loss": 2.6079, "step": 10502 }, { "crossentropy": 2.484349489212036, "epoch": 0.5711411403246418, "grad_norm": 0.032714858651161194, "grad_norm_var": 2.027127861817651e-06, "learning_rate": 0.0008458975159546134, "loss": 2.4843, "step": 10503 }, { "crossentropy": 2.494365096092224, "epoch": 0.571195519182142, "grad_norm": 0.03149493783712387, "grad_norm_var": 1.852745855903705e-06, "learning_rate": 0.0008452046509328676, "loss": 2.4944, "step": 10504 }, { "crossentropy": 2.6525635719299316, "epoch": 0.5712498980396422, "grad_norm": 0.03253450617194176, "grad_norm_var": 1.823791153688671e-06, "learning_rate": 0.0008445120435876569, "loss": 2.6526, "step": 10505 }, { "crossentropy": 2.5089077949523926, "epoch": 0.5713042768971424, "grad_norm": 0.03175274282693863, "grad_norm_var": 1.8437677154268041e-06, "learning_rate": 0.0008438196939619336, "loss": 2.5089, "step": 10506 }, { "crossentropy": 2.4980478286743164, "epoch": 0.5713586557546426, "grad_norm": 0.031033337116241455, "grad_norm_var": 1.8477276124721863e-06, "learning_rate": 0.0008431276020986384, "loss": 2.498, "step": 10507 }, { "crossentropy": 2.5032976865768433, "epoch": 0.5714130346121428, "grad_norm": 0.03166387602686882, "grad_norm_var": 1.7897432443911634e-06, "learning_rate": 0.0008424357680406935, "loss": 2.5033, "step": 10508 }, { "crossentropy": 2.5799670219421387, "epoch": 0.571467413469643, "grad_norm": 0.033618394285440445, "grad_norm_var": 1.877502804048005e-06, "learning_rate": 0.0008417441918310064, "loss": 2.58, "step": 10509 }, { "crossentropy": 2.4861565828323364, "epoch": 0.5715217923271432, "grad_norm": 0.03249659016728401, "grad_norm_var": 1.8523113316292726e-06, "learning_rate": 0.0008410528735124684, "loss": 2.4862, "step": 10510 }, { "crossentropy": 2.502276301383972, "epoch": 0.5715761711846434, "grad_norm": 0.03502678871154785, "grad_norm_var": 2.212000047935083e-06, "learning_rate": 0.0008403618131279534, "loss": 2.5023, "step": 10511 }, { "crossentropy": 2.5363556146621704, "epoch": 0.5716305500421436, "grad_norm": 0.030937178060412407, "grad_norm_var": 2.4073058052565536e-06, "learning_rate": 0.0008396710107203204, "loss": 2.5364, "step": 10512 }, { "crossentropy": 2.6446253061294556, "epoch": 0.5716849288996438, "grad_norm": 0.03160674497485161, "grad_norm_var": 2.468880253092886e-06, "learning_rate": 0.0008389804663324141, "loss": 2.6446, "step": 10513 }, { "crossentropy": 2.5339728593826294, "epoch": 0.571739307757144, "grad_norm": 0.03214471414685249, "grad_norm_var": 1.2803615008830644e-06, "learning_rate": 0.0008382901800070574, "loss": 2.534, "step": 10514 }, { "crossentropy": 2.5293564796447754, "epoch": 0.5717936866146442, "grad_norm": 0.03249397873878479, "grad_norm_var": 1.152852391977802e-06, "learning_rate": 0.0008376001517870652, "loss": 2.5294, "step": 10515 }, { "crossentropy": 2.5098992586135864, "epoch": 0.5718480654721444, "grad_norm": 0.030635762959718704, "grad_norm_var": 1.211263725119542e-06, "learning_rate": 0.0008369103817152319, "loss": 2.5099, "step": 10516 }, { "crossentropy": 2.4863022565841675, "epoch": 0.5719024443296447, "grad_norm": 0.030987635254859924, "grad_norm_var": 1.2817573818516322e-06, "learning_rate": 0.0008362208698343343, "loss": 2.4863, "step": 10517 }, { "crossentropy": 2.3912044763565063, "epoch": 0.5719568231871448, "grad_norm": 0.031547896564006805, "grad_norm_var": 1.2531653552945067e-06, "learning_rate": 0.0008355316161871368, "loss": 2.3912, "step": 10518 }, { "crossentropy": 2.555753707885742, "epoch": 0.5720112020446451, "grad_norm": 0.03189195320010185, "grad_norm_var": 1.2217851928544478e-06, "learning_rate": 0.0008348426208163856, "loss": 2.5558, "step": 10519 }, { "crossentropy": 2.4942654371261597, "epoch": 0.5720655809021452, "grad_norm": 0.03157081454992294, "grad_norm_var": 1.2171194349531172e-06, "learning_rate": 0.0008341538837648121, "loss": 2.4943, "step": 10520 }, { "crossentropy": 2.5417959690093994, "epoch": 0.5721199597596455, "grad_norm": 0.031969472765922546, "grad_norm_var": 1.196536047481803e-06, "learning_rate": 0.0008334654050751311, "loss": 2.5418, "step": 10521 }, { "crossentropy": 2.555578589439392, "epoch": 0.5721743386171456, "grad_norm": 0.031875960528850555, "grad_norm_var": 1.194061566638195e-06, "learning_rate": 0.0008327771847900406, "loss": 2.5556, "step": 10522 }, { "crossentropy": 2.6245847940444946, "epoch": 0.5722287174746459, "grad_norm": 0.03186991810798645, "grad_norm_var": 1.1334558278981344e-06, "learning_rate": 0.0008320892229522248, "loss": 2.6246, "step": 10523 }, { "crossentropy": 2.5423755645751953, "epoch": 0.572283096332146, "grad_norm": 0.031107449904084206, "grad_norm_var": 1.1793093213402958e-06, "learning_rate": 0.0008314015196043501, "loss": 2.5424, "step": 10524 }, { "crossentropy": 2.4233442544937134, "epoch": 0.5723374751896463, "grad_norm": 0.03143041580915451, "grad_norm_var": 1.0023890704299299e-06, "learning_rate": 0.0008307140747890646, "loss": 2.4233, "step": 10525 }, { "crossentropy": 2.5238651037216187, "epoch": 0.5723918540471464, "grad_norm": 0.032268915325403214, "grad_norm_var": 9.85987741153269e-07, "learning_rate": 0.0008300268885490064, "loss": 2.5239, "step": 10526 }, { "crossentropy": 2.606125831604004, "epoch": 0.5724462329046467, "grad_norm": 0.03491829335689545, "grad_norm_var": 9.405559401028917e-07, "learning_rate": 0.0008293399609267943, "loss": 2.6061, "step": 10527 }, { "crossentropy": 2.591193199157715, "epoch": 0.5725006117621468, "grad_norm": 0.0320218987762928, "grad_norm_var": 8.851732649132882e-07, "learning_rate": 0.000828653291965028, "loss": 2.5912, "step": 10528 }, { "crossentropy": 2.4790701866149902, "epoch": 0.5725549906196471, "grad_norm": 0.03266612067818642, "grad_norm_var": 9.144068469946285e-07, "learning_rate": 0.0008279668817062946, "loss": 2.4791, "step": 10529 }, { "crossentropy": 2.521967649459839, "epoch": 0.5726093694771472, "grad_norm": 0.03208005800843239, "grad_norm_var": 9.130979348499855e-07, "learning_rate": 0.0008272807301931684, "loss": 2.522, "step": 10530 }, { "crossentropy": 2.5778634548187256, "epoch": 0.5726637483346475, "grad_norm": 0.03354593366384506, "grad_norm_var": 1.0573628336634724e-06, "learning_rate": 0.0008265948374681992, "loss": 2.5779, "step": 10531 }, { "crossentropy": 2.6089099645614624, "epoch": 0.5727181271921477, "grad_norm": 0.03175264224410057, "grad_norm_var": 9.285522386542523e-07, "learning_rate": 0.000825909203573928, "loss": 2.6089, "step": 10532 }, { "crossentropy": 2.556137800216675, "epoch": 0.5727725060496479, "grad_norm": 0.03067953512072563, "grad_norm_var": 9.799381178737884e-07, "learning_rate": 0.0008252238285528757, "loss": 2.5561, "step": 10533 }, { "crossentropy": 2.5570660829544067, "epoch": 0.5728268849071481, "grad_norm": 0.031009411439299583, "grad_norm_var": 1.0358937765537918e-06, "learning_rate": 0.00082453871244755, "loss": 2.5571, "step": 10534 }, { "crossentropy": 2.54417884349823, "epoch": 0.5728812637646483, "grad_norm": 0.03242024779319763, "grad_norm_var": 1.0428261778219298e-06, "learning_rate": 0.0008238538553004415, "loss": 2.5442, "step": 10535 }, { "crossentropy": 2.541394352912903, "epoch": 0.5729356426221485, "grad_norm": 0.03399480879306793, "grad_norm_var": 1.2473689033551394e-06, "learning_rate": 0.0008231692571540212, "loss": 2.5414, "step": 10536 }, { "crossentropy": 2.5049946308135986, "epoch": 0.5729900214796487, "grad_norm": 0.032190389931201935, "grad_norm_var": 1.242872062066682e-06, "learning_rate": 0.0008224849180507504, "loss": 2.505, "step": 10537 }, { "crossentropy": 2.564782500267029, "epoch": 0.5730444003371489, "grad_norm": 0.03508247062563896, "grad_norm_var": 1.7300530321830038e-06, "learning_rate": 0.0008218008380330721, "loss": 2.5648, "step": 10538 }, { "crossentropy": 2.6204041242599487, "epoch": 0.5730987791946491, "grad_norm": 0.03337346017360687, "grad_norm_var": 1.7570760138348994e-06, "learning_rate": 0.000821117017143409, "loss": 2.6204, "step": 10539 }, { "crossentropy": 2.4798543453216553, "epoch": 0.5731531580521493, "grad_norm": 0.03413597494363785, "grad_norm_var": 1.7543272391769368e-06, "learning_rate": 0.000820433455424171, "loss": 2.4799, "step": 10540 }, { "crossentropy": 2.424629807472229, "epoch": 0.5732075369096495, "grad_norm": 0.03230800852179527, "grad_norm_var": 1.6511956269259537e-06, "learning_rate": 0.0008197501529177564, "loss": 2.4246, "step": 10541 }, { "crossentropy": 2.5142279863357544, "epoch": 0.5732619157671497, "grad_norm": 0.0309348925948143, "grad_norm_var": 1.8529742874549692e-06, "learning_rate": 0.0008190671096665386, "loss": 2.5142, "step": 10542 }, { "crossentropy": 2.5068429708480835, "epoch": 0.57331629462465, "grad_norm": 0.03213556110858917, "grad_norm_var": 1.5119027918848582e-06, "learning_rate": 0.0008183843257128809, "loss": 2.5068, "step": 10543 }, { "crossentropy": 2.5383888483047485, "epoch": 0.5733706734821501, "grad_norm": 0.03240488842129707, "grad_norm_var": 1.4955982471728195e-06, "learning_rate": 0.0008177018010991289, "loss": 2.5384, "step": 10544 }, { "crossentropy": 2.481842517852783, "epoch": 0.5734250523396504, "grad_norm": 0.03173287957906723, "grad_norm_var": 1.5349171058469478e-06, "learning_rate": 0.0008170195358676113, "loss": 2.4818, "step": 10545 }, { "crossentropy": 2.429246187210083, "epoch": 0.5734794311971505, "grad_norm": 0.03290126100182533, "grad_norm_var": 1.5325820673024825e-06, "learning_rate": 0.0008163375300606446, "loss": 2.4292, "step": 10546 }, { "crossentropy": 2.5308674573898315, "epoch": 0.5735338100546508, "grad_norm": 0.03288218379020691, "grad_norm_var": 1.4708840351547315e-06, "learning_rate": 0.0008156557837205209, "loss": 2.5309, "step": 10547 }, { "crossentropy": 2.462926506996155, "epoch": 0.5735881889121509, "grad_norm": 0.03219782933592796, "grad_norm_var": 1.4391368644374313e-06, "learning_rate": 0.0008149742968895252, "loss": 2.4629, "step": 10548 }, { "crossentropy": 2.582605004310608, "epoch": 0.5736425677696512, "grad_norm": 0.03190495818853378, "grad_norm_var": 1.231626088899262e-06, "learning_rate": 0.0008142930696099237, "loss": 2.5826, "step": 10549 }, { "crossentropy": 2.3190032243728638, "epoch": 0.5736969466271513, "grad_norm": 0.030807919800281525, "grad_norm_var": 1.2769110501709493e-06, "learning_rate": 0.0008136121019239623, "loss": 2.319, "step": 10550 }, { "crossentropy": 2.6056458950042725, "epoch": 0.5737513254846516, "grad_norm": 0.03177957981824875, "grad_norm_var": 1.3168928946741612e-06, "learning_rate": 0.0008129313938738742, "loss": 2.6056, "step": 10551 }, { "crossentropy": 2.5749682188034058, "epoch": 0.5738057043421517, "grad_norm": 0.030936162918806076, "grad_norm_var": 1.3115394996210394e-06, "learning_rate": 0.0008122509455018801, "loss": 2.575, "step": 10552 }, { "crossentropy": 2.5270402431488037, "epoch": 0.573860083199652, "grad_norm": 0.030763806775212288, "grad_norm_var": 1.4703842500260553e-06, "learning_rate": 0.0008115707568501768, "loss": 2.527, "step": 10553 }, { "crossentropy": 2.623627781867981, "epoch": 0.5739144620571521, "grad_norm": 0.031574640423059464, "grad_norm_var": 9.229006481601635e-07, "learning_rate": 0.00081089082796095, "loss": 2.6236, "step": 10554 }, { "crossentropy": 2.5085755586624146, "epoch": 0.5739688409146524, "grad_norm": 0.03163999691605568, "grad_norm_var": 8.044418225204199e-07, "learning_rate": 0.0008102111588763689, "loss": 2.5086, "step": 10555 }, { "crossentropy": 2.5337090492248535, "epoch": 0.5740232197721525, "grad_norm": 0.03183053061366081, "grad_norm_var": 4.616179762434071e-07, "learning_rate": 0.0008095317496385856, "loss": 2.5337, "step": 10556 }, { "crossentropy": 2.5620501041412354, "epoch": 0.5740775986296528, "grad_norm": 0.032223623245954514, "grad_norm_var": 4.563016005127886e-07, "learning_rate": 0.0008088526002897362, "loss": 2.5621, "step": 10557 }, { "crossentropy": 2.4426063299179077, "epoch": 0.574131977487153, "grad_norm": 0.03088723123073578, "grad_norm_var": 4.6188190948961463e-07, "learning_rate": 0.0008081737108719411, "loss": 2.4426, "step": 10558 }, { "crossentropy": 2.5245872735977173, "epoch": 0.5741863563446532, "grad_norm": 0.03258794546127319, "grad_norm_var": 4.956554445131271e-07, "learning_rate": 0.0008074950814273047, "loss": 2.5246, "step": 10559 }, { "crossentropy": 2.5009292364120483, "epoch": 0.5742407352021534, "grad_norm": 0.031199192628264427, "grad_norm_var": 4.918368173169203e-07, "learning_rate": 0.0008068167119979153, "loss": 2.5009, "step": 10560 }, { "crossentropy": 2.5681577920913696, "epoch": 0.5742951140596536, "grad_norm": 0.033254995942115784, "grad_norm_var": 6.350705579860037e-07, "learning_rate": 0.0008061386026258433, "loss": 2.5682, "step": 10561 }, { "crossentropy": 2.5156538486480713, "epoch": 0.5743494929171538, "grad_norm": 0.03206523135304451, "grad_norm_var": 5.599804958478574e-07, "learning_rate": 0.0008054607533531444, "loss": 2.5157, "step": 10562 }, { "crossentropy": 2.5617311000823975, "epoch": 0.574403871774654, "grad_norm": 0.030820880085229874, "grad_norm_var": 5.235755635069987e-07, "learning_rate": 0.0008047831642218611, "loss": 2.5617, "step": 10563 }, { "crossentropy": 2.495366334915161, "epoch": 0.5744582506321542, "grad_norm": 0.031606193631887436, "grad_norm_var": 5.026046570437889e-07, "learning_rate": 0.0008041058352740133, "loss": 2.4954, "step": 10564 }, { "crossentropy": 2.5654174089431763, "epoch": 0.5745126294896544, "grad_norm": 0.03533296287059784, "grad_norm_var": 1.3683608686645514e-06, "learning_rate": 0.0008034287665516099, "loss": 2.5654, "step": 10565 }, { "crossentropy": 2.494787096977234, "epoch": 0.5745670083471546, "grad_norm": 0.031609609723091125, "grad_norm_var": 1.2990714600836598e-06, "learning_rate": 0.0008027519580966419, "loss": 2.4948, "step": 10566 }, { "crossentropy": 2.589513421058655, "epoch": 0.5746213872046548, "grad_norm": 0.03522513061761856, "grad_norm_var": 1.9939909463783284e-06, "learning_rate": 0.0008020754099510847, "loss": 2.5895, "step": 10567 }, { "crossentropy": 2.397457718849182, "epoch": 0.574675766062155, "grad_norm": 0.03530055657029152, "grad_norm_var": 2.508750252137639e-06, "learning_rate": 0.0008013991221568967, "loss": 2.3975, "step": 10568 }, { "crossentropy": 2.536463737487793, "epoch": 0.5747301449196552, "grad_norm": 0.03278457000851631, "grad_norm_var": 2.3311606041614136e-06, "learning_rate": 0.0008007230947560207, "loss": 2.5365, "step": 10569 }, { "crossentropy": 2.6354905366897583, "epoch": 0.5747845237771554, "grad_norm": 0.032554253935813904, "grad_norm_var": 2.270735240944667e-06, "learning_rate": 0.0008000473277903836, "loss": 2.6355, "step": 10570 }, { "crossentropy": 2.5194510221481323, "epoch": 0.5748389026346556, "grad_norm": 0.030922114849090576, "grad_norm_var": 2.3907834864400942e-06, "learning_rate": 0.0007993718213018952, "loss": 2.5195, "step": 10571 }, { "crossentropy": 2.6483263969421387, "epoch": 0.5748932814921558, "grad_norm": 0.03260715678334236, "grad_norm_var": 2.3578297063207016e-06, "learning_rate": 0.0007986965753324516, "loss": 2.6483, "step": 10572 }, { "crossentropy": 2.4437220096588135, "epoch": 0.574947660349656, "grad_norm": 0.030648773536086082, "grad_norm_var": 2.583755674847622e-06, "learning_rate": 0.0007980215899239274, "loss": 2.4437, "step": 10573 }, { "crossentropy": 2.549086570739746, "epoch": 0.5750020392071563, "grad_norm": 0.03298534080386162, "grad_norm_var": 2.418087583784534e-06, "learning_rate": 0.0007973468651181887, "loss": 2.5491, "step": 10574 }, { "crossentropy": 2.595950484275818, "epoch": 0.5750564180646565, "grad_norm": 0.03149891644716263, "grad_norm_var": 2.4930989859092237e-06, "learning_rate": 0.0007966724009570781, "loss": 2.596, "step": 10575 }, { "crossentropy": 2.450586199760437, "epoch": 0.5751107969221567, "grad_norm": 0.031064966693520546, "grad_norm_var": 2.517970483667473e-06, "learning_rate": 0.0007959981974824265, "loss": 2.4506, "step": 10576 }, { "crossentropy": 2.474225401878357, "epoch": 0.5751651757796569, "grad_norm": 0.033434927463531494, "grad_norm_var": 2.537684632279655e-06, "learning_rate": 0.0007953242547360478, "loss": 2.4742, "step": 10577 }, { "crossentropy": 2.523801326751709, "epoch": 0.5752195546371571, "grad_norm": 0.03182311728596687, "grad_norm_var": 2.5563147836012968e-06, "learning_rate": 0.0007946505727597381, "loss": 2.5238, "step": 10578 }, { "crossentropy": 2.5886672735214233, "epoch": 0.5752739334946573, "grad_norm": 0.031003927811980247, "grad_norm_var": 2.517092945851261e-06, "learning_rate": 0.0007939771515952798, "loss": 2.5887, "step": 10579 }, { "crossentropy": 2.5097750425338745, "epoch": 0.5753283123521575, "grad_norm": 0.03102211281657219, "grad_norm_var": 2.60998139644529e-06, "learning_rate": 0.000793303991284437, "loss": 2.5098, "step": 10580 }, { "crossentropy": 2.476184368133545, "epoch": 0.5753826912096577, "grad_norm": 0.030907396227121353, "grad_norm_var": 2.155725824622825e-06, "learning_rate": 0.0007926310918689589, "loss": 2.4762, "step": 10581 }, { "crossentropy": 2.5443087816238403, "epoch": 0.5754370700671579, "grad_norm": 0.030886542052030563, "grad_norm_var": 2.2464836128492503e-06, "learning_rate": 0.0007919584533905777, "loss": 2.5443, "step": 10582 }, { "crossentropy": 2.5600147247314453, "epoch": 0.5754914489246581, "grad_norm": 0.031135326251387596, "grad_norm_var": 1.6241942070754333e-06, "learning_rate": 0.0007912860758910112, "loss": 2.56, "step": 10583 }, { "crossentropy": 2.432360529899597, "epoch": 0.5755458277821583, "grad_norm": 0.029985368251800537, "grad_norm_var": 9.879221733230988e-07, "learning_rate": 0.0007906139594119566, "loss": 2.4324, "step": 10584 }, { "crossentropy": 2.4813467264175415, "epoch": 0.5756002066396585, "grad_norm": 0.0319734662771225, "grad_norm_var": 8.986667577985548e-07, "learning_rate": 0.0007899421039951005, "loss": 2.4813, "step": 10585 }, { "crossentropy": 2.5383145809173584, "epoch": 0.5756545854971588, "grad_norm": 0.032463982701301575, "grad_norm_var": 8.868281960056415e-07, "learning_rate": 0.0007892705096821112, "loss": 2.5383, "step": 10586 }, { "crossentropy": 2.5062936544418335, "epoch": 0.5757089643546589, "grad_norm": 0.031744614243507385, "grad_norm_var": 8.632440320047522e-07, "learning_rate": 0.0007885991765146389, "loss": 2.5063, "step": 10587 }, { "crossentropy": 2.5360454320907593, "epoch": 0.5757633432121592, "grad_norm": 0.030917352065443993, "grad_norm_var": 8.089585273368295e-07, "learning_rate": 0.0007879281045343184, "loss": 2.536, "step": 10588 }, { "crossentropy": 2.4208446741104126, "epoch": 0.5758177220696593, "grad_norm": 0.03263482078909874, "grad_norm_var": 8.38411608869843e-07, "learning_rate": 0.0007872572937827704, "loss": 2.4208, "step": 10589 }, { "crossentropy": 2.5250285863876343, "epoch": 0.5758721009271596, "grad_norm": 0.032066430896520615, "grad_norm_var": 7.20550306884613e-07, "learning_rate": 0.0007865867443015978, "loss": 2.525, "step": 10590 }, { "crossentropy": 2.4763914346694946, "epoch": 0.5759264797846597, "grad_norm": 0.030897794291377068, "grad_norm_var": 7.460430022795805e-07, "learning_rate": 0.0007859164561323867, "loss": 2.4764, "step": 10591 }, { "crossentropy": 2.392060875892639, "epoch": 0.57598085864216, "grad_norm": 0.0315224789083004, "grad_norm_var": 7.327319230170914e-07, "learning_rate": 0.0007852464293167083, "loss": 2.3921, "step": 10592 }, { "crossentropy": 2.512618899345398, "epoch": 0.5760352374996601, "grad_norm": 0.03156528249382973, "grad_norm_var": 4.753933068005663e-07, "learning_rate": 0.0007845766638961172, "loss": 2.5126, "step": 10593 }, { "crossentropy": 2.5430291891098022, "epoch": 0.5760896163571604, "grad_norm": 0.030876122415065765, "grad_norm_var": 4.79201791245681e-07, "learning_rate": 0.0007839071599121522, "loss": 2.543, "step": 10594 }, { "crossentropy": 2.5081381797790527, "epoch": 0.5761439952146605, "grad_norm": 0.033048052340745926, "grad_norm_var": 6.459812380932382e-07, "learning_rate": 0.000783237917406332, "loss": 2.5081, "step": 10595 }, { "crossentropy": 2.489029288291931, "epoch": 0.5761983740721608, "grad_norm": 0.03156643733382225, "grad_norm_var": 6.314164563308757e-07, "learning_rate": 0.0007825689364201665, "loss": 2.489, "step": 10596 }, { "crossentropy": 2.520955801010132, "epoch": 0.5762527529296609, "grad_norm": 0.0321001335978508, "grad_norm_var": 6.241845156885883e-07, "learning_rate": 0.0007819002169951445, "loss": 2.521, "step": 10597 }, { "crossentropy": 2.5990960597991943, "epoch": 0.5763071317871612, "grad_norm": 0.032951436936855316, "grad_norm_var": 6.979559752108652e-07, "learning_rate": 0.0007812317591727374, "loss": 2.5991, "step": 10598 }, { "crossentropy": 2.6675418615341187, "epoch": 0.5763615106446613, "grad_norm": 0.0324246846139431, "grad_norm_var": 7.021067200643771e-07, "learning_rate": 0.0007805635629944025, "loss": 2.6675, "step": 10599 }, { "crossentropy": 2.4581139087677, "epoch": 0.5764158895021616, "grad_norm": 0.030910903587937355, "grad_norm_var": 5.321857509682591e-07, "learning_rate": 0.0007798956285015841, "loss": 2.4581, "step": 10600 }, { "crossentropy": 2.478045344352722, "epoch": 0.5764702683596618, "grad_norm": 0.030800500884652138, "grad_norm_var": 5.994922011230581e-07, "learning_rate": 0.0007792279557357029, "loss": 2.478, "step": 10601 }, { "crossentropy": 2.528579354286194, "epoch": 0.576524647217162, "grad_norm": 0.03088805079460144, "grad_norm_var": 6.111382614766454e-07, "learning_rate": 0.0007785605447381688, "loss": 2.5286, "step": 10602 }, { "crossentropy": 2.6654125452041626, "epoch": 0.5765790260746622, "grad_norm": 0.03148216754198074, "grad_norm_var": 6.132588714142957e-07, "learning_rate": 0.0007778933955503742, "loss": 2.6654, "step": 10603 }, { "crossentropy": 2.584971785545349, "epoch": 0.5766334049321624, "grad_norm": 0.0315428227186203, "grad_norm_var": 5.752928728916615e-07, "learning_rate": 0.0007772265082136948, "loss": 2.585, "step": 10604 }, { "crossentropy": 2.5616743564605713, "epoch": 0.5766877837896626, "grad_norm": 0.03250117599964142, "grad_norm_var": 5.598383269153715e-07, "learning_rate": 0.000776559882769492, "loss": 2.5617, "step": 10605 }, { "crossentropy": 2.4835076332092285, "epoch": 0.5767421626471628, "grad_norm": 0.031214306131005287, "grad_norm_var": 5.631936871417405e-07, "learning_rate": 0.0007758935192591049, "loss": 2.4835, "step": 10606 }, { "crossentropy": 2.5321316719055176, "epoch": 0.576796541504663, "grad_norm": 0.03248574212193489, "grad_norm_var": 5.629550556479941e-07, "learning_rate": 0.0007752274177238656, "loss": 2.5321, "step": 10607 }, { "crossentropy": 2.5466185808181763, "epoch": 0.5768509203621632, "grad_norm": 0.033947285264730453, "grad_norm_var": 8.592949199523788e-07, "learning_rate": 0.0007745615782050835, "loss": 2.5466, "step": 10608 }, { "crossentropy": 2.4927843809127808, "epoch": 0.5769052992196634, "grad_norm": 0.03069336898624897, "grad_norm_var": 9.450326178388087e-07, "learning_rate": 0.0007738960007440521, "loss": 2.4928, "step": 10609 }, { "crossentropy": 2.448106527328491, "epoch": 0.5769596780771636, "grad_norm": 0.03062911331653595, "grad_norm_var": 9.805768172212065e-07, "learning_rate": 0.0007732306853820493, "loss": 2.4481, "step": 10610 }, { "crossentropy": 2.5757755041122437, "epoch": 0.5770140569346638, "grad_norm": 0.0310199074447155, "grad_norm_var": 9.066920300226354e-07, "learning_rate": 0.0007725656321603413, "loss": 2.5758, "step": 10611 }, { "crossentropy": 2.5552269220352173, "epoch": 0.577068435792164, "grad_norm": 0.036045704036951065, "grad_norm_var": 2.082479395568102e-06, "learning_rate": 0.0007719008411201705, "loss": 2.5552, "step": 10612 }, { "crossentropy": 2.565617799758911, "epoch": 0.5771228146496642, "grad_norm": 0.030373012647032738, "grad_norm_var": 2.2406343587929076e-06, "learning_rate": 0.0007712363123027676, "loss": 2.5656, "step": 10613 }, { "crossentropy": 2.547174572944641, "epoch": 0.5771771935071645, "grad_norm": 0.0315588153898716, "grad_norm_var": 2.1609282870540485e-06, "learning_rate": 0.0007705720457493464, "loss": 2.5472, "step": 10614 }, { "crossentropy": 2.5582544803619385, "epoch": 0.5772315723646646, "grad_norm": 0.029990484938025475, "grad_norm_var": 2.322784415028506e-06, "learning_rate": 0.0007699080415011034, "loss": 2.5583, "step": 10615 }, { "crossentropy": 2.4471737146377563, "epoch": 0.5772859512221649, "grad_norm": 0.03174684941768646, "grad_norm_var": 2.286286249120181e-06, "learning_rate": 0.0007692442995992216, "loss": 2.4472, "step": 10616 }, { "crossentropy": 2.465412139892578, "epoch": 0.577340330079665, "grad_norm": 0.03487744927406311, "grad_norm_var": 2.845705382865417e-06, "learning_rate": 0.000768580820084861, "loss": 2.4654, "step": 10617 }, { "crossentropy": 2.4939284324645996, "epoch": 0.5773947089371653, "grad_norm": 0.03134208917617798, "grad_norm_var": 2.7950719478512034e-06, "learning_rate": 0.0007679176029991747, "loss": 2.4939, "step": 10618 }, { "crossentropy": 2.449761152267456, "epoch": 0.5774490877946654, "grad_norm": 0.031148772686719894, "grad_norm_var": 2.8235107377878876e-06, "learning_rate": 0.0007672546483832943, "loss": 2.4498, "step": 10619 }, { "crossentropy": 2.5529705286026, "epoch": 0.5775034666521657, "grad_norm": 0.03263542801141739, "grad_norm_var": 2.8395611427173282e-06, "learning_rate": 0.0007665919562783331, "loss": 2.553, "step": 10620 }, { "crossentropy": 2.51233446598053, "epoch": 0.5775578455096658, "grad_norm": 0.031363021582365036, "grad_norm_var": 2.846455010207099e-06, "learning_rate": 0.0007659295267253908, "loss": 2.5123, "step": 10621 }, { "crossentropy": 2.4571295976638794, "epoch": 0.5776122243671661, "grad_norm": 0.03126734867691994, "grad_norm_var": 2.8414846437515754e-06, "learning_rate": 0.0007652673597655541, "loss": 2.4571, "step": 10622 }, { "crossentropy": 2.6392208337783813, "epoch": 0.5776666032246662, "grad_norm": 0.0323929563164711, "grad_norm_var": 2.8353363567082025e-06, "learning_rate": 0.0007646054554398862, "loss": 2.6392, "step": 10623 }, { "crossentropy": 2.604546308517456, "epoch": 0.5777209820821665, "grad_norm": 0.0360136516392231, "grad_norm_var": 3.6553859902349923e-06, "learning_rate": 0.00076394381378944, "loss": 2.6045, "step": 10624 }, { "crossentropy": 2.478482246398926, "epoch": 0.5777753609396666, "grad_norm": 0.030507903546094894, "grad_norm_var": 3.691544116939058e-06, "learning_rate": 0.0007632824348552486, "loss": 2.4785, "step": 10625 }, { "crossentropy": 2.4506454467773438, "epoch": 0.5778297397971669, "grad_norm": 0.032039009034633636, "grad_norm_var": 3.5473531754041827e-06, "learning_rate": 0.0007626213186783309, "loss": 2.4506, "step": 10626 }, { "crossentropy": 2.557522177696228, "epoch": 0.577884118654667, "grad_norm": 0.03288069739937782, "grad_norm_var": 3.4845831471974968e-06, "learning_rate": 0.0007619604652996886, "loss": 2.5575, "step": 10627 }, { "crossentropy": 2.5281978845596313, "epoch": 0.5779384975121673, "grad_norm": 0.03168671578168869, "grad_norm_var": 2.4727290638316505e-06, "learning_rate": 0.0007612998747603073, "loss": 2.5282, "step": 10628 }, { "crossentropy": 2.5673916339874268, "epoch": 0.5779928763696675, "grad_norm": 0.03222746029496193, "grad_norm_var": 2.2880933666415457e-06, "learning_rate": 0.0007606395471011556, "loss": 2.5674, "step": 10629 }, { "crossentropy": 2.487450122833252, "epoch": 0.5780472552271677, "grad_norm": 0.03086034208536148, "grad_norm_var": 2.369443136330607e-06, "learning_rate": 0.0007599794823631884, "loss": 2.4875, "step": 10630 }, { "crossentropy": 2.5101661682128906, "epoch": 0.5781016340846679, "grad_norm": 0.031377121806144714, "grad_norm_var": 2.1067604467114287e-06, "learning_rate": 0.0007593196805873393, "loss": 2.5102, "step": 10631 }, { "crossentropy": 2.469811797142029, "epoch": 0.5781560129421681, "grad_norm": 0.03201461583375931, "grad_norm_var": 2.096922311563681e-06, "learning_rate": 0.000758660141814529, "loss": 2.4698, "step": 10632 }, { "crossentropy": 2.508271098136902, "epoch": 0.5782103917996683, "grad_norm": 0.032255273312330246, "grad_norm_var": 1.5782059642246067e-06, "learning_rate": 0.0007580008660856647, "loss": 2.5083, "step": 10633 }, { "crossentropy": 2.5012025833129883, "epoch": 0.5782647706571685, "grad_norm": 0.03240993246436119, "grad_norm_var": 1.5556908826749294e-06, "learning_rate": 0.000757341853441631, "loss": 2.5012, "step": 10634 }, { "crossentropy": 2.484726667404175, "epoch": 0.5783191495146687, "grad_norm": 0.031059646978974342, "grad_norm_var": 1.5671051602930784e-06, "learning_rate": 0.0007566831039232996, "loss": 2.4847, "step": 10635 }, { "crossentropy": 2.3895654678344727, "epoch": 0.5783735283721689, "grad_norm": 0.03236738592386246, "grad_norm_var": 1.5510999031706583e-06, "learning_rate": 0.0007560246175715269, "loss": 2.3896, "step": 10636 }, { "crossentropy": 2.4524590969085693, "epoch": 0.5784279072296691, "grad_norm": 0.030971389263868332, "grad_norm_var": 1.596307261039403e-06, "learning_rate": 0.00075536639442715, "loss": 2.4525, "step": 10637 }, { "crossentropy": 2.5159032344818115, "epoch": 0.5784822860871693, "grad_norm": 0.033838286995887756, "grad_norm_var": 1.7511669993564104e-06, "learning_rate": 0.0007547084345309923, "loss": 2.5159, "step": 10638 }, { "crossentropy": 2.5187233686447144, "epoch": 0.5785366649446695, "grad_norm": 0.031413208693265915, "grad_norm_var": 1.7835247549676983e-06, "learning_rate": 0.0007540507379238599, "loss": 2.5187, "step": 10639 }, { "crossentropy": 2.542533755302429, "epoch": 0.5785910438021697, "grad_norm": 0.0324716791510582, "grad_norm_var": 7.288731806988675e-07, "learning_rate": 0.0007533933046465419, "loss": 2.5425, "step": 10640 }, { "crossentropy": 2.4071309566497803, "epoch": 0.5786454226596699, "grad_norm": 0.03168134391307831, "grad_norm_var": 5.973167366737775e-07, "learning_rate": 0.0007527361347398132, "loss": 2.4071, "step": 10641 }, { "crossentropy": 2.5497446060180664, "epoch": 0.5786998015171702, "grad_norm": 0.0337427519261837, "grad_norm_var": 7.939301899044602e-07, "learning_rate": 0.0007520792282444284, "loss": 2.5497, "step": 10642 }, { "crossentropy": 2.605835199356079, "epoch": 0.5787541803746703, "grad_norm": 0.032318923622369766, "grad_norm_var": 7.53576091762686e-07, "learning_rate": 0.0007514225852011286, "loss": 2.6058, "step": 10643 }, { "crossentropy": 2.5847012996673584, "epoch": 0.5788085592321706, "grad_norm": 0.03351609408855438, "grad_norm_var": 8.757131892383884e-07, "learning_rate": 0.0007507662056506414, "loss": 2.5847, "step": 10644 }, { "crossentropy": 2.4944297075271606, "epoch": 0.5788629380896707, "grad_norm": 0.03150302544236183, "grad_norm_var": 9.01788938196613e-07, "learning_rate": 0.0007501100896336716, "loss": 2.4944, "step": 10645 }, { "crossentropy": 2.4565465450286865, "epoch": 0.578917316947171, "grad_norm": 0.0330645926296711, "grad_norm_var": 8.37430901381173e-07, "learning_rate": 0.0007494542371909113, "loss": 2.4565, "step": 10646 }, { "crossentropy": 2.619051933288574, "epoch": 0.5789716958046711, "grad_norm": 0.03213540464639664, "grad_norm_var": 7.850828338424546e-07, "learning_rate": 0.0007487986483630372, "loss": 2.6191, "step": 10647 }, { "crossentropy": 2.443752408027649, "epoch": 0.5790260746621714, "grad_norm": 0.03264288604259491, "grad_norm_var": 7.860374119979801e-07, "learning_rate": 0.000748143323190707, "loss": 2.4438, "step": 10648 }, { "crossentropy": 2.5195064544677734, "epoch": 0.5790804535196715, "grad_norm": 0.03285830095410347, "grad_norm_var": 8.021948100851572e-07, "learning_rate": 0.000747488261714564, "loss": 2.5195, "step": 10649 }, { "crossentropy": 2.5541352033615112, "epoch": 0.5791348323771718, "grad_norm": 0.03183414041996002, "grad_norm_var": 8.202092973644819e-07, "learning_rate": 0.0007468334639752344, "loss": 2.5541, "step": 10650 }, { "crossentropy": 2.4857993125915527, "epoch": 0.5791892112346719, "grad_norm": 0.03293853998184204, "grad_norm_var": 7.204242412988647e-07, "learning_rate": 0.0007461789300133282, "loss": 2.4858, "step": 10651 }, { "crossentropy": 2.3495582342147827, "epoch": 0.5792435900921722, "grad_norm": 0.03290287405252457, "grad_norm_var": 7.32010339350199e-07, "learning_rate": 0.0007455246598694387, "loss": 2.3496, "step": 10652 }, { "crossentropy": 2.615228056907654, "epoch": 0.5792979689496723, "grad_norm": 0.03191139176487923, "grad_norm_var": 5.969539537334715e-07, "learning_rate": 0.0007448706535841448, "loss": 2.6152, "step": 10653 }, { "crossentropy": 2.482768416404724, "epoch": 0.5793523478071726, "grad_norm": 0.03259687498211861, "grad_norm_var": 4.797588921761724e-07, "learning_rate": 0.0007442169111980047, "loss": 2.4828, "step": 10654 }, { "crossentropy": 2.550092935562134, "epoch": 0.5794067266646727, "grad_norm": 0.030760176479816437, "grad_norm_var": 5.984933987114466e-07, "learning_rate": 0.0007435634327515644, "loss": 2.5501, "step": 10655 }, { "crossentropy": 2.5019359588623047, "epoch": 0.579461105522173, "grad_norm": 0.03139973059296608, "grad_norm_var": 6.643445195103625e-07, "learning_rate": 0.0007429102182853514, "loss": 2.5019, "step": 10656 }, { "crossentropy": 2.5776034593582153, "epoch": 0.5795154843796732, "grad_norm": 0.03220890462398529, "grad_norm_var": 6.337950456042352e-07, "learning_rate": 0.0007422572678398775, "loss": 2.5776, "step": 10657 }, { "crossentropy": 2.607934355735779, "epoch": 0.5795698632371734, "grad_norm": 0.03202658146619797, "grad_norm_var": 5.096853146095587e-07, "learning_rate": 0.0007416045814556388, "loss": 2.6079, "step": 10658 }, { "crossentropy": 2.4650187492370605, "epoch": 0.5796242420946736, "grad_norm": 0.03094366192817688, "grad_norm_var": 6.2234361859273e-07, "learning_rate": 0.0007409521591731138, "loss": 2.465, "step": 10659 }, { "crossentropy": 2.518948197364807, "epoch": 0.5796786209521738, "grad_norm": 0.03056463412940502, "grad_norm_var": 6.499305304745549e-07, "learning_rate": 0.0007403000010327649, "loss": 2.5189, "step": 10660 }, { "crossentropy": 2.537497878074646, "epoch": 0.579732999809674, "grad_norm": 0.03251846507191658, "grad_norm_var": 6.446204933943743e-07, "learning_rate": 0.0007396481070750399, "loss": 2.5375, "step": 10661 }, { "crossentropy": 2.598644256591797, "epoch": 0.5797873786671742, "grad_norm": 0.033543992787599564, "grad_norm_var": 7.218112036763789e-07, "learning_rate": 0.0007389964773403652, "loss": 2.5986, "step": 10662 }, { "crossentropy": 2.600491523742676, "epoch": 0.5798417575246744, "grad_norm": 0.031752560287714005, "grad_norm_var": 7.297597489318152e-07, "learning_rate": 0.0007383451118691576, "loss": 2.6005, "step": 10663 }, { "crossentropy": 2.486173391342163, "epoch": 0.5798961363821746, "grad_norm": 0.03211084380745888, "grad_norm_var": 7.080695204672206e-07, "learning_rate": 0.0007376940107018143, "loss": 2.4862, "step": 10664 }, { "crossentropy": 2.488582134246826, "epoch": 0.5799505152396748, "grad_norm": 0.03079172596335411, "grad_norm_var": 7.535026635305571e-07, "learning_rate": 0.0007370431738787131, "loss": 2.4886, "step": 10665 }, { "crossentropy": 2.6309733390808105, "epoch": 0.580004894097175, "grad_norm": 0.03198612481355667, "grad_norm_var": 7.530986780333032e-07, "learning_rate": 0.0007363926014402189, "loss": 2.631, "step": 10666 }, { "crossentropy": 2.563888192176819, "epoch": 0.5800592729546752, "grad_norm": 0.03204864636063576, "grad_norm_var": 6.834989498305608e-07, "learning_rate": 0.0007357422934266833, "loss": 2.5639, "step": 10667 }, { "crossentropy": 2.5546764135360718, "epoch": 0.5801136518121754, "grad_norm": 0.03521539270877838, "grad_norm_var": 1.3333684473687863e-06, "learning_rate": 0.0007350922498784335, "loss": 2.5547, "step": 10668 }, { "crossentropy": 2.570392608642578, "epoch": 0.5801680306696756, "grad_norm": 0.03419452905654907, "grad_norm_var": 1.6249648471439026e-06, "learning_rate": 0.0007344424708357867, "loss": 2.5704, "step": 10669 }, { "crossentropy": 2.361067533493042, "epoch": 0.5802224095271759, "grad_norm": 0.030965246260166168, "grad_norm_var": 1.6977091204922137e-06, "learning_rate": 0.0007337929563390405, "loss": 2.3611, "step": 10670 }, { "crossentropy": 2.590111494064331, "epoch": 0.580276788384676, "grad_norm": 0.03422771394252777, "grad_norm_var": 1.8461815127877953e-06, "learning_rate": 0.0007331437064284779, "loss": 2.5901, "step": 10671 }, { "crossentropy": 2.453604221343994, "epoch": 0.5803311672421763, "grad_norm": 0.03258471563458443, "grad_norm_var": 1.7946773574872005e-06, "learning_rate": 0.0007324947211443661, "loss": 2.4536, "step": 10672 }, { "crossentropy": 2.4873803853988647, "epoch": 0.5803855460996764, "grad_norm": 0.03145890682935715, "grad_norm_var": 1.8444662630736578e-06, "learning_rate": 0.0007318460005269506, "loss": 2.4874, "step": 10673 }, { "crossentropy": 2.5558594465255737, "epoch": 0.5804399249571767, "grad_norm": 0.03131859749555588, "grad_norm_var": 1.9023930240375283e-06, "learning_rate": 0.0007311975446164682, "loss": 2.5559, "step": 10674 }, { "crossentropy": 2.483452320098877, "epoch": 0.5804943038146768, "grad_norm": 0.03263165429234505, "grad_norm_var": 1.783287952323538e-06, "learning_rate": 0.0007305493534531354, "loss": 2.4835, "step": 10675 }, { "crossentropy": 2.6158872842788696, "epoch": 0.5805486826721771, "grad_norm": 0.03056051954627037, "grad_norm_var": 1.7842792398437348e-06, "learning_rate": 0.0007299014270771498, "loss": 2.6159, "step": 10676 }, { "crossentropy": 2.535353899002075, "epoch": 0.5806030615296772, "grad_norm": 0.032324910163879395, "grad_norm_var": 1.7827725044328168e-06, "learning_rate": 0.0007292537655286957, "loss": 2.5354, "step": 10677 }, { "crossentropy": 2.593615174293518, "epoch": 0.5806574403871775, "grad_norm": 0.03299345076084137, "grad_norm_var": 1.714602833331226e-06, "learning_rate": 0.0007286063688479444, "loss": 2.5936, "step": 10678 }, { "crossentropy": 2.5156408548355103, "epoch": 0.5807118192446776, "grad_norm": 0.03303759545087814, "grad_norm_var": 1.7200983976145557e-06, "learning_rate": 0.000727959237075042, "loss": 2.5156, "step": 10679 }, { "crossentropy": 2.5579060316085815, "epoch": 0.5807661981021779, "grad_norm": 0.03134821727871895, "grad_norm_var": 1.7861721766080733e-06, "learning_rate": 0.000727312370250125, "loss": 2.5579, "step": 10680 }, { "crossentropy": 2.5061373710632324, "epoch": 0.580820576959678, "grad_norm": 0.03306008130311966, "grad_norm_var": 1.6348035753736717e-06, "learning_rate": 0.0007266657684133115, "loss": 2.5061, "step": 10681 }, { "crossentropy": 2.503692626953125, "epoch": 0.5808749558171783, "grad_norm": 0.03160696476697922, "grad_norm_var": 1.6696294421774648e-06, "learning_rate": 0.0007260194316047031, "loss": 2.5037, "step": 10682 }, { "crossentropy": 2.458352565765381, "epoch": 0.5809293346746784, "grad_norm": 0.03206504136323929, "grad_norm_var": 1.668717355725787e-06, "learning_rate": 0.0007253733598643858, "loss": 2.4584, "step": 10683 }, { "crossentropy": 2.5258963108062744, "epoch": 0.5809837135321787, "grad_norm": 0.03160844370722771, "grad_norm_var": 1.1637255898347757e-06, "learning_rate": 0.0007247275532324255, "loss": 2.5259, "step": 10684 }, { "crossentropy": 2.5684491395950317, "epoch": 0.5810380923896789, "grad_norm": 0.0326378308236599, "grad_norm_var": 9.114024448954316e-07, "learning_rate": 0.0007240820117488778, "loss": 2.5684, "step": 10685 }, { "crossentropy": 2.5427119731903076, "epoch": 0.5810924712471791, "grad_norm": 0.031551793217659, "grad_norm_var": 8.401035208216445e-07, "learning_rate": 0.000723436735453778, "loss": 2.5427, "step": 10686 }, { "crossentropy": 2.5086649656295776, "epoch": 0.5811468501046793, "grad_norm": 0.031151803210377693, "grad_norm_var": 5.951160482968772e-07, "learning_rate": 0.000722791724387144, "loss": 2.5087, "step": 10687 }, { "crossentropy": 2.402792453765869, "epoch": 0.5812012289621795, "grad_norm": 0.030276181176304817, "grad_norm_var": 7.470769474581436e-07, "learning_rate": 0.0007221469785889784, "loss": 2.4028, "step": 10688 }, { "crossentropy": 2.4021071195602417, "epoch": 0.5812556078196797, "grad_norm": 0.03172985091805458, "grad_norm_var": 7.374642994874166e-07, "learning_rate": 0.0007215024980992707, "loss": 2.4021, "step": 10689 }, { "crossentropy": 2.5034520626068115, "epoch": 0.5813099866771799, "grad_norm": 0.033657465130090714, "grad_norm_var": 9.077364304381496e-07, "learning_rate": 0.0007208582829579885, "loss": 2.5035, "step": 10690 }, { "crossentropy": 2.5148074626922607, "epoch": 0.5813643655346801, "grad_norm": 0.032082706689834595, "grad_norm_var": 8.814438597232535e-07, "learning_rate": 0.0007202143332050864, "loss": 2.5148, "step": 10691 }, { "crossentropy": 2.5491751432418823, "epoch": 0.5814187443921803, "grad_norm": 0.03199128806591034, "grad_norm_var": 7.384411827397741e-07, "learning_rate": 0.000719570648880501, "loss": 2.5492, "step": 10692 }, { "crossentropy": 2.605390191078186, "epoch": 0.5814731232496805, "grad_norm": 0.03155624866485596, "grad_norm_var": 7.492666439191017e-07, "learning_rate": 0.0007189272300241534, "loss": 2.6054, "step": 10693 }, { "crossentropy": 2.5260363817214966, "epoch": 0.5815275021071807, "grad_norm": 0.03037681058049202, "grad_norm_var": 8.383316615442314e-07, "learning_rate": 0.000718284076675948, "loss": 2.526, "step": 10694 }, { "crossentropy": 2.5739728212356567, "epoch": 0.5815818809646809, "grad_norm": 0.03334876149892807, "grad_norm_var": 8.932964218180106e-07, "learning_rate": 0.0007176411888757722, "loss": 2.574, "step": 10695 }, { "crossentropy": 2.391914129257202, "epoch": 0.5816362598221811, "grad_norm": 0.03172225505113602, "grad_norm_var": 8.756146319422823e-07, "learning_rate": 0.0007169985666634971, "loss": 2.3919, "step": 10696 }, { "crossentropy": 2.4678237438201904, "epoch": 0.5816906386796813, "grad_norm": 0.030732188373804092, "grad_norm_var": 8.546911866931401e-07, "learning_rate": 0.0007163562100789794, "loss": 2.4678, "step": 10697 }, { "crossentropy": 2.5026187896728516, "epoch": 0.5817450175371816, "grad_norm": 0.031928155571222305, "grad_norm_var": 8.547573896448834e-07, "learning_rate": 0.0007157141191620548, "loss": 2.5026, "step": 10698 }, { "crossentropy": 2.4971089363098145, "epoch": 0.5817993963946817, "grad_norm": 0.03087017871439457, "grad_norm_var": 8.979480111240365e-07, "learning_rate": 0.000715072293952545, "loss": 2.4971, "step": 10699 }, { "crossentropy": 2.5168248414993286, "epoch": 0.581853775252182, "grad_norm": 0.030898243188858032, "grad_norm_var": 9.382718114653154e-07, "learning_rate": 0.0007144307344902589, "loss": 2.5168, "step": 10700 }, { "crossentropy": 2.63741934299469, "epoch": 0.5819081541096821, "grad_norm": 0.03212227299809456, "grad_norm_var": 8.874599418544898e-07, "learning_rate": 0.000713789440814982, "loss": 2.6374, "step": 10701 }, { "crossentropy": 2.5071667432785034, "epoch": 0.5819625329671824, "grad_norm": 0.031533870846033096, "grad_norm_var": 8.876543890355317e-07, "learning_rate": 0.0007131484129664878, "loss": 2.5072, "step": 10702 }, { "crossentropy": 2.532489061355591, "epoch": 0.5820169118246825, "grad_norm": 0.032545965164899826, "grad_norm_var": 9.214254685367569e-07, "learning_rate": 0.0007125076509845329, "loss": 2.5325, "step": 10703 }, { "crossentropy": 2.5593055486679077, "epoch": 0.5820712906821828, "grad_norm": 0.031593214720487595, "grad_norm_var": 7.779149936654965e-07, "learning_rate": 0.0007118671549088562, "loss": 2.5593, "step": 10704 }, { "crossentropy": 2.594022750854492, "epoch": 0.5821256695396829, "grad_norm": 0.03235020861029625, "grad_norm_var": 7.967367556631904e-07, "learning_rate": 0.0007112269247791802, "loss": 2.594, "step": 10705 }, { "crossentropy": 2.7021923065185547, "epoch": 0.5821800483971832, "grad_norm": 0.03356058523058891, "grad_norm_var": 7.737414969455203e-07, "learning_rate": 0.0007105869606352122, "loss": 2.7022, "step": 10706 }, { "crossentropy": 2.5370222330093384, "epoch": 0.5822344272546833, "grad_norm": 0.03233478218317032, "grad_norm_var": 7.863472026524518e-07, "learning_rate": 0.0007099472625166419, "loss": 2.537, "step": 10707 }, { "crossentropy": 2.552727460861206, "epoch": 0.5822888061121836, "grad_norm": 0.030393460765480995, "grad_norm_var": 9.140152705149525e-07, "learning_rate": 0.0007093078304631434, "loss": 2.5527, "step": 10708 }, { "crossentropy": 2.5242542028427124, "epoch": 0.5823431849696837, "grad_norm": 0.031072447076439857, "grad_norm_var": 9.40607164529603e-07, "learning_rate": 0.0007086686645143736, "loss": 2.5243, "step": 10709 }, { "crossentropy": 2.489717721939087, "epoch": 0.582397563827184, "grad_norm": 0.031253356486558914, "grad_norm_var": 8.326434655320323e-07, "learning_rate": 0.0007080297647099698, "loss": 2.4897, "step": 10710 }, { "crossentropy": 2.5932793617248535, "epoch": 0.5824519426846841, "grad_norm": 0.03146820887923241, "grad_norm_var": 6.568730255259673e-07, "learning_rate": 0.0007073911310895614, "loss": 2.5933, "step": 10711 }, { "crossentropy": 2.583553910255432, "epoch": 0.5825063215421844, "grad_norm": 0.031562335789203644, "grad_norm_var": 6.569032869239844e-07, "learning_rate": 0.0007067527636927518, "loss": 2.5836, "step": 10712 }, { "crossentropy": 2.4541512727737427, "epoch": 0.5825607003996846, "grad_norm": 0.033054888248443604, "grad_norm_var": 7.133407989945526e-07, "learning_rate": 0.0007061146625591331, "loss": 2.4542, "step": 10713 }, { "crossentropy": 2.57925546169281, "epoch": 0.5826150792571848, "grad_norm": 0.034611862152814865, "grad_norm_var": 1.2151070230159104e-06, "learning_rate": 0.0007054768277282797, "loss": 2.5793, "step": 10714 }, { "crossentropy": 2.435154438018799, "epoch": 0.582669458114685, "grad_norm": 0.033433955162763596, "grad_norm_var": 1.2562407167834276e-06, "learning_rate": 0.0007048392592397501, "loss": 2.4352, "step": 10715 }, { "crossentropy": 2.518923759460449, "epoch": 0.5827238369721852, "grad_norm": 0.031697828322649, "grad_norm_var": 1.1668145958351286e-06, "learning_rate": 0.0007042019571330849, "loss": 2.5189, "step": 10716 }, { "crossentropy": 2.5568623542785645, "epoch": 0.5827782158296854, "grad_norm": 0.03205455094575882, "grad_norm_var": 1.1674584012031156e-06, "learning_rate": 0.0007035649214478096, "loss": 2.5569, "step": 10717 }, { "crossentropy": 2.53107750415802, "epoch": 0.5828325946871856, "grad_norm": 0.03185319900512695, "grad_norm_var": 1.1472751961054406e-06, "learning_rate": 0.0007029281522234321, "loss": 2.5311, "step": 10718 }, { "crossentropy": 2.5180028676986694, "epoch": 0.5828869735446858, "grad_norm": 0.033901359885931015, "grad_norm_var": 1.3286728029445841e-06, "learning_rate": 0.0007022916494994441, "loss": 2.518, "step": 10719 }, { "crossentropy": 2.5457262992858887, "epoch": 0.582941352402186, "grad_norm": 0.03293619304895401, "grad_norm_var": 1.321594518448726e-06, "learning_rate": 0.000701655413315323, "loss": 2.5457, "step": 10720 }, { "crossentropy": 2.4468491077423096, "epoch": 0.5829957312596862, "grad_norm": 0.030501876026391983, "grad_norm_var": 1.5341277885859313e-06, "learning_rate": 0.0007010194437105233, "loss": 2.4468, "step": 10721 }, { "crossentropy": 2.572696566581726, "epoch": 0.5830501101171864, "grad_norm": 0.03332819417119026, "grad_norm_var": 1.496295411976672e-06, "learning_rate": 0.0007003837407244911, "loss": 2.5727, "step": 10722 }, { "crossentropy": 2.5819047689437866, "epoch": 0.5831044889746867, "grad_norm": 0.032636284828186035, "grad_norm_var": 1.5067457112831356e-06, "learning_rate": 0.0006997483043966518, "loss": 2.5819, "step": 10723 }, { "crossentropy": 2.5659772157669067, "epoch": 0.5831588678321868, "grad_norm": 0.03156766667962074, "grad_norm_var": 1.3046053393471566e-06, "learning_rate": 0.0006991131347664126, "loss": 2.566, "step": 10724 }, { "crossentropy": 2.5476202964782715, "epoch": 0.5832132466896871, "grad_norm": 0.031675439327955246, "grad_norm_var": 1.2279619509297328e-06, "learning_rate": 0.0006984782318731664, "loss": 2.5476, "step": 10725 }, { "crossentropy": 2.4906294345855713, "epoch": 0.5832676255471873, "grad_norm": 0.03218451514840126, "grad_norm_var": 1.146487079506144e-06, "learning_rate": 0.0006978435957562906, "loss": 2.4906, "step": 10726 }, { "crossentropy": 2.6107006072998047, "epoch": 0.5833220044046875, "grad_norm": 0.03260146081447601, "grad_norm_var": 1.0853138941205667e-06, "learning_rate": 0.0006972092264551439, "loss": 2.6107, "step": 10727 }, { "crossentropy": 2.5905520915985107, "epoch": 0.5833763832621877, "grad_norm": 0.03363708406686783, "grad_norm_var": 1.1018492082801381e-06, "learning_rate": 0.0006965751240090696, "loss": 2.5906, "step": 10728 }, { "crossentropy": 2.4863297939300537, "epoch": 0.5834307621196879, "grad_norm": 0.03189690038561821, "grad_norm_var": 1.1161605374045147e-06, "learning_rate": 0.0006959412884573934, "loss": 2.4863, "step": 10729 }, { "crossentropy": 2.5822086334228516, "epoch": 0.5834851409771881, "grad_norm": 0.031517144292593, "grad_norm_var": 8.566931891401212e-07, "learning_rate": 0.0006953077198394264, "loss": 2.5822, "step": 10730 }, { "crossentropy": 2.6407161951065063, "epoch": 0.5835395198346883, "grad_norm": 0.031875211745500565, "grad_norm_var": 7.809765150116703e-07, "learning_rate": 0.0006946744181944625, "loss": 2.6407, "step": 10731 }, { "crossentropy": 2.6370439529418945, "epoch": 0.5835938986921885, "grad_norm": 0.03248396888375282, "grad_norm_var": 7.626096423583399e-07, "learning_rate": 0.0006940413835617754, "loss": 2.637, "step": 10732 }, { "crossentropy": 2.5255234241485596, "epoch": 0.5836482775496887, "grad_norm": 0.032175447791814804, "grad_norm_var": 7.597166737426919e-07, "learning_rate": 0.000693408615980628, "loss": 2.5255, "step": 10733 }, { "crossentropy": 2.561630606651306, "epoch": 0.5837026564071889, "grad_norm": 0.0314561128616333, "grad_norm_var": 7.931344776556636e-07, "learning_rate": 0.0006927761154902639, "loss": 2.5616, "step": 10734 }, { "crossentropy": 2.471834897994995, "epoch": 0.5837570352646891, "grad_norm": 0.03090016357600689, "grad_norm_var": 7.046510599007511e-07, "learning_rate": 0.0006921438821299086, "loss": 2.4718, "step": 10735 }, { "crossentropy": 2.507754921913147, "epoch": 0.5838114141221893, "grad_norm": 0.03047989495098591, "grad_norm_var": 8.032470825355964e-07, "learning_rate": 0.0006915119159387729, "loss": 2.5078, "step": 10736 }, { "crossentropy": 2.587330460548401, "epoch": 0.5838657929796895, "grad_norm": 0.03170914202928543, "grad_norm_var": 6.640809618156547e-07, "learning_rate": 0.000690880216956053, "loss": 2.5873, "step": 10737 }, { "crossentropy": 2.487208843231201, "epoch": 0.5839201718371897, "grad_norm": 0.03247008100152016, "grad_norm_var": 5.5902912852081e-07, "learning_rate": 0.0006902487852209238, "loss": 2.4872, "step": 10738 }, { "crossentropy": 2.5114043951034546, "epoch": 0.58397455069469, "grad_norm": 0.030900994315743446, "grad_norm_var": 5.894059670323437e-07, "learning_rate": 0.0006896176207725474, "loss": 2.5114, "step": 10739 }, { "crossentropy": 2.5265438556671143, "epoch": 0.5840289295521901, "grad_norm": 0.031317874789237976, "grad_norm_var": 6.025658368231096e-07, "learning_rate": 0.0006889867236500669, "loss": 2.5265, "step": 10740 }, { "crossentropy": 2.459317207336426, "epoch": 0.5840833084096904, "grad_norm": 0.030161995440721512, "grad_norm_var": 7.769301596964913e-07, "learning_rate": 0.0006883560938926109, "loss": 2.4593, "step": 10741 }, { "crossentropy": 2.5017441511154175, "epoch": 0.5841376872671905, "grad_norm": 0.030375927686691284, "grad_norm_var": 8.730890727901789e-07, "learning_rate": 0.0006877257315392915, "loss": 2.5017, "step": 10742 }, { "crossentropy": 2.6436764001846313, "epoch": 0.5841920661246908, "grad_norm": 0.030788175761699677, "grad_norm_var": 8.418955757426381e-07, "learning_rate": 0.0006870956366291997, "loss": 2.6437, "step": 10743 }, { "crossentropy": 2.432081937789917, "epoch": 0.5842464449821909, "grad_norm": 0.03215944021940231, "grad_norm_var": 5.591127572489856e-07, "learning_rate": 0.0006864658092014164, "loss": 2.4321, "step": 10744 }, { "crossentropy": 2.5086296796798706, "epoch": 0.5843008238396912, "grad_norm": 0.03176804259419441, "grad_norm_var": 5.519015563157225e-07, "learning_rate": 0.0006858362492950033, "loss": 2.5086, "step": 10745 }, { "crossentropy": 2.503992199897766, "epoch": 0.5843552026971913, "grad_norm": 0.03076912648975849, "grad_norm_var": 5.760590579300649e-07, "learning_rate": 0.0006852069569490027, "loss": 2.504, "step": 10746 }, { "crossentropy": 2.4854246377944946, "epoch": 0.5844095815546916, "grad_norm": 0.03232907876372337, "grad_norm_var": 6.199925926407441e-07, "learning_rate": 0.0006845779322024432, "loss": 2.4854, "step": 10747 }, { "crossentropy": 2.5311925411224365, "epoch": 0.5844639604121917, "grad_norm": 0.03049337863922119, "grad_norm_var": 5.773838732641793e-07, "learning_rate": 0.0006839491750943389, "loss": 2.5312, "step": 10748 }, { "crossentropy": 2.5819798707962036, "epoch": 0.584518339269692, "grad_norm": 0.03124886192381382, "grad_norm_var": 5.186777574121461e-07, "learning_rate": 0.000683320685663682, "loss": 2.582, "step": 10749 }, { "crossentropy": 2.5450756549835205, "epoch": 0.5845727181271921, "grad_norm": 0.03262714669108391, "grad_norm_var": 6.431222388525303e-07, "learning_rate": 0.0006826924639494519, "loss": 2.5451, "step": 10750 }, { "crossentropy": 2.690354108810425, "epoch": 0.5846270969846924, "grad_norm": 0.031058816239237785, "grad_norm_var": 6.36634914685088e-07, "learning_rate": 0.0006820645099906103, "loss": 2.6904, "step": 10751 }, { "crossentropy": 2.4527931213378906, "epoch": 0.5846814758421925, "grad_norm": 0.03298007324337959, "grad_norm_var": 7.568867999139043e-07, "learning_rate": 0.0006814368238261015, "loss": 2.4528, "step": 10752 }, { "crossentropy": 2.465464472770691, "epoch": 0.5847358546996928, "grad_norm": 0.030914029106497765, "grad_norm_var": 7.686493710588275e-07, "learning_rate": 0.0006808094054948565, "loss": 2.4655, "step": 10753 }, { "crossentropy": 2.430977702140808, "epoch": 0.584790233557193, "grad_norm": 0.031640563160181046, "grad_norm_var": 6.930466358948898e-07, "learning_rate": 0.0006801822550357828, "loss": 2.431, "step": 10754 }, { "crossentropy": 2.4488853216171265, "epoch": 0.5848446124146932, "grad_norm": 0.03315145894885063, "grad_norm_var": 8.761007182106831e-07, "learning_rate": 0.0006795553724877795, "loss": 2.4489, "step": 10755 }, { "crossentropy": 2.489277482032776, "epoch": 0.5848989912721934, "grad_norm": 0.032057736068964005, "grad_norm_var": 8.936783827432256e-07, "learning_rate": 0.0006789287578897252, "loss": 2.4893, "step": 10756 }, { "crossentropy": 2.628995180130005, "epoch": 0.5849533701296936, "grad_norm": 0.03248235210776329, "grad_norm_var": 8.06099441615545e-07, "learning_rate": 0.0006783024112804792, "loss": 2.629, "step": 10757 }, { "crossentropy": 2.580460786819458, "epoch": 0.5850077489871938, "grad_norm": 0.03288603946566582, "grad_norm_var": 7.641904832572076e-07, "learning_rate": 0.0006776763326988883, "loss": 2.5805, "step": 10758 }, { "crossentropy": 2.591380476951599, "epoch": 0.585062127844694, "grad_norm": 0.032862961292266846, "grad_norm_var": 7.437431994170365e-07, "learning_rate": 0.0006770505221837831, "loss": 2.5914, "step": 10759 }, { "crossentropy": 2.642226815223694, "epoch": 0.5851165067021942, "grad_norm": 0.03301781788468361, "grad_norm_var": 8.121256490097502e-07, "learning_rate": 0.0006764249797739736, "loss": 2.6422, "step": 10760 }, { "crossentropy": 2.4633712768554688, "epoch": 0.5851708855596944, "grad_norm": 0.030017521232366562, "grad_norm_var": 1.0619791869695879e-06, "learning_rate": 0.0006757997055082554, "loss": 2.4634, "step": 10761 }, { "crossentropy": 2.5038424730300903, "epoch": 0.5852252644171946, "grad_norm": 0.031400084495544434, "grad_norm_var": 9.910029712947941e-07, "learning_rate": 0.0006751746994254082, "loss": 2.5038, "step": 10762 }, { "crossentropy": 2.494153618812561, "epoch": 0.5852796432746948, "grad_norm": 0.030884530395269394, "grad_norm_var": 1.048023771175197e-06, "learning_rate": 0.0006745499615641948, "loss": 2.4942, "step": 10763 }, { "crossentropy": 2.488824725151062, "epoch": 0.585334022132195, "grad_norm": 0.030125658959150314, "grad_norm_var": 1.1233671136210659e-06, "learning_rate": 0.0006739254919633603, "loss": 2.4888, "step": 10764 }, { "crossentropy": 2.6000837087631226, "epoch": 0.5853884009896952, "grad_norm": 0.04835496470332146, "grad_norm_var": 1.807578729906689e-05, "learning_rate": 0.0006733012906616321, "loss": 2.6001, "step": 10765 }, { "crossentropy": 2.529252290725708, "epoch": 0.5854427798471954, "grad_norm": 0.03238317742943764, "grad_norm_var": 1.8088508621617815e-05, "learning_rate": 0.0006726773576977246, "loss": 2.5293, "step": 10766 }, { "crossentropy": 2.384984850883484, "epoch": 0.5854971587046957, "grad_norm": 0.031059717759490013, "grad_norm_var": 1.808828872607715e-05, "learning_rate": 0.0006720536931103344, "loss": 2.385, "step": 10767 }, { "crossentropy": 2.5627228021621704, "epoch": 0.5855515375621958, "grad_norm": 0.031128598377108574, "grad_norm_var": 1.8279971546462227e-05, "learning_rate": 0.0006714302969381387, "loss": 2.5627, "step": 10768 }, { "crossentropy": 2.5228748321533203, "epoch": 0.5856059164196961, "grad_norm": 0.030369382351636887, "grad_norm_var": 1.8433505634575e-05, "learning_rate": 0.0006708071692197987, "loss": 2.5229, "step": 10769 }, { "crossentropy": 2.482355237007141, "epoch": 0.5856602952771962, "grad_norm": 0.03626963496208191, "grad_norm_var": 1.909486438044644e-05, "learning_rate": 0.0006701843099939642, "loss": 2.4824, "step": 10770 }, { "crossentropy": 2.5448739528656006, "epoch": 0.5857146741346965, "grad_norm": 0.031708359718322754, "grad_norm_var": 1.9201311438177674e-05, "learning_rate": 0.0006695617192992609, "loss": 2.5449, "step": 10771 }, { "crossentropy": 2.540568709373474, "epoch": 0.5857690529921966, "grad_norm": 0.03185411915183067, "grad_norm_var": 1.9227801807320382e-05, "learning_rate": 0.0006689393971743024, "loss": 2.5406, "step": 10772 }, { "crossentropy": 2.4844900369644165, "epoch": 0.5858234318496969, "grad_norm": 0.030317332595586777, "grad_norm_var": 1.9648626264448e-05, "learning_rate": 0.0006683173436576851, "loss": 2.4845, "step": 10773 }, { "crossentropy": 2.492066979408264, "epoch": 0.585877810707197, "grad_norm": 0.03201816603541374, "grad_norm_var": 1.968458746826555e-05, "learning_rate": 0.0006676955587879874, "loss": 2.4921, "step": 10774 }, { "crossentropy": 2.4445953369140625, "epoch": 0.5859321895646973, "grad_norm": 0.030954400077462196, "grad_norm_var": 1.9879878700770543e-05, "learning_rate": 0.0006670740426037724, "loss": 2.4446, "step": 10775 }, { "crossentropy": 2.4913206100463867, "epoch": 0.5859865684221974, "grad_norm": 0.031164273619651794, "grad_norm_var": 1.9995415705740617e-05, "learning_rate": 0.0006664527951435856, "loss": 2.4913, "step": 10776 }, { "crossentropy": 2.562492251396179, "epoch": 0.5860409472796977, "grad_norm": 0.03061101585626602, "grad_norm_var": 1.982093633575038e-05, "learning_rate": 0.0006658318164459565, "loss": 2.5625, "step": 10777 }, { "crossentropy": 2.546806812286377, "epoch": 0.5860953261371978, "grad_norm": 0.031264252960681915, "grad_norm_var": 1.9842692926672632e-05, "learning_rate": 0.0006652111065493982, "loss": 2.5468, "step": 10778 }, { "crossentropy": 2.620829224586487, "epoch": 0.5861497049946981, "grad_norm": 0.0325145348906517, "grad_norm_var": 1.9651302350435343e-05, "learning_rate": 0.0006645906654924044, "loss": 2.6208, "step": 10779 }, { "crossentropy": 2.5761220455169678, "epoch": 0.5862040838521982, "grad_norm": 0.03143451362848282, "grad_norm_var": 1.9321136822275406e-05, "learning_rate": 0.0006639704933134549, "loss": 2.5761, "step": 10780 }, { "crossentropy": 2.48614764213562, "epoch": 0.5862584627096985, "grad_norm": 0.03358609229326248, "grad_norm_var": 2.151529057419508e-06, "learning_rate": 0.000663350590051015, "loss": 2.4861, "step": 10781 }, { "crossentropy": 2.566390037536621, "epoch": 0.5863128415671987, "grad_norm": 0.03154122084379196, "grad_norm_var": 2.129227081445551e-06, "learning_rate": 0.0006627309557435273, "loss": 2.5664, "step": 10782 }, { "crossentropy": 2.5730968713760376, "epoch": 0.5863672204246989, "grad_norm": 0.03227054327726364, "grad_norm_var": 2.111479006981776e-06, "learning_rate": 0.0006621115904294229, "loss": 2.5731, "step": 10783 }, { "crossentropy": 2.475194215774536, "epoch": 0.5864215992821991, "grad_norm": 0.031121229752898216, "grad_norm_var": 2.1121547178835907e-06, "learning_rate": 0.0006614924941471123, "loss": 2.4752, "step": 10784 }, { "crossentropy": 2.498706340789795, "epoch": 0.5864759781396993, "grad_norm": 0.033448826521635056, "grad_norm_var": 2.1123311927682963e-06, "learning_rate": 0.000660873666934993, "loss": 2.4987, "step": 10785 }, { "crossentropy": 2.583027482032776, "epoch": 0.5865303569971995, "grad_norm": 0.031742215156555176, "grad_norm_var": 8.189984695448904e-07, "learning_rate": 0.0006602551088314434, "loss": 2.583, "step": 10786 }, { "crossentropy": 2.5151772499084473, "epoch": 0.5865847358546997, "grad_norm": 0.030897002667188644, "grad_norm_var": 8.616117449117624e-07, "learning_rate": 0.0006596368198748259, "loss": 2.5152, "step": 10787 }, { "crossentropy": 2.5939542055130005, "epoch": 0.5866391147121999, "grad_norm": 0.031943272799253464, "grad_norm_var": 8.642825054647544e-07, "learning_rate": 0.0006590188001034863, "loss": 2.594, "step": 10788 }, { "crossentropy": 2.541326880455017, "epoch": 0.5866934935697001, "grad_norm": 0.03171389549970627, "grad_norm_var": 7.330364371315364e-07, "learning_rate": 0.0006584010495557541, "loss": 2.5413, "step": 10789 }, { "crossentropy": 2.534942150115967, "epoch": 0.5867478724272003, "grad_norm": 0.03113563545048237, "grad_norm_var": 7.518180022179072e-07, "learning_rate": 0.0006577835682699417, "loss": 2.5349, "step": 10790 }, { "crossentropy": 2.540582060813904, "epoch": 0.5868022512847005, "grad_norm": 0.030318407341837883, "grad_norm_var": 8.410820713713642e-07, "learning_rate": 0.0006571663562843421, "loss": 2.5406, "step": 10791 }, { "crossentropy": 2.5752058029174805, "epoch": 0.5868566301422007, "grad_norm": 0.03134705871343613, "grad_norm_var": 8.308648927077134e-07, "learning_rate": 0.0006565494136372385, "loss": 2.5752, "step": 10792 }, { "crossentropy": 2.5091660022735596, "epoch": 0.586911008999701, "grad_norm": 0.032206591218709946, "grad_norm_var": 7.624326767042734e-07, "learning_rate": 0.0006559327403668897, "loss": 2.5092, "step": 10793 }, { "crossentropy": 2.44450843334198, "epoch": 0.5869653878572011, "grad_norm": 0.030785419046878815, "grad_norm_var": 8.097115402622233e-07, "learning_rate": 0.0006553163365115428, "loss": 2.4445, "step": 10794 }, { "crossentropy": 2.6356427669525146, "epoch": 0.5870197667147014, "grad_norm": 0.03208157420158386, "grad_norm_var": 7.773156392221058e-07, "learning_rate": 0.0006547002021094256, "loss": 2.6356, "step": 10795 }, { "crossentropy": 2.605223059654236, "epoch": 0.5870741455722015, "grad_norm": 0.03170403838157654, "grad_norm_var": 7.714762850280586e-07, "learning_rate": 0.0006540843371987509, "loss": 2.6052, "step": 10796 }, { "crossentropy": 2.5508817434310913, "epoch": 0.5871285244297018, "grad_norm": 0.03391311690211296, "grad_norm_var": 8.586477960526578e-07, "learning_rate": 0.0006534687418177143, "loss": 2.5509, "step": 10797 }, { "crossentropy": 2.508892297744751, "epoch": 0.5871829032872019, "grad_norm": 0.032382331788539886, "grad_norm_var": 8.782584225739404e-07, "learning_rate": 0.0006528534160044946, "loss": 2.5089, "step": 10798 }, { "crossentropy": 2.5847870111465454, "epoch": 0.5872372821447022, "grad_norm": 0.032262422144412994, "grad_norm_var": 8.777673224284367e-07, "learning_rate": 0.0006522383597972514, "loss": 2.5848, "step": 10799 }, { "crossentropy": 2.490127921104431, "epoch": 0.5872916610022023, "grad_norm": 0.03178434818983078, "grad_norm_var": 8.441142094999775e-07, "learning_rate": 0.0006516235732341324, "loss": 2.4901, "step": 10800 }, { "crossentropy": 2.5074304342269897, "epoch": 0.5873460398597026, "grad_norm": 0.03348792344331741, "grad_norm_var": 8.525227503129729e-07, "learning_rate": 0.0006510090563532667, "loss": 2.5074, "step": 10801 }, { "crossentropy": 2.4814743995666504, "epoch": 0.5874004187172027, "grad_norm": 0.03152487054467201, "grad_norm_var": 8.587893298327963e-07, "learning_rate": 0.0006503948091927642, "loss": 2.4815, "step": 10802 }, { "crossentropy": 2.5459131002426147, "epoch": 0.587454797574703, "grad_norm": 0.032834362238645554, "grad_norm_var": 8.49011009937337e-07, "learning_rate": 0.0006497808317907189, "loss": 2.5459, "step": 10803 }, { "crossentropy": 2.377631664276123, "epoch": 0.5875091764322031, "grad_norm": 0.03078540973365307, "grad_norm_var": 9.360135750510149e-07, "learning_rate": 0.0006491671241852127, "loss": 2.3776, "step": 10804 }, { "crossentropy": 2.5224850177764893, "epoch": 0.5875635552897034, "grad_norm": 0.03347060829401016, "grad_norm_var": 1.0872412077154396e-06, "learning_rate": 0.0006485536864143044, "loss": 2.5225, "step": 10805 }, { "crossentropy": 2.591214656829834, "epoch": 0.5876179341472035, "grad_norm": 0.03173644468188286, "grad_norm_var": 1.0404387634381506e-06, "learning_rate": 0.0006479405185160392, "loss": 2.5912, "step": 10806 }, { "crossentropy": 2.454797625541687, "epoch": 0.5876723130047038, "grad_norm": 0.03045402653515339, "grad_norm_var": 1.0104745328387912e-06, "learning_rate": 0.0006473276205284462, "loss": 2.4548, "step": 10807 }, { "crossentropy": 2.610575795173645, "epoch": 0.5877266918622039, "grad_norm": 0.03144710510969162, "grad_norm_var": 1.0017561074654804e-06, "learning_rate": 0.000646714992489536, "loss": 2.6106, "step": 10808 }, { "crossentropy": 2.566322684288025, "epoch": 0.5877810707197042, "grad_norm": 0.03348291665315628, "grad_norm_var": 1.1295727374870112e-06, "learning_rate": 0.0006461026344373044, "loss": 2.5663, "step": 10809 }, { "crossentropy": 2.5796003341674805, "epoch": 0.5878354495772043, "grad_norm": 0.03227461129426956, "grad_norm_var": 1.000493632832139e-06, "learning_rate": 0.000645490546409726, "loss": 2.5796, "step": 10810 }, { "crossentropy": 2.502760887145996, "epoch": 0.5878898284347046, "grad_norm": 0.031289730221033096, "grad_norm_var": 1.0549972620657946e-06, "learning_rate": 0.0006448787284447654, "loss": 2.5028, "step": 10811 }, { "crossentropy": 2.557142496109009, "epoch": 0.5879442072922048, "grad_norm": 0.03453498333692551, "grad_norm_var": 1.3773106101061875e-06, "learning_rate": 0.0006442671805803669, "loss": 2.5571, "step": 10812 }, { "crossentropy": 2.486691474914551, "epoch": 0.587998586149705, "grad_norm": 0.036508411169052124, "grad_norm_var": 2.337772058610271e-06, "learning_rate": 0.0006436559028544558, "loss": 2.4867, "step": 10813 }, { "crossentropy": 2.57336688041687, "epoch": 0.5880529650072052, "grad_norm": 0.03147687390446663, "grad_norm_var": 2.405184385723992e-06, "learning_rate": 0.0006430448953049434, "loss": 2.5734, "step": 10814 }, { "crossentropy": 2.5290948152542114, "epoch": 0.5881073438647054, "grad_norm": 0.03138788044452667, "grad_norm_var": 2.475988418160618e-06, "learning_rate": 0.0006424341579697268, "loss": 2.5291, "step": 10815 }, { "crossentropy": 2.5849623680114746, "epoch": 0.5881617227222056, "grad_norm": 0.03133070096373558, "grad_norm_var": 2.5263934977283e-06, "learning_rate": 0.0006418236908866798, "loss": 2.585, "step": 10816 }, { "crossentropy": 2.551080822944641, "epoch": 0.5882161015797058, "grad_norm": 0.03117694891989231, "grad_norm_var": 2.517773416409634e-06, "learning_rate": 0.0006412134940936643, "loss": 2.5511, "step": 10817 }, { "crossentropy": 2.603623628616333, "epoch": 0.588270480437206, "grad_norm": 0.031329888850450516, "grad_norm_var": 2.538539483605305e-06, "learning_rate": 0.0006406035676285244, "loss": 2.6036, "step": 10818 }, { "crossentropy": 2.5434387922286987, "epoch": 0.5883248592947062, "grad_norm": 0.031546276062726974, "grad_norm_var": 2.536733510055432e-06, "learning_rate": 0.0006399939115290871, "loss": 2.5434, "step": 10819 }, { "crossentropy": 2.5198118686676025, "epoch": 0.5883792381522064, "grad_norm": 0.03066120482981205, "grad_norm_var": 2.5601231536396487e-06, "learning_rate": 0.0006393845258331631, "loss": 2.5198, "step": 10820 }, { "crossentropy": 2.571366310119629, "epoch": 0.5884336170097066, "grad_norm": 0.03234884515404701, "grad_norm_var": 2.438524983853921e-06, "learning_rate": 0.0006387754105785437, "loss": 2.5714, "step": 10821 }, { "crossentropy": 2.4447038173675537, "epoch": 0.5884879958672068, "grad_norm": 0.030936602503061295, "grad_norm_var": 2.5131939142698644e-06, "learning_rate": 0.0006381665658030084, "loss": 2.4447, "step": 10822 }, { "crossentropy": 2.507004141807556, "epoch": 0.588542374724707, "grad_norm": 0.03213738277554512, "grad_norm_var": 2.3406862378676114e-06, "learning_rate": 0.0006375579915443174, "loss": 2.507, "step": 10823 }, { "crossentropy": 2.4059832096099854, "epoch": 0.5885967535822072, "grad_norm": 0.0310217272490263, "grad_norm_var": 2.3899840381213225e-06, "learning_rate": 0.0006369496878402115, "loss": 2.406, "step": 10824 }, { "crossentropy": 2.437023162841797, "epoch": 0.5886511324397075, "grad_norm": 0.03143903985619545, "grad_norm_var": 2.271565118361684e-06, "learning_rate": 0.0006363416547284168, "loss": 2.437, "step": 10825 }, { "crossentropy": 2.4597781896591187, "epoch": 0.5887055112972076, "grad_norm": 0.03213340416550636, "grad_norm_var": 2.2669363254901176e-06, "learning_rate": 0.0006357338922466471, "loss": 2.4598, "step": 10826 }, { "crossentropy": 2.4901278018951416, "epoch": 0.5887598901547079, "grad_norm": 0.03163520619273186, "grad_norm_var": 2.2438091647079453e-06, "learning_rate": 0.0006351264004325912, "loss": 2.4901, "step": 10827 }, { "crossentropy": 2.575141429901123, "epoch": 0.588814269012208, "grad_norm": 0.03209326043725014, "grad_norm_var": 1.7831082399630359e-06, "learning_rate": 0.0006345191793239269, "loss": 2.5751, "step": 10828 }, { "crossentropy": 2.4383403062820435, "epoch": 0.5888686478697083, "grad_norm": 0.03340113162994385, "grad_norm_var": 4.452605234707968e-07, "learning_rate": 0.0006339122289583126, "loss": 2.4383, "step": 10829 }, { "crossentropy": 2.6038806438446045, "epoch": 0.5889230267272084, "grad_norm": 0.031755346804857254, "grad_norm_var": 4.444765184913411e-07, "learning_rate": 0.000633305549373392, "loss": 2.6039, "step": 10830 }, { "crossentropy": 2.440659284591675, "epoch": 0.5889774055847087, "grad_norm": 0.031466592103242874, "grad_norm_var": 4.4215555942959427e-07, "learning_rate": 0.00063269914060679, "loss": 2.4407, "step": 10831 }, { "crossentropy": 2.5311927795410156, "epoch": 0.5890317844422088, "grad_norm": 0.031010864302515984, "grad_norm_var": 4.6220163677727216e-07, "learning_rate": 0.0006320930026961158, "loss": 2.5312, "step": 10832 }, { "crossentropy": 2.5132557153701782, "epoch": 0.5890861632997091, "grad_norm": 0.03211553022265434, "grad_norm_var": 4.604560370854386e-07, "learning_rate": 0.0006314871356789609, "loss": 2.5133, "step": 10833 }, { "crossentropy": 2.523421883583069, "epoch": 0.5891405421572092, "grad_norm": 0.032613225281238556, "grad_norm_var": 5.018537079454992e-07, "learning_rate": 0.0006308815395929024, "loss": 2.5234, "step": 10834 }, { "crossentropy": 2.4635597467422485, "epoch": 0.5891949210147095, "grad_norm": 0.031084289774298668, "grad_norm_var": 5.289573644798665e-07, "learning_rate": 0.0006302762144754964, "loss": 2.4636, "step": 10835 }, { "crossentropy": 2.5544148683547974, "epoch": 0.5892492998722096, "grad_norm": 0.03158238157629967, "grad_norm_var": 4.493864978231347e-07, "learning_rate": 0.0006296711603642846, "loss": 2.5544, "step": 10836 }, { "crossentropy": 2.5470030307769775, "epoch": 0.5893036787297099, "grad_norm": 0.031115001067519188, "grad_norm_var": 4.539839881523289e-07, "learning_rate": 0.0006290663772967953, "loss": 2.547, "step": 10837 }, { "crossentropy": 2.4891830682754517, "epoch": 0.58935805758721, "grad_norm": 0.031023401767015457, "grad_norm_var": 4.453732469813139e-07, "learning_rate": 0.0006284618653105328, "loss": 2.4892, "step": 10838 }, { "crossentropy": 2.408829689025879, "epoch": 0.5894124364447103, "grad_norm": 0.02997685596346855, "grad_norm_var": 6.188205534854903e-07, "learning_rate": 0.0006278576244429902, "loss": 2.4088, "step": 10839 }, { "crossentropy": 2.5042182207107544, "epoch": 0.5894668153022105, "grad_norm": 0.0319812148809433, "grad_norm_var": 6.034410448104426e-07, "learning_rate": 0.0006272536547316415, "loss": 2.5042, "step": 10840 }, { "crossentropy": 2.55232310295105, "epoch": 0.5895211941597107, "grad_norm": 0.03406934812664986, "grad_norm_var": 9.612770307590996e-07, "learning_rate": 0.0006266499562139444, "loss": 2.5523, "step": 10841 }, { "crossentropy": 2.447689414024353, "epoch": 0.5895755730172109, "grad_norm": 0.033512845635414124, "grad_norm_var": 1.1385723134664602e-06, "learning_rate": 0.0006260465289273399, "loss": 2.4477, "step": 10842 }, { "crossentropy": 2.507606863975525, "epoch": 0.5896299518747111, "grad_norm": 0.031328607350587845, "grad_norm_var": 1.1553654678838568e-06, "learning_rate": 0.0006254433729092518, "loss": 2.5076, "step": 10843 }, { "crossentropy": 2.555777907371521, "epoch": 0.5896843307322113, "grad_norm": 0.031402457505464554, "grad_norm_var": 1.1658354608502178e-06, "learning_rate": 0.000624840488197087, "loss": 2.5558, "step": 10844 }, { "crossentropy": 2.6291972398757935, "epoch": 0.5897387095897115, "grad_norm": 0.032740093767642975, "grad_norm_var": 1.0555455455426756e-06, "learning_rate": 0.0006242378748282363, "loss": 2.6292, "step": 10845 }, { "crossentropy": 2.502722978591919, "epoch": 0.5897930884472117, "grad_norm": 0.03125306963920593, "grad_norm_var": 1.0742117806533987e-06, "learning_rate": 0.0006236355328400745, "loss": 2.5027, "step": 10846 }, { "crossentropy": 2.604376792907715, "epoch": 0.5898474673047119, "grad_norm": 0.03227050229907036, "grad_norm_var": 1.0823783101249294e-06, "learning_rate": 0.000623033462269954, "loss": 2.6044, "step": 10847 }, { "crossentropy": 2.4939002990722656, "epoch": 0.5899018461622121, "grad_norm": 0.03283621743321419, "grad_norm_var": 1.0943083338677921e-06, "learning_rate": 0.0006224316631552207, "loss": 2.4939, "step": 10848 }, { "crossentropy": 2.484865188598633, "epoch": 0.5899562250197123, "grad_norm": 0.030747724696993828, "grad_norm_var": 1.1776886208195024e-06, "learning_rate": 0.0006218301355331924, "loss": 2.4849, "step": 10849 }, { "crossentropy": 2.498655676841736, "epoch": 0.5900106038772125, "grad_norm": 0.031008195132017136, "grad_norm_var": 1.1745234797557915e-06, "learning_rate": 0.000621228879441178, "loss": 2.4987, "step": 10850 }, { "crossentropy": 2.579045295715332, "epoch": 0.5900649827347128, "grad_norm": 0.0320977084338665, "grad_norm_var": 1.1493321697040107e-06, "learning_rate": 0.0006206278949164657, "loss": 2.579, "step": 10851 }, { "crossentropy": 2.5810129642486572, "epoch": 0.5901193615922129, "grad_norm": 0.03236229717731476, "grad_norm_var": 1.1637726122185054e-06, "learning_rate": 0.0006200271819963288, "loss": 2.581, "step": 10852 }, { "crossentropy": 2.427393674850464, "epoch": 0.5901737404497132, "grad_norm": 0.031533997505903244, "grad_norm_var": 1.1332450536889373e-06, "learning_rate": 0.0006194267407180226, "loss": 2.4274, "step": 10853 }, { "crossentropy": 2.5799556970596313, "epoch": 0.5902281193072133, "grad_norm": 0.03360482305288315, "grad_norm_var": 1.2535081063754962e-06, "learning_rate": 0.0006188265711187857, "loss": 2.58, "step": 10854 }, { "crossentropy": 2.45813250541687, "epoch": 0.5902824981647136, "grad_norm": 0.03251709043979645, "grad_norm_var": 9.56205316208107e-07, "learning_rate": 0.0006182266732358405, "loss": 2.4581, "step": 10855 }, { "crossentropy": 2.4681979417800903, "epoch": 0.5903368770222137, "grad_norm": 0.032455649226903915, "grad_norm_var": 9.561717207720675e-07, "learning_rate": 0.0006176270471063916, "loss": 2.4682, "step": 10856 }, { "crossentropy": 2.581622838973999, "epoch": 0.590391255879714, "grad_norm": 0.0324157252907753, "grad_norm_var": 7.22366383025053e-07, "learning_rate": 0.0006170276927676294, "loss": 2.5816, "step": 10857 }, { "crossentropy": 2.5382529497146606, "epoch": 0.5904456347372141, "grad_norm": 0.034157585352659225, "grad_norm_var": 8.671860638625641e-07, "learning_rate": 0.0006164286102567207, "loss": 2.5383, "step": 10858 }, { "crossentropy": 2.534242868423462, "epoch": 0.5905000135947144, "grad_norm": 0.03294205293059349, "grad_norm_var": 8.487230720420942e-07, "learning_rate": 0.0006158297996108237, "loss": 2.5342, "step": 10859 }, { "crossentropy": 2.5612611770629883, "epoch": 0.5905543924522145, "grad_norm": 0.0312572680413723, "grad_norm_var": 8.668654523518443e-07, "learning_rate": 0.0006152312608670768, "loss": 2.5613, "step": 10860 }, { "crossentropy": 2.4643503427505493, "epoch": 0.5906087713097148, "grad_norm": 0.03068271093070507, "grad_norm_var": 1.0004045387539592e-06, "learning_rate": 0.0006146329940625978, "loss": 2.4644, "step": 10861 }, { "crossentropy": 2.4483437538146973, "epoch": 0.5906631501672149, "grad_norm": 0.032312218099832535, "grad_norm_var": 9.461241519042573e-07, "learning_rate": 0.0006140349992344918, "loss": 2.4483, "step": 10862 }, { "crossentropy": 2.5134958028793335, "epoch": 0.5907175290247152, "grad_norm": 0.03153982385993004, "grad_norm_var": 9.726344903495641e-07, "learning_rate": 0.0006134372764198465, "loss": 2.5135, "step": 10863 }, { "crossentropy": 2.479333281517029, "epoch": 0.5907719078822153, "grad_norm": 0.0336778350174427, "grad_norm_var": 1.0934102831977946e-06, "learning_rate": 0.0006128398256557316, "loss": 2.4793, "step": 10864 }, { "crossentropy": 2.4652493000030518, "epoch": 0.5908262867397156, "grad_norm": 0.033364541828632355, "grad_norm_var": 1.0122239340287168e-06, "learning_rate": 0.0006122426469792003, "loss": 2.4652, "step": 10865 }, { "crossentropy": 2.46225368976593, "epoch": 0.5908806655972157, "grad_norm": 0.031481336802244186, "grad_norm_var": 9.40267611678142e-07, "learning_rate": 0.0006116457404272896, "loss": 2.4623, "step": 10866 }, { "crossentropy": 2.530022382736206, "epoch": 0.590935044454716, "grad_norm": 0.031060868874192238, "grad_norm_var": 1.0492707770453612e-06, "learning_rate": 0.0006110491060370188, "loss": 2.53, "step": 10867 }, { "crossentropy": 2.6094419956207275, "epoch": 0.5909894233122162, "grad_norm": 0.03209040313959122, "grad_norm_var": 1.0529147799628302e-06, "learning_rate": 0.0006104527438453916, "loss": 2.6094, "step": 10868 }, { "crossentropy": 2.5264954566955566, "epoch": 0.5910438021697164, "grad_norm": 0.03171868994832039, "grad_norm_var": 1.0357310305491562e-06, "learning_rate": 0.0006098566538893908, "loss": 2.5265, "step": 10869 }, { "crossentropy": 2.5815094709396362, "epoch": 0.5910981810272167, "grad_norm": 0.031065426766872406, "grad_norm_var": 1.0070977743164543e-06, "learning_rate": 0.0006092608362059887, "loss": 2.5815, "step": 10870 }, { "crossentropy": 2.516520142555237, "epoch": 0.5911525598847168, "grad_norm": 0.03112333081662655, "grad_norm_var": 1.0642300334290223e-06, "learning_rate": 0.0006086652908321366, "loss": 2.5165, "step": 10871 }, { "crossentropy": 2.585049033164978, "epoch": 0.5912069387422171, "grad_norm": 0.031393177807331085, "grad_norm_var": 1.0821469754389158e-06, "learning_rate": 0.0006080700178047688, "loss": 2.585, "step": 10872 }, { "crossentropy": 2.558128595352173, "epoch": 0.5912613175997172, "grad_norm": 0.0313371941447258, "grad_norm_var": 1.0976092873329756e-06, "learning_rate": 0.0006074750171608035, "loss": 2.5581, "step": 10873 }, { "crossentropy": 2.483397960662842, "epoch": 0.5913156964572175, "grad_norm": 0.02978689596056938, "grad_norm_var": 1.0052154325289649e-06, "learning_rate": 0.0006068802889371422, "loss": 2.4834, "step": 10874 }, { "crossentropy": 2.5009526014328003, "epoch": 0.5913700753147176, "grad_norm": 0.031497519463300705, "grad_norm_var": 8.91999268360953e-07, "learning_rate": 0.00060628583317067, "loss": 2.501, "step": 10875 }, { "crossentropy": 2.4932388067245483, "epoch": 0.5914244541722179, "grad_norm": 0.03768487647175789, "grad_norm_var": 3.191696350711296e-06, "learning_rate": 0.000605691649898254, "loss": 2.4932, "step": 10876 }, { "crossentropy": 2.65879487991333, "epoch": 0.591478833029718, "grad_norm": 0.031632907688617706, "grad_norm_var": 3.0826850327556055e-06, "learning_rate": 0.0006050977391567452, "loss": 2.6588, "step": 10877 }, { "crossentropy": 2.4892475605010986, "epoch": 0.5915332118872183, "grad_norm": 0.030614620074629784, "grad_norm_var": 3.2029816709607754e-06, "learning_rate": 0.0006045041009829771, "loss": 2.4892, "step": 10878 }, { "crossentropy": 2.454942226409912, "epoch": 0.5915875907447185, "grad_norm": 0.03204089403152466, "grad_norm_var": 3.191815152578945e-06, "learning_rate": 0.000603910735413768, "loss": 2.4549, "step": 10879 }, { "crossentropy": 2.465291738510132, "epoch": 0.5916419696022187, "grad_norm": 0.03249647468328476, "grad_norm_var": 3.0105291191835725e-06, "learning_rate": 0.0006033176424859143, "loss": 2.4653, "step": 10880 }, { "crossentropy": 2.6117929220199585, "epoch": 0.5916963484597189, "grad_norm": 0.034219320863485336, "grad_norm_var": 3.2231864148975076e-06, "learning_rate": 0.0006027248222362031, "loss": 2.6118, "step": 10881 }, { "crossentropy": 2.611107587814331, "epoch": 0.5917507273172191, "grad_norm": 0.03282264247536659, "grad_norm_var": 3.2513230270054256e-06, "learning_rate": 0.0006021322747013991, "loss": 2.6111, "step": 10882 }, { "crossentropy": 2.56829833984375, "epoch": 0.5918051061747193, "grad_norm": 0.03157898038625717, "grad_norm_var": 3.2006970348015415e-06, "learning_rate": 0.0006015399999182508, "loss": 2.5683, "step": 10883 }, { "crossentropy": 2.5738741159439087, "epoch": 0.5918594850322195, "grad_norm": 0.03158876672387123, "grad_norm_var": 3.2149902347960078e-06, "learning_rate": 0.0006009479979234894, "loss": 2.5739, "step": 10884 }, { "crossentropy": 2.578825354576111, "epoch": 0.5919138638897197, "grad_norm": 0.03575092554092407, "grad_norm_var": 4.059712940108981e-06, "learning_rate": 0.0006003562687538344, "loss": 2.5788, "step": 10885 }, { "crossentropy": 2.52337110042572, "epoch": 0.5919682427472199, "grad_norm": 0.030775699764490128, "grad_norm_var": 4.112250291885231e-06, "learning_rate": 0.000599764812445981, "loss": 2.5234, "step": 10886 }, { "crossentropy": 2.478753447532654, "epoch": 0.5920226216047201, "grad_norm": 0.030911095440387726, "grad_norm_var": 4.147556883263394e-06, "learning_rate": 0.0005991736290366113, "loss": 2.4788, "step": 10887 }, { "crossentropy": 2.5905909538269043, "epoch": 0.5920770004622203, "grad_norm": 0.03131115436553955, "grad_norm_var": 4.157438194265584e-06, "learning_rate": 0.0005985827185623899, "loss": 2.5906, "step": 10888 }, { "crossentropy": 2.526419997215271, "epoch": 0.5921313793197205, "grad_norm": 0.03191244602203369, "grad_norm_var": 4.107868391571206e-06, "learning_rate": 0.0005979920810599654, "loss": 2.5264, "step": 10889 }, { "crossentropy": 2.5320874452590942, "epoch": 0.5921857581772207, "grad_norm": 0.03211285173892975, "grad_norm_var": 3.6700030247436974e-06, "learning_rate": 0.000597401716565969, "loss": 2.5321, "step": 10890 }, { "crossentropy": 2.5348236560821533, "epoch": 0.5922401370347209, "grad_norm": 0.03157878667116165, "grad_norm_var": 3.66026358312881e-06, "learning_rate": 0.0005968116251170114, "loss": 2.5348, "step": 10891 }, { "crossentropy": 2.5908647775650024, "epoch": 0.5922945158922212, "grad_norm": 0.03371037170290947, "grad_norm_var": 1.8678680900922288e-06, "learning_rate": 0.0005962218067496927, "loss": 2.5909, "step": 10892 }, { "crossentropy": 2.5596688985824585, "epoch": 0.5923488947497213, "grad_norm": 0.032916419208049774, "grad_norm_var": 1.875300918566464e-06, "learning_rate": 0.0005956322615005927, "loss": 2.5597, "step": 10893 }, { "crossentropy": 2.5595104694366455, "epoch": 0.5924032736072216, "grad_norm": 0.03198084235191345, "grad_norm_var": 1.690167992577686e-06, "learning_rate": 0.0005950429894062731, "loss": 2.5595, "step": 10894 }, { "crossentropy": 2.576335906982422, "epoch": 0.5924576524647217, "grad_norm": 0.03330698236823082, "grad_norm_var": 1.7370374756036735e-06, "learning_rate": 0.0005944539905032792, "loss": 2.5763, "step": 10895 }, { "crossentropy": 2.5690267086029053, "epoch": 0.592512031322222, "grad_norm": 0.031169813126325607, "grad_norm_var": 1.8363173867353214e-06, "learning_rate": 0.000593865264828144, "loss": 2.569, "step": 10896 }, { "crossentropy": 2.5864288806915283, "epoch": 0.5925664101797221, "grad_norm": 0.03218517825007439, "grad_norm_var": 1.5887289137104034e-06, "learning_rate": 0.000593276812417376, "loss": 2.5864, "step": 10897 }, { "crossentropy": 2.4640796184539795, "epoch": 0.5926207890372224, "grad_norm": 0.03269209340214729, "grad_norm_var": 1.5794053105861253e-06, "learning_rate": 0.0005926886333074715, "loss": 2.4641, "step": 10898 }, { "crossentropy": 2.462743043899536, "epoch": 0.5926751678947225, "grad_norm": 0.03060649149119854, "grad_norm_var": 1.7213269977789698e-06, "learning_rate": 0.0005921007275349094, "loss": 2.4627, "step": 10899 }, { "crossentropy": 2.428894877433777, "epoch": 0.5927295467522228, "grad_norm": 0.03200686722993851, "grad_norm_var": 1.7005826039349539e-06, "learning_rate": 0.0005915130951361503, "loss": 2.4289, "step": 10900 }, { "crossentropy": 2.46857225894928, "epoch": 0.5927839256097229, "grad_norm": 0.030822845175862312, "grad_norm_var": 8.740537024878407e-07, "learning_rate": 0.0005909257361476405, "loss": 2.4686, "step": 10901 }, { "crossentropy": 2.5856072902679443, "epoch": 0.5928383044672232, "grad_norm": 0.03402860835194588, "grad_norm_var": 1.0586040103346499e-06, "learning_rate": 0.0005903386506058034, "loss": 2.5856, "step": 10902 }, { "crossentropy": 2.53219735622406, "epoch": 0.5928926833247233, "grad_norm": 0.03226489573717117, "grad_norm_var": 9.624636834240179e-07, "learning_rate": 0.0005897518385470535, "loss": 2.5322, "step": 10903 }, { "crossentropy": 2.5800962448120117, "epoch": 0.5929470621822236, "grad_norm": 0.032722581177949905, "grad_norm_var": 9.266784121090069e-07, "learning_rate": 0.000589165300007784, "loss": 2.5801, "step": 10904 }, { "crossentropy": 2.5131616592407227, "epoch": 0.5930014410397237, "grad_norm": 0.031078414991497993, "grad_norm_var": 1.0078169150641436e-06, "learning_rate": 0.0005885790350243691, "loss": 2.5132, "step": 10905 }, { "crossentropy": 2.4122772216796875, "epoch": 0.593055819897224, "grad_norm": 0.03101225756108761, "grad_norm_var": 1.0961659340113482e-06, "learning_rate": 0.0005879930436331682, "loss": 2.4123, "step": 10906 }, { "crossentropy": 2.5403023958206177, "epoch": 0.5931101987547241, "grad_norm": 0.03306535631418228, "grad_norm_var": 1.1249856913058665e-06, "learning_rate": 0.0005874073258705281, "loss": 2.5403, "step": 10907 }, { "crossentropy": 2.512934684753418, "epoch": 0.5931645776122244, "grad_norm": 0.03154950588941574, "grad_norm_var": 9.883210989899045e-07, "learning_rate": 0.0005868218817727705, "loss": 2.5129, "step": 10908 }, { "crossentropy": 2.5261131525039673, "epoch": 0.5932189564697246, "grad_norm": 0.03235640004277229, "grad_norm_var": 9.460704019067489e-07, "learning_rate": 0.0005862367113762057, "loss": 2.5261, "step": 10909 }, { "crossentropy": 2.4788955450057983, "epoch": 0.5932733353272248, "grad_norm": 0.031067317351698875, "grad_norm_var": 1.0070260680607498e-06, "learning_rate": 0.0005856518147171252, "loss": 2.4789, "step": 10910 }, { "crossentropy": 2.55900239944458, "epoch": 0.593327714184725, "grad_norm": 0.033127788454294205, "grad_norm_var": 9.777097121689183e-07, "learning_rate": 0.0005850671918318035, "loss": 2.559, "step": 10911 }, { "crossentropy": 2.5117560625076294, "epoch": 0.5933820930422252, "grad_norm": 0.031578872352838516, "grad_norm_var": 9.437187300958035e-07, "learning_rate": 0.000584482842756498, "loss": 2.5118, "step": 10912 }, { "crossentropy": 2.537652850151062, "epoch": 0.5934364718997254, "grad_norm": 0.031161068007349968, "grad_norm_var": 9.853953111311001e-07, "learning_rate": 0.0005838987675274505, "loss": 2.5377, "step": 10913 }, { "crossentropy": 2.5031532049179077, "epoch": 0.5934908507572256, "grad_norm": 0.0312935970723629, "grad_norm_var": 9.685736328710406e-07, "learning_rate": 0.0005833149661808846, "loss": 2.5032, "step": 10914 }, { "crossentropy": 2.4697293043136597, "epoch": 0.5935452296147258, "grad_norm": 0.03137620911002159, "grad_norm_var": 8.770662591973579e-07, "learning_rate": 0.000582731438753007, "loss": 2.4697, "step": 10915 }, { "crossentropy": 2.427139401435852, "epoch": 0.593599608472226, "grad_norm": 0.03272780403494835, "grad_norm_var": 9.191468412342163e-07, "learning_rate": 0.000582148185280007, "loss": 2.4271, "step": 10916 }, { "crossentropy": 2.360235810279846, "epoch": 0.5936539873297262, "grad_norm": 0.03193875402212143, "grad_norm_var": 8.28956461572788e-07, "learning_rate": 0.0005815652057980564, "loss": 2.3602, "step": 10917 }, { "crossentropy": 2.521076202392578, "epoch": 0.5937083661872264, "grad_norm": 0.03112456388771534, "grad_norm_var": 5.790157684944549e-07, "learning_rate": 0.000580982500343315, "loss": 2.5211, "step": 10918 }, { "crossentropy": 2.543339490890503, "epoch": 0.5937627450447266, "grad_norm": 0.031993091106414795, "grad_norm_var": 5.682468439801759e-07, "learning_rate": 0.0005804000689519173, "loss": 2.5433, "step": 10919 }, { "crossentropy": 2.5685765743255615, "epoch": 0.5938171239022269, "grad_norm": 0.03275985270738602, "grad_norm_var": 5.728024355386759e-07, "learning_rate": 0.0005798179116599878, "loss": 2.5686, "step": 10920 }, { "crossentropy": 2.454542398452759, "epoch": 0.593871502759727, "grad_norm": 0.03099728748202324, "grad_norm_var": 5.812969383120943e-07, "learning_rate": 0.0005792360285036297, "loss": 2.4545, "step": 10921 }, { "crossentropy": 2.533475875854492, "epoch": 0.5939258816172273, "grad_norm": 0.030852027237415314, "grad_norm_var": 6.00171178287293e-07, "learning_rate": 0.0005786544195189319, "loss": 2.5335, "step": 10922 }, { "crossentropy": 2.6173969507217407, "epoch": 0.5939802604747274, "grad_norm": 0.03252314776182175, "grad_norm_var": 5.278331390348809e-07, "learning_rate": 0.0005780730847419652, "loss": 2.6174, "step": 10923 }, { "crossentropy": 2.5461541414260864, "epoch": 0.5940346393322277, "grad_norm": 0.03248274326324463, "grad_norm_var": 5.539955778956999e-07, "learning_rate": 0.0005774920242087833, "loss": 2.5462, "step": 10924 }, { "crossentropy": 2.491024374961853, "epoch": 0.5940890181897278, "grad_norm": 0.030951745808124542, "grad_norm_var": 5.796659848454708e-07, "learning_rate": 0.0005769112379554226, "loss": 2.491, "step": 10925 }, { "crossentropy": 2.5082037448883057, "epoch": 0.5941433970472281, "grad_norm": 0.03153350204229355, "grad_norm_var": 5.509862760565143e-07, "learning_rate": 0.0005763307260179035, "loss": 2.5082, "step": 10926 }, { "crossentropy": 2.519564628601074, "epoch": 0.5941977759047282, "grad_norm": 0.03099207393825054, "grad_norm_var": 4.512359133730052e-07, "learning_rate": 0.0005757504884322295, "loss": 2.5196, "step": 10927 }, { "crossentropy": 2.4427435398101807, "epoch": 0.5942521547622285, "grad_norm": 0.03226639702916145, "grad_norm_var": 4.749099827170185e-07, "learning_rate": 0.0005751705252343836, "loss": 2.4427, "step": 10928 }, { "crossentropy": 2.5447601079940796, "epoch": 0.5943065336197286, "grad_norm": 0.030896935611963272, "grad_norm_var": 4.977525250455158e-07, "learning_rate": 0.0005745908364603381, "loss": 2.5448, "step": 10929 }, { "crossentropy": 2.6057796478271484, "epoch": 0.5943609124772289, "grad_norm": 0.03182109817862511, "grad_norm_var": 4.887150215221921e-07, "learning_rate": 0.0005740114221460424, "loss": 2.6058, "step": 10930 }, { "crossentropy": 2.5672527551651, "epoch": 0.594415291334729, "grad_norm": 0.0334455631673336, "grad_norm_var": 6.663736883592841e-07, "learning_rate": 0.0005734322823274319, "loss": 2.5673, "step": 10931 }, { "crossentropy": 2.49786639213562, "epoch": 0.5944696701922293, "grad_norm": 0.031735971570014954, "grad_norm_var": 6.093471749498679e-07, "learning_rate": 0.0005728534170404248, "loss": 2.4979, "step": 10932 }, { "crossentropy": 2.5751041173934937, "epoch": 0.5945240490497294, "grad_norm": 0.030619898810982704, "grad_norm_var": 6.883257701581346e-07, "learning_rate": 0.000572274826320921, "loss": 2.5751, "step": 10933 }, { "crossentropy": 2.5023869276046753, "epoch": 0.5945784279072297, "grad_norm": 0.031164301559329033, "grad_norm_var": 6.854431846032751e-07, "learning_rate": 0.0005716965102048044, "loss": 2.5024, "step": 10934 }, { "crossentropy": 2.5842264890670776, "epoch": 0.5946328067647298, "grad_norm": 0.03230055794119835, "grad_norm_var": 7.037882488730342e-07, "learning_rate": 0.0005711184687279414, "loss": 2.5842, "step": 10935 }, { "crossentropy": 2.5633333921432495, "epoch": 0.5946871856222301, "grad_norm": 0.03237902745604515, "grad_norm_var": 6.594908148729496e-07, "learning_rate": 0.0005705407019261827, "loss": 2.5633, "step": 10936 }, { "crossentropy": 2.4457714557647705, "epoch": 0.5947415644797303, "grad_norm": 0.03281335532665253, "grad_norm_var": 6.990633825217404e-07, "learning_rate": 0.0005699632098353596, "loss": 2.4458, "step": 10937 }, { "crossentropy": 2.631859540939331, "epoch": 0.5947959433372305, "grad_norm": 0.03203282132744789, "grad_norm_var": 6.371705328512272e-07, "learning_rate": 0.0005693859924912892, "loss": 2.6319, "step": 10938 }, { "crossentropy": 2.4871219396591187, "epoch": 0.5948503221947307, "grad_norm": 0.03182429075241089, "grad_norm_var": 6.070626324322503e-07, "learning_rate": 0.0005688090499297677, "loss": 2.4871, "step": 10939 }, { "crossentropy": 2.5173280239105225, "epoch": 0.5949047010522309, "grad_norm": 0.03238547220826149, "grad_norm_var": 5.991722674758183e-07, "learning_rate": 0.000568232382186577, "loss": 2.5173, "step": 10940 }, { "crossentropy": 2.5300880670547485, "epoch": 0.5949590799097311, "grad_norm": 0.03250233456492424, "grad_norm_var": 5.693794602700525e-07, "learning_rate": 0.0005676559892974842, "loss": 2.5301, "step": 10941 }, { "crossentropy": 2.490498900413513, "epoch": 0.5950134587672313, "grad_norm": 0.03281519562005997, "grad_norm_var": 6.060693597448868e-07, "learning_rate": 0.0005670798712982339, "loss": 2.4905, "step": 10942 }, { "crossentropy": 2.527683138847351, "epoch": 0.5950678376247315, "grad_norm": 0.03068690374493599, "grad_norm_var": 6.528898131015019e-07, "learning_rate": 0.0005665040282245565, "loss": 2.5277, "step": 10943 }, { "crossentropy": 2.5705000162124634, "epoch": 0.5951222164822317, "grad_norm": 0.03211253881454468, "grad_norm_var": 6.485070451982151e-07, "learning_rate": 0.0005659284601121667, "loss": 2.5705, "step": 10944 }, { "crossentropy": 2.4884510040283203, "epoch": 0.5951765953397319, "grad_norm": 0.032043684273958206, "grad_norm_var": 5.664697745266211e-07, "learning_rate": 0.0005653531669967593, "loss": 2.4885, "step": 10945 }, { "crossentropy": 2.6038182973861694, "epoch": 0.5952309741972321, "grad_norm": 0.03296084702014923, "grad_norm_var": 6.139846865986972e-07, "learning_rate": 0.0005647781489140153, "loss": 2.6038, "step": 10946 }, { "crossentropy": 2.4164477586746216, "epoch": 0.5952853530547323, "grad_norm": 0.03151373192667961, "grad_norm_var": 5.042323276078601e-07, "learning_rate": 0.0005642034058995938, "loss": 2.4164, "step": 10947 }, { "crossentropy": 2.4971498250961304, "epoch": 0.5953397319122326, "grad_norm": 0.03226799517869949, "grad_norm_var": 5.03677202061172e-07, "learning_rate": 0.0005636289379891423, "loss": 2.4971, "step": 10948 }, { "crossentropy": 2.551819324493408, "epoch": 0.5953941107697327, "grad_norm": 0.031795792281627655, "grad_norm_var": 3.695726808758028e-07, "learning_rate": 0.0005630547452182888, "loss": 2.5518, "step": 10949 }, { "crossentropy": 2.6009517908096313, "epoch": 0.595448489627233, "grad_norm": 0.031016146764159203, "grad_norm_var": 3.8942688770590185e-07, "learning_rate": 0.0005624808276226428, "loss": 2.601, "step": 10950 }, { "crossentropy": 2.2953702211380005, "epoch": 0.5955028684847331, "grad_norm": 0.03322330489754677, "grad_norm_var": 4.684665778394246e-07, "learning_rate": 0.0005619071852377977, "loss": 2.2954, "step": 10951 }, { "crossentropy": 2.499357223510742, "epoch": 0.5955572473422334, "grad_norm": 0.03276081010699272, "grad_norm_var": 4.893194413098927e-07, "learning_rate": 0.0005613338180993333, "loss": 2.4994, "step": 10952 }, { "crossentropy": 2.5370346307754517, "epoch": 0.5956116261997335, "grad_norm": 0.030951036140322685, "grad_norm_var": 5.46879591149738e-07, "learning_rate": 0.0005607607262428067, "loss": 2.537, "step": 10953 }, { "crossentropy": 2.583603262901306, "epoch": 0.5956660050572338, "grad_norm": 0.030436748638749123, "grad_norm_var": 7.109865819459321e-07, "learning_rate": 0.0005601879097037604, "loss": 2.5836, "step": 10954 }, { "crossentropy": 2.4350417852401733, "epoch": 0.5957203839147339, "grad_norm": 0.03147410601377487, "grad_norm_var": 7.248030228598623e-07, "learning_rate": 0.0005596153685177213, "loss": 2.435, "step": 10955 }, { "crossentropy": 2.4430654048919678, "epoch": 0.5957747627722342, "grad_norm": 0.031372442841529846, "grad_norm_var": 7.279840457760037e-07, "learning_rate": 0.000559043102720197, "loss": 2.4431, "step": 10956 }, { "crossentropy": 2.5445045232772827, "epoch": 0.5958291416297343, "grad_norm": 0.031275663524866104, "grad_norm_var": 7.187461762082064e-07, "learning_rate": 0.0005584711123466807, "loss": 2.5445, "step": 10957 }, { "crossentropy": 2.4649497270584106, "epoch": 0.5958835204872346, "grad_norm": 0.03068745881319046, "grad_norm_var": 7.120410377752974e-07, "learning_rate": 0.0005578993974326424, "loss": 2.4649, "step": 10958 }, { "crossentropy": 2.6102460622787476, "epoch": 0.5959378993447347, "grad_norm": 0.03299630433320999, "grad_norm_var": 7.453686394722024e-07, "learning_rate": 0.0005573279580135438, "loss": 2.6102, "step": 10959 }, { "crossentropy": 2.592745065689087, "epoch": 0.595992278202235, "grad_norm": 0.03137722238898277, "grad_norm_var": 7.490627694914463e-07, "learning_rate": 0.0005567567941248242, "loss": 2.5927, "step": 10960 }, { "crossentropy": 2.4862853288650513, "epoch": 0.5960466570597351, "grad_norm": 0.030974194407463074, "grad_norm_var": 7.800380547699625e-07, "learning_rate": 0.0005561859058019042, "loss": 2.4863, "step": 10961 }, { "crossentropy": 2.4833273887634277, "epoch": 0.5961010359172354, "grad_norm": 0.032505057752132416, "grad_norm_var": 7.159566378754732e-07, "learning_rate": 0.0005556152930801911, "loss": 2.4833, "step": 10962 }, { "crossentropy": 2.3960987329483032, "epoch": 0.5961554147747355, "grad_norm": 0.03167746588587761, "grad_norm_var": 7.143461772744861e-07, "learning_rate": 0.0005550449559950754, "loss": 2.3961, "step": 10963 }, { "crossentropy": 2.567711114883423, "epoch": 0.5962097936322358, "grad_norm": 0.031987886875867844, "grad_norm_var": 6.970836613430827e-07, "learning_rate": 0.0005544748945819272, "loss": 2.5677, "step": 10964 }, { "crossentropy": 2.388104796409607, "epoch": 0.596264172489736, "grad_norm": 0.03333489969372749, "grad_norm_var": 8.736236478367922e-07, "learning_rate": 0.000553905108876101, "loss": 2.3881, "step": 10965 }, { "crossentropy": 2.5510531663894653, "epoch": 0.5963185513472362, "grad_norm": 0.03268755227327347, "grad_norm_var": 8.839743900848976e-07, "learning_rate": 0.0005533355989129346, "loss": 2.5511, "step": 10966 }, { "crossentropy": 2.570399045944214, "epoch": 0.5963729302047364, "grad_norm": 0.03212892264127731, "grad_norm_var": 7.595535528949019e-07, "learning_rate": 0.0005527663647277492, "loss": 2.5704, "step": 10967 }, { "crossentropy": 2.5212570428848267, "epoch": 0.5964273090622366, "grad_norm": 0.033836640417575836, "grad_norm_var": 9.712582807967658e-07, "learning_rate": 0.0005521974063558477, "loss": 2.5213, "step": 10968 }, { "crossentropy": 2.4771987199783325, "epoch": 0.5964816879197368, "grad_norm": 0.03146114572882652, "grad_norm_var": 9.25938434034286e-07, "learning_rate": 0.0005516287238325162, "loss": 2.4772, "step": 10969 }, { "crossentropy": 2.5032267570495605, "epoch": 0.596536066777237, "grad_norm": 0.03208650276064873, "grad_norm_var": 7.767377219737466e-07, "learning_rate": 0.0005510603171930239, "loss": 2.5032, "step": 10970 }, { "crossentropy": 2.543106198310852, "epoch": 0.5965904456347372, "grad_norm": 0.03247671201825142, "grad_norm_var": 7.704027133383084e-07, "learning_rate": 0.0005504921864726242, "loss": 2.5431, "step": 10971 }, { "crossentropy": 2.4906210899353027, "epoch": 0.5966448244922374, "grad_norm": 0.030806588008999825, "grad_norm_var": 8.41846121067792e-07, "learning_rate": 0.00054992433170655, "loss": 2.4906, "step": 10972 }, { "crossentropy": 2.6276639699935913, "epoch": 0.5966992033497376, "grad_norm": 0.03145653009414673, "grad_norm_var": 8.259704044285734e-07, "learning_rate": 0.0005493567529300187, "loss": 2.6277, "step": 10973 }, { "crossentropy": 2.4800833463668823, "epoch": 0.5967535822072378, "grad_norm": 0.03292396664619446, "grad_norm_var": 7.382259709517858e-07, "learning_rate": 0.0005487894501782343, "loss": 2.4801, "step": 10974 }, { "crossentropy": 2.626419186592102, "epoch": 0.596807961064738, "grad_norm": 0.032574135810136795, "grad_norm_var": 7.028446832171015e-07, "learning_rate": 0.0005482224234863775, "loss": 2.6264, "step": 10975 }, { "crossentropy": 2.452057719230652, "epoch": 0.5968623399222382, "grad_norm": 0.0313127301633358, "grad_norm_var": 7.09693519428224e-07, "learning_rate": 0.0005476556728896154, "loss": 2.4521, "step": 10976 }, { "crossentropy": 2.5252691507339478, "epoch": 0.5969167187797384, "grad_norm": 0.031186960637569427, "grad_norm_var": 6.794663992515699e-07, "learning_rate": 0.0005470891984230975, "loss": 2.5253, "step": 10977 }, { "crossentropy": 2.6349756717681885, "epoch": 0.5969710976372387, "grad_norm": 0.03169297054409981, "grad_norm_var": 6.825349210757044e-07, "learning_rate": 0.0005465230001219562, "loss": 2.635, "step": 10978 }, { "crossentropy": 2.5728018283843994, "epoch": 0.5970254764947388, "grad_norm": 0.03337494283914566, "grad_norm_var": 7.665447688659597e-07, "learning_rate": 0.0005459570780213063, "loss": 2.5728, "step": 10979 }, { "crossentropy": 2.5891791582107544, "epoch": 0.5970798553522391, "grad_norm": 0.03116740472614765, "grad_norm_var": 8.327064888916103e-07, "learning_rate": 0.0005453914321562464, "loss": 2.5892, "step": 10980 }, { "crossentropy": 2.4493428468704224, "epoch": 0.5971342342097392, "grad_norm": 0.03178367391228676, "grad_norm_var": 7.39431324853675e-07, "learning_rate": 0.0005448260625618562, "loss": 2.4493, "step": 10981 }, { "crossentropy": 2.641374707221985, "epoch": 0.5971886130672395, "grad_norm": 0.03339746966958046, "grad_norm_var": 8.30347118787172e-07, "learning_rate": 0.0005442609692732003, "loss": 2.6414, "step": 10982 }, { "crossentropy": 2.4429104328155518, "epoch": 0.5972429919247396, "grad_norm": 0.030656782910227776, "grad_norm_var": 9.60945324819185e-07, "learning_rate": 0.0005436961523253265, "loss": 2.4429, "step": 10983 }, { "crossentropy": 2.499277114868164, "epoch": 0.5972973707822399, "grad_norm": 0.03225315734744072, "grad_norm_var": 7.32462352009836e-07, "learning_rate": 0.0005431316117532603, "loss": 2.4993, "step": 10984 }, { "crossentropy": 2.4538103342056274, "epoch": 0.59735174963974, "grad_norm": 0.030589301139116287, "grad_norm_var": 8.32522332331687e-07, "learning_rate": 0.0005425673475920184, "loss": 2.4538, "step": 10985 }, { "crossentropy": 2.4368016719818115, "epoch": 0.5974061284972403, "grad_norm": 0.03146898373961449, "grad_norm_var": 8.376023407981181e-07, "learning_rate": 0.0005420033598765928, "loss": 2.4368, "step": 10986 }, { "crossentropy": 2.494004964828491, "epoch": 0.5974605073547404, "grad_norm": 0.030085807666182518, "grad_norm_var": 9.8557342978108e-07, "learning_rate": 0.0005414396486419626, "loss": 2.494, "step": 10987 }, { "crossentropy": 2.5666013956069946, "epoch": 0.5975148862122407, "grad_norm": 0.03143412619829178, "grad_norm_var": 9.378833445678926e-07, "learning_rate": 0.0005408762139230888, "loss": 2.5666, "step": 10988 }, { "crossentropy": 2.471604108810425, "epoch": 0.5975692650697408, "grad_norm": 0.03173276036977768, "grad_norm_var": 9.333192481898589e-07, "learning_rate": 0.0005403130557549152, "loss": 2.4716, "step": 10989 }, { "crossentropy": 2.5058282613754272, "epoch": 0.5976236439272411, "grad_norm": 0.030400972813367844, "grad_norm_var": 9.285710100905096e-07, "learning_rate": 0.0005397501741723676, "loss": 2.5058, "step": 10990 }, { "crossentropy": 2.4466497898101807, "epoch": 0.5976780227847412, "grad_norm": 0.031569503247737885, "grad_norm_var": 8.570809529382609e-07, "learning_rate": 0.0005391875692103565, "loss": 2.4466, "step": 10991 }, { "crossentropy": 2.5699589252471924, "epoch": 0.5977324016422415, "grad_norm": 0.03244723752140999, "grad_norm_var": 9.081804867561247e-07, "learning_rate": 0.0005386252409037734, "loss": 2.57, "step": 10992 }, { "crossentropy": 2.5592442750930786, "epoch": 0.5977867804997417, "grad_norm": 0.03127578645944595, "grad_norm_var": 9.04046761423952e-07, "learning_rate": 0.0005380631892874932, "loss": 2.5592, "step": 10993 }, { "crossentropy": 2.4922406673431396, "epoch": 0.5978411593572419, "grad_norm": 0.03286616876721382, "grad_norm_var": 1.007245522885317e-06, "learning_rate": 0.0005375014143963752, "loss": 2.4922, "step": 10994 }, { "crossentropy": 2.584412693977356, "epoch": 0.5978955382147421, "grad_norm": 0.03219449892640114, "grad_norm_var": 8.238667408242643e-07, "learning_rate": 0.0005369399162652577, "loss": 2.5844, "step": 10995 }, { "crossentropy": 2.4632092714309692, "epoch": 0.5979499170722423, "grad_norm": 0.030126431956887245, "grad_norm_var": 9.492385132614147e-07, "learning_rate": 0.0005363786949289672, "loss": 2.4632, "step": 10996 }, { "crossentropy": 2.3598923683166504, "epoch": 0.5980042959297425, "grad_norm": 0.0316653810441494, "grad_norm_var": 9.459175164587408e-07, "learning_rate": 0.0005358177504223077, "loss": 2.3599, "step": 10997 }, { "crossentropy": 2.4328489303588867, "epoch": 0.5980586747872427, "grad_norm": 0.03086080215871334, "grad_norm_var": 7.0979311903348e-07, "learning_rate": 0.0005352570827800691, "loss": 2.4328, "step": 10998 }, { "crossentropy": 2.493914008140564, "epoch": 0.5981130536447429, "grad_norm": 0.030973168089985847, "grad_norm_var": 6.86733156212103e-07, "learning_rate": 0.0005346966920370239, "loss": 2.4939, "step": 10999 }, { "crossentropy": 2.5443865060806274, "epoch": 0.5981674325022431, "grad_norm": 0.031002620235085487, "grad_norm_var": 6.374682282186585e-07, "learning_rate": 0.0005341365782279273, "loss": 2.5444, "step": 11000 }, { "crossentropy": 2.5195261240005493, "epoch": 0.5982218113597433, "grad_norm": 0.03204922750592232, "grad_norm_var": 6.336324620098323e-07, "learning_rate": 0.0005335767413875158, "loss": 2.5195, "step": 11001 }, { "crossentropy": 2.6130878925323486, "epoch": 0.5982761902172435, "grad_norm": 0.03111700527369976, "grad_norm_var": 6.374149849860492e-07, "learning_rate": 0.0005330171815505114, "loss": 2.6131, "step": 11002 }, { "crossentropy": 2.6045788526535034, "epoch": 0.5983305690747437, "grad_norm": 0.030917372554540634, "grad_norm_var": 5.390696938109881e-07, "learning_rate": 0.0005324578987516165, "loss": 2.6046, "step": 11003 }, { "crossentropy": 2.508515238761902, "epoch": 0.598384947932244, "grad_norm": 0.03268381208181381, "grad_norm_var": 6.399360089272595e-07, "learning_rate": 0.0005318988930255181, "loss": 2.5085, "step": 11004 }, { "crossentropy": 2.57733952999115, "epoch": 0.5984393267897441, "grad_norm": 0.03264545276761055, "grad_norm_var": 7.212159056680373e-07, "learning_rate": 0.000531340164406885, "loss": 2.5773, "step": 11005 }, { "crossentropy": 2.6310964822769165, "epoch": 0.5984937056472444, "grad_norm": 0.03394556790590286, "grad_norm_var": 9.635653900132707e-07, "learning_rate": 0.0005307817129303671, "loss": 2.6311, "step": 11006 }, { "crossentropy": 2.5491265058517456, "epoch": 0.5985480845047445, "grad_norm": 0.03186142444610596, "grad_norm_var": 9.61038871532616e-07, "learning_rate": 0.0005302235386306021, "loss": 2.5491, "step": 11007 }, { "crossentropy": 2.2712773084640503, "epoch": 0.5986024633622448, "grad_norm": 0.032105907797813416, "grad_norm_var": 9.383863261115764e-07, "learning_rate": 0.0005296656415422069, "loss": 2.2713, "step": 11008 }, { "crossentropy": 2.520817518234253, "epoch": 0.5986568422197449, "grad_norm": 0.031342409551143646, "grad_norm_var": 9.342899101405597e-07, "learning_rate": 0.0005291080216997796, "loss": 2.5208, "step": 11009 }, { "crossentropy": 2.496206045150757, "epoch": 0.5987112210772452, "grad_norm": 0.031224681064486504, "grad_norm_var": 8.63291580344092e-07, "learning_rate": 0.0005285506791379047, "loss": 2.4962, "step": 11010 }, { "crossentropy": 2.5602749586105347, "epoch": 0.5987655999347453, "grad_norm": 0.03058312088251114, "grad_norm_var": 9.128295829230934e-07, "learning_rate": 0.0005279936138911479, "loss": 2.5603, "step": 11011 }, { "crossentropy": 2.6159952878952026, "epoch": 0.5988199787922456, "grad_norm": 0.031001459807157516, "grad_norm_var": 7.923764200891039e-07, "learning_rate": 0.0005274368259940581, "loss": 2.616, "step": 11012 }, { "crossentropy": 2.4894856214523315, "epoch": 0.5988743576497457, "grad_norm": 0.030975177884101868, "grad_norm_var": 8.183156361737798e-07, "learning_rate": 0.0005268803154811669, "loss": 2.4895, "step": 11013 }, { "crossentropy": 2.4205822944641113, "epoch": 0.598928736507246, "grad_norm": 0.030932234600186348, "grad_norm_var": 8.117791914751995e-07, "learning_rate": 0.0005263240823869875, "loss": 2.4206, "step": 11014 }, { "crossentropy": 2.5468465089797974, "epoch": 0.5989831153647461, "grad_norm": 0.03180958703160286, "grad_norm_var": 7.872664640630351e-07, "learning_rate": 0.0005257681267460184, "loss": 2.5468, "step": 11015 }, { "crossentropy": 2.583471179008484, "epoch": 0.5990374942222464, "grad_norm": 0.03175180405378342, "grad_norm_var": 7.589456988022485e-07, "learning_rate": 0.000525212448592739, "loss": 2.5835, "step": 11016 }, { "crossentropy": 2.5444953441619873, "epoch": 0.5990918730797465, "grad_norm": 0.031595464795827866, "grad_norm_var": 7.497260929994886e-07, "learning_rate": 0.0005246570479616103, "loss": 2.5445, "step": 11017 }, { "crossentropy": 2.427805542945862, "epoch": 0.5991462519372468, "grad_norm": 0.031119266524910927, "grad_norm_var": 7.495639718639909e-07, "learning_rate": 0.0005241019248870799, "loss": 2.4278, "step": 11018 }, { "crossentropy": 2.452537417411804, "epoch": 0.5992006307947471, "grad_norm": 0.03236331418156624, "grad_norm_var": 7.37849185668835e-07, "learning_rate": 0.0005235470794035762, "loss": 2.4525, "step": 11019 }, { "crossentropy": 2.5218228101730347, "epoch": 0.5992550096522472, "grad_norm": 0.03166556358337402, "grad_norm_var": 6.753673857181105e-07, "learning_rate": 0.0005229925115455087, "loss": 2.5218, "step": 11020 }, { "crossentropy": 2.586486577987671, "epoch": 0.5993093885097475, "grad_norm": 0.032796476036310196, "grad_norm_var": 6.961802571802625e-07, "learning_rate": 0.00052243822134727, "loss": 2.5865, "step": 11021 }, { "crossentropy": 2.495527982711792, "epoch": 0.5993637673672476, "grad_norm": 0.03125893697142601, "grad_norm_var": 3.400696924143875e-07, "learning_rate": 0.0005218842088432413, "loss": 2.4955, "step": 11022 }, { "crossentropy": 2.564525008201599, "epoch": 0.5994181462247479, "grad_norm": 0.030825840309262276, "grad_norm_var": 3.605304108366657e-07, "learning_rate": 0.0005213304740677771, "loss": 2.5645, "step": 11023 }, { "crossentropy": 2.6745622158050537, "epoch": 0.599472525082248, "grad_norm": 0.03445836529135704, "grad_norm_var": 9.091766421322663e-07, "learning_rate": 0.0005207770170552218, "loss": 2.6746, "step": 11024 }, { "crossentropy": 2.476106882095337, "epoch": 0.5995269039397483, "grad_norm": 0.0312812514603138, "grad_norm_var": 9.115637624054184e-07, "learning_rate": 0.0005202238378398999, "loss": 2.4761, "step": 11025 }, { "crossentropy": 2.4909071922302246, "epoch": 0.5995812827972484, "grad_norm": 0.03160325065255165, "grad_norm_var": 9.014421506972018e-07, "learning_rate": 0.0005196709364561192, "loss": 2.4909, "step": 11026 }, { "crossentropy": 2.5111751556396484, "epoch": 0.5996356616547487, "grad_norm": 0.031972650438547134, "grad_norm_var": 8.288426093893634e-07, "learning_rate": 0.000519118312938171, "loss": 2.5112, "step": 11027 }, { "crossentropy": 2.4954336881637573, "epoch": 0.5996900405122488, "grad_norm": 0.031799547374248505, "grad_norm_var": 7.929178284829154e-07, "learning_rate": 0.0005185659673203257, "loss": 2.4954, "step": 11028 }, { "crossentropy": 2.5173609256744385, "epoch": 0.5997444193697491, "grad_norm": 0.03207835555076599, "grad_norm_var": 7.530926316738253e-07, "learning_rate": 0.0005180138996368417, "loss": 2.5174, "step": 11029 }, { "crossentropy": 2.462458372116089, "epoch": 0.5997987982272492, "grad_norm": 0.032309580594301224, "grad_norm_var": 7.064228661557773e-07, "learning_rate": 0.0005174621099219584, "loss": 2.4625, "step": 11030 }, { "crossentropy": 2.5000221729278564, "epoch": 0.5998531770847495, "grad_norm": 0.03139745816588402, "grad_norm_var": 7.230001619372312e-07, "learning_rate": 0.0005169105982098948, "loss": 2.5, "step": 11031 }, { "crossentropy": 2.582789182662964, "epoch": 0.5999075559422496, "grad_norm": 0.03166429325938225, "grad_norm_var": 7.251183549413879e-07, "learning_rate": 0.0005163593645348552, "loss": 2.5828, "step": 11032 }, { "crossentropy": 2.5725916624069214, "epoch": 0.5999619347997499, "grad_norm": 0.032231517136096954, "grad_norm_var": 7.256919368457566e-07, "learning_rate": 0.00051580840893103, "loss": 2.5726, "step": 11033 }, { "crossentropy": 2.4864017963409424, "epoch": 0.6000163136572501, "grad_norm": 0.03349652886390686, "grad_norm_var": 8.230024845989979e-07, "learning_rate": 0.0005152577314325852, "loss": 2.4864, "step": 11034 }, { "crossentropy": 2.4905136823654175, "epoch": 0.6000706925147503, "grad_norm": 0.03158947825431824, "grad_norm_var": 8.307000396229423e-07, "learning_rate": 0.0005147073320736751, "loss": 2.4905, "step": 11035 }, { "crossentropy": 2.4552005529403687, "epoch": 0.6001250713722505, "grad_norm": 0.03170981630682945, "grad_norm_var": 8.286908991630375e-07, "learning_rate": 0.0005141572108884345, "loss": 2.4552, "step": 11036 }, { "crossentropy": 2.499848484992981, "epoch": 0.6001794502297507, "grad_norm": 0.032156966626644135, "grad_norm_var": 7.888603913947517e-07, "learning_rate": 0.0005136073679109821, "loss": 2.4998, "step": 11037 }, { "crossentropy": 2.513715624809265, "epoch": 0.6002338290872509, "grad_norm": 0.03172361105680466, "grad_norm_var": 7.570852414211555e-07, "learning_rate": 0.0005130578031754185, "loss": 2.5137, "step": 11038 }, { "crossentropy": 2.568361282348633, "epoch": 0.6002882079447511, "grad_norm": 0.03138339892029762, "grad_norm_var": 6.878393574706483e-07, "learning_rate": 0.000512508516715825, "loss": 2.5684, "step": 11039 }, { "crossentropy": 2.434032917022705, "epoch": 0.6003425868022513, "grad_norm": 0.03276044502854347, "grad_norm_var": 3.2358773909043025e-07, "learning_rate": 0.0005119595085662703, "loss": 2.434, "step": 11040 }, { "crossentropy": 2.5318201780319214, "epoch": 0.6003969656597515, "grad_norm": 0.03173774480819702, "grad_norm_var": 2.9606717706516056e-07, "learning_rate": 0.0005114107787608036, "loss": 2.5318, "step": 11041 }, { "crossentropy": 2.4320852756500244, "epoch": 0.6004513445172517, "grad_norm": 0.032458771020174026, "grad_norm_var": 2.9930225718746705e-07, "learning_rate": 0.0005108623273334551, "loss": 2.4321, "step": 11042 }, { "crossentropy": 2.4825379848480225, "epoch": 0.6005057233747519, "grad_norm": 0.031047655269503593, "grad_norm_var": 3.597755133907665e-07, "learning_rate": 0.0005103141543182388, "loss": 2.4825, "step": 11043 }, { "crossentropy": 2.498470664024353, "epoch": 0.6005601022322521, "grad_norm": 0.03233625367283821, "grad_norm_var": 3.6546856727186157e-07, "learning_rate": 0.0005097662597491548, "loss": 2.4985, "step": 11044 }, { "crossentropy": 2.559143304824829, "epoch": 0.6006144810897524, "grad_norm": 0.031966377049684525, "grad_norm_var": 3.651587829294704e-07, "learning_rate": 0.0005092186436601804, "loss": 2.5591, "step": 11045 }, { "crossentropy": 2.5959101915359497, "epoch": 0.6006688599472525, "grad_norm": 0.03169691935181618, "grad_norm_var": 3.631756283201896e-07, "learning_rate": 0.0005086713060852788, "loss": 2.5959, "step": 11046 }, { "crossentropy": 2.391480803489685, "epoch": 0.6007232388047528, "grad_norm": 0.030829284340143204, "grad_norm_var": 4.259550853115962e-07, "learning_rate": 0.0005081242470583952, "loss": 2.3915, "step": 11047 }, { "crossentropy": 2.485723376274109, "epoch": 0.6007776176622529, "grad_norm": 0.030930396169424057, "grad_norm_var": 4.850619153649032e-07, "learning_rate": 0.0005075774666134586, "loss": 2.4857, "step": 11048 }, { "crossentropy": 2.6058905124664307, "epoch": 0.6008319965197532, "grad_norm": 0.03212195634841919, "grad_norm_var": 4.806544634655078e-07, "learning_rate": 0.000507030964784379, "loss": 2.6059, "step": 11049 }, { "crossentropy": 2.408133387565613, "epoch": 0.6008863753772533, "grad_norm": 0.03127935528755188, "grad_norm_var": 3.0752910850800377e-07, "learning_rate": 0.00050648474160505, "loss": 2.4081, "step": 11050 }, { "crossentropy": 2.507148027420044, "epoch": 0.6009407542347536, "grad_norm": 0.03095722757279873, "grad_norm_var": 3.4461407546782876e-07, "learning_rate": 0.0005059387971093482, "loss": 2.5071, "step": 11051 }, { "crossentropy": 2.508813977241516, "epoch": 0.6009951330922537, "grad_norm": 0.03236297145485878, "grad_norm_var": 3.7269727404115607e-07, "learning_rate": 0.0005053931313311333, "loss": 2.5088, "step": 11052 }, { "crossentropy": 2.4632993936538696, "epoch": 0.601049511949754, "grad_norm": 0.03161466866731644, "grad_norm_var": 3.6051862957880986e-07, "learning_rate": 0.0005048477443042449, "loss": 2.4633, "step": 11053 }, { "crossentropy": 2.584404468536377, "epoch": 0.6011038908072541, "grad_norm": 0.03283391892910004, "grad_norm_var": 4.4099790719477154e-07, "learning_rate": 0.000504302636062508, "loss": 2.5844, "step": 11054 }, { "crossentropy": 2.628438949584961, "epoch": 0.6011582696647544, "grad_norm": 0.032643117010593414, "grad_norm_var": 4.7527194444770223e-07, "learning_rate": 0.0005037578066397319, "loss": 2.6284, "step": 11055 }, { "crossentropy": 2.487012267112732, "epoch": 0.6012126485222545, "grad_norm": 0.03130091726779938, "grad_norm_var": 4.309558132531884e-07, "learning_rate": 0.0005032132560697039, "loss": 2.487, "step": 11056 }, { "crossentropy": 2.5117839574813843, "epoch": 0.6012670273797548, "grad_norm": 0.03091832436621189, "grad_norm_var": 4.7506296636314005e-07, "learning_rate": 0.0005026689843861976, "loss": 2.5118, "step": 11057 }, { "crossentropy": 2.6233320236206055, "epoch": 0.6013214062372549, "grad_norm": 0.03102412447333336, "grad_norm_var": 4.5973169442886597e-07, "learning_rate": 0.000502124991622968, "loss": 2.6233, "step": 11058 }, { "crossentropy": 2.467722535133362, "epoch": 0.6013757850947552, "grad_norm": 0.030513960868120193, "grad_norm_var": 5.18009748484242e-07, "learning_rate": 0.000501581277813753, "loss": 2.4677, "step": 11059 }, { "crossentropy": 2.4174022674560547, "epoch": 0.6014301639522553, "grad_norm": 0.03158920258283615, "grad_norm_var": 4.778719213190003e-07, "learning_rate": 0.000501037842992273, "loss": 2.4174, "step": 11060 }, { "crossentropy": 2.551022171974182, "epoch": 0.6014845428097556, "grad_norm": 0.03201952204108238, "grad_norm_var": 4.810951200912364e-07, "learning_rate": 0.0005004946871922317, "loss": 2.551, "step": 11061 }, { "crossentropy": 2.5540255308151245, "epoch": 0.6015389216672558, "grad_norm": 0.0319041982293129, "grad_norm_var": 4.881243518481404e-07, "learning_rate": 0.0004999518104473155, "loss": 2.554, "step": 11062 }, { "crossentropy": 2.5687415599823, "epoch": 0.601593300524756, "grad_norm": 0.030662188306450844, "grad_norm_var": 5.059866622198015e-07, "learning_rate": 0.000499409212791192, "loss": 2.5687, "step": 11063 }, { "crossentropy": 2.4657055139541626, "epoch": 0.6016476793822562, "grad_norm": 0.031149400398135185, "grad_norm_var": 4.911177731721506e-07, "learning_rate": 0.0004988668942575148, "loss": 2.4657, "step": 11064 }, { "crossentropy": 2.5002427101135254, "epoch": 0.6017020582397564, "grad_norm": 0.03118562512099743, "grad_norm_var": 4.7524880371200597e-07, "learning_rate": 0.0004983248548799141, "loss": 2.5002, "step": 11065 }, { "crossentropy": 2.5371216535568237, "epoch": 0.6017564370972566, "grad_norm": 0.03125723451375961, "grad_norm_var": 4.759225552500157e-07, "learning_rate": 0.0004977830946920109, "loss": 2.5371, "step": 11066 }, { "crossentropy": 2.613538384437561, "epoch": 0.6018108159547568, "grad_norm": 0.0306458231061697, "grad_norm_var": 5.043550653588269e-07, "learning_rate": 0.0004972416137274016, "loss": 2.6135, "step": 11067 }, { "crossentropy": 2.5509787797927856, "epoch": 0.601865194812257, "grad_norm": 0.03154804930090904, "grad_norm_var": 4.4954863181420556e-07, "learning_rate": 0.00049670041201967, "loss": 2.551, "step": 11068 }, { "crossentropy": 2.533604860305786, "epoch": 0.6019195736697572, "grad_norm": 0.03125353902578354, "grad_norm_var": 4.485978040007308e-07, "learning_rate": 0.0004961594896023803, "loss": 2.5336, "step": 11069 }, { "crossentropy": 2.4824403524398804, "epoch": 0.6019739525272574, "grad_norm": 0.032158032059669495, "grad_norm_var": 3.4820378867979745e-07, "learning_rate": 0.0004956188465090799, "loss": 2.4824, "step": 11070 }, { "crossentropy": 2.4850070476531982, "epoch": 0.6020283313847576, "grad_norm": 0.032774463295936584, "grad_norm_var": 3.717385371871147e-07, "learning_rate": 0.000495078482773299, "loss": 2.485, "step": 11071 }, { "crossentropy": 2.6252399682998657, "epoch": 0.6020827102422578, "grad_norm": 0.03167203441262245, "grad_norm_var": 3.7697577299672786e-07, "learning_rate": 0.0004945383984285512, "loss": 2.6252, "step": 11072 }, { "crossentropy": 2.5662307739257812, "epoch": 0.602137089099758, "grad_norm": 0.03131648898124695, "grad_norm_var": 3.6172508069582755e-07, "learning_rate": 0.0004939985935083313, "loss": 2.5662, "step": 11073 }, { "crossentropy": 2.5177172422409058, "epoch": 0.6021914679572582, "grad_norm": 0.03365711867809296, "grad_norm_var": 6.570496402416003e-07, "learning_rate": 0.0004934590680461176, "loss": 2.5177, "step": 11074 }, { "crossentropy": 2.5983442068099976, "epoch": 0.6022458468147585, "grad_norm": 0.03267660737037659, "grad_norm_var": 6.414847442050286e-07, "learning_rate": 0.0004929198220753723, "loss": 2.5983, "step": 11075 }, { "crossentropy": 2.524384617805481, "epoch": 0.6023002256722586, "grad_norm": 0.03203817829489708, "grad_norm_var": 6.464423031222906e-07, "learning_rate": 0.0004923808556295356, "loss": 2.5244, "step": 11076 }, { "crossentropy": 2.4896005392074585, "epoch": 0.6023546045297589, "grad_norm": 0.031055545434355736, "grad_norm_var": 6.692240829956799e-07, "learning_rate": 0.0004918421687420365, "loss": 2.4896, "step": 11077 }, { "crossentropy": 2.5724246501922607, "epoch": 0.602408983387259, "grad_norm": 0.031221074983477592, "grad_norm_var": 6.783937556440259e-07, "learning_rate": 0.0004913037614462845, "loss": 2.5724, "step": 11078 }, { "crossentropy": 2.6146847009658813, "epoch": 0.6024633622447593, "grad_norm": 0.03144051507115364, "grad_norm_var": 6.14577838464617e-07, "learning_rate": 0.0004907656337756683, "loss": 2.6147, "step": 11079 }, { "crossentropy": 2.522170066833496, "epoch": 0.6025177411022594, "grad_norm": 0.03389054164290428, "grad_norm_var": 8.863902188339818e-07, "learning_rate": 0.000490227785763564, "loss": 2.5222, "step": 11080 }, { "crossentropy": 2.5235674381256104, "epoch": 0.6025721199597597, "grad_norm": 0.030830951407551765, "grad_norm_var": 9.262346249999202e-07, "learning_rate": 0.0004896902174433276, "loss": 2.5236, "step": 11081 }, { "crossentropy": 2.5811221599578857, "epoch": 0.6026264988172598, "grad_norm": 0.03113011084496975, "grad_norm_var": 9.371183951041671e-07, "learning_rate": 0.0004891529288482993, "loss": 2.5811, "step": 11082 }, { "crossentropy": 2.525633215904236, "epoch": 0.6026808776747601, "grad_norm": 0.031986724585294724, "grad_norm_var": 8.374542934424499e-07, "learning_rate": 0.0004886159200118007, "loss": 2.5256, "step": 11083 }, { "crossentropy": 2.5580638647079468, "epoch": 0.6027352565322602, "grad_norm": 0.031543515622615814, "grad_norm_var": 8.376777731483136e-07, "learning_rate": 0.00048807919096713673, "loss": 2.5581, "step": 11084 }, { "crossentropy": 2.5510852336883545, "epoch": 0.6027896353897605, "grad_norm": 0.03226703032851219, "grad_norm_var": 8.124449432914474e-07, "learning_rate": 0.0004875427417475953, "loss": 2.5511, "step": 11085 }, { "crossentropy": 2.432081699371338, "epoch": 0.6028440142472606, "grad_norm": 0.03231111541390419, "grad_norm_var": 8.175703080639159e-07, "learning_rate": 0.0004870065723864464, "loss": 2.4321, "step": 11086 }, { "crossentropy": 2.4895639419555664, "epoch": 0.6028983931047609, "grad_norm": 0.030476100742816925, "grad_norm_var": 9.067912634050842e-07, "learning_rate": 0.0004864706829169419, "loss": 2.4896, "step": 11087 }, { "crossentropy": 2.5069233179092407, "epoch": 0.602952771962261, "grad_norm": 0.03253863751888275, "grad_norm_var": 9.337889838834448e-07, "learning_rate": 0.0004859350733723167, "loss": 2.5069, "step": 11088 }, { "crossentropy": 2.5973373651504517, "epoch": 0.6030071508197613, "grad_norm": 0.03326425328850746, "grad_norm_var": 1.0196821286813373e-06, "learning_rate": 0.00048539974378579155, "loss": 2.5973, "step": 11089 }, { "crossentropy": 2.6248035430908203, "epoch": 0.6030615296772615, "grad_norm": 0.03292641043663025, "grad_norm_var": 8.936010655008024e-07, "learning_rate": 0.00048486469419056435, "loss": 2.6248, "step": 11090 }, { "crossentropy": 2.535990595817566, "epoch": 0.6031159085347617, "grad_norm": 0.03172983601689339, "grad_norm_var": 8.610351307224961e-07, "learning_rate": 0.0004843299246198196, "loss": 2.536, "step": 11091 }, { "crossentropy": 2.6654754877090454, "epoch": 0.6031702873922619, "grad_norm": 0.032256029546260834, "grad_norm_var": 8.67560130562763e-07, "learning_rate": 0.00048379543510672254, "loss": 2.6655, "step": 11092 }, { "crossentropy": 2.5186867713928223, "epoch": 0.6032246662497621, "grad_norm": 0.03284604474902153, "grad_norm_var": 8.593399321685554e-07, "learning_rate": 0.00048326122568442267, "loss": 2.5187, "step": 11093 }, { "crossentropy": 2.42758047580719, "epoch": 0.6032790451072623, "grad_norm": 0.03193853795528412, "grad_norm_var": 8.130592646616679e-07, "learning_rate": 0.0004827272963860513, "loss": 2.4276, "step": 11094 }, { "crossentropy": 2.4521411657333374, "epoch": 0.6033334239647625, "grad_norm": 0.03229280561208725, "grad_norm_var": 7.851045967487032e-07, "learning_rate": 0.0004821936472447197, "loss": 2.4521, "step": 11095 }, { "crossentropy": 2.5121243000030518, "epoch": 0.6033878028222627, "grad_norm": 0.032013047486543655, "grad_norm_var": 5.670209085349593e-07, "learning_rate": 0.00048166027829352697, "loss": 2.5121, "step": 11096 }, { "crossentropy": 2.4997713565826416, "epoch": 0.6034421816797629, "grad_norm": 0.03235166147351265, "grad_norm_var": 4.70067993415613e-07, "learning_rate": 0.0004811271895655528, "loss": 2.4998, "step": 11097 }, { "crossentropy": 2.5132038593292236, "epoch": 0.6034965605372631, "grad_norm": 0.033209677785634995, "grad_norm_var": 4.6671753298008905e-07, "learning_rate": 0.0004805943810938557, "loss": 2.5132, "step": 11098 }, { "crossentropy": 2.4491019248962402, "epoch": 0.6035509393947633, "grad_norm": 0.03147745132446289, "grad_norm_var": 5.005985691065862e-07, "learning_rate": 0.0004800618529114814, "loss": 2.4491, "step": 11099 }, { "crossentropy": 2.532234787940979, "epoch": 0.6036053182522635, "grad_norm": 0.032343558967113495, "grad_norm_var": 4.689596532767478e-07, "learning_rate": 0.0004795296050514586, "loss": 2.5322, "step": 11100 }, { "crossentropy": 2.5703155994415283, "epoch": 0.6036596971097637, "grad_norm": 0.031137138605117798, "grad_norm_var": 5.484654383546815e-07, "learning_rate": 0.00047899763754679416, "loss": 2.5703, "step": 11101 }, { "crossentropy": 2.6281425952911377, "epoch": 0.6037140759672639, "grad_norm": 0.03184332326054573, "grad_norm_var": 5.54869908414984e-07, "learning_rate": 0.00047846595043048114, "loss": 2.6281, "step": 11102 }, { "crossentropy": 2.5441190004348755, "epoch": 0.6037684548247642, "grad_norm": 0.06335696578025818, "grad_norm_var": 6.072125904142591e-05, "learning_rate": 0.00047793454373549484, "loss": 2.5441, "step": 11103 }, { "crossentropy": 2.597430467605591, "epoch": 0.6038228336822643, "grad_norm": 0.03331680968403816, "grad_norm_var": 6.0584619218561946e-05, "learning_rate": 0.0004774034174947922, "loss": 2.5974, "step": 11104 }, { "crossentropy": 2.469013214111328, "epoch": 0.6038772125397646, "grad_norm": 0.032611701637506485, "grad_norm_var": 6.0698650644182514e-05, "learning_rate": 0.00047687257174131416, "loss": 2.469, "step": 11105 }, { "crossentropy": 2.5741270780563354, "epoch": 0.6039315913972647, "grad_norm": 0.03083234839141369, "grad_norm_var": 6.133618606565676e-05, "learning_rate": 0.00047634200650798045, "loss": 2.5741, "step": 11106 }, { "crossentropy": 2.4290162324905396, "epoch": 0.603985970254765, "grad_norm": 0.03389759734272957, "grad_norm_var": 6.094560330889436e-05, "learning_rate": 0.00047581172182769915, "loss": 2.429, "step": 11107 }, { "crossentropy": 2.6191877126693726, "epoch": 0.6040403491122651, "grad_norm": 0.034923482686281204, "grad_norm_var": 6.06872531743271e-05, "learning_rate": 0.00047528171773335737, "loss": 2.6192, "step": 11108 }, { "crossentropy": 2.5111833810806274, "epoch": 0.6040947279697654, "grad_norm": 0.03021533414721489, "grad_norm_var": 6.166468857447837e-05, "learning_rate": 0.00047475199425782433, "loss": 2.5112, "step": 11109 }, { "crossentropy": 2.541750192642212, "epoch": 0.6041491068272655, "grad_norm": 0.031072210520505905, "grad_norm_var": 6.197687176367274e-05, "learning_rate": 0.0004742225514339532, "loss": 2.5418, "step": 11110 }, { "crossentropy": 2.5310404300689697, "epoch": 0.6042034856847658, "grad_norm": 0.03183683007955551, "grad_norm_var": 6.210465906399744e-05, "learning_rate": 0.0004736933892945816, "loss": 2.531, "step": 11111 }, { "crossentropy": 2.6210726499557495, "epoch": 0.6042578645422659, "grad_norm": 0.03344695270061493, "grad_norm_var": 6.182413832848363e-05, "learning_rate": 0.00047316450787252494, "loss": 2.6211, "step": 11112 }, { "crossentropy": 2.337275505065918, "epoch": 0.6043122433997662, "grad_norm": 0.03243075683712959, "grad_norm_var": 6.18045930431351e-05, "learning_rate": 0.0004726359072005859, "loss": 2.3373, "step": 11113 }, { "crossentropy": 2.5388091802597046, "epoch": 0.6043666222572663, "grad_norm": 0.03230448067188263, "grad_norm_var": 6.198100293733168e-05, "learning_rate": 0.00047210758731154655, "loss": 2.5388, "step": 11114 }, { "crossentropy": 2.5945487022399902, "epoch": 0.6044210011147666, "grad_norm": 0.03161311149597168, "grad_norm_var": 6.193308067633146e-05, "learning_rate": 0.000471579548238173, "loss": 2.5945, "step": 11115 }, { "crossentropy": 2.497977375984192, "epoch": 0.6044753799722667, "grad_norm": 0.0311804860830307, "grad_norm_var": 6.230534837983341e-05, "learning_rate": 0.0004710517900132144, "loss": 2.498, "step": 11116 }, { "crossentropy": 2.4193109273910522, "epoch": 0.604529758829767, "grad_norm": 0.03126026317477226, "grad_norm_var": 6.22572252669919e-05, "learning_rate": 0.00047052431266940143, "loss": 2.4193, "step": 11117 }, { "crossentropy": 2.592501401901245, "epoch": 0.6045841376872672, "grad_norm": 0.03073570877313614, "grad_norm_var": 6.267218002927789e-05, "learning_rate": 0.00046999711623944786, "loss": 2.5925, "step": 11118 }, { "crossentropy": 2.4707189798355103, "epoch": 0.6046385165447674, "grad_norm": 0.03234374523162842, "grad_norm_var": 1.6595460281201932e-06, "learning_rate": 0.0004694702007560503, "loss": 2.4707, "step": 11119 }, { "crossentropy": 2.5517693758010864, "epoch": 0.6046928954022676, "grad_norm": 0.03258739411830902, "grad_norm_var": 1.5770216606164014e-06, "learning_rate": 0.0004689435662518865, "loss": 2.5518, "step": 11120 }, { "crossentropy": 2.617580533027649, "epoch": 0.6047472742597678, "grad_norm": 0.03365952521562576, "grad_norm_var": 1.7198181771844635e-06, "learning_rate": 0.0004684172127596176, "loss": 2.6176, "step": 11121 }, { "crossentropy": 2.501326084136963, "epoch": 0.604801653117268, "grad_norm": 0.032013434916734695, "grad_norm_var": 1.6000904098737238e-06, "learning_rate": 0.00046789114031189015, "loss": 2.5013, "step": 11122 }, { "crossentropy": 2.37063467502594, "epoch": 0.6048560319747682, "grad_norm": 0.03275240585207939, "grad_norm_var": 1.4259133946809347e-06, "learning_rate": 0.0004673653489413276, "loss": 2.3706, "step": 11123 }, { "crossentropy": 2.5563158988952637, "epoch": 0.6049104108322684, "grad_norm": 0.03146718069911003, "grad_norm_var": 8.937196308811748e-07, "learning_rate": 0.0004668398386805406, "loss": 2.5563, "step": 11124 }, { "crossentropy": 2.437572956085205, "epoch": 0.6049647896897686, "grad_norm": 0.03193407878279686, "grad_norm_var": 6.848364503513869e-07, "learning_rate": 0.0004663146095621207, "loss": 2.4376, "step": 11125 }, { "crossentropy": 2.467031478881836, "epoch": 0.6050191685472688, "grad_norm": 0.031187457963824272, "grad_norm_var": 6.707965832720841e-07, "learning_rate": 0.0004657896616186419, "loss": 2.467, "step": 11126 }, { "crossentropy": 2.472238063812256, "epoch": 0.605073547404769, "grad_norm": 0.03034360520541668, "grad_norm_var": 8.520207982503612e-07, "learning_rate": 0.00046526499488266096, "loss": 2.4722, "step": 11127 }, { "crossentropy": 2.5369250774383545, "epoch": 0.6051279262622692, "grad_norm": 0.031281061470508575, "grad_norm_var": 7.140089228538718e-07, "learning_rate": 0.00046474060938671736, "loss": 2.5369, "step": 11128 }, { "crossentropy": 2.4691739082336426, "epoch": 0.6051823051197694, "grad_norm": 0.03211449459195137, "grad_norm_var": 6.944390208253488e-07, "learning_rate": 0.0004642165051633329, "loss": 2.4692, "step": 11129 }, { "crossentropy": 2.4999406337738037, "epoch": 0.6052366839772696, "grad_norm": 0.032628245651721954, "grad_norm_var": 7.228264478663255e-07, "learning_rate": 0.00046369268224501214, "loss": 2.4999, "step": 11130 }, { "crossentropy": 2.474348783493042, "epoch": 0.6052910628347699, "grad_norm": 0.03167829290032387, "grad_norm_var": 7.21303617563553e-07, "learning_rate": 0.000463169140664243, "loss": 2.4743, "step": 11131 }, { "crossentropy": 2.497266411781311, "epoch": 0.60534544169227, "grad_norm": 0.0317964144051075, "grad_norm_var": 6.922516085703171e-07, "learning_rate": 0.0004626458804534922, "loss": 2.4973, "step": 11132 }, { "crossentropy": 2.6410969495773315, "epoch": 0.6053998205497703, "grad_norm": 0.032997217029333115, "grad_norm_var": 7.41581905036072e-07, "learning_rate": 0.00046212290164521555, "loss": 2.6411, "step": 11133 }, { "crossentropy": 2.4958441257476807, "epoch": 0.6054541994072704, "grad_norm": 0.031078733503818512, "grad_norm_var": 6.924829545659182e-07, "learning_rate": 0.00046160020427184477, "loss": 2.4958, "step": 11134 }, { "crossentropy": 2.552426815032959, "epoch": 0.6055085782647707, "grad_norm": 0.032015614211559296, "grad_norm_var": 6.837993676659173e-07, "learning_rate": 0.0004610777883657985, "loss": 2.5524, "step": 11135 }, { "crossentropy": 2.5319035053253174, "epoch": 0.6055629571222708, "grad_norm": 0.031038081273436546, "grad_norm_var": 7.064799773950538e-07, "learning_rate": 0.00046055565395947573, "loss": 2.5319, "step": 11136 }, { "crossentropy": 2.5185866355895996, "epoch": 0.6056173359797711, "grad_norm": 0.030144769698381424, "grad_norm_var": 6.418701774890302e-07, "learning_rate": 0.0004600338010852595, "loss": 2.5186, "step": 11137 }, { "crossentropy": 2.5910459756851196, "epoch": 0.6056717148372712, "grad_norm": 0.03157572075724602, "grad_norm_var": 6.328933373840925e-07, "learning_rate": 0.00045951222977551445, "loss": 2.591, "step": 11138 }, { "crossentropy": 2.514024257659912, "epoch": 0.6057260936947715, "grad_norm": 0.03677695617079735, "grad_norm_var": 2.249060416396178e-06, "learning_rate": 0.00045899094006258745, "loss": 2.514, "step": 11139 }, { "crossentropy": 2.588816285133362, "epoch": 0.6057804725522716, "grad_norm": 0.03167959675192833, "grad_norm_var": 2.2402276033399766e-06, "learning_rate": 0.00045846993197880883, "loss": 2.5888, "step": 11140 }, { "crossentropy": 2.5100516080856323, "epoch": 0.6058348514097719, "grad_norm": 0.03209834545850754, "grad_norm_var": 2.2428379653909124e-06, "learning_rate": 0.000457949205556491, "loss": 2.5101, "step": 11141 }, { "crossentropy": 2.546290874481201, "epoch": 0.605889230267272, "grad_norm": 0.03240057826042175, "grad_norm_var": 2.2192136870375147e-06, "learning_rate": 0.00045742876082792986, "loss": 2.5463, "step": 11142 }, { "crossentropy": 2.505516767501831, "epoch": 0.6059436091247723, "grad_norm": 0.031009137630462646, "grad_norm_var": 2.1018661731352404e-06, "learning_rate": 0.00045690859782540063, "loss": 2.5055, "step": 11143 }, { "crossentropy": 2.6345399618148804, "epoch": 0.6059979879822724, "grad_norm": 0.03264327347278595, "grad_norm_var": 2.083706908392287e-06, "learning_rate": 0.00045638871658116475, "loss": 2.6345, "step": 11144 }, { "crossentropy": 2.3766433000564575, "epoch": 0.6060523668397727, "grad_norm": 0.03188733011484146, "grad_norm_var": 2.0866359896397433e-06, "learning_rate": 0.00045586911712746595, "loss": 2.3766, "step": 11145 }, { "crossentropy": 2.430039405822754, "epoch": 0.6061067456972729, "grad_norm": 0.031898077577352524, "grad_norm_var": 2.067606820371062e-06, "learning_rate": 0.0004553497994965272, "loss": 2.43, "step": 11146 }, { "crossentropy": 2.537544846534729, "epoch": 0.6061611245547731, "grad_norm": 0.03324466571211815, "grad_norm_var": 2.1443896751982172e-06, "learning_rate": 0.0004548307637205568, "loss": 2.5375, "step": 11147 }, { "crossentropy": 2.471724271774292, "epoch": 0.6062155034122733, "grad_norm": 0.0315573588013649, "grad_norm_var": 2.159001547711092e-06, "learning_rate": 0.0004543120098317449, "loss": 2.4717, "step": 11148 }, { "crossentropy": 2.4733551740646362, "epoch": 0.6062698822697735, "grad_norm": 0.031155573204159737, "grad_norm_var": 2.157502336741964e-06, "learning_rate": 0.0004537935378622643, "loss": 2.4734, "step": 11149 }, { "crossentropy": 2.5572439432144165, "epoch": 0.6063242611272737, "grad_norm": 0.032287079840898514, "grad_norm_var": 2.0982784732558857e-06, "learning_rate": 0.0004532753478442697, "loss": 2.5572, "step": 11150 }, { "crossentropy": 2.627477288246155, "epoch": 0.6063786399847739, "grad_norm": 0.03236968815326691, "grad_norm_var": 2.102684401235286e-06, "learning_rate": 0.00045275743980989936, "loss": 2.6275, "step": 11151 }, { "crossentropy": 2.5341910123825073, "epoch": 0.6064330188422741, "grad_norm": 0.03283439204096794, "grad_norm_var": 2.0475286529744352e-06, "learning_rate": 0.00045223981379127363, "loss": 2.5342, "step": 11152 }, { "crossentropy": 2.5091127157211304, "epoch": 0.6064873976997743, "grad_norm": 0.03250962123274803, "grad_norm_var": 1.741874706595115e-06, "learning_rate": 0.0004517224698204953, "loss": 2.5091, "step": 11153 }, { "crossentropy": 2.5516490936279297, "epoch": 0.6065417765572745, "grad_norm": 0.03198787942528725, "grad_norm_var": 1.7088172697108378e-06, "learning_rate": 0.0004512054079296468, "loss": 2.5516, "step": 11154 }, { "crossentropy": 2.4428542852401733, "epoch": 0.6065961554147747, "grad_norm": 0.033059630542993546, "grad_norm_var": 4.011921373401565e-07, "learning_rate": 0.0004506886281507999, "loss": 2.4429, "step": 11155 }, { "crossentropy": 2.5593299865722656, "epoch": 0.6066505342722749, "grad_norm": 0.03192583844065666, "grad_norm_var": 3.8908142212591643e-07, "learning_rate": 0.00045017213051600337, "loss": 2.5593, "step": 11156 }, { "crossentropy": 2.4702736139297485, "epoch": 0.6067049131297751, "grad_norm": 0.03193020820617676, "grad_norm_var": 3.9266270631976795e-07, "learning_rate": 0.0004496559150572882, "loss": 2.4703, "step": 11157 }, { "crossentropy": 2.5923672914505005, "epoch": 0.6067592919872753, "grad_norm": 0.033096976578235626, "grad_norm_var": 4.444974141176583e-07, "learning_rate": 0.0004491399818066705, "loss": 2.5924, "step": 11158 }, { "crossentropy": 2.433030843734741, "epoch": 0.6068136708447756, "grad_norm": 0.03051387332379818, "grad_norm_var": 5.392786669761636e-07, "learning_rate": 0.0004486243307961502, "loss": 2.433, "step": 11159 }, { "crossentropy": 2.4354242086410522, "epoch": 0.6068680497022757, "grad_norm": 0.031525276601314545, "grad_norm_var": 5.485400378974862e-07, "learning_rate": 0.00044810896205770446, "loss": 2.4354, "step": 11160 }, { "crossentropy": 2.355642795562744, "epoch": 0.606922428559776, "grad_norm": 0.03223724663257599, "grad_norm_var": 5.457354159634868e-07, "learning_rate": 0.0004475938756232978, "loss": 2.3556, "step": 11161 }, { "crossentropy": 2.486321449279785, "epoch": 0.6069768074172761, "grad_norm": 0.032213013619184494, "grad_norm_var": 5.420555857729252e-07, "learning_rate": 0.00044707907152487404, "loss": 2.4863, "step": 11162 }, { "crossentropy": 2.4778228998184204, "epoch": 0.6070311862747764, "grad_norm": 0.03565693646669388, "grad_norm_var": 1.2568588455968398e-06, "learning_rate": 0.0004465645497943621, "loss": 2.4778, "step": 11163 }, { "crossentropy": 2.51955509185791, "epoch": 0.6070855651322765, "grad_norm": 0.03131759911775589, "grad_norm_var": 1.2843134270985762e-06, "learning_rate": 0.00044605031046367205, "loss": 2.5196, "step": 11164 }, { "crossentropy": 2.469216465950012, "epoch": 0.6071399439897768, "grad_norm": 0.030714556574821472, "grad_norm_var": 1.3631057773929075e-06, "learning_rate": 0.00044553635356469446, "loss": 2.4692, "step": 11165 }, { "crossentropy": 2.443313479423523, "epoch": 0.6071943228472769, "grad_norm": 0.032040562480688095, "grad_norm_var": 1.3660545760339384e-06, "learning_rate": 0.00044502267912930715, "loss": 2.4433, "step": 11166 }, { "crossentropy": 2.4754323959350586, "epoch": 0.6072487017047772, "grad_norm": 0.03414207696914673, "grad_norm_var": 1.5916593951955839e-06, "learning_rate": 0.00044450928718936744, "loss": 2.4754, "step": 11167 }, { "crossentropy": 2.5840282440185547, "epoch": 0.6073030805622774, "grad_norm": 0.031516604125499725, "grad_norm_var": 1.6162451992109227e-06, "learning_rate": 0.00044399617777671376, "loss": 2.584, "step": 11168 }, { "crossentropy": 2.5092720985412598, "epoch": 0.6073574594197776, "grad_norm": 0.031619343906641006, "grad_norm_var": 1.6378421442561023e-06, "learning_rate": 0.00044348335092316894, "loss": 2.5093, "step": 11169 }, { "crossentropy": 2.5928653478622437, "epoch": 0.6074118382772778, "grad_norm": 0.03171585500240326, "grad_norm_var": 1.6508352443266126e-06, "learning_rate": 0.00044297080666054034, "loss": 2.5929, "step": 11170 }, { "crossentropy": 2.5263147354125977, "epoch": 0.607466217134778, "grad_norm": 0.032067250460386276, "grad_norm_var": 1.5988540491086116e-06, "learning_rate": 0.0004424585450206126, "loss": 2.5263, "step": 11171 }, { "crossentropy": 2.570863723754883, "epoch": 0.6075205959922783, "grad_norm": 0.030819686129689217, "grad_norm_var": 1.706850886677482e-06, "learning_rate": 0.00044194656603515736, "loss": 2.5709, "step": 11172 }, { "crossentropy": 2.5369083881378174, "epoch": 0.6075749748497784, "grad_norm": 0.03222868964076042, "grad_norm_var": 1.7068381411799427e-06, "learning_rate": 0.00044143486973592606, "loss": 2.5369, "step": 11173 }, { "crossentropy": 2.6434744596481323, "epoch": 0.6076293537072787, "grad_norm": 0.034553397446870804, "grad_norm_var": 2.0351303834935847e-06, "learning_rate": 0.0004409234561546549, "loss": 2.6435, "step": 11174 }, { "crossentropy": 2.455951690673828, "epoch": 0.6076837325647788, "grad_norm": 0.030684178695082664, "grad_norm_var": 1.999106958453419e-06, "learning_rate": 0.00044041232532306075, "loss": 2.456, "step": 11175 }, { "crossentropy": 2.516268014907837, "epoch": 0.6077381114222791, "grad_norm": 0.03161775320768356, "grad_norm_var": 1.9914358130308323e-06, "learning_rate": 0.00043990147727284215, "loss": 2.5163, "step": 11176 }, { "crossentropy": 2.5256766080856323, "epoch": 0.6077924902797792, "grad_norm": 0.0312935933470726, "grad_norm_var": 2.0419700506482213e-06, "learning_rate": 0.00043939091203568315, "loss": 2.5257, "step": 11177 }, { "crossentropy": 2.474653482437134, "epoch": 0.6078468691372795, "grad_norm": 0.032235532999038696, "grad_norm_var": 2.042228275885421e-06, "learning_rate": 0.0004388806296432485, "loss": 2.4747, "step": 11178 }, { "crossentropy": 2.5019806623458862, "epoch": 0.6079012479947796, "grad_norm": 0.03168705478310585, "grad_norm_var": 1.16510758839695e-06, "learning_rate": 0.00043837063012718447, "loss": 2.502, "step": 11179 }, { "crossentropy": 2.5401833057403564, "epoch": 0.6079556268522799, "grad_norm": 0.03355184197425842, "grad_norm_var": 1.3063242565304148e-06, "learning_rate": 0.00043786091351912025, "loss": 2.5402, "step": 11180 }, { "crossentropy": 2.529635190963745, "epoch": 0.60801000570978, "grad_norm": 0.033541254699230194, "grad_norm_var": 1.309743698813213e-06, "learning_rate": 0.00043735147985067004, "loss": 2.5296, "step": 11181 }, { "crossentropy": 2.525713086128235, "epoch": 0.6080643845672803, "grad_norm": 0.032963819801807404, "grad_norm_var": 1.342509745161311e-06, "learning_rate": 0.0004368423291534274, "loss": 2.5257, "step": 11182 }, { "crossentropy": 2.3648953437805176, "epoch": 0.6081187634247804, "grad_norm": 0.03136860206723213, "grad_norm_var": 1.1290853841700495e-06, "learning_rate": 0.0004363334614589687, "loss": 2.3649, "step": 11183 }, { "crossentropy": 2.601301908493042, "epoch": 0.6081731422822807, "grad_norm": 0.03173370659351349, "grad_norm_var": 1.1153888892172688e-06, "learning_rate": 0.00043582487679885376, "loss": 2.6013, "step": 11184 }, { "crossentropy": 2.452298164367676, "epoch": 0.6082275211397808, "grad_norm": 0.030555907636880875, "grad_norm_var": 1.2549456715587233e-06, "learning_rate": 0.0004353165752046251, "loss": 2.4523, "step": 11185 }, { "crossentropy": 2.550430417060852, "epoch": 0.6082818999972811, "grad_norm": 0.0313098169863224, "grad_norm_var": 1.282724524108841e-06, "learning_rate": 0.00043480855670780693, "loss": 2.5504, "step": 11186 }, { "crossentropy": 2.5010926723480225, "epoch": 0.6083362788547813, "grad_norm": 0.03390760347247124, "grad_norm_var": 1.5076550532223837e-06, "learning_rate": 0.0004343008213399058, "loss": 2.5011, "step": 11187 }, { "crossentropy": 2.4666197299957275, "epoch": 0.6083906577122815, "grad_norm": 0.03443789854645729, "grad_norm_var": 1.6945698479027635e-06, "learning_rate": 0.00043379336913241086, "loss": 2.4666, "step": 11188 }, { "crossentropy": 2.511524796485901, "epoch": 0.6084450365697817, "grad_norm": 0.030903616920113564, "grad_norm_var": 1.8265212856452898e-06, "learning_rate": 0.0004332862001167953, "loss": 2.5115, "step": 11189 }, { "crossentropy": 2.5131165981292725, "epoch": 0.6084994154272819, "grad_norm": 0.031362902373075485, "grad_norm_var": 1.4920492728608935e-06, "learning_rate": 0.00043277931432451124, "loss": 2.5131, "step": 11190 }, { "crossentropy": 2.4151861667633057, "epoch": 0.6085537942847821, "grad_norm": 0.03349292278289795, "grad_norm_var": 1.4653043104621468e-06, "learning_rate": 0.0004322727117869951, "loss": 2.4152, "step": 11191 }, { "crossentropy": 2.5076483488082886, "epoch": 0.6086081731422823, "grad_norm": 0.032377053052186966, "grad_norm_var": 1.4375580512563143e-06, "learning_rate": 0.0004317663925356685, "loss": 2.5076, "step": 11192 }, { "crossentropy": 2.4484235048294067, "epoch": 0.6086625519997825, "grad_norm": 0.030591536313295364, "grad_norm_var": 1.5621208845834384e-06, "learning_rate": 0.00043126035660193075, "loss": 2.4484, "step": 11193 }, { "crossentropy": 2.5375367403030396, "epoch": 0.6087169308572827, "grad_norm": 0.03023914434015751, "grad_norm_var": 1.815420314544893e-06, "learning_rate": 0.00043075460401716613, "loss": 2.5375, "step": 11194 }, { "crossentropy": 2.416481137275696, "epoch": 0.6087713097147829, "grad_norm": 0.03155010566115379, "grad_norm_var": 1.8246175028523623e-06, "learning_rate": 0.00043024913481274073, "loss": 2.4165, "step": 11195 }, { "crossentropy": 2.515727400779724, "epoch": 0.6088256885722831, "grad_norm": 0.03125082328915596, "grad_norm_var": 1.715624034248573e-06, "learning_rate": 0.0004297439490200039, "loss": 2.5157, "step": 11196 }, { "crossentropy": 2.606711268424988, "epoch": 0.6088800674297833, "grad_norm": 0.03151176497340202, "grad_norm_var": 1.5489997531622593e-06, "learning_rate": 0.00042923904667028666, "loss": 2.6067, "step": 11197 }, { "crossentropy": 2.4615970849990845, "epoch": 0.6089344462872835, "grad_norm": 0.0322795994579792, "grad_norm_var": 1.4764026094339517e-06, "learning_rate": 0.00042873442779490237, "loss": 2.4616, "step": 11198 }, { "crossentropy": 2.448703408241272, "epoch": 0.6089888251447837, "grad_norm": 0.031271882355213165, "grad_norm_var": 1.4826094109532859e-06, "learning_rate": 0.000428230092425147, "loss": 2.4487, "step": 11199 }, { "crossentropy": 2.606143832206726, "epoch": 0.609043204002284, "grad_norm": 0.031809110194444656, "grad_norm_var": 1.4823131676416334e-06, "learning_rate": 0.0004277260405922989, "loss": 2.6061, "step": 11200 }, { "crossentropy": 2.573448896408081, "epoch": 0.6090975828597841, "grad_norm": 0.030985398218035698, "grad_norm_var": 1.4224135951441114e-06, "learning_rate": 0.00042722227232761976, "loss": 2.5734, "step": 11201 }, { "crossentropy": 2.4391682147979736, "epoch": 0.6091519617172844, "grad_norm": 0.047558631747961044, "grad_norm_var": 1.6796771790680827e-05, "learning_rate": 0.00042671878766235074, "loss": 2.4392, "step": 11202 }, { "crossentropy": 2.4516676664352417, "epoch": 0.6092063405747845, "grad_norm": 0.030630389228463173, "grad_norm_var": 1.700398581340138e-05, "learning_rate": 0.00042621558662772065, "loss": 2.4517, "step": 11203 }, { "crossentropy": 2.476908802986145, "epoch": 0.6092607194322848, "grad_norm": 0.031079232692718506, "grad_norm_var": 1.690424453831991e-05, "learning_rate": 0.0004257126692549346, "loss": 2.4769, "step": 11204 }, { "crossentropy": 2.3947904109954834, "epoch": 0.6093150982897849, "grad_norm": 0.032213300466537476, "grad_norm_var": 1.6744751098353192e-05, "learning_rate": 0.0004252100355751842, "loss": 2.3948, "step": 11205 }, { "crossentropy": 2.5430866479873657, "epoch": 0.6093694771472852, "grad_norm": 0.03411497920751572, "grad_norm_var": 1.679619697970967e-05, "learning_rate": 0.00042470768561964247, "loss": 2.5431, "step": 11206 }, { "crossentropy": 2.44619357585907, "epoch": 0.6094238560047853, "grad_norm": 0.031463656574487686, "grad_norm_var": 1.683489857017919e-05, "learning_rate": 0.0004242056194194643, "loss": 2.4462, "step": 11207 }, { "crossentropy": 2.5215595960617065, "epoch": 0.6094782348622856, "grad_norm": 0.03090847283601761, "grad_norm_var": 1.7005108370272724e-05, "learning_rate": 0.00042370383700578775, "loss": 2.5216, "step": 11208 }, { "crossentropy": 2.5171756744384766, "epoch": 0.6095326137197857, "grad_norm": 0.03647405281662941, "grad_norm_var": 1.7697550526973845e-05, "learning_rate": 0.00042320233840973276, "loss": 2.5172, "step": 11209 }, { "crossentropy": 2.4546420574188232, "epoch": 0.609586992577286, "grad_norm": 0.03315429016947746, "grad_norm_var": 1.7220179828176565e-05, "learning_rate": 0.00042270112366240176, "loss": 2.4546, "step": 11210 }, { "crossentropy": 2.512226700782776, "epoch": 0.6096413714347861, "grad_norm": 0.033737149089574814, "grad_norm_var": 1.7091669637516308e-05, "learning_rate": 0.00042220019279487985, "loss": 2.5122, "step": 11211 }, { "crossentropy": 2.509724259376526, "epoch": 0.6096957502922864, "grad_norm": 0.0309432540088892, "grad_norm_var": 1.7175575383739888e-05, "learning_rate": 0.000421699545838235, "loss": 2.5097, "step": 11212 }, { "crossentropy": 2.5223482847213745, "epoch": 0.6097501291497865, "grad_norm": 0.03157460317015648, "grad_norm_var": 1.7162235024137487e-05, "learning_rate": 0.000421199182823514, "loss": 2.5223, "step": 11213 }, { "crossentropy": 2.46023428440094, "epoch": 0.6098045080072868, "grad_norm": 0.033227935433387756, "grad_norm_var": 1.7109982572579525e-05, "learning_rate": 0.0004206991037817515, "loss": 2.4602, "step": 11214 }, { "crossentropy": 2.4668877124786377, "epoch": 0.609858886864787, "grad_norm": 0.030437756329774857, "grad_norm_var": 1.7367534036940427e-05, "learning_rate": 0.00042019930874396264, "loss": 2.4669, "step": 11215 }, { "crossentropy": 2.3981621265411377, "epoch": 0.6099132657222872, "grad_norm": 0.031263984739780426, "grad_norm_var": 1.748316827423524e-05, "learning_rate": 0.0004196997977411421, "loss": 2.3982, "step": 11216 }, { "crossentropy": 2.369547724723816, "epoch": 0.6099676445797874, "grad_norm": 0.030949944630265236, "grad_norm_var": 1.749329222855768e-05, "learning_rate": 0.0004192005708042695, "loss": 2.3695, "step": 11217 }, { "crossentropy": 2.465918183326721, "epoch": 0.6100220234372876, "grad_norm": 0.03210592642426491, "grad_norm_var": 2.644377555596636e-06, "learning_rate": 0.0004187016279643069, "loss": 2.4659, "step": 11218 }, { "crossentropy": 2.5226467847824097, "epoch": 0.6100764022947878, "grad_norm": 0.03214704990386963, "grad_norm_var": 2.4823761394635046e-06, "learning_rate": 0.00041820296925219826, "loss": 2.5226, "step": 11219 }, { "crossentropy": 2.5020065307617188, "epoch": 0.610130781152288, "grad_norm": 0.03158192336559296, "grad_norm_var": 2.4205548829189793e-06, "learning_rate": 0.00041770459469887003, "loss": 2.502, "step": 11220 }, { "crossentropy": 2.4478635787963867, "epoch": 0.6101851600097882, "grad_norm": 0.0319502167403698, "grad_norm_var": 2.426821974977194e-06, "learning_rate": 0.0004172065043352308, "loss": 2.4479, "step": 11221 }, { "crossentropy": 2.3615437746047974, "epoch": 0.6102395388672884, "grad_norm": 0.031324632465839386, "grad_norm_var": 2.220409066998735e-06, "learning_rate": 0.0004167086981921714, "loss": 2.3615, "step": 11222 }, { "crossentropy": 2.4447914361953735, "epoch": 0.6102939177247886, "grad_norm": 0.03216530382633209, "grad_norm_var": 2.193723146816142e-06, "learning_rate": 0.00041621117630056604, "loss": 2.4448, "step": 11223 }, { "crossentropy": 2.486512064933777, "epoch": 0.6103482965822888, "grad_norm": 0.03208056464791298, "grad_norm_var": 2.0899906578941117e-06, "learning_rate": 0.00041571393869127007, "loss": 2.4865, "step": 11224 }, { "crossentropy": 2.643149971961975, "epoch": 0.610402675439789, "grad_norm": 0.03255981579422951, "grad_norm_var": 8.142925485787299e-07, "learning_rate": 0.0004152169853951204, "loss": 2.6431, "step": 11225 }, { "crossentropy": 2.484549045562744, "epoch": 0.6104570542972892, "grad_norm": 0.032491620630025864, "grad_norm_var": 7.353560685785868e-07, "learning_rate": 0.0004147203164429403, "loss": 2.4845, "step": 11226 }, { "crossentropy": 2.540893316268921, "epoch": 0.6105114331547894, "grad_norm": 0.031592369079589844, "grad_norm_var": 5.000226886405871e-07, "learning_rate": 0.0004142239318655311, "loss": 2.5409, "step": 11227 }, { "crossentropy": 2.446417450904846, "epoch": 0.6105658120122897, "grad_norm": 0.030324699357151985, "grad_norm_var": 5.925172117703566e-07, "learning_rate": 0.00041372783169367765, "loss": 2.4464, "step": 11228 }, { "crossentropy": 2.458654284477234, "epoch": 0.6106201908697898, "grad_norm": 0.03137258440256119, "grad_norm_var": 5.994192445355275e-07, "learning_rate": 0.00041323201595814873, "loss": 2.4587, "step": 11229 }, { "crossentropy": 2.5656856298446655, "epoch": 0.6106745697272901, "grad_norm": 0.03204365819692612, "grad_norm_var": 4.49523696767889e-07, "learning_rate": 0.00041273648468969417, "loss": 2.5657, "step": 11230 }, { "crossentropy": 2.6165367364883423, "epoch": 0.6107289485847902, "grad_norm": 0.03236009180545807, "grad_norm_var": 3.699000296716881e-07, "learning_rate": 0.00041224123791904654, "loss": 2.6165, "step": 11231 }, { "crossentropy": 2.5363831520080566, "epoch": 0.6107833274422905, "grad_norm": 0.03161315619945526, "grad_norm_var": 3.539782643754522e-07, "learning_rate": 0.0004117462756769186, "loss": 2.5364, "step": 11232 }, { "crossentropy": 2.535640835762024, "epoch": 0.6108377062997906, "grad_norm": 0.03277868404984474, "grad_norm_var": 3.5780493607115844e-07, "learning_rate": 0.00041125159799400957, "loss": 2.5356, "step": 11233 }, { "crossentropy": 2.435998797416687, "epoch": 0.6108920851572909, "grad_norm": 0.03238995373249054, "grad_norm_var": 3.7042694564780264e-07, "learning_rate": 0.000410757204901, "loss": 2.436, "step": 11234 }, { "crossentropy": 2.5970832109451294, "epoch": 0.610946464014791, "grad_norm": 0.03243101388216019, "grad_norm_var": 3.8392991579433377e-07, "learning_rate": 0.00041026309642854864, "loss": 2.5971, "step": 11235 }, { "crossentropy": 2.5234144926071167, "epoch": 0.6110008428722913, "grad_norm": 0.029928231611847878, "grad_norm_var": 6.34081145400404e-07, "learning_rate": 0.0004097692726072999, "loss": 2.5234, "step": 11236 }, { "crossentropy": 2.5032821893692017, "epoch": 0.6110552217297914, "grad_norm": 0.03166346997022629, "grad_norm_var": 6.349264016445102e-07, "learning_rate": 0.0004092757334678837, "loss": 2.5033, "step": 11237 }, { "crossentropy": 2.553426504135132, "epoch": 0.6111096005872917, "grad_norm": 0.03173311799764633, "grad_norm_var": 6.183756275409656e-07, "learning_rate": 0.00040878247904090504, "loss": 2.5534, "step": 11238 }, { "crossentropy": 2.496281862258911, "epoch": 0.6111639794447918, "grad_norm": 0.03145292401313782, "grad_norm_var": 6.197191910750054e-07, "learning_rate": 0.0004082895093569572, "loss": 2.4963, "step": 11239 }, { "crossentropy": 2.4243714809417725, "epoch": 0.6112183583022921, "grad_norm": 0.03133044391870499, "grad_norm_var": 6.269255163638779e-07, "learning_rate": 0.0004077968244466129, "loss": 2.4244, "step": 11240 }, { "crossentropy": 2.4964178800582886, "epoch": 0.6112727371597922, "grad_norm": 0.032566558569669724, "grad_norm_var": 6.276527128324291e-07, "learning_rate": 0.00040730442434042834, "loss": 2.4964, "step": 11241 }, { "crossentropy": 2.476338028907776, "epoch": 0.6113271160172925, "grad_norm": 0.0309431254863739, "grad_norm_var": 6.253346273951109e-07, "learning_rate": 0.00040681230906894296, "loss": 2.4763, "step": 11242 }, { "crossentropy": 2.588338613510132, "epoch": 0.6113814948747927, "grad_norm": 0.031232070177793503, "grad_norm_var": 6.365892204274655e-07, "learning_rate": 0.0004063204786626734, "loss": 2.5883, "step": 11243 }, { "crossentropy": 2.506944417953491, "epoch": 0.6114358737322929, "grad_norm": 0.032878484576940536, "grad_norm_var": 5.979588667194413e-07, "learning_rate": 0.00040582893315212643, "loss": 2.5069, "step": 11244 }, { "crossentropy": 2.3721789121627808, "epoch": 0.6114902525897931, "grad_norm": 0.030110104009509087, "grad_norm_var": 7.686548543626992e-07, "learning_rate": 0.00040533767256778654, "loss": 2.3722, "step": 11245 }, { "crossentropy": 2.611955404281616, "epoch": 0.6115446314472933, "grad_norm": 0.031984247267246246, "grad_norm_var": 7.66279476185211e-07, "learning_rate": 0.00040484669694011987, "loss": 2.612, "step": 11246 }, { "crossentropy": 2.4423590898513794, "epoch": 0.6115990103047935, "grad_norm": 0.031549159437417984, "grad_norm_var": 7.373305499139629e-07, "learning_rate": 0.000404356006299576, "loss": 2.4424, "step": 11247 }, { "crossentropy": 2.510143518447876, "epoch": 0.6116533891622937, "grad_norm": 0.0329778790473938, "grad_norm_var": 8.449295714006595e-07, "learning_rate": 0.00040386560067659006, "loss": 2.5101, "step": 11248 }, { "crossentropy": 2.5971662998199463, "epoch": 0.6117077680197939, "grad_norm": 0.031148750334978104, "grad_norm_var": 7.867277056029682e-07, "learning_rate": 0.0004033754801015732, "loss": 2.5972, "step": 11249 }, { "crossentropy": 2.5571188926696777, "epoch": 0.6117621468772941, "grad_norm": 0.03340715914964676, "grad_norm_var": 9.524369638889858e-07, "learning_rate": 0.0004028856446049245, "loss": 2.5571, "step": 11250 }, { "crossentropy": 2.498443841934204, "epoch": 0.6118165257347943, "grad_norm": 0.031742848455905914, "grad_norm_var": 9.157448896350446e-07, "learning_rate": 0.0004023960942170213, "loss": 2.4984, "step": 11251 }, { "crossentropy": 2.3667508363723755, "epoch": 0.6118709045922945, "grad_norm": 0.030289174988865852, "grad_norm_var": 8.402782681298769e-07, "learning_rate": 0.0004019068289682265, "loss": 2.3668, "step": 11252 }, { "crossentropy": 2.415993094444275, "epoch": 0.6119252834497947, "grad_norm": 0.030759336426854134, "grad_norm_var": 8.943379154569547e-07, "learning_rate": 0.00040141784888888366, "loss": 2.416, "step": 11253 }, { "crossentropy": 2.4861894845962524, "epoch": 0.611979662307295, "grad_norm": 0.0314013734459877, "grad_norm_var": 8.96725315214702e-07, "learning_rate": 0.00040092915400931795, "loss": 2.4862, "step": 11254 }, { "crossentropy": 2.581143021583557, "epoch": 0.6120340411647951, "grad_norm": 0.03198136389255524, "grad_norm_var": 9.03050934330018e-07, "learning_rate": 0.000400440744359839, "loss": 2.5811, "step": 11255 }, { "crossentropy": 2.488244652748108, "epoch": 0.6120884200222954, "grad_norm": 0.034980326890945435, "grad_norm_var": 1.5831198138998991e-06, "learning_rate": 0.00039995261997073704, "loss": 2.4882, "step": 11256 }, { "crossentropy": 2.625553607940674, "epoch": 0.6121427988797955, "grad_norm": 0.032486844807863235, "grad_norm_var": 1.5761348146499463e-06, "learning_rate": 0.00039946478087228457, "loss": 2.6256, "step": 11257 }, { "crossentropy": 2.496448040008545, "epoch": 0.6121971777372958, "grad_norm": 0.029771970584988594, "grad_norm_var": 1.8061291652472777e-06, "learning_rate": 0.0003989772270947367, "loss": 2.4964, "step": 11258 }, { "crossentropy": 2.421982169151306, "epoch": 0.6122515565947959, "grad_norm": 0.03172948956489563, "grad_norm_var": 1.7843367027295734e-06, "learning_rate": 0.00039848995866833273, "loss": 2.422, "step": 11259 }, { "crossentropy": 2.4128520488739014, "epoch": 0.6123059354522962, "grad_norm": 0.03134120628237724, "grad_norm_var": 1.7160859899560561e-06, "learning_rate": 0.00039800297562329114, "loss": 2.4129, "step": 11260 }, { "crossentropy": 2.4703707695007324, "epoch": 0.6123603143097963, "grad_norm": 0.032068464905023575, "grad_norm_var": 1.5331120276811887e-06, "learning_rate": 0.0003975162779898145, "loss": 2.4704, "step": 11261 }, { "crossentropy": 2.4887224435806274, "epoch": 0.6124146931672966, "grad_norm": 0.06663820147514343, "grad_norm_var": 7.72037802300235e-05, "learning_rate": 0.00039702986579808754, "loss": 2.4887, "step": 11262 }, { "crossentropy": 2.5565168857574463, "epoch": 0.6124690720247967, "grad_norm": 0.0308831799775362, "grad_norm_var": 7.745064685392966e-05, "learning_rate": 0.0003965437390782767, "loss": 2.5565, "step": 11263 }, { "crossentropy": 2.5108642578125, "epoch": 0.612523450882297, "grad_norm": 0.033281389623880386, "grad_norm_var": 7.741603355641042e-05, "learning_rate": 0.0003960578978605317, "loss": 2.5109, "step": 11264 }, { "crossentropy": 2.5686017274856567, "epoch": 0.6125778297397971, "grad_norm": 0.03344675898551941, "grad_norm_var": 7.68741629142207e-05, "learning_rate": 0.00039557234217498297, "loss": 2.5686, "step": 11265 }, { "crossentropy": 2.4286513328552246, "epoch": 0.6126322085972974, "grad_norm": 0.031284086406230927, "grad_norm_var": 7.736278082821138e-05, "learning_rate": 0.0003950870720517458, "loss": 2.4287, "step": 11266 }, { "crossentropy": 2.4867981672286987, "epoch": 0.6126865874547975, "grad_norm": 0.03187045827507973, "grad_norm_var": 7.732530249663632e-05, "learning_rate": 0.0003946020875209144, "loss": 2.4868, "step": 11267 }, { "crossentropy": 2.4846670627593994, "epoch": 0.6127409663122978, "grad_norm": 0.03259163722395897, "grad_norm_var": 7.651333228302843e-05, "learning_rate": 0.0003941173886125693, "loss": 2.4847, "step": 11268 }, { "crossentropy": 2.6153491735458374, "epoch": 0.612795345169798, "grad_norm": 0.032839030027389526, "grad_norm_var": 7.58414352048361e-05, "learning_rate": 0.000393632975356768, "loss": 2.6153, "step": 11269 }, { "crossentropy": 2.456825613975525, "epoch": 0.6128497240272982, "grad_norm": 0.030632102862000465, "grad_norm_var": 7.617442254559356e-05, "learning_rate": 0.00039314884778355653, "loss": 2.4568, "step": 11270 }, { "crossentropy": 2.5152748823165894, "epoch": 0.6129041028847984, "grad_norm": 0.032120347023010254, "grad_norm_var": 7.61337904613859e-05, "learning_rate": 0.0003926650059229575, "loss": 2.5153, "step": 11271 }, { "crossentropy": 2.4894356727600098, "epoch": 0.6129584817422986, "grad_norm": 0.03231518715620041, "grad_norm_var": 7.631743663813135e-05, "learning_rate": 0.00039218144980497974, "loss": 2.4894, "step": 11272 }, { "crossentropy": 2.5764825344085693, "epoch": 0.6130128605997988, "grad_norm": 0.03233540803194046, "grad_norm_var": 7.635106394917079e-05, "learning_rate": 0.0003916981794596125, "loss": 2.5765, "step": 11273 }, { "crossentropy": 2.4037511348724365, "epoch": 0.613067239457299, "grad_norm": 0.03133165463805199, "grad_norm_var": 7.560891742420338e-05, "learning_rate": 0.000391215194916828, "loss": 2.4038, "step": 11274 }, { "crossentropy": 2.4679659605026245, "epoch": 0.6131216183147992, "grad_norm": 0.03219849243760109, "grad_norm_var": 7.547009551345726e-05, "learning_rate": 0.00039073249620658003, "loss": 2.468, "step": 11275 }, { "crossentropy": 2.5579981803894043, "epoch": 0.6131759971722994, "grad_norm": 0.0321798212826252, "grad_norm_var": 7.519454974002221e-05, "learning_rate": 0.0003902500833588057, "loss": 2.558, "step": 11276 }, { "crossentropy": 2.533384680747986, "epoch": 0.6132303760297996, "grad_norm": 0.030315130949020386, "grad_norm_var": 7.589691759251194e-05, "learning_rate": 0.00038976795640342343, "loss": 2.5334, "step": 11277 }, { "crossentropy": 2.565589427947998, "epoch": 0.6132847548872998, "grad_norm": 0.032752808183431625, "grad_norm_var": 8.385441816187444e-07, "learning_rate": 0.0003892861153703342, "loss": 2.5656, "step": 11278 }, { "crossentropy": 2.5403796434402466, "epoch": 0.6133391337448, "grad_norm": 0.03103114478290081, "grad_norm_var": 8.174137255400592e-07, "learning_rate": 0.000388804560289423, "loss": 2.5404, "step": 11279 }, { "crossentropy": 2.4592604637145996, "epoch": 0.6133935126023002, "grad_norm": 0.031362954527139664, "grad_norm_var": 7.280703986737094e-07, "learning_rate": 0.00038832329119055153, "loss": 2.4593, "step": 11280 }, { "crossentropy": 2.5699537992477417, "epoch": 0.6134478914598004, "grad_norm": 0.030837779864668846, "grad_norm_var": 6.19933050078356e-07, "learning_rate": 0.00038784230810357147, "loss": 2.57, "step": 11281 }, { "crossentropy": 2.4629822969436646, "epoch": 0.6135022703173006, "grad_norm": 0.03059096448123455, "grad_norm_var": 6.930058657400595e-07, "learning_rate": 0.00038736161105831226, "loss": 2.463, "step": 11282 }, { "crossentropy": 2.628265619277954, "epoch": 0.6135566491748008, "grad_norm": 0.03243231400847435, "grad_norm_var": 7.250144528271607e-07, "learning_rate": 0.0003868812000845845, "loss": 2.6283, "step": 11283 }, { "crossentropy": 2.4503661394119263, "epoch": 0.613611028032301, "grad_norm": 0.03170402720570564, "grad_norm_var": 6.736636764315129e-07, "learning_rate": 0.00038640107521218384, "loss": 2.4504, "step": 11284 }, { "crossentropy": 2.3862192630767822, "epoch": 0.6136654068898012, "grad_norm": 0.030677741393446922, "grad_norm_var": 6.333980006625045e-07, "learning_rate": 0.00038592123647088753, "loss": 2.3862, "step": 11285 }, { "crossentropy": 2.5753802061080933, "epoch": 0.6137197857473015, "grad_norm": 0.03341890126466751, "grad_norm_var": 7.773072247682066e-07, "learning_rate": 0.0003854416838904545, "loss": 2.5754, "step": 11286 }, { "crossentropy": 2.4438154697418213, "epoch": 0.6137741646048016, "grad_norm": 0.030419781804084778, "grad_norm_var": 8.684768503158224e-07, "learning_rate": 0.0003849624175006256, "loss": 2.4438, "step": 11287 }, { "crossentropy": 2.5414609909057617, "epoch": 0.6138285434623019, "grad_norm": 0.03191816061735153, "grad_norm_var": 8.414751301763848e-07, "learning_rate": 0.00038448343733112566, "loss": 2.5415, "step": 11288 }, { "crossentropy": 2.381280541419983, "epoch": 0.613882922319802, "grad_norm": 0.030678875744342804, "grad_norm_var": 8.492684121961038e-07, "learning_rate": 0.00038400474341165945, "loss": 2.3813, "step": 11289 }, { "crossentropy": 2.449218988418579, "epoch": 0.6139373011773023, "grad_norm": 0.03225962817668915, "grad_norm_var": 8.834156929730015e-07, "learning_rate": 0.00038352633577191654, "loss": 2.4492, "step": 11290 }, { "crossentropy": 2.5369768142700195, "epoch": 0.6139916800348024, "grad_norm": 0.031406741589307785, "grad_norm_var": 8.539940934480332e-07, "learning_rate": 0.0003830482144415648, "loss": 2.537, "step": 11291 }, { "crossentropy": 2.5746097564697266, "epoch": 0.6140460588923027, "grad_norm": 0.04233342036604881, "grad_norm_var": 8.218937460937926e-06, "learning_rate": 0.00038257037945025886, "loss": 2.5746, "step": 11292 }, { "crossentropy": 2.529116988182068, "epoch": 0.6141004377498028, "grad_norm": 0.0308114942163229, "grad_norm_var": 8.113975014911719e-06, "learning_rate": 0.00038209283082763445, "loss": 2.5291, "step": 11293 }, { "crossentropy": 2.486280679702759, "epoch": 0.6141548166073031, "grad_norm": 0.031032217666506767, "grad_norm_var": 8.16410493820613e-06, "learning_rate": 0.0003816155686033063, "loss": 2.4863, "step": 11294 }, { "crossentropy": 2.455820083618164, "epoch": 0.6142091954648032, "grad_norm": 0.03379959985613823, "grad_norm_var": 8.264359506250406e-06, "learning_rate": 0.0003811385928068739, "loss": 2.4558, "step": 11295 }, { "crossentropy": 2.5292608737945557, "epoch": 0.6142635743223035, "grad_norm": 0.03223947808146477, "grad_norm_var": 8.21101279490515e-06, "learning_rate": 0.00038066190346792154, "loss": 2.5293, "step": 11296 }, { "crossentropy": 2.469179153442383, "epoch": 0.6143179531798036, "grad_norm": 0.031111372634768486, "grad_norm_var": 8.16289534423067e-06, "learning_rate": 0.0003801855006160104, "loss": 2.4692, "step": 11297 }, { "crossentropy": 2.5968236923217773, "epoch": 0.6143723320373039, "grad_norm": 0.03091050498187542, "grad_norm_var": 8.096370384076201e-06, "learning_rate": 0.00037970938428068816, "loss": 2.5968, "step": 11298 }, { "crossentropy": 2.5425983667373657, "epoch": 0.614426710894804, "grad_norm": 0.031567543745040894, "grad_norm_var": 8.130406390062039e-06, "learning_rate": 0.00037923355449148157, "loss": 2.5426, "step": 11299 }, { "crossentropy": 2.4730085134506226, "epoch": 0.6144810897523043, "grad_norm": 0.03294239565730095, "grad_norm_var": 8.133117481326835e-06, "learning_rate": 0.00037875801127790264, "loss": 2.473, "step": 11300 }, { "crossentropy": 2.5528327226638794, "epoch": 0.6145354686098045, "grad_norm": 0.031211448833346367, "grad_norm_var": 8.032241483917054e-06, "learning_rate": 0.00037828275466944393, "loss": 2.5528, "step": 11301 }, { "crossentropy": 2.4873099327087402, "epoch": 0.6145898474673047, "grad_norm": 0.03271918743848801, "grad_norm_var": 7.965809479471687e-06, "learning_rate": 0.0003778077846955785, "loss": 2.4873, "step": 11302 }, { "crossentropy": 2.4594709873199463, "epoch": 0.6146442263248049, "grad_norm": 0.033747825771570206, "grad_norm_var": 7.808143041865149e-06, "learning_rate": 0.0003773331013857656, "loss": 2.4595, "step": 11303 }, { "crossentropy": 2.5553205013275146, "epoch": 0.6146986051823051, "grad_norm": 0.03090537339448929, "grad_norm_var": 7.956644906811093e-06, "learning_rate": 0.00037685870476944494, "loss": 2.5553, "step": 11304 }, { "crossentropy": 2.4569296836853027, "epoch": 0.6147529840398053, "grad_norm": 0.03259654715657234, "grad_norm_var": 7.72600399873978e-06, "learning_rate": 0.0003763845948760364, "loss": 2.4569, "step": 11305 }, { "crossentropy": 2.433039426803589, "epoch": 0.6148073628973055, "grad_norm": 0.031055165454745293, "grad_norm_var": 7.871284276326125e-06, "learning_rate": 0.00037591077173494415, "loss": 2.433, "step": 11306 }, { "crossentropy": 2.4972022771835327, "epoch": 0.6148617417548057, "grad_norm": 0.03230324387550354, "grad_norm_var": 7.787919393345034e-06, "learning_rate": 0.00037543723537555586, "loss": 2.4972, "step": 11307 }, { "crossentropy": 2.627129554748535, "epoch": 0.6149161206123059, "grad_norm": 0.03236698359251022, "grad_norm_var": 1.0356891322898151e-06, "learning_rate": 0.00037496398582723855, "loss": 2.6271, "step": 11308 }, { "crossentropy": 2.533022880554199, "epoch": 0.6149704994698061, "grad_norm": 0.030414467677474022, "grad_norm_var": 1.1062082381778823e-06, "learning_rate": 0.00037449102311934267, "loss": 2.533, "step": 11309 }, { "crossentropy": 2.5045887231826782, "epoch": 0.6150248783273063, "grad_norm": 0.03181612119078636, "grad_norm_var": 1.0504949296177702e-06, "learning_rate": 0.00037401834728120086, "loss": 2.5046, "step": 11310 }, { "crossentropy": 2.550342082977295, "epoch": 0.6150792571848065, "grad_norm": 0.030382856726646423, "grad_norm_var": 9.519570575891445e-07, "learning_rate": 0.0003735459583421291, "loss": 2.5503, "step": 11311 }, { "crossentropy": 2.553563952445984, "epoch": 0.6151336360423068, "grad_norm": 0.03097180649638176, "grad_norm_var": 9.727300094107868e-07, "learning_rate": 0.000373073856331424, "loss": 2.5536, "step": 11312 }, { "crossentropy": 2.4893518686294556, "epoch": 0.6151880148998069, "grad_norm": 0.030816983431577682, "grad_norm_var": 1.000816705657807e-06, "learning_rate": 0.0003726020412783632, "loss": 2.4894, "step": 11313 }, { "crossentropy": 2.6095809936523438, "epoch": 0.6152423937573072, "grad_norm": 0.03233129531145096, "grad_norm_var": 9.83004159705554e-07, "learning_rate": 0.0003721305132122105, "loss": 2.6096, "step": 11314 }, { "crossentropy": 2.49786114692688, "epoch": 0.6152967726148073, "grad_norm": 0.032488539814949036, "grad_norm_var": 1.0124677809818787e-06, "learning_rate": 0.00037165927216220975, "loss": 2.4979, "step": 11315 }, { "crossentropy": 2.45782470703125, "epoch": 0.6153511514723076, "grad_norm": 0.03142062574625015, "grad_norm_var": 9.288361971299668e-07, "learning_rate": 0.0003711883181575848, "loss": 2.4578, "step": 11316 }, { "crossentropy": 2.526463747024536, "epoch": 0.6154055303298078, "grad_norm": 0.03215836361050606, "grad_norm_var": 9.204447192531194e-07, "learning_rate": 0.00037071765122754455, "loss": 2.5265, "step": 11317 }, { "crossentropy": 2.5123205184936523, "epoch": 0.615459909187308, "grad_norm": 0.031847599893808365, "grad_norm_var": 8.588909861821648e-07, "learning_rate": 0.00037024727140128055, "loss": 2.5123, "step": 11318 }, { "crossentropy": 2.549424886703491, "epoch": 0.6155142880448082, "grad_norm": 0.0314079187810421, "grad_norm_var": 5.704562912022445e-07, "learning_rate": 0.00036977717870796456, "loss": 2.5494, "step": 11319 }, { "crossentropy": 2.4537527561187744, "epoch": 0.6155686669023084, "grad_norm": 0.03165339305996895, "grad_norm_var": 5.381183391762907e-07, "learning_rate": 0.00036930737317675044, "loss": 2.4538, "step": 11320 }, { "crossentropy": 2.4703696966171265, "epoch": 0.6156230457598086, "grad_norm": 0.031492386013269424, "grad_norm_var": 4.7157759428395387e-07, "learning_rate": 0.00036883785483677657, "loss": 2.4704, "step": 11321 }, { "crossentropy": 2.450606942176819, "epoch": 0.6156774246173088, "grad_norm": 0.032165732234716415, "grad_norm_var": 4.7420730026139353e-07, "learning_rate": 0.00036836862371716076, "loss": 2.4506, "step": 11322 }, { "crossentropy": 2.5658371448516846, "epoch": 0.615731803474809, "grad_norm": 0.03200184553861618, "grad_norm_var": 4.527248901077385e-07, "learning_rate": 0.0003678996798470052, "loss": 2.5658, "step": 11323 }, { "crossentropy": 2.4986987113952637, "epoch": 0.6157861823323092, "grad_norm": 0.03249932453036308, "grad_norm_var": 4.6720229986775956e-07, "learning_rate": 0.00036743102325539293, "loss": 2.4987, "step": 11324 }, { "crossentropy": 2.499051094055176, "epoch": 0.6158405611898095, "grad_norm": 0.03638802841305733, "grad_norm_var": 1.7397662859685557e-06, "learning_rate": 0.0003669626539713894, "loss": 2.4991, "step": 11325 }, { "crossentropy": 2.4816198348999023, "epoch": 0.6158949400473096, "grad_norm": 0.03134181350469589, "grad_norm_var": 1.7648341957406573e-06, "learning_rate": 0.00036649457202404367, "loss": 2.4816, "step": 11326 }, { "crossentropy": 2.5290335416793823, "epoch": 0.6159493189048099, "grad_norm": 0.031446799635887146, "grad_norm_var": 1.6117750703666574e-06, "learning_rate": 0.00036602677744238367, "loss": 2.529, "step": 11327 }, { "crossentropy": 2.516040325164795, "epoch": 0.61600369776231, "grad_norm": 0.030243001878261566, "grad_norm_var": 1.7475124204297324e-06, "learning_rate": 0.00036555927025542226, "loss": 2.516, "step": 11328 }, { "crossentropy": 2.487818956375122, "epoch": 0.6160580766198103, "grad_norm": 0.030120791867375374, "grad_norm_var": 1.8858999478711524e-06, "learning_rate": 0.00036509205049215567, "loss": 2.4878, "step": 11329 }, { "crossentropy": 2.5369807481765747, "epoch": 0.6161124554773104, "grad_norm": 0.03185108304023743, "grad_norm_var": 1.8751284940389147e-06, "learning_rate": 0.00036462511818155855, "loss": 2.537, "step": 11330 }, { "crossentropy": 2.5394036769866943, "epoch": 0.6161668343348107, "grad_norm": 0.031492095440626144, "grad_norm_var": 1.8600484986136158e-06, "learning_rate": 0.00036415847335258934, "loss": 2.5394, "step": 11331 }, { "crossentropy": 2.627823829650879, "epoch": 0.6162212131923108, "grad_norm": 0.03264148160815239, "grad_norm_var": 1.88401418115557e-06, "learning_rate": 0.0003636921160341905, "loss": 2.6278, "step": 11332 }, { "crossentropy": 2.4848111867904663, "epoch": 0.6162755920498111, "grad_norm": 0.031582463532686234, "grad_norm_var": 1.8865917748847692e-06, "learning_rate": 0.0003632260462552833, "loss": 2.4848, "step": 11333 }, { "crossentropy": 2.500171184539795, "epoch": 0.6163299709073112, "grad_norm": 0.030931783840060234, "grad_norm_var": 1.943698866844051e-06, "learning_rate": 0.0003627602640447747, "loss": 2.5002, "step": 11334 }, { "crossentropy": 2.425475597381592, "epoch": 0.6163843497648115, "grad_norm": 0.030695194378495216, "grad_norm_var": 2.0154385896570578e-06, "learning_rate": 0.00036229476943155036, "loss": 2.4255, "step": 11335 }, { "crossentropy": 2.556177854537964, "epoch": 0.6164387286223116, "grad_norm": 0.03334639593958855, "grad_norm_var": 2.165051963500539e-06, "learning_rate": 0.00036182956244448116, "loss": 2.5562, "step": 11336 }, { "crossentropy": 2.420334219932556, "epoch": 0.6164931074798119, "grad_norm": 0.030975718051195145, "grad_norm_var": 2.209128282604555e-06, "learning_rate": 0.00036136464311241844, "loss": 2.4203, "step": 11337 }, { "crossentropy": 2.556939125061035, "epoch": 0.616547486337312, "grad_norm": 0.03190396726131439, "grad_norm_var": 2.202660669944192e-06, "learning_rate": 0.0003609000114641964, "loss": 2.5569, "step": 11338 }, { "crossentropy": 2.5855021476745605, "epoch": 0.6166018651948123, "grad_norm": 0.03293973580002785, "grad_norm_var": 2.2777068745750134e-06, "learning_rate": 0.00036043566752862964, "loss": 2.5855, "step": 11339 }, { "crossentropy": 2.300764799118042, "epoch": 0.6166562440523125, "grad_norm": 0.029058581218123436, "grad_norm_var": 2.742667738783675e-06, "learning_rate": 0.0003599716113345186, "loss": 2.3008, "step": 11340 }, { "crossentropy": 2.53536057472229, "epoch": 0.6167106229098127, "grad_norm": 0.032245174050331116, "grad_norm_var": 1.21747205647967e-06, "learning_rate": 0.0003595078429106413, "loss": 2.5354, "step": 11341 }, { "crossentropy": 2.477362275123596, "epoch": 0.6167650017673129, "grad_norm": 0.030538253486156464, "grad_norm_var": 1.2668492467236374e-06, "learning_rate": 0.00035904436228576144, "loss": 2.4774, "step": 11342 }, { "crossentropy": 2.504316210746765, "epoch": 0.6168193806248131, "grad_norm": 0.03095044568181038, "grad_norm_var": 1.2775472524668868e-06, "learning_rate": 0.0003585811694886232, "loss": 2.5043, "step": 11343 }, { "crossentropy": 2.4409292936325073, "epoch": 0.6168737594823133, "grad_norm": 0.031142570078372955, "grad_norm_var": 1.1959760908623769e-06, "learning_rate": 0.00035811826454795417, "loss": 2.4409, "step": 11344 }, { "crossentropy": 2.440220832824707, "epoch": 0.6169281383398135, "grad_norm": 0.03130502253770828, "grad_norm_var": 1.0814872811886472e-06, "learning_rate": 0.0003576556474924625, "loss": 2.4402, "step": 11345 }, { "crossentropy": 2.4888654947280884, "epoch": 0.6169825171973137, "grad_norm": 0.030262799933552742, "grad_norm_var": 1.1595085201959154e-06, "learning_rate": 0.00035719331835083913, "loss": 2.4889, "step": 11346 }, { "crossentropy": 2.6433522701263428, "epoch": 0.6170368960548139, "grad_norm": 0.03200812637805939, "grad_norm_var": 1.1841579258366671e-06, "learning_rate": 0.00035673127715175835, "loss": 2.6434, "step": 11347 }, { "crossentropy": 2.458452582359314, "epoch": 0.6170912749123141, "grad_norm": 0.032878246158361435, "grad_norm_var": 1.2266013770856342e-06, "learning_rate": 0.0003562695239238745, "loss": 2.4585, "step": 11348 }, { "crossentropy": 2.6384897232055664, "epoch": 0.6171456537698143, "grad_norm": 0.033359017223119736, "grad_norm_var": 1.4616851895472848e-06, "learning_rate": 0.00035580805869582624, "loss": 2.6385, "step": 11349 }, { "crossentropy": 2.5790607929229736, "epoch": 0.6172000326273145, "grad_norm": 0.03187892213463783, "grad_norm_var": 1.4417246180629344e-06, "learning_rate": 0.00035534688149623075, "loss": 2.5791, "step": 11350 }, { "crossentropy": 2.3778640031814575, "epoch": 0.6172544114848147, "grad_norm": 0.030893292278051376, "grad_norm_var": 1.4204632228134729e-06, "learning_rate": 0.0003548859923536929, "loss": 2.3779, "step": 11351 }, { "crossentropy": 2.2903698682785034, "epoch": 0.6173087903423149, "grad_norm": 0.030933663249015808, "grad_norm_var": 1.224216137118e-06, "learning_rate": 0.00035442539129679483, "loss": 2.2904, "step": 11352 }, { "crossentropy": 2.5746705532073975, "epoch": 0.6173631691998152, "grad_norm": 0.03146209940314293, "grad_norm_var": 1.207945924729982e-06, "learning_rate": 0.00035396507835410264, "loss": 2.5747, "step": 11353 }, { "crossentropy": 2.5769015550613403, "epoch": 0.6174175480573153, "grad_norm": 0.03244246169924736, "grad_norm_var": 1.256151352646793e-06, "learning_rate": 0.0003535050535541645, "loss": 2.5769, "step": 11354 }, { "crossentropy": 2.5875929594039917, "epoch": 0.6174719269148156, "grad_norm": 0.03356076404452324, "grad_norm_var": 1.3979273051009655e-06, "learning_rate": 0.00035304531692551157, "loss": 2.5876, "step": 11355 }, { "crossentropy": 2.419751286506653, "epoch": 0.6175263057723157, "grad_norm": 0.03208547085523605, "grad_norm_var": 9.62043397887769e-07, "learning_rate": 0.00035258586849665544, "loss": 2.4198, "step": 11356 }, { "crossentropy": 2.5344265699386597, "epoch": 0.617580684629816, "grad_norm": 0.031696218997240067, "grad_norm_var": 9.443885792403585e-07, "learning_rate": 0.00035212670829609107, "loss": 2.5344, "step": 11357 }, { "crossentropy": 2.5330309867858887, "epoch": 0.6176350634873161, "grad_norm": 0.03197302669286728, "grad_norm_var": 8.484438683915013e-07, "learning_rate": 0.00035166783635229515, "loss": 2.533, "step": 11358 }, { "crossentropy": 2.591813564300537, "epoch": 0.6176894423448164, "grad_norm": 0.0327962301671505, "grad_norm_var": 8.518026917029986e-07, "learning_rate": 0.00035120925269372603, "loss": 2.5918, "step": 11359 }, { "crossentropy": 2.6262972354888916, "epoch": 0.6177438212023165, "grad_norm": 0.03204751014709473, "grad_norm_var": 8.094985529110878e-07, "learning_rate": 0.000350750957348826, "loss": 2.6263, "step": 11360 }, { "crossentropy": 2.6056686639785767, "epoch": 0.6177982000598168, "grad_norm": 0.031702231615781784, "grad_norm_var": 7.839333716023646e-07, "learning_rate": 0.00035029295034601563, "loss": 2.6057, "step": 11361 }, { "crossentropy": 2.563286542892456, "epoch": 0.6178525789173169, "grad_norm": 0.032034557312726974, "grad_norm_var": 5.700364604898621e-07, "learning_rate": 0.0003498352317137027, "loss": 2.5633, "step": 11362 }, { "crossentropy": 2.525391936302185, "epoch": 0.6179069577748172, "grad_norm": 0.03086378425359726, "grad_norm_var": 6.673473309251031e-07, "learning_rate": 0.0003493778014802734, "loss": 2.5254, "step": 11363 }, { "crossentropy": 2.5208956003189087, "epoch": 0.6179613366323173, "grad_norm": 0.030731098726391792, "grad_norm_var": 7.149274638136438e-07, "learning_rate": 0.00034892065967409683, "loss": 2.5209, "step": 11364 }, { "crossentropy": 2.5545636415481567, "epoch": 0.6180157154898176, "grad_norm": 0.03113280050456524, "grad_norm_var": 5.927211266104048e-07, "learning_rate": 0.0003484638063235246, "loss": 2.5546, "step": 11365 }, { "crossentropy": 2.4168615341186523, "epoch": 0.6180700943473177, "grad_norm": 0.03277420625090599, "grad_norm_var": 6.564597821343345e-07, "learning_rate": 0.00034800724145689024, "loss": 2.4169, "step": 11366 }, { "crossentropy": 2.50643253326416, "epoch": 0.618124473204818, "grad_norm": 0.031405527144670486, "grad_norm_var": 6.095263553268851e-07, "learning_rate": 0.0003475509651025099, "loss": 2.5064, "step": 11367 }, { "crossentropy": 2.611703634262085, "epoch": 0.6181788520623182, "grad_norm": 0.03225133195519447, "grad_norm_var": 5.565942441852767e-07, "learning_rate": 0.00034709497728868113, "loss": 2.6117, "step": 11368 }, { "crossentropy": 2.4694483280181885, "epoch": 0.6182332309198184, "grad_norm": 0.031388621777296066, "grad_norm_var": 5.615642778476759e-07, "learning_rate": 0.0003466392780436839, "loss": 2.4694, "step": 11369 }, { "crossentropy": 2.632403254508972, "epoch": 0.6182876097773186, "grad_norm": 0.032021209597587585, "grad_norm_var": 5.438922105896818e-07, "learning_rate": 0.00034618386739578065, "loss": 2.6324, "step": 11370 }, { "crossentropy": 2.587318778038025, "epoch": 0.6183419886348188, "grad_norm": 0.03365366905927658, "grad_norm_var": 5.649541043671602e-07, "learning_rate": 0.0003457287453732155, "loss": 2.5873, "step": 11371 }, { "crossentropy": 2.499471068382263, "epoch": 0.618396367492319, "grad_norm": 0.03100152686238289, "grad_norm_var": 6.130048115235778e-07, "learning_rate": 0.0003452739120042142, "loss": 2.4995, "step": 11372 }, { "crossentropy": 2.6351351737976074, "epoch": 0.6184507463498192, "grad_norm": 0.03359729424118996, "grad_norm_var": 8.019085944133488e-07, "learning_rate": 0.0003448193673169842, "loss": 2.6351, "step": 11373 }, { "crossentropy": 2.503792881965637, "epoch": 0.6185051252073194, "grad_norm": 0.03166645020246506, "grad_norm_var": 8.072877929249725e-07, "learning_rate": 0.0003443651113397184, "loss": 2.5038, "step": 11374 }, { "crossentropy": 2.595561385154724, "epoch": 0.6185595040648196, "grad_norm": 0.03146437183022499, "grad_norm_var": 7.664142222871609e-07, "learning_rate": 0.00034391114410058766, "loss": 2.5956, "step": 11375 }, { "crossentropy": 2.4947903156280518, "epoch": 0.6186138829223198, "grad_norm": 0.03179711103439331, "grad_norm_var": 7.640229576545583e-07, "learning_rate": 0.00034345746562774613, "loss": 2.4948, "step": 11376 }, { "crossentropy": 2.5807650089263916, "epoch": 0.61866826177982, "grad_norm": 0.03280522674322128, "grad_norm_var": 8.193783944882478e-07, "learning_rate": 0.0003430040759493319, "loss": 2.5808, "step": 11377 }, { "crossentropy": 2.5147705078125, "epoch": 0.6187226406373202, "grad_norm": 0.030768966302275658, "grad_norm_var": 8.987710602791395e-07, "learning_rate": 0.000342550975093463, "loss": 2.5148, "step": 11378 }, { "crossentropy": 2.508605480194092, "epoch": 0.6187770194948204, "grad_norm": 0.03136073797941208, "grad_norm_var": 8.500054239931095e-07, "learning_rate": 0.00034209816308824136, "loss": 2.5086, "step": 11379 }, { "crossentropy": 2.5772629976272583, "epoch": 0.6188313983523206, "grad_norm": 0.03210252523422241, "grad_norm_var": 7.604413098415604e-07, "learning_rate": 0.00034164563996174735, "loss": 2.5773, "step": 11380 }, { "crossentropy": 2.5723360776901245, "epoch": 0.6188857772098209, "grad_norm": 0.03146414831280708, "grad_norm_var": 7.312228942037696e-07, "learning_rate": 0.00034119340574204846, "loss": 2.5723, "step": 11381 }, { "crossentropy": 2.599989414215088, "epoch": 0.618940156067321, "grad_norm": 0.03127293288707733, "grad_norm_var": 7.111455490886222e-07, "learning_rate": 0.0003407414604571923, "loss": 2.6, "step": 11382 }, { "crossentropy": 2.5229324102401733, "epoch": 0.6189945349248213, "grad_norm": 0.030889680609107018, "grad_norm_var": 7.60159850547668e-07, "learning_rate": 0.0003402898041352059, "loss": 2.5229, "step": 11383 }, { "crossentropy": 2.5603597164154053, "epoch": 0.6190489137823214, "grad_norm": 0.03299926966428757, "grad_norm_var": 8.357329692662563e-07, "learning_rate": 0.0003398384368041013, "loss": 2.5604, "step": 11384 }, { "crossentropy": 2.574575662612915, "epoch": 0.6191032926398217, "grad_norm": 0.03157322108745575, "grad_norm_var": 8.255010924264223e-07, "learning_rate": 0.0003393873584918733, "loss": 2.5746, "step": 11385 }, { "crossentropy": 2.4478503465652466, "epoch": 0.6191576714973218, "grad_norm": 0.03053469769656658, "grad_norm_var": 9.400594708006918e-07, "learning_rate": 0.00033893656922649506, "loss": 2.4479, "step": 11386 }, { "crossentropy": 2.475093126296997, "epoch": 0.6192120503548221, "grad_norm": 0.03265557438135147, "grad_norm_var": 7.568994065872917e-07, "learning_rate": 0.00033848606903592594, "loss": 2.4751, "step": 11387 }, { "crossentropy": 2.43424654006958, "epoch": 0.6192664292123222, "grad_norm": 0.03380359709262848, "grad_norm_var": 9.690679953269168e-07, "learning_rate": 0.00033803585794810463, "loss": 2.4342, "step": 11388 }, { "crossentropy": 2.529985785484314, "epoch": 0.6193208080698225, "grad_norm": 0.03136279433965683, "grad_norm_var": 7.820747406813196e-07, "learning_rate": 0.0003375859359909533, "loss": 2.53, "step": 11389 }, { "crossentropy": 2.492349624633789, "epoch": 0.6193751869273226, "grad_norm": 0.031002286821603775, "grad_norm_var": 8.199283313865679e-07, "learning_rate": 0.0003371363031923752, "loss": 2.4923, "step": 11390 }, { "crossentropy": 2.4777551889419556, "epoch": 0.6194295657848229, "grad_norm": 0.031669579446315765, "grad_norm_var": 8.149894361875042e-07, "learning_rate": 0.0003366869595802563, "loss": 2.4778, "step": 11391 }, { "crossentropy": 2.5079060792922974, "epoch": 0.619483944642323, "grad_norm": 0.030570903792977333, "grad_norm_var": 9.018981761717166e-07, "learning_rate": 0.00033623790518246487, "loss": 2.5079, "step": 11392 }, { "crossentropy": 2.537869691848755, "epoch": 0.6195383234998233, "grad_norm": 0.030761249363422394, "grad_norm_var": 8.556079623637964e-07, "learning_rate": 0.00033578914002685146, "loss": 2.5379, "step": 11393 }, { "crossentropy": 2.577264904975891, "epoch": 0.6195927023573234, "grad_norm": 0.032905612140893936, "grad_norm_var": 9.185702789713529e-07, "learning_rate": 0.00033534066414124596, "loss": 2.5773, "step": 11394 }, { "crossentropy": 2.527804970741272, "epoch": 0.6196470812148237, "grad_norm": 0.03080226108431816, "grad_norm_var": 9.620643659810027e-07, "learning_rate": 0.00033489247755346286, "loss": 2.5278, "step": 11395 }, { "crossentropy": 2.427986264228821, "epoch": 0.6197014600723239, "grad_norm": 0.03090238757431507, "grad_norm_var": 9.793759782467104e-07, "learning_rate": 0.000334444580291301, "loss": 2.428, "step": 11396 }, { "crossentropy": 2.5817354917526245, "epoch": 0.6197558389298241, "grad_norm": 0.03182981163263321, "grad_norm_var": 9.824190637686484e-07, "learning_rate": 0.00033399697238253536, "loss": 2.5817, "step": 11397 }, { "crossentropy": 2.5687276124954224, "epoch": 0.6198102177873243, "grad_norm": 0.03170313686132431, "grad_norm_var": 9.754554824806597e-07, "learning_rate": 0.00033354965385492765, "loss": 2.5687, "step": 11398 }, { "crossentropy": 2.5845264196395874, "epoch": 0.6198645966448245, "grad_norm": 0.031707312911748886, "grad_norm_var": 9.373065881920165e-07, "learning_rate": 0.00033310262473621957, "loss": 2.5845, "step": 11399 }, { "crossentropy": 2.4935762882232666, "epoch": 0.6199189755023247, "grad_norm": 0.03154030442237854, "grad_norm_var": 8.125361308826493e-07, "learning_rate": 0.00033265588505413556, "loss": 2.4936, "step": 11400 }, { "crossentropy": 2.523786425590515, "epoch": 0.6199733543598249, "grad_norm": 0.03073192946612835, "grad_norm_var": 8.578458570392134e-07, "learning_rate": 0.00033220943483638297, "loss": 2.5238, "step": 11401 }, { "crossentropy": 2.5193933248519897, "epoch": 0.6200277332173251, "grad_norm": 0.03222663328051567, "grad_norm_var": 8.12181093655948e-07, "learning_rate": 0.0003317632741106491, "loss": 2.5194, "step": 11402 }, { "crossentropy": 2.3498247861862183, "epoch": 0.6200821120748253, "grad_norm": 0.03108915686607361, "grad_norm_var": 7.525830304696505e-07, "learning_rate": 0.00033131740290460456, "loss": 2.3498, "step": 11403 }, { "crossentropy": 2.4235053062438965, "epoch": 0.6201364909323255, "grad_norm": 0.03162027522921562, "grad_norm_var": 3.909943195426566e-07, "learning_rate": 0.0003308718212459022, "loss": 2.4235, "step": 11404 }, { "crossentropy": 2.5510250329971313, "epoch": 0.6201908697898257, "grad_norm": 0.03260565921664238, "grad_norm_var": 4.811078312220389e-07, "learning_rate": 0.00033042652916217717, "loss": 2.551, "step": 11405 }, { "crossentropy": 2.5715484619140625, "epoch": 0.6202452486473259, "grad_norm": 0.032422780990600586, "grad_norm_var": 5.168781862827648e-07, "learning_rate": 0.00032998152668104407, "loss": 2.5715, "step": 11406 }, { "crossentropy": 2.5035157203674316, "epoch": 0.6202996275048261, "grad_norm": 0.03171450272202492, "grad_norm_var": 5.176123828872034e-07, "learning_rate": 0.0003295368138301041, "loss": 2.5035, "step": 11407 }, { "crossentropy": 2.572995185852051, "epoch": 0.6203540063623263, "grad_norm": 0.03136148676276207, "grad_norm_var": 4.512687380167172e-07, "learning_rate": 0.0003290923906369364, "loss": 2.573, "step": 11408 }, { "crossentropy": 2.5464318990707397, "epoch": 0.6204083852198266, "grad_norm": 0.030807755887508392, "grad_norm_var": 4.460771714123272e-07, "learning_rate": 0.0003286482571291033, "loss": 2.5464, "step": 11409 }, { "crossentropy": 2.441628932952881, "epoch": 0.6204627640773267, "grad_norm": 0.031765520572662354, "grad_norm_var": 3.32371084717636e-07, "learning_rate": 0.0003282044133341505, "loss": 2.4416, "step": 11410 }, { "crossentropy": 2.5172985792160034, "epoch": 0.620517142934827, "grad_norm": 0.0331743098795414, "grad_norm_var": 4.4693368486294305e-07, "learning_rate": 0.0003277608592796044, "loss": 2.5173, "step": 11411 }, { "crossentropy": 2.593425154685974, "epoch": 0.6205715217923271, "grad_norm": 0.03196880966424942, "grad_norm_var": 4.0457364507509065e-07, "learning_rate": 0.0003273175949929741, "loss": 2.5934, "step": 11412 }, { "crossentropy": 2.402799129486084, "epoch": 0.6206259006498274, "grad_norm": 0.03130829334259033, "grad_norm_var": 4.1719346382100046e-07, "learning_rate": 0.0003268746205017503, "loss": 2.4028, "step": 11413 }, { "crossentropy": 2.528038740158081, "epoch": 0.6206802795073275, "grad_norm": 0.03174661844968796, "grad_norm_var": 4.171312973309891e-07, "learning_rate": 0.0003264319358334056, "loss": 2.528, "step": 11414 }, { "crossentropy": 2.4933968782424927, "epoch": 0.6207346583648278, "grad_norm": 0.030073009431362152, "grad_norm_var": 5.905257101276959e-07, "learning_rate": 0.0003259895410153946, "loss": 2.4934, "step": 11415 }, { "crossentropy": 2.613949418067932, "epoch": 0.6207890372223279, "grad_norm": 0.030682701617479324, "grad_norm_var": 6.47300419869554e-07, "learning_rate": 0.0003255474360751559, "loss": 2.6139, "step": 11416 }, { "crossentropy": 2.5754090547561646, "epoch": 0.6208434160798282, "grad_norm": 0.031417857855558395, "grad_norm_var": 5.990333024801878e-07, "learning_rate": 0.000325105621040106, "loss": 2.5754, "step": 11417 }, { "crossentropy": 2.4134639501571655, "epoch": 0.6208977949373283, "grad_norm": 0.0321037583053112, "grad_norm_var": 5.90105208414284e-07, "learning_rate": 0.0003246640959376473, "loss": 2.4135, "step": 11418 }, { "crossentropy": 2.506338596343994, "epoch": 0.6209521737948286, "grad_norm": 0.0323772132396698, "grad_norm_var": 6.032480619234948e-07, "learning_rate": 0.0003242228607951636, "loss": 2.5063, "step": 11419 }, { "crossentropy": 2.5456372499465942, "epoch": 0.6210065526523287, "grad_norm": 0.03155346214771271, "grad_norm_var": 6.042097513850667e-07, "learning_rate": 0.0003237819156400179, "loss": 2.5456, "step": 11420 }, { "crossentropy": 2.452825427055359, "epoch": 0.621060931509829, "grad_norm": 0.03155532851815224, "grad_norm_var": 5.453095940710986e-07, "learning_rate": 0.0003233412604995584, "loss": 2.4528, "step": 11421 }, { "crossentropy": 2.519632339477539, "epoch": 0.6211153103673291, "grad_norm": 0.03032323531806469, "grad_norm_var": 5.98069533599962e-07, "learning_rate": 0.00032290089540111365, "loss": 2.5196, "step": 11422 }, { "crossentropy": 2.4390552043914795, "epoch": 0.6211696892248294, "grad_norm": 0.029957694932818413, "grad_norm_var": 7.397543042172415e-07, "learning_rate": 0.0003224608203719953, "loss": 2.4391, "step": 11423 }, { "crossentropy": 2.5373719930648804, "epoch": 0.6212240680823296, "grad_norm": 0.03265831992030144, "grad_norm_var": 8.406152972872371e-07, "learning_rate": 0.00032202103543949593, "loss": 2.5374, "step": 11424 }, { "crossentropy": 2.4259297847747803, "epoch": 0.6212784469398298, "grad_norm": 0.031416866928339005, "grad_norm_var": 8.102538423623388e-07, "learning_rate": 0.0003215815406308903, "loss": 2.4259, "step": 11425 }, { "crossentropy": 2.4531731605529785, "epoch": 0.62133282579733, "grad_norm": 0.03135994076728821, "grad_norm_var": 8.064566673453288e-07, "learning_rate": 0.0003211423359734361, "loss": 2.4532, "step": 11426 }, { "crossentropy": 2.430475115776062, "epoch": 0.6213872046548302, "grad_norm": 0.03216855227947235, "grad_norm_var": 6.424481463415099e-07, "learning_rate": 0.0003207034214943727, "loss": 2.4305, "step": 11427 }, { "crossentropy": 2.5675283670425415, "epoch": 0.6214415835123304, "grad_norm": 0.03321158513426781, "grad_norm_var": 8.304190540072791e-07, "learning_rate": 0.0003202647972209194, "loss": 2.5675, "step": 11428 }, { "crossentropy": 2.5308762788772583, "epoch": 0.6214959623698306, "grad_norm": 0.03134583681821823, "grad_norm_var": 8.295742730447346e-07, "learning_rate": 0.00031982646318028053, "loss": 2.5309, "step": 11429 }, { "crossentropy": 2.499634265899658, "epoch": 0.6215503412273308, "grad_norm": 0.0344783179461956, "grad_norm_var": 1.3868787532672968e-06, "learning_rate": 0.0003193884193996427, "loss": 2.4996, "step": 11430 }, { "crossentropy": 2.5553109645843506, "epoch": 0.621604720084831, "grad_norm": 0.031432900577783585, "grad_norm_var": 1.2133073847772624e-06, "learning_rate": 0.0003189506659061703, "loss": 2.5553, "step": 11431 }, { "crossentropy": 2.5015941858291626, "epoch": 0.6216590989423312, "grad_norm": 0.031462401151657104, "grad_norm_var": 1.1400637105515287e-06, "learning_rate": 0.00031851320272701213, "loss": 2.5016, "step": 11432 }, { "crossentropy": 2.5132189989089966, "epoch": 0.6217134777998314, "grad_norm": 0.03285719454288483, "grad_norm_var": 1.19592771388131e-06, "learning_rate": 0.0003180760298893026, "loss": 2.5132, "step": 11433 }, { "crossentropy": 2.5109145641326904, "epoch": 0.6217678566573316, "grad_norm": 0.032179538160562515, "grad_norm_var": 1.1984321580682984e-06, "learning_rate": 0.0003176391474201523, "loss": 2.5109, "step": 11434 }, { "crossentropy": 2.4984065294265747, "epoch": 0.6218222355148318, "grad_norm": 0.035110313445329666, "grad_norm_var": 1.8406030904360415e-06, "learning_rate": 0.0003172025553466562, "loss": 2.4984, "step": 11435 }, { "crossentropy": 2.3590359687805176, "epoch": 0.621876614372332, "grad_norm": 0.03127022087574005, "grad_norm_var": 1.8650099997296586e-06, "learning_rate": 0.00031676625369589205, "loss": 2.359, "step": 11436 }, { "crossentropy": 2.54527485370636, "epoch": 0.6219309932298323, "grad_norm": 0.03180114924907684, "grad_norm_var": 1.8525974139586843e-06, "learning_rate": 0.0003163302424949188, "loss": 2.5453, "step": 11437 }, { "crossentropy": 2.51099693775177, "epoch": 0.6219853720873324, "grad_norm": 0.03180591017007828, "grad_norm_var": 1.6457365761212654e-06, "learning_rate": 0.00031589452177077816, "loss": 2.511, "step": 11438 }, { "crossentropy": 2.5756852626800537, "epoch": 0.6220397509448327, "grad_norm": 0.03280237689614296, "grad_norm_var": 1.3172111486046041e-06, "learning_rate": 0.0003154590915504907, "loss": 2.5757, "step": 11439 }, { "crossentropy": 2.537126660346985, "epoch": 0.6220941298023328, "grad_norm": 0.030130483210086823, "grad_norm_var": 1.6076402373901265e-06, "learning_rate": 0.00031502395186106316, "loss": 2.5371, "step": 11440 }, { "crossentropy": 2.5414934158325195, "epoch": 0.6221485086598331, "grad_norm": 0.030957819893956184, "grad_norm_var": 1.667341484905257e-06, "learning_rate": 0.00031458910272948393, "loss": 2.5415, "step": 11441 }, { "crossentropy": 2.652040958404541, "epoch": 0.6222028875173332, "grad_norm": 0.03186073899269104, "grad_norm_var": 1.6303679602762728e-06, "learning_rate": 0.00031415454418271926, "loss": 2.652, "step": 11442 }, { "crossentropy": 2.420379400253296, "epoch": 0.6222572663748335, "grad_norm": 0.033001527190208435, "grad_norm_var": 1.6724943402763541e-06, "learning_rate": 0.00031372027624772036, "loss": 2.4204, "step": 11443 }, { "crossentropy": 2.5830881595611572, "epoch": 0.6223116452323336, "grad_norm": 0.0325937420129776, "grad_norm_var": 1.615636171241894e-06, "learning_rate": 0.00031328629895142234, "loss": 2.5831, "step": 11444 }, { "crossentropy": 2.591017007827759, "epoch": 0.6223660240898339, "grad_norm": 0.031730446964502335, "grad_norm_var": 1.5814298873026289e-06, "learning_rate": 0.0003128526123207376, "loss": 2.591, "step": 11445 }, { "crossentropy": 2.5468201637268066, "epoch": 0.622420402947334, "grad_norm": 0.030637117102742195, "grad_norm_var": 1.345548160444863e-06, "learning_rate": 0.00031241921638256457, "loss": 2.5468, "step": 11446 }, { "crossentropy": 2.484151005744934, "epoch": 0.6224747818048343, "grad_norm": 0.030985020101070404, "grad_norm_var": 1.3905846876035553e-06, "learning_rate": 0.0003119861111637812, "loss": 2.4842, "step": 11447 }, { "crossentropy": 2.486920118331909, "epoch": 0.6225291606623344, "grad_norm": 0.0320126935839653, "grad_norm_var": 1.3737989878053724e-06, "learning_rate": 0.0003115532966912488, "loss": 2.4869, "step": 11448 }, { "crossentropy": 2.4966315031051636, "epoch": 0.6225835395198347, "grad_norm": 0.03132462128996849, "grad_norm_var": 1.3420680688598179e-06, "learning_rate": 0.0003111207729918109, "loss": 2.4966, "step": 11449 }, { "crossentropy": 2.4966049194335938, "epoch": 0.6226379183773348, "grad_norm": 0.030829543247818947, "grad_norm_var": 1.403448629732591e-06, "learning_rate": 0.0003106885400922893, "loss": 2.4966, "step": 11450 }, { "crossentropy": 2.5314844846725464, "epoch": 0.6226922972348351, "grad_norm": 0.0321362242102623, "grad_norm_var": 6.449165477332888e-07, "learning_rate": 0.00031025659801949327, "loss": 2.5315, "step": 11451 }, { "crossentropy": 2.5521081686019897, "epoch": 0.6227466760923352, "grad_norm": 0.03242979943752289, "grad_norm_var": 6.752659871018708e-07, "learning_rate": 0.0003098249468002118, "loss": 2.5521, "step": 11452 }, { "crossentropy": 2.4845153093338013, "epoch": 0.6228010549498355, "grad_norm": 0.03328368812799454, "grad_norm_var": 8.346168764712062e-07, "learning_rate": 0.0003093935864612135, "loss": 2.4845, "step": 11453 }, { "crossentropy": 2.4957690238952637, "epoch": 0.6228554338073357, "grad_norm": 0.03256555646657944, "grad_norm_var": 8.730433154207576e-07, "learning_rate": 0.00030896251702925147, "loss": 2.4958, "step": 11454 }, { "crossentropy": 2.568367600440979, "epoch": 0.6229098126648359, "grad_norm": 0.031731534749269485, "grad_norm_var": 8.058897736311673e-07, "learning_rate": 0.0003085317385310615, "loss": 2.5684, "step": 11455 }, { "crossentropy": 2.6469658613204956, "epoch": 0.6229641915223361, "grad_norm": 0.031214017421007156, "grad_norm_var": 6.433928826707665e-07, "learning_rate": 0.00030810125099335953, "loss": 2.647, "step": 11456 }, { "crossentropy": 2.5681872367858887, "epoch": 0.6230185703798363, "grad_norm": 0.03254057839512825, "grad_norm_var": 6.157172459021291e-07, "learning_rate": 0.00030767105444284295, "loss": 2.5682, "step": 11457 }, { "crossentropy": 2.432350277900696, "epoch": 0.6230729492373365, "grad_norm": 0.032349593937397, "grad_norm_var": 6.261518004121539e-07, "learning_rate": 0.00030724114890619324, "loss": 2.4324, "step": 11458 }, { "crossentropy": 2.5086957216262817, "epoch": 0.6231273280948367, "grad_norm": 0.030796099454164505, "grad_norm_var": 6.239827110569747e-07, "learning_rate": 0.0003068115344100725, "loss": 2.5087, "step": 11459 }, { "crossentropy": 2.431569814682007, "epoch": 0.6231817069523369, "grad_norm": 0.030607016757130623, "grad_norm_var": 6.663801517407201e-07, "learning_rate": 0.0003063822109811254, "loss": 2.4316, "step": 11460 }, { "crossentropy": 2.5701440572738647, "epoch": 0.6232360858098371, "grad_norm": 0.0315726138651371, "grad_norm_var": 6.672615812848877e-07, "learning_rate": 0.0003059531786459763, "loss": 2.5701, "step": 11461 }, { "crossentropy": 2.4338544607162476, "epoch": 0.6232904646673373, "grad_norm": 0.031455617398023605, "grad_norm_var": 5.943939630115765e-07, "learning_rate": 0.0003055244374312355, "loss": 2.4339, "step": 11462 }, { "crossentropy": 2.594725728034973, "epoch": 0.6233448435248375, "grad_norm": 0.03149651363492012, "grad_norm_var": 5.59281230588735e-07, "learning_rate": 0.00030509598736349345, "loss": 2.5947, "step": 11463 }, { "crossentropy": 2.577369213104248, "epoch": 0.6233992223823377, "grad_norm": 0.031596362590789795, "grad_norm_var": 5.56731540073072e-07, "learning_rate": 0.0003046678284693205, "loss": 2.5774, "step": 11464 }, { "crossentropy": 2.511527180671692, "epoch": 0.623453601239838, "grad_norm": 0.030507463961839676, "grad_norm_var": 6.443316248799213e-07, "learning_rate": 0.00030423996077527047, "loss": 2.5115, "step": 11465 }, { "crossentropy": 2.5321106910705566, "epoch": 0.6235079800973382, "grad_norm": 0.03269943967461586, "grad_norm_var": 6.472095748520501e-07, "learning_rate": 0.0003038123843078816, "loss": 2.5321, "step": 11466 }, { "crossentropy": 2.582363486289978, "epoch": 0.6235623589548384, "grad_norm": 0.03153793513774872, "grad_norm_var": 6.436682059133275e-07, "learning_rate": 0.00030338509909366983, "loss": 2.5824, "step": 11467 }, { "crossentropy": 2.5590606927871704, "epoch": 0.6236167378123386, "grad_norm": 0.03091471642255783, "grad_norm_var": 6.546546109762227e-07, "learning_rate": 0.0003029581051591351, "loss": 2.5591, "step": 11468 }, { "crossentropy": 2.5531914234161377, "epoch": 0.6236711166698388, "grad_norm": 0.030917905271053314, "grad_norm_var": 4.98377094700061e-07, "learning_rate": 0.00030253140253076017, "loss": 2.5532, "step": 11469 }, { "crossentropy": 2.61147940158844, "epoch": 0.623725495527339, "grad_norm": 0.03272058442234993, "grad_norm_var": 5.212548905192954e-07, "learning_rate": 0.0003021049912350082, "loss": 2.6115, "step": 11470 }, { "crossentropy": 2.5910327434539795, "epoch": 0.6237798743848392, "grad_norm": 0.03211146220564842, "grad_norm_var": 5.399220524550703e-07, "learning_rate": 0.0003016788712983248, "loss": 2.591, "step": 11471 }, { "crossentropy": 2.5713136196136475, "epoch": 0.6238342532423394, "grad_norm": 0.033088888972997665, "grad_norm_var": 6.719113664923332e-07, "learning_rate": 0.0003012530427471372, "loss": 2.5713, "step": 11472 }, { "crossentropy": 2.52815318107605, "epoch": 0.6238886320998396, "grad_norm": 0.03304654732346535, "grad_norm_var": 7.458301768439492e-07, "learning_rate": 0.00030082750560785575, "loss": 2.5282, "step": 11473 }, { "crossentropy": 2.515854001045227, "epoch": 0.6239430109573398, "grad_norm": 0.03184359148144722, "grad_norm_var": 7.189288786437943e-07, "learning_rate": 0.00030040225990687176, "loss": 2.5159, "step": 11474 }, { "crossentropy": 2.55231511592865, "epoch": 0.62399738981484, "grad_norm": 0.030717894434928894, "grad_norm_var": 7.285492071411644e-07, "learning_rate": 0.0002999773056705574, "loss": 2.5523, "step": 11475 }, { "crossentropy": 2.5431004762649536, "epoch": 0.6240517686723402, "grad_norm": 0.03089282661676407, "grad_norm_var": 6.92873678562883e-07, "learning_rate": 0.00029955264292526817, "loss": 2.5431, "step": 11476 }, { "crossentropy": 2.4358521699905396, "epoch": 0.6241061475298404, "grad_norm": 0.03211187943816185, "grad_norm_var": 7.022476880937563e-07, "learning_rate": 0.00029912827169734313, "loss": 2.4359, "step": 11477 }, { "crossentropy": 2.5464354753494263, "epoch": 0.6241605263873407, "grad_norm": 0.032764337956905365, "grad_norm_var": 7.616380113196925e-07, "learning_rate": 0.0002987041920130995, "loss": 2.5464, "step": 11478 }, { "crossentropy": 2.5719624757766724, "epoch": 0.6242149052448408, "grad_norm": 0.03106827102601528, "grad_norm_var": 7.910295540152253e-07, "learning_rate": 0.000298280403898839, "loss": 2.572, "step": 11479 }, { "crossentropy": 2.544095277786255, "epoch": 0.6242692841023411, "grad_norm": 0.03223540261387825, "grad_norm_var": 8.005858367417096e-07, "learning_rate": 0.000297856907380844, "loss": 2.5441, "step": 11480 }, { "crossentropy": 2.5347412824630737, "epoch": 0.6243236629598412, "grad_norm": 0.03168490529060364, "grad_norm_var": 6.805956010361938e-07, "learning_rate": 0.00029743370248538014, "loss": 2.5347, "step": 11481 }, { "crossentropy": 2.5779401063919067, "epoch": 0.6243780418173415, "grad_norm": 0.03347744420170784, "grad_norm_var": 8.016367703253853e-07, "learning_rate": 0.0002970107892386936, "loss": 2.5779, "step": 11482 }, { "crossentropy": 2.507872700691223, "epoch": 0.6244324206748416, "grad_norm": 0.0324992761015892, "grad_norm_var": 8.071038091695861e-07, "learning_rate": 0.00029658816766701344, "loss": 2.5079, "step": 11483 }, { "crossentropy": 2.6100754737854004, "epoch": 0.6244867995323419, "grad_norm": 0.041247349232435226, "grad_norm_var": 5.976371477084509e-06, "learning_rate": 0.0002961658377965504, "loss": 2.6101, "step": 11484 }, { "crossentropy": 2.544836401939392, "epoch": 0.624541178389842, "grad_norm": 0.029989104717969894, "grad_norm_var": 6.245012225800543e-06, "learning_rate": 0.0002957437996534962, "loss": 2.5448, "step": 11485 }, { "crossentropy": 2.5003732442855835, "epoch": 0.6245955572473423, "grad_norm": 0.031747929751873016, "grad_norm_var": 6.287690086236016e-06, "learning_rate": 0.0002953220532640272, "loss": 2.5004, "step": 11486 }, { "crossentropy": 2.599825382232666, "epoch": 0.6246499361048424, "grad_norm": 0.03212352097034454, "grad_norm_var": 6.2870215005544065e-06, "learning_rate": 0.0002949005986542963, "loss": 2.5998, "step": 11487 }, { "crossentropy": 2.3806607723236084, "epoch": 0.6247043149623427, "grad_norm": 0.03016599826514721, "grad_norm_var": 6.604608844948632e-06, "learning_rate": 0.00029447943585044547, "loss": 2.3807, "step": 11488 }, { "crossentropy": 2.4914097785949707, "epoch": 0.6247586938198428, "grad_norm": 0.03195872902870178, "grad_norm_var": 6.5776867876690105e-06, "learning_rate": 0.00029405856487859207, "loss": 2.4914, "step": 11489 }, { "crossentropy": 2.4717750549316406, "epoch": 0.6248130726773431, "grad_norm": 0.03083866275846958, "grad_norm_var": 6.699684823509633e-06, "learning_rate": 0.00029363798576483914, "loss": 2.4718, "step": 11490 }, { "crossentropy": 2.628127098083496, "epoch": 0.6248674515348432, "grad_norm": 0.03106795623898506, "grad_norm_var": 6.637222835141607e-06, "learning_rate": 0.0002932176985352708, "loss": 2.6281, "step": 11491 }, { "crossentropy": 2.5303114652633667, "epoch": 0.6249218303923435, "grad_norm": 0.03176204860210419, "grad_norm_var": 6.528068810629839e-06, "learning_rate": 0.00029279770321595234, "loss": 2.5303, "step": 11492 }, { "crossentropy": 2.577233672142029, "epoch": 0.6249762092498437, "grad_norm": 0.032916899770498276, "grad_norm_var": 6.548763919850597e-06, "learning_rate": 0.0002923779998329318, "loss": 2.5772, "step": 11493 }, { "crossentropy": 2.506618618965149, "epoch": 0.6250305881073439, "grad_norm": 0.031044190749526024, "grad_norm_var": 6.637918193093799e-06, "learning_rate": 0.00029195858841223944, "loss": 2.5066, "step": 11494 }, { "crossentropy": 2.470311760902405, "epoch": 0.6250849669648441, "grad_norm": 0.03237622231245041, "grad_norm_var": 6.540631487473543e-06, "learning_rate": 0.0002915394689798856, "loss": 2.4703, "step": 11495 }, { "crossentropy": 2.455222487449646, "epoch": 0.6251393458223443, "grad_norm": 0.03027890995144844, "grad_norm_var": 6.802196520063715e-06, "learning_rate": 0.00029112064156186447, "loss": 2.4552, "step": 11496 }, { "crossentropy": 2.4130754470825195, "epoch": 0.6251937246798445, "grad_norm": 0.03025437518954277, "grad_norm_var": 7.028096762299905e-06, "learning_rate": 0.00029070210618415137, "loss": 2.4131, "step": 11497 }, { "crossentropy": 2.5027809143066406, "epoch": 0.6252481035373447, "grad_norm": 0.030360572040081024, "grad_norm_var": 7.06669537540321e-06, "learning_rate": 0.0002902838628727017, "loss": 2.5028, "step": 11498 }, { "crossentropy": 2.498176097869873, "epoch": 0.6253024823948449, "grad_norm": 0.03183981031179428, "grad_norm_var": 7.04245627878446e-06, "learning_rate": 0.0002898659116534574, "loss": 2.4982, "step": 11499 }, { "crossentropy": 2.367801547050476, "epoch": 0.6253568612523451, "grad_norm": 0.030382797122001648, "grad_norm_var": 8.405021247024352e-07, "learning_rate": 0.0002894482525523384, "loss": 2.3678, "step": 11500 }, { "crossentropy": 2.5611096620559692, "epoch": 0.6254112401098453, "grad_norm": 0.03274522349238396, "grad_norm_var": 8.724004905770553e-07, "learning_rate": 0.00028903088559524613, "loss": 2.5611, "step": 11501 }, { "crossentropy": 2.4702577590942383, "epoch": 0.6254656189673455, "grad_norm": 0.031643833965063095, "grad_norm_var": 8.677835725866597e-07, "learning_rate": 0.0002886138108080666, "loss": 2.4703, "step": 11502 }, { "crossentropy": 2.5781038999557495, "epoch": 0.6255199978248457, "grad_norm": 0.032853864133358, "grad_norm_var": 9.754736365102557e-07, "learning_rate": 0.00028819702821666573, "loss": 2.5781, "step": 11503 }, { "crossentropy": 2.4833956956863403, "epoch": 0.625574376682346, "grad_norm": 0.03049040585756302, "grad_norm_var": 9.284316571790223e-07, "learning_rate": 0.00028778053784689285, "loss": 2.4834, "step": 11504 }, { "crossentropy": 2.5263973474502563, "epoch": 0.6256287555398461, "grad_norm": 0.031427085399627686, "grad_norm_var": 9.083273951053979e-07, "learning_rate": 0.0002873643397245773, "loss": 2.5264, "step": 11505 }, { "crossentropy": 2.4719892740249634, "epoch": 0.6256831343973464, "grad_norm": 0.03223845735192299, "grad_norm_var": 9.273903308685266e-07, "learning_rate": 0.0002869484338755324, "loss": 2.472, "step": 11506 }, { "crossentropy": 2.5672255754470825, "epoch": 0.6257375132548465, "grad_norm": 0.033622004091739655, "grad_norm_var": 1.1947141387439226e-06, "learning_rate": 0.0002865328203255513, "loss": 2.5672, "step": 11507 }, { "crossentropy": 2.472177267074585, "epoch": 0.6257918921123468, "grad_norm": 0.032106515020132065, "grad_norm_var": 1.2077452328020187e-06, "learning_rate": 0.0002861174991004112, "loss": 2.4722, "step": 11508 }, { "crossentropy": 2.62052059173584, "epoch": 0.6258462709698469, "grad_norm": 0.03260258957743645, "grad_norm_var": 1.1613009170247188e-06, "learning_rate": 0.0002857024702258676, "loss": 2.6205, "step": 11509 }, { "crossentropy": 2.479225277900696, "epoch": 0.6259006498273472, "grad_norm": 0.031808171421289444, "grad_norm_var": 1.1369175218672404e-06, "learning_rate": 0.00028528773372766213, "loss": 2.4792, "step": 11510 }, { "crossentropy": 2.5131070613861084, "epoch": 0.6259550286848473, "grad_norm": 0.03099769726395607, "grad_norm_var": 1.1294530166840062e-06, "learning_rate": 0.00028487328963151706, "loss": 2.5131, "step": 11511 }, { "crossentropy": 2.4812344312667847, "epoch": 0.6260094075423476, "grad_norm": 0.030785709619522095, "grad_norm_var": 1.0560145542413506e-06, "learning_rate": 0.0002844591379631334, "loss": 2.4812, "step": 11512 }, { "crossentropy": 2.5286436080932617, "epoch": 0.6260637863998477, "grad_norm": 0.03188461437821388, "grad_norm_var": 9.220317854346608e-07, "learning_rate": 0.00028404527874819783, "loss": 2.5286, "step": 11513 }, { "crossentropy": 2.4276434183120728, "epoch": 0.626118165257348, "grad_norm": 0.03051597625017166, "grad_norm_var": 8.950242602239848e-07, "learning_rate": 0.0002836317120123777, "loss": 2.4276, "step": 11514 }, { "crossentropy": 2.4914731979370117, "epoch": 0.6261725441148481, "grad_norm": 0.031172992661595345, "grad_norm_var": 9.145226898254425e-07, "learning_rate": 0.00028321843778132184, "loss": 2.4915, "step": 11515 }, { "crossentropy": 2.42811381816864, "epoch": 0.6262269229723484, "grad_norm": 0.03139226511120796, "grad_norm_var": 8.002662764152451e-07, "learning_rate": 0.00028280545608066034, "loss": 2.4281, "step": 11516 }, { "crossentropy": 2.4765918254852295, "epoch": 0.6262813018298485, "grad_norm": 0.03135832026600838, "grad_norm_var": 7.397696029268063e-07, "learning_rate": 0.0002823927669360071, "loss": 2.4766, "step": 11517 }, { "crossentropy": 2.5183240175247192, "epoch": 0.6263356806873488, "grad_norm": 0.031756725162267685, "grad_norm_var": 7.400024645882411e-07, "learning_rate": 0.0002819803703729562, "loss": 2.5183, "step": 11518 }, { "crossentropy": 2.5493916273117065, "epoch": 0.6263900595448489, "grad_norm": 0.03170893341302872, "grad_norm_var": 6.440052506289599e-07, "learning_rate": 0.0002815682664170849, "loss": 2.5494, "step": 11519 }, { "crossentropy": 2.5814719200134277, "epoch": 0.6264444384023492, "grad_norm": 0.030891308560967445, "grad_norm_var": 5.938416355391607e-07, "learning_rate": 0.00028115645509394995, "loss": 2.5815, "step": 11520 }, { "crossentropy": 2.565725803375244, "epoch": 0.6264988172598493, "grad_norm": 0.031202055513858795, "grad_norm_var": 6.034498927769138e-07, "learning_rate": 0.0002807449364290915, "loss": 2.5657, "step": 11521 }, { "crossentropy": 2.555144786834717, "epoch": 0.6265531961173496, "grad_norm": 0.03302580118179321, "grad_norm_var": 7.063036399611196e-07, "learning_rate": 0.0002803337104480336, "loss": 2.5551, "step": 11522 }, { "crossentropy": 2.47503924369812, "epoch": 0.6266075749748498, "grad_norm": 0.03201600909233093, "grad_norm_var": 4.510116839693985e-07, "learning_rate": 0.00027992277717627857, "loss": 2.475, "step": 11523 }, { "crossentropy": 2.500156283378601, "epoch": 0.62666195383235, "grad_norm": 0.03181828558444977, "grad_norm_var": 4.3583920226232995e-07, "learning_rate": 0.0002795121366393122, "loss": 2.5002, "step": 11524 }, { "crossentropy": 2.5824116468429565, "epoch": 0.6267163326898502, "grad_norm": 0.03491639345884323, "grad_norm_var": 1.0925258069453709e-06, "learning_rate": 0.00027910178886260184, "loss": 2.5824, "step": 11525 }, { "crossentropy": 2.4463634490966797, "epoch": 0.6267707115473504, "grad_norm": 0.03229083865880966, "grad_norm_var": 1.1138415508875806e-06, "learning_rate": 0.00027869173387159773, "loss": 2.4464, "step": 11526 }, { "crossentropy": 2.5030269622802734, "epoch": 0.6268250904048506, "grad_norm": 0.03155016526579857, "grad_norm_var": 1.0787264087331236e-06, "learning_rate": 0.0002782819716917301, "loss": 2.503, "step": 11527 }, { "crossentropy": 2.4878355264663696, "epoch": 0.6268794692623508, "grad_norm": 0.0320395901799202, "grad_norm_var": 1.0127834083333473e-06, "learning_rate": 0.000277872502348413, "loss": 2.4878, "step": 11528 }, { "crossentropy": 2.4366661310195923, "epoch": 0.626933848119851, "grad_norm": 0.032460473477840424, "grad_norm_var": 1.0364536098275123e-06, "learning_rate": 0.0002774633258670406, "loss": 2.4367, "step": 11529 }, { "crossentropy": 2.4800424575805664, "epoch": 0.6269882269773512, "grad_norm": 0.03247789293527603, "grad_norm_var": 9.196192036157897e-07, "learning_rate": 0.0002770544422729909, "loss": 2.48, "step": 11530 }, { "crossentropy": 2.496108889579773, "epoch": 0.6270426058348514, "grad_norm": 0.0319969467818737, "grad_norm_var": 8.70659080515187e-07, "learning_rate": 0.00027664585159162036, "loss": 2.4961, "step": 11531 }, { "crossentropy": 2.4676016569137573, "epoch": 0.6270969846923516, "grad_norm": 0.03145698085427284, "grad_norm_var": 8.651903873157455e-07, "learning_rate": 0.00027623755384827034, "loss": 2.4676, "step": 11532 }, { "crossentropy": 2.46444308757782, "epoch": 0.6271513635498518, "grad_norm": 0.03053341805934906, "grad_norm_var": 9.849411841198005e-07, "learning_rate": 0.0002758295490682644, "loss": 2.4644, "step": 11533 }, { "crossentropy": 2.4609774351119995, "epoch": 0.627205742407352, "grad_norm": 0.03256389498710632, "grad_norm_var": 9.985255704477673e-07, "learning_rate": 0.00027542183727690463, "loss": 2.461, "step": 11534 }, { "crossentropy": 2.4791104793548584, "epoch": 0.6272601212648522, "grad_norm": 0.031277112662792206, "grad_norm_var": 1.030353311166895e-06, "learning_rate": 0.00027501441849947793, "loss": 2.4791, "step": 11535 }, { "crossentropy": 2.5435153245925903, "epoch": 0.6273145001223525, "grad_norm": 0.0324179045855999, "grad_norm_var": 9.437602036523514e-07, "learning_rate": 0.00027460729276125194, "loss": 2.5435, "step": 11536 }, { "crossentropy": 2.527760863304138, "epoch": 0.6273688789798526, "grad_norm": 0.03210950270295143, "grad_norm_var": 8.83225754218173e-07, "learning_rate": 0.00027420046008747647, "loss": 2.5278, "step": 11537 }, { "crossentropy": 2.5788140296936035, "epoch": 0.6274232578373529, "grad_norm": 0.03086675889790058, "grad_norm_var": 9.323657129414135e-07, "learning_rate": 0.0002737939205033824, "loss": 2.5788, "step": 11538 }, { "crossentropy": 2.4465479850769043, "epoch": 0.627477636694853, "grad_norm": 0.0303583275526762, "grad_norm_var": 1.111514592936659e-06, "learning_rate": 0.00027338767403418284, "loss": 2.4465, "step": 11539 }, { "crossentropy": 2.503102421760559, "epoch": 0.6275320155523533, "grad_norm": 0.03342790529131889, "grad_norm_var": 1.2460550653443e-06, "learning_rate": 0.0002729817207050733, "loss": 2.5031, "step": 11540 }, { "crossentropy": 2.4249980449676514, "epoch": 0.6275863944098534, "grad_norm": 0.031530387699604034, "grad_norm_var": 6.669593165182137e-07, "learning_rate": 0.0002725760605412303, "loss": 2.425, "step": 11541 }, { "crossentropy": 2.5015673637390137, "epoch": 0.6276407732673537, "grad_norm": 0.0309283547103405, "grad_norm_var": 7.001506933584344e-07, "learning_rate": 0.00027217069356781366, "loss": 2.5016, "step": 11542 }, { "crossentropy": 2.5150896310806274, "epoch": 0.6276951521248538, "grad_norm": 0.0311295036226511, "grad_norm_var": 7.224034654250509e-07, "learning_rate": 0.00027176561980996164, "loss": 2.5151, "step": 11543 }, { "crossentropy": 2.495773434638977, "epoch": 0.6277495309823541, "grad_norm": 0.03236112371087074, "grad_norm_var": 7.424188992157515e-07, "learning_rate": 0.0002713608392927985, "loss": 2.4958, "step": 11544 }, { "crossentropy": 2.5112799406051636, "epoch": 0.6278039098398542, "grad_norm": 0.03153739497065544, "grad_norm_var": 7.074342368001135e-07, "learning_rate": 0.0002709563520414271, "loss": 2.5113, "step": 11545 }, { "crossentropy": 2.508120059967041, "epoch": 0.6278582886973545, "grad_norm": 0.032036665827035904, "grad_norm_var": 6.730050148241827e-07, "learning_rate": 0.0002705521580809345, "loss": 2.5081, "step": 11546 }, { "crossentropy": 2.502885580062866, "epoch": 0.6279126675548546, "grad_norm": 0.031119653955101967, "grad_norm_var": 6.81490853457883e-07, "learning_rate": 0.00027014825743638726, "loss": 2.5029, "step": 11547 }, { "crossentropy": 2.410045027732849, "epoch": 0.6279670464123549, "grad_norm": 0.030781125649809837, "grad_norm_var": 7.232367986853185e-07, "learning_rate": 0.00026974465013283546, "loss": 2.41, "step": 11548 }, { "crossentropy": 2.4257360696792603, "epoch": 0.628021425269855, "grad_norm": 0.03150147944688797, "grad_norm_var": 6.491487572784915e-07, "learning_rate": 0.00026934133619531007, "loss": 2.4257, "step": 11549 }, { "crossentropy": 2.519436478614807, "epoch": 0.6280758041273553, "grad_norm": 0.03093339502811432, "grad_norm_var": 6.104722951138456e-07, "learning_rate": 0.0002689383156488251, "loss": 2.5194, "step": 11550 }, { "crossentropy": 2.6110771894454956, "epoch": 0.6281301829848555, "grad_norm": 0.031162003055214882, "grad_norm_var": 6.150249911421324e-07, "learning_rate": 0.0002685355885183749, "loss": 2.6111, "step": 11551 }, { "crossentropy": 2.3771843910217285, "epoch": 0.6281845618423557, "grad_norm": 0.03127491846680641, "grad_norm_var": 5.587082479887688e-07, "learning_rate": 0.00026813315482893677, "loss": 2.3772, "step": 11552 }, { "crossentropy": 2.549595832824707, "epoch": 0.6282389406998559, "grad_norm": 0.031795743852853775, "grad_norm_var": 5.369010894209585e-07, "learning_rate": 0.000267731014605469, "loss": 2.5496, "step": 11553 }, { "crossentropy": 2.5618503093719482, "epoch": 0.6282933195573561, "grad_norm": 0.031982000917196274, "grad_norm_var": 5.321400819588285e-07, "learning_rate": 0.00026732916787291063, "loss": 2.5619, "step": 11554 }, { "crossentropy": 2.3408141136169434, "epoch": 0.6283476984148563, "grad_norm": 0.03028738498687744, "grad_norm_var": 5.431709487024293e-07, "learning_rate": 0.0002669276146561855, "loss": 2.3408, "step": 11555 }, { "crossentropy": 2.4616270065307617, "epoch": 0.6284020772723565, "grad_norm": 0.03111727349460125, "grad_norm_var": 2.788403515693033e-07, "learning_rate": 0.0002665263549801983, "loss": 2.4616, "step": 11556 }, { "crossentropy": 2.4716391563415527, "epoch": 0.6284564561298567, "grad_norm": 0.030650600790977478, "grad_norm_var": 3.051651009541711e-07, "learning_rate": 0.0002661253888698323, "loss": 2.4716, "step": 11557 }, { "crossentropy": 2.5577577352523804, "epoch": 0.6285108349873569, "grad_norm": 0.031237037852406502, "grad_norm_var": 2.963423616899584e-07, "learning_rate": 0.0002657247163499565, "loss": 2.5578, "step": 11558 }, { "crossentropy": 2.522173762321472, "epoch": 0.6285652138448571, "grad_norm": 0.03133528307080269, "grad_norm_var": 2.9412697135331303e-07, "learning_rate": 0.00026532433744542007, "loss": 2.5222, "step": 11559 }, { "crossentropy": 2.4677541255950928, "epoch": 0.6286195927023573, "grad_norm": 0.03273691236972809, "grad_norm_var": 3.5514036009566943e-07, "learning_rate": 0.0002649242521810541, "loss": 2.4678, "step": 11560 }, { "crossentropy": 2.4008716344833374, "epoch": 0.6286739715598575, "grad_norm": 0.03256836161017418, "grad_norm_var": 4.482855806870924e-07, "learning_rate": 0.0002645244605816716, "loss": 2.4009, "step": 11561 }, { "crossentropy": 2.5369049310684204, "epoch": 0.6287283504173578, "grad_norm": 0.031936969608068466, "grad_norm_var": 4.4054326270025143e-07, "learning_rate": 0.0002641249626720671, "loss": 2.5369, "step": 11562 }, { "crossentropy": 2.4999219179153442, "epoch": 0.6287827292748579, "grad_norm": 0.031892161816358566, "grad_norm_var": 4.488356716361141e-07, "learning_rate": 0.00026372575847701695, "loss": 2.4999, "step": 11563 }, { "crossentropy": 2.560964584350586, "epoch": 0.6288371081323582, "grad_norm": 0.03082495927810669, "grad_norm_var": 4.4504921710892113e-07, "learning_rate": 0.00026332684802128083, "loss": 2.561, "step": 11564 }, { "crossentropy": 2.6650896072387695, "epoch": 0.6288914869898583, "grad_norm": 0.03338878974318504, "grad_norm_var": 6.800510043385946e-07, "learning_rate": 0.00026292823132959556, "loss": 2.6651, "step": 11565 }, { "crossentropy": 2.4894309043884277, "epoch": 0.6289458658473586, "grad_norm": 0.03148390352725983, "grad_norm_var": 6.522472893156905e-07, "learning_rate": 0.0002625299084266863, "loss": 2.4894, "step": 11566 }, { "crossentropy": 2.51720654964447, "epoch": 0.6290002447048587, "grad_norm": 0.03394313529133797, "grad_norm_var": 9.715267445364306e-07, "learning_rate": 0.0002621318793372557, "loss": 2.5172, "step": 11567 }, { "crossentropy": 2.546446919441223, "epoch": 0.629054623562359, "grad_norm": 0.03190689906477928, "grad_norm_var": 9.5405834408113e-07, "learning_rate": 0.00026173414408598827, "loss": 2.5464, "step": 11568 }, { "crossentropy": 2.451634407043457, "epoch": 0.6291090024198591, "grad_norm": 0.034556567668914795, "grad_norm_var": 1.4222633029086404e-06, "learning_rate": 0.00026133670269755106, "loss": 2.4516, "step": 11569 }, { "crossentropy": 2.452850103378296, "epoch": 0.6291633812773594, "grad_norm": 0.031248101964592934, "grad_norm_var": 1.456759419312748e-06, "learning_rate": 0.0002609395551965948, "loss": 2.4529, "step": 11570 }, { "crossentropy": 2.6596657037734985, "epoch": 0.6292177601348595, "grad_norm": 0.032640665769577026, "grad_norm_var": 1.2828798856982186e-06, "learning_rate": 0.00026054270160774884, "loss": 2.6597, "step": 11571 }, { "crossentropy": 2.430206298828125, "epoch": 0.6292721389923598, "grad_norm": 0.0315365269780159, "grad_norm_var": 1.2393933565865248e-06, "learning_rate": 0.0002601461419556256, "loss": 2.4302, "step": 11572 }, { "crossentropy": 2.513853907585144, "epoch": 0.6293265178498599, "grad_norm": 0.031798105686903, "grad_norm_var": 1.0971890320181249e-06, "learning_rate": 0.0002597498762648198, "loss": 2.5139, "step": 11573 }, { "crossentropy": 2.5367348194122314, "epoch": 0.6293808967073602, "grad_norm": 0.03137162700295448, "grad_norm_var": 1.0812263583238514e-06, "learning_rate": 0.0002593539045599075, "loss": 2.5367, "step": 11574 }, { "crossentropy": 2.6339516639709473, "epoch": 0.6294352755648603, "grad_norm": 0.031207608059048653, "grad_norm_var": 1.0969325168423578e-06, "learning_rate": 0.00025895822686544667, "loss": 2.634, "step": 11575 }, { "crossentropy": 2.488800048828125, "epoch": 0.6294896544223606, "grad_norm": 0.031029818579554558, "grad_norm_var": 1.1546024263185077e-06, "learning_rate": 0.0002585628432059756, "loss": 2.4888, "step": 11576 }, { "crossentropy": 2.5566688776016235, "epoch": 0.6295440332798607, "grad_norm": 0.03187918663024902, "grad_norm_var": 1.139723300661828e-06, "learning_rate": 0.00025816775360601674, "loss": 2.5567, "step": 11577 }, { "crossentropy": 2.548269510269165, "epoch": 0.629598412137361, "grad_norm": 0.031650278717279434, "grad_norm_var": 1.1488106723210457e-06, "learning_rate": 0.0002577729580900745, "loss": 2.5483, "step": 11578 }, { "crossentropy": 2.479658603668213, "epoch": 0.6296527909948612, "grad_norm": 0.03049200028181076, "grad_norm_var": 1.2956521258312142e-06, "learning_rate": 0.00025737845668263095, "loss": 2.4797, "step": 11579 }, { "crossentropy": 2.5349429845809937, "epoch": 0.6297071698523614, "grad_norm": 0.029834719374775887, "grad_norm_var": 1.5034838836103584e-06, "learning_rate": 0.0002569842494081531, "loss": 2.5349, "step": 11580 }, { "crossentropy": 2.57180392742157, "epoch": 0.6297615487098616, "grad_norm": 0.030803509056568146, "grad_norm_var": 1.398713306919458e-06, "learning_rate": 0.0002565903362910921, "loss": 2.5718, "step": 11581 }, { "crossentropy": 2.526862382888794, "epoch": 0.6298159275673618, "grad_norm": 0.03099271096289158, "grad_norm_var": 1.4286930054255615e-06, "learning_rate": 0.0002561967173558749, "loss": 2.5269, "step": 11582 }, { "crossentropy": 2.4694573879241943, "epoch": 0.629870306424862, "grad_norm": 0.03238033875823021, "grad_norm_var": 1.1099120811225988e-06, "learning_rate": 0.0002558033926269149, "loss": 2.4695, "step": 11583 }, { "crossentropy": 2.4280035495758057, "epoch": 0.6299246852823622, "grad_norm": 0.03102664090692997, "grad_norm_var": 1.1203300516138733e-06, "learning_rate": 0.0002554103621286058, "loss": 2.428, "step": 11584 }, { "crossentropy": 2.4950543642044067, "epoch": 0.6299790641398624, "grad_norm": 0.031802721321582794, "grad_norm_var": 4.822907327871791e-07, "learning_rate": 0.00025501762588532205, "loss": 2.4951, "step": 11585 }, { "crossentropy": 2.520883798599243, "epoch": 0.6300334429973626, "grad_norm": 0.03171522542834282, "grad_norm_var": 4.892138815153069e-07, "learning_rate": 0.00025462518392142255, "loss": 2.5209, "step": 11586 }, { "crossentropy": 2.498202919960022, "epoch": 0.6300878218548628, "grad_norm": 0.03060295805335045, "grad_norm_var": 4.076009547985948e-07, "learning_rate": 0.0002542330362612438, "loss": 2.4982, "step": 11587 }, { "crossentropy": 2.5215142965316772, "epoch": 0.630142200712363, "grad_norm": 0.03228192776441574, "grad_norm_var": 4.700342447224211e-07, "learning_rate": 0.0002538411829291082, "loss": 2.5215, "step": 11588 }, { "crossentropy": 2.4386013746261597, "epoch": 0.6301965795698632, "grad_norm": 0.0305987149477005, "grad_norm_var": 4.809798460125713e-07, "learning_rate": 0.0002534496239493178, "loss": 2.4386, "step": 11589 }, { "crossentropy": 2.6038914918899536, "epoch": 0.6302509584273635, "grad_norm": 0.03465786203742027, "grad_norm_var": 1.2182688296029451e-06, "learning_rate": 0.00025305835934615604, "loss": 2.6039, "step": 11590 }, { "crossentropy": 2.52068030834198, "epoch": 0.6303053372848636, "grad_norm": 0.03101939521729946, "grad_norm_var": 1.226183318010993e-06, "learning_rate": 0.0002526673891438891, "loss": 2.5207, "step": 11591 }, { "crossentropy": 2.558812141418457, "epoch": 0.6303597161423639, "grad_norm": 0.03287428244948387, "grad_norm_var": 1.3421166066784318e-06, "learning_rate": 0.00025227671336676584, "loss": 2.5588, "step": 11592 }, { "crossentropy": 2.5740703344345093, "epoch": 0.630414094999864, "grad_norm": 0.030835522338747978, "grad_norm_var": 1.3627547898246157e-06, "learning_rate": 0.0002518863320390136, "loss": 2.5741, "step": 11593 }, { "crossentropy": 2.492470383644104, "epoch": 0.6304684738573643, "grad_norm": 0.030879005789756775, "grad_norm_var": 1.3817081477766977e-06, "learning_rate": 0.0002514962451848446, "loss": 2.4925, "step": 11594 }, { "crossentropy": 2.639250874519348, "epoch": 0.6305228527148644, "grad_norm": 0.03240085765719414, "grad_norm_var": 1.3720191080302529e-06, "learning_rate": 0.00025110645282845124, "loss": 2.6393, "step": 11595 }, { "crossentropy": 2.5746419429779053, "epoch": 0.6305772315723647, "grad_norm": 0.03216242417693138, "grad_norm_var": 1.180117381457899e-06, "learning_rate": 0.00025071695499400857, "loss": 2.5746, "step": 11596 }, { "crossentropy": 2.3779406547546387, "epoch": 0.6306316104298648, "grad_norm": 0.029334738850593567, "grad_norm_var": 1.488482363190026e-06, "learning_rate": 0.0002503277517056729, "loss": 2.3779, "step": 11597 }, { "crossentropy": 2.481170892715454, "epoch": 0.6306859892873651, "grad_norm": 0.031818047165870667, "grad_norm_var": 1.4644655629708943e-06, "learning_rate": 0.00024993884298758095, "loss": 2.4812, "step": 11598 }, { "crossentropy": 2.437014102935791, "epoch": 0.6307403681448652, "grad_norm": 0.03095131926238537, "grad_norm_var": 1.4528296401490499e-06, "learning_rate": 0.00024955022886385405, "loss": 2.437, "step": 11599 }, { "crossentropy": 2.4620866775512695, "epoch": 0.6307947470023655, "grad_norm": 0.030573660507798195, "grad_norm_var": 1.4978737915342817e-06, "learning_rate": 0.00024916190935859393, "loss": 2.4621, "step": 11600 }, { "crossentropy": 2.459870457649231, "epoch": 0.6308491258598656, "grad_norm": 0.03316851705312729, "grad_norm_var": 1.6637991495547024e-06, "learning_rate": 0.000248773884495882, "loss": 2.4599, "step": 11601 }, { "crossentropy": 2.4964793920516968, "epoch": 0.6309035047173659, "grad_norm": 0.0320357009768486, "grad_norm_var": 1.6744087990877565e-06, "learning_rate": 0.00024838615429978396, "loss": 2.4965, "step": 11602 }, { "crossentropy": 2.5735172033309937, "epoch": 0.630957883574866, "grad_norm": 0.03060847334563732, "grad_norm_var": 1.6736501595707946e-06, "learning_rate": 0.00024799871879434775, "loss": 2.5735, "step": 11603 }, { "crossentropy": 2.594445586204529, "epoch": 0.6310122624323663, "grad_norm": 0.03172058239579201, "grad_norm_var": 1.6451136792704994e-06, "learning_rate": 0.0002476115780035998, "loss": 2.5944, "step": 11604 }, { "crossentropy": 2.4275081157684326, "epoch": 0.6310666412898664, "grad_norm": 0.03226777911186218, "grad_norm_var": 1.5958528669794085e-06, "learning_rate": 0.00024722473195155193, "loss": 2.4275, "step": 11605 }, { "crossentropy": 2.4760318994522095, "epoch": 0.6311210201473667, "grad_norm": 0.03205455467104912, "grad_norm_var": 9.950785358525064e-07, "learning_rate": 0.00024683818066219444, "loss": 2.476, "step": 11606 }, { "crossentropy": 2.5935733318328857, "epoch": 0.6311753990048669, "grad_norm": 0.0316011942923069, "grad_norm_var": 9.755347198417765e-07, "learning_rate": 0.00024645192415950203, "loss": 2.5936, "step": 11607 }, { "crossentropy": 2.467391610145569, "epoch": 0.6312297778623671, "grad_norm": 0.030409391969442368, "grad_norm_var": 9.300332411791082e-07, "learning_rate": 0.00024606596246742884, "loss": 2.4674, "step": 11608 }, { "crossentropy": 2.5912543535232544, "epoch": 0.6312841567198673, "grad_norm": 0.03170741721987724, "grad_norm_var": 9.088592700898321e-07, "learning_rate": 0.0002456802956099136, "loss": 2.5913, "step": 11609 }, { "crossentropy": 2.4436172246932983, "epoch": 0.6313385355773675, "grad_norm": 0.032112665474414825, "grad_norm_var": 9.049822459953563e-07, "learning_rate": 0.0002452949236108731, "loss": 2.4436, "step": 11610 }, { "crossentropy": 2.5448917150497437, "epoch": 0.6313929144348677, "grad_norm": 0.03133651241660118, "grad_norm_var": 8.561659710042916e-07, "learning_rate": 0.00024490984649420955, "loss": 2.5449, "step": 11611 }, { "crossentropy": 2.576555013656616, "epoch": 0.6314472932923679, "grad_norm": 0.031134136021137238, "grad_norm_var": 8.302561363138083e-07, "learning_rate": 0.0002445250642838037, "loss": 2.5766, "step": 11612 }, { "crossentropy": 2.4768874645233154, "epoch": 0.6315016721498681, "grad_norm": 0.03172933682799339, "grad_norm_var": 5.205670842913506e-07, "learning_rate": 0.00024414057700351932, "loss": 2.4769, "step": 11613 }, { "crossentropy": 2.581300139427185, "epoch": 0.6315560510073683, "grad_norm": 0.03142617270350456, "grad_norm_var": 5.17561382757621e-07, "learning_rate": 0.0002437563846772034, "loss": 2.5813, "step": 11614 }, { "crossentropy": 2.5512155294418335, "epoch": 0.6316104298648686, "grad_norm": 0.03346933424472809, "grad_norm_var": 7.120529925244238e-07, "learning_rate": 0.00024337248732868188, "loss": 2.5512, "step": 11615 }, { "crossentropy": 2.5039727687835693, "epoch": 0.6316648087223687, "grad_norm": 0.031388163566589355, "grad_norm_var": 6.30140536480199e-07, "learning_rate": 0.00024298888498176408, "loss": 2.504, "step": 11616 }, { "crossentropy": 2.471606969833374, "epoch": 0.631719187579869, "grad_norm": 0.03146882355213165, "grad_norm_var": 4.916347892334651e-07, "learning_rate": 0.00024260557766024037, "loss": 2.4716, "step": 11617 }, { "crossentropy": 2.5731970071792603, "epoch": 0.6317735664373691, "grad_norm": 0.03208041191101074, "grad_norm_var": 4.940329007422588e-07, "learning_rate": 0.00024222256538788346, "loss": 2.5732, "step": 11618 }, { "crossentropy": 2.5885788202285767, "epoch": 0.6318279452948694, "grad_norm": 0.03116113692522049, "grad_norm_var": 4.3584479766725635e-07, "learning_rate": 0.00024183984818844763, "loss": 2.5886, "step": 11619 }, { "crossentropy": 2.513185977935791, "epoch": 0.6318823241523696, "grad_norm": 0.03143995255231857, "grad_norm_var": 4.396871318992334e-07, "learning_rate": 0.00024145742608566834, "loss": 2.5132, "step": 11620 }, { "crossentropy": 2.412350296974182, "epoch": 0.6319367030098698, "grad_norm": 0.03165842592716217, "grad_norm_var": 4.146664125125853e-07, "learning_rate": 0.0002410752991032633, "loss": 2.4124, "step": 11621 }, { "crossentropy": 2.4908004999160767, "epoch": 0.63199108186737, "grad_norm": 0.03134795278310776, "grad_norm_var": 4.064478652937113e-07, "learning_rate": 0.0002406934672649308, "loss": 2.4908, "step": 11622 }, { "crossentropy": 2.5286024808883667, "epoch": 0.6320454607248702, "grad_norm": 0.03173394128680229, "grad_norm_var": 4.077130356892426e-07, "learning_rate": 0.00024031193059435353, "loss": 2.5286, "step": 11623 }, { "crossentropy": 2.526586890220642, "epoch": 0.6320998395823704, "grad_norm": 0.031265418976545334, "grad_norm_var": 3.1759264304912863e-07, "learning_rate": 0.00023993068911519144, "loss": 2.5266, "step": 11624 }, { "crossentropy": 2.4950685501098633, "epoch": 0.6321542184398706, "grad_norm": 0.03133387491106987, "grad_norm_var": 3.236399636489621e-07, "learning_rate": 0.00023954974285109176, "loss": 2.4951, "step": 11625 }, { "crossentropy": 2.3725290298461914, "epoch": 0.6322085972973708, "grad_norm": 0.03040282242000103, "grad_norm_var": 3.964142381556451e-07, "learning_rate": 0.0002391690918256778, "loss": 2.3725, "step": 11626 }, { "crossentropy": 2.5455187559127808, "epoch": 0.632262976154871, "grad_norm": 0.03149228170514107, "grad_norm_var": 3.940466115660701e-07, "learning_rate": 0.00023878873606255846, "loss": 2.5455, "step": 11627 }, { "crossentropy": 2.5183539390563965, "epoch": 0.6323173550123712, "grad_norm": 0.03240816667675972, "grad_norm_var": 4.276939728920509e-07, "learning_rate": 0.0002384086755853232, "loss": 2.5184, "step": 11628 }, { "crossentropy": 2.4751734733581543, "epoch": 0.6323717338698714, "grad_norm": 0.03148353472352028, "grad_norm_var": 4.276537094949206e-07, "learning_rate": 0.00023802891041754205, "loss": 2.4752, "step": 11629 }, { "crossentropy": 2.5788251161575317, "epoch": 0.6324261127273716, "grad_norm": 0.030469181016087532, "grad_norm_var": 5.067577592741956e-07, "learning_rate": 0.0002376494405827684, "loss": 2.5788, "step": 11630 }, { "crossentropy": 2.5628809928894043, "epoch": 0.6324804915848719, "grad_norm": 0.03058462217450142, "grad_norm_var": 2.8389970785075205e-07, "learning_rate": 0.00023727026610453672, "loss": 2.5629, "step": 11631 }, { "crossentropy": 2.463092803955078, "epoch": 0.632534870442372, "grad_norm": 0.03158028423786163, "grad_norm_var": 2.8699414906096834e-07, "learning_rate": 0.0002368913870063627, "loss": 2.4631, "step": 11632 }, { "crossentropy": 2.478727102279663, "epoch": 0.6325892492998723, "grad_norm": 0.03205902501940727, "grad_norm_var": 3.165871263498266e-07, "learning_rate": 0.0002365128033117442, "loss": 2.4787, "step": 11633 }, { "crossentropy": 2.5704561471939087, "epoch": 0.6326436281573724, "grad_norm": 0.032364241778850555, "grad_norm_var": 3.471326160626546e-07, "learning_rate": 0.00023613451504416128, "loss": 2.5705, "step": 11634 }, { "crossentropy": 2.4353126287460327, "epoch": 0.6326980070148727, "grad_norm": 0.032194506376981735, "grad_norm_var": 3.7764800372239867e-07, "learning_rate": 0.00023575652222707357, "loss": 2.4353, "step": 11635 }, { "crossentropy": 2.3656346797943115, "epoch": 0.6327523858723728, "grad_norm": 0.03130558505654335, "grad_norm_var": 3.7964867744960663e-07, "learning_rate": 0.0002353788248839245, "loss": 2.3656, "step": 11636 }, { "crossentropy": 2.4131271839141846, "epoch": 0.6328067647298731, "grad_norm": 0.031803201884031296, "grad_norm_var": 3.8439825713927853e-07, "learning_rate": 0.00023500142303813976, "loss": 2.4131, "step": 11637 }, { "crossentropy": 2.4934595823287964, "epoch": 0.6328611435873732, "grad_norm": 0.030611973255872726, "grad_norm_var": 4.3212188638036554e-07, "learning_rate": 0.000234624316713124, "loss": 2.4935, "step": 11638 }, { "crossentropy": 2.4656569957733154, "epoch": 0.6329155224448735, "grad_norm": 0.030462170019745827, "grad_norm_var": 4.839241549326231e-07, "learning_rate": 0.00023424750593226462, "loss": 2.4657, "step": 11639 }, { "crossentropy": 2.4300392866134644, "epoch": 0.6329699013023736, "grad_norm": 0.03372672200202942, "grad_norm_var": 8.302620225409621e-07, "learning_rate": 0.00023387099071893236, "loss": 2.43, "step": 11640 }, { "crossentropy": 2.451447010040283, "epoch": 0.6330242801598739, "grad_norm": 0.03177786245942116, "grad_norm_var": 8.31703916248982e-07, "learning_rate": 0.00023349477109647744, "loss": 2.4514, "step": 11641 }, { "crossentropy": 2.465875744819641, "epoch": 0.633078659017374, "grad_norm": 0.030718142166733742, "grad_norm_var": 7.898816813797406e-07, "learning_rate": 0.0002331188470882334, "loss": 2.4659, "step": 11642 }, { "crossentropy": 2.586975932121277, "epoch": 0.6331330378748743, "grad_norm": 0.032973527908325195, "grad_norm_var": 9.126319772014782e-07, "learning_rate": 0.00023274321871751436, "loss": 2.587, "step": 11643 }, { "crossentropy": 2.475345015525818, "epoch": 0.6331874167323744, "grad_norm": 0.03193164989352226, "grad_norm_var": 8.79140613780369e-07, "learning_rate": 0.00023236788600761616, "loss": 2.4753, "step": 11644 }, { "crossentropy": 2.5798016786575317, "epoch": 0.6332417955898747, "grad_norm": 0.031316354870796204, "grad_norm_var": 8.841051912739182e-07, "learning_rate": 0.0002319928489818174, "loss": 2.5798, "step": 11645 }, { "crossentropy": 2.518284320831299, "epoch": 0.6332961744473748, "grad_norm": 0.03173817694187164, "grad_norm_var": 7.904672041656249e-07, "learning_rate": 0.00023161810766337553, "loss": 2.5183, "step": 11646 }, { "crossentropy": 2.467641592025757, "epoch": 0.6333505533048751, "grad_norm": 0.030268654227256775, "grad_norm_var": 8.435599588689965e-07, "learning_rate": 0.0002312436620755337, "loss": 2.4676, "step": 11647 }, { "crossentropy": 2.4430837631225586, "epoch": 0.6334049321623753, "grad_norm": 0.03206909820437431, "grad_norm_var": 8.521898653930544e-07, "learning_rate": 0.00023086951224151497, "loss": 2.4431, "step": 11648 }, { "crossentropy": 2.433646559715271, "epoch": 0.6334593110198755, "grad_norm": 0.030479423701763153, "grad_norm_var": 9.341119602853757e-07, "learning_rate": 0.00023049565818452245, "loss": 2.4336, "step": 11649 }, { "crossentropy": 2.3738508224487305, "epoch": 0.6335136898773757, "grad_norm": 0.03152759000658989, "grad_norm_var": 8.935922914990804e-07, "learning_rate": 0.00023012209992774203, "loss": 2.3739, "step": 11650 }, { "crossentropy": 2.4580471515655518, "epoch": 0.6335680687348759, "grad_norm": 0.03209686651825905, "grad_norm_var": 8.858826774523742e-07, "learning_rate": 0.0002297488374943424, "loss": 2.458, "step": 11651 }, { "crossentropy": 2.510232925415039, "epoch": 0.6336224475923761, "grad_norm": 0.03179623931646347, "grad_norm_var": 8.849106453113117e-07, "learning_rate": 0.00022937587090747226, "loss": 2.5102, "step": 11652 }, { "crossentropy": 2.4573111534118652, "epoch": 0.6336768264498763, "grad_norm": 0.029872190207242966, "grad_norm_var": 1.0607777077021225e-06, "learning_rate": 0.0002290032001902631, "loss": 2.4573, "step": 11653 }, { "crossentropy": 2.5484120845794678, "epoch": 0.6337312053073765, "grad_norm": 0.06884460896253586, "grad_norm_var": 8.80940906838494e-05, "learning_rate": 0.0002286308253658276, "loss": 2.5484, "step": 11654 }, { "crossentropy": 2.4771242141723633, "epoch": 0.6337855841648767, "grad_norm": 0.030902251601219177, "grad_norm_var": 8.790740827784916e-05, "learning_rate": 0.0002282587464572594, "loss": 2.4771, "step": 11655 }, { "crossentropy": 2.5780436992645264, "epoch": 0.6338399630223769, "grad_norm": 0.03196027874946594, "grad_norm_var": 8.813793105583039e-05, "learning_rate": 0.00022788696348763628, "loss": 2.578, "step": 11656 }, { "crossentropy": 2.5779820680618286, "epoch": 0.6338943418798771, "grad_norm": 0.03219190239906311, "grad_norm_var": 8.803883122824768e-05, "learning_rate": 0.00022751547648001413, "loss": 2.578, "step": 11657 }, { "crossentropy": 2.539939045906067, "epoch": 0.6339487207373773, "grad_norm": 0.03146860748529434, "grad_norm_var": 8.7766361091357e-05, "learning_rate": 0.00022714428545743181, "loss": 2.5399, "step": 11658 }, { "crossentropy": 2.488526940345764, "epoch": 0.6340030995948776, "grad_norm": 0.03159099817276001, "grad_norm_var": 8.80455162820148e-05, "learning_rate": 0.00022677339044291202, "loss": 2.4885, "step": 11659 }, { "crossentropy": 2.487597346305847, "epoch": 0.6340574784523777, "grad_norm": 0.03934912011027336, "grad_norm_var": 8.968246132129148e-05, "learning_rate": 0.0002264027914594563, "loss": 2.4876, "step": 11660 }, { "crossentropy": 2.4691436290740967, "epoch": 0.634111857309878, "grad_norm": 0.03260679543018341, "grad_norm_var": 8.928745338668918e-05, "learning_rate": 0.0002260324885300491, "loss": 2.4691, "step": 11661 }, { "crossentropy": 2.492649555206299, "epoch": 0.6341662361673781, "grad_norm": 0.031922075897455215, "grad_norm_var": 8.922680852422663e-05, "learning_rate": 0.00022566248167765646, "loss": 2.4926, "step": 11662 }, { "crossentropy": 2.559002995491028, "epoch": 0.6342206150248784, "grad_norm": 0.03212055191397667, "grad_norm_var": 8.844347122436512e-05, "learning_rate": 0.00022529277092522504, "loss": 2.559, "step": 11663 }, { "crossentropy": 2.52719509601593, "epoch": 0.6342749938823785, "grad_norm": 0.031588394194841385, "grad_norm_var": 8.860890674096348e-05, "learning_rate": 0.00022492335629568482, "loss": 2.5272, "step": 11664 }, { "crossentropy": 2.4743822813034058, "epoch": 0.6343293727398788, "grad_norm": 0.031815823167562485, "grad_norm_var": 8.80228496983354e-05, "learning_rate": 0.0002245542378119464, "loss": 2.4744, "step": 11665 }, { "crossentropy": 2.4973347187042236, "epoch": 0.6343837515973789, "grad_norm": 0.030189745128154755, "grad_norm_var": 8.866107629706866e-05, "learning_rate": 0.0002241854154969014, "loss": 2.4973, "step": 11666 }, { "crossentropy": 2.5849204063415527, "epoch": 0.6344381304548792, "grad_norm": 0.03152275085449219, "grad_norm_var": 8.88575791161988e-05, "learning_rate": 0.0002238168893734255, "loss": 2.5849, "step": 11667 }, { "crossentropy": 2.4289947748184204, "epoch": 0.6344925093123793, "grad_norm": 0.030715998262166977, "grad_norm_var": 8.929961656815767e-05, "learning_rate": 0.00022344865946437255, "loss": 2.429, "step": 11668 }, { "crossentropy": 2.524024486541748, "epoch": 0.6345468881698796, "grad_norm": 0.030260931700468063, "grad_norm_var": 8.90800051991454e-05, "learning_rate": 0.00022308072579257988, "loss": 2.524, "step": 11669 }, { "crossentropy": 2.4951452016830444, "epoch": 0.6346012670273797, "grad_norm": 0.031960275024175644, "grad_norm_var": 4.29816551404579e-06, "learning_rate": 0.00022271308838086868, "loss": 2.4951, "step": 11670 }, { "crossentropy": 2.5409404039382935, "epoch": 0.63465564588488, "grad_norm": 0.031448863446712494, "grad_norm_var": 4.236075484459247e-06, "learning_rate": 0.00022234574725203682, "loss": 2.5409, "step": 11671 }, { "crossentropy": 2.5222742557525635, "epoch": 0.6347100247423801, "grad_norm": 0.03292137756943703, "grad_norm_var": 4.283005853373117e-06, "learning_rate": 0.00022197870242886774, "loss": 2.5223, "step": 11672 }, { "crossentropy": 2.564137578010559, "epoch": 0.6347644035998804, "grad_norm": 0.032103877514600754, "grad_norm_var": 4.282465937343929e-06, "learning_rate": 0.00022161195393412492, "loss": 2.5641, "step": 11673 }, { "crossentropy": 2.5415948629379272, "epoch": 0.6348187824573805, "grad_norm": 0.03153827786445618, "grad_norm_var": 4.276912082794756e-06, "learning_rate": 0.00022124550179055403, "loss": 2.5416, "step": 11674 }, { "crossentropy": 2.470739960670471, "epoch": 0.6348731613148808, "grad_norm": 0.032026588916778564, "grad_norm_var": 4.2590058369753235e-06, "learning_rate": 0.0002208793460208819, "loss": 2.4707, "step": 11675 }, { "crossentropy": 2.289481520652771, "epoch": 0.634927540172381, "grad_norm": 0.03008732758462429, "grad_norm_var": 7.062570265196997e-07, "learning_rate": 0.000220513486647817, "loss": 2.2895, "step": 11676 }, { "crossentropy": 2.4701216220855713, "epoch": 0.6349819190298812, "grad_norm": 0.03162950277328491, "grad_norm_var": 6.28485889060309e-07, "learning_rate": 0.00022014792369404957, "loss": 2.4701, "step": 11677 }, { "crossentropy": 2.4601900577545166, "epoch": 0.6350362978873814, "grad_norm": 0.030876776203513145, "grad_norm_var": 6.36664445908576e-07, "learning_rate": 0.000219782657182252, "loss": 2.4602, "step": 11678 }, { "crossentropy": 2.510531783103943, "epoch": 0.6350906767448816, "grad_norm": 0.030102144926786423, "grad_norm_var": 7.042185749534316e-07, "learning_rate": 0.0002194176871350778, "loss": 2.5105, "step": 11679 }, { "crossentropy": 2.557137370109558, "epoch": 0.6351450556023818, "grad_norm": 0.031104907393455505, "grad_norm_var": 7.001915288751732e-07, "learning_rate": 0.0002190530135751606, "loss": 2.5571, "step": 11680 }, { "crossentropy": 2.4680100679397583, "epoch": 0.635199434459882, "grad_norm": 0.031215718016028404, "grad_norm_var": 6.789517386307364e-07, "learning_rate": 0.0002186886365251195, "loss": 2.468, "step": 11681 }, { "crossentropy": 2.5654619932174683, "epoch": 0.6352538133173822, "grad_norm": 0.03200926631689072, "grad_norm_var": 6.331190277167186e-07, "learning_rate": 0.0002183245560075503, "loss": 2.5655, "step": 11682 }, { "crossentropy": 2.560668110847473, "epoch": 0.6353081921748824, "grad_norm": 0.03165053203701973, "grad_norm_var": 6.371630765811351e-07, "learning_rate": 0.00021796077204503495, "loss": 2.5607, "step": 11683 }, { "crossentropy": 2.422996997833252, "epoch": 0.6353625710323826, "grad_norm": 0.03200077265501022, "grad_norm_var": 6.31161195089449e-07, "learning_rate": 0.0002175972846601343, "loss": 2.423, "step": 11684 }, { "crossentropy": 2.401883840560913, "epoch": 0.6354169498898828, "grad_norm": 0.03168219327926636, "grad_norm_var": 5.351932204683176e-07, "learning_rate": 0.00021723409387539085, "loss": 2.4019, "step": 11685 }, { "crossentropy": 2.5275685787200928, "epoch": 0.635471328747383, "grad_norm": 0.03196056932210922, "grad_norm_var": 5.352104079730935e-07, "learning_rate": 0.00021687119971333047, "loss": 2.5276, "step": 11686 }, { "crossentropy": 2.480176091194153, "epoch": 0.6355257076048832, "grad_norm": 0.03112536296248436, "grad_norm_var": 5.449238720745536e-07, "learning_rate": 0.00021650860219645852, "loss": 2.4802, "step": 11687 }, { "crossentropy": 2.5855700969696045, "epoch": 0.6355800864623834, "grad_norm": 0.030791105702519417, "grad_norm_var": 4.254545550821364e-07, "learning_rate": 0.00021614630134726364, "loss": 2.5856, "step": 11688 }, { "crossentropy": 2.497917056083679, "epoch": 0.6356344653198837, "grad_norm": 0.031688567250967026, "grad_norm_var": 3.955442913316486e-07, "learning_rate": 0.00021578429718821512, "loss": 2.4979, "step": 11689 }, { "crossentropy": 2.466940999031067, "epoch": 0.6356888441773838, "grad_norm": 0.033144425600767136, "grad_norm_var": 5.985739553709957e-07, "learning_rate": 0.000215422589741765, "loss": 2.4669, "step": 11690 }, { "crossentropy": 2.5688825845718384, "epoch": 0.6357432230348841, "grad_norm": 0.03207974508404732, "grad_norm_var": 6.02883296164404e-07, "learning_rate": 0.00021506117903034362, "loss": 2.5689, "step": 11691 }, { "crossentropy": 2.574246644973755, "epoch": 0.6357976018923842, "grad_norm": 0.03172852098941803, "grad_norm_var": 4.737388098941756e-07, "learning_rate": 0.0002147000650763675, "loss": 2.5742, "step": 11692 }, { "crossentropy": 2.4968560934066772, "epoch": 0.6358519807498845, "grad_norm": 0.031355421990156174, "grad_norm_var": 4.755058818242761e-07, "learning_rate": 0.00021433924790223324, "loss": 2.4969, "step": 11693 }, { "crossentropy": 2.5057212114334106, "epoch": 0.6359063596073846, "grad_norm": 0.03147323802113533, "grad_norm_var": 4.4561247385041526e-07, "learning_rate": 0.00021397872753031621, "loss": 2.5057, "step": 11694 }, { "crossentropy": 2.5173689126968384, "epoch": 0.6359607384648849, "grad_norm": 0.03176775574684143, "grad_norm_var": 2.93124532287486e-07, "learning_rate": 0.00021361850398297634, "loss": 2.5174, "step": 11695 }, { "crossentropy": 2.4128347635269165, "epoch": 0.636015117322385, "grad_norm": 0.03951907902956009, "grad_norm_var": 4.079971784018826e-06, "learning_rate": 0.0002132585772825546, "loss": 2.4128, "step": 11696 }, { "crossentropy": 2.5459758043289185, "epoch": 0.6360694961798853, "grad_norm": 0.030494840815663338, "grad_norm_var": 4.207010559822514e-06, "learning_rate": 0.00021289894745137318, "loss": 2.546, "step": 11697 }, { "crossentropy": 2.5599279403686523, "epoch": 0.6361238750373854, "grad_norm": 0.033065102994441986, "grad_norm_var": 4.256244570305841e-06, "learning_rate": 0.00021253961451173586, "loss": 2.5599, "step": 11698 }, { "crossentropy": 2.476962089538574, "epoch": 0.6361782538948857, "grad_norm": 0.030809571966528893, "grad_norm_var": 4.364349436663144e-06, "learning_rate": 0.00021218057848592875, "loss": 2.477, "step": 11699 }, { "crossentropy": 2.501114010810852, "epoch": 0.6362326327523858, "grad_norm": 0.03148670494556427, "grad_norm_var": 4.392320797843537e-06, "learning_rate": 0.0002118218393962179, "loss": 2.5011, "step": 11700 }, { "crossentropy": 2.4703320264816284, "epoch": 0.6362870116098861, "grad_norm": 0.031343910843133926, "grad_norm_var": 4.41993093528759e-06, "learning_rate": 0.00021146339726485386, "loss": 2.4703, "step": 11701 }, { "crossentropy": 2.407586693763733, "epoch": 0.6363413904673862, "grad_norm": 0.03252364695072174, "grad_norm_var": 4.428181280570125e-06, "learning_rate": 0.00021110525211406384, "loss": 2.4076, "step": 11702 }, { "crossentropy": 2.573591470718384, "epoch": 0.6363957693248865, "grad_norm": 0.031330786645412445, "grad_norm_var": 4.402759216704646e-06, "learning_rate": 0.0002107474039660623, "loss": 2.5736, "step": 11703 }, { "crossentropy": 2.557437539100647, "epoch": 0.6364501481823867, "grad_norm": 0.03141527250409126, "grad_norm_var": 4.312965112149234e-06, "learning_rate": 0.00021038985284304202, "loss": 2.5574, "step": 11704 }, { "crossentropy": 2.54653537273407, "epoch": 0.6365045270398869, "grad_norm": 0.031131787225604057, "grad_norm_var": 4.370431146706238e-06, "learning_rate": 0.00021003259876717752, "loss": 2.5465, "step": 11705 }, { "crossentropy": 2.5931395292282104, "epoch": 0.6365589058973871, "grad_norm": 0.03213644027709961, "grad_norm_var": 4.302550825062062e-06, "learning_rate": 0.00020967564176062548, "loss": 2.5931, "step": 11706 }, { "crossentropy": 2.6216373443603516, "epoch": 0.6366132847548873, "grad_norm": 0.031032728031277657, "grad_norm_var": 4.37443319257468e-06, "learning_rate": 0.00020931898184552378, "loss": 2.6216, "step": 11707 }, { "crossentropy": 2.502422571182251, "epoch": 0.6366676636123875, "grad_norm": 0.031097836792469025, "grad_norm_var": 4.42535360779354e-06, "learning_rate": 0.0002089626190439925, "loss": 2.5024, "step": 11708 }, { "crossentropy": 2.5267962217330933, "epoch": 0.6367220424698877, "grad_norm": 0.03035142458975315, "grad_norm_var": 4.574508745377306e-06, "learning_rate": 0.0002086065533781334, "loss": 2.5268, "step": 11709 }, { "crossentropy": 2.5988515615463257, "epoch": 0.6367764213273879, "grad_norm": 0.03183431550860405, "grad_norm_var": 4.560365827993443e-06, "learning_rate": 0.0002082507848700288, "loss": 2.5989, "step": 11710 }, { "crossentropy": 2.5552021265029907, "epoch": 0.6368308001848881, "grad_norm": 0.030875809490680695, "grad_norm_var": 4.632812005912973e-06, "learning_rate": 0.00020789531354174274, "loss": 2.5552, "step": 11711 }, { "crossentropy": 2.4983654022216797, "epoch": 0.6368851790423883, "grad_norm": 0.033108096569776535, "grad_norm_var": 6.914662104509032e-07, "learning_rate": 0.00020754013941532258, "loss": 2.4984, "step": 11712 }, { "crossentropy": 2.595494508743286, "epoch": 0.6369395578998885, "grad_norm": 0.03251103684306145, "grad_norm_var": 6.746754493149853e-07, "learning_rate": 0.00020718526251279346, "loss": 2.5955, "step": 11713 }, { "crossentropy": 2.6204137802124023, "epoch": 0.6369939367573887, "grad_norm": 0.03135383874177933, "grad_norm_var": 5.298925990400323e-07, "learning_rate": 0.00020683068285616668, "loss": 2.6204, "step": 11714 }, { "crossentropy": 2.521240472793579, "epoch": 0.637048315614889, "grad_norm": 0.03238430991768837, "grad_norm_var": 5.354104568402642e-07, "learning_rate": 0.00020647640046743298, "loss": 2.5212, "step": 11715 }, { "crossentropy": 2.425597906112671, "epoch": 0.6371026944723891, "grad_norm": 0.030767539516091347, "grad_norm_var": 5.805045712467885e-07, "learning_rate": 0.0002061224153685637, "loss": 2.4256, "step": 11716 }, { "crossentropy": 2.557319164276123, "epoch": 0.6371570733298894, "grad_norm": 0.03100384771823883, "grad_norm_var": 5.982067850379691e-07, "learning_rate": 0.00020576872758151178, "loss": 2.5573, "step": 11717 }, { "crossentropy": 2.5662237405776978, "epoch": 0.6372114521873895, "grad_norm": 0.03123171441257, "grad_norm_var": 5.354388902201046e-07, "learning_rate": 0.00020541533712821526, "loss": 2.5662, "step": 11718 }, { "crossentropy": 2.5155826807022095, "epoch": 0.6372658310448898, "grad_norm": 0.03250419721007347, "grad_norm_var": 5.992565914963855e-07, "learning_rate": 0.0002050622440305888, "loss": 2.5156, "step": 11719 }, { "crossentropy": 2.584119439125061, "epoch": 0.6373202099023899, "grad_norm": 0.03168980032205582, "grad_norm_var": 5.991722276139545e-07, "learning_rate": 0.00020470944831053162, "loss": 2.5841, "step": 11720 }, { "crossentropy": 2.531845450401306, "epoch": 0.6373745887598902, "grad_norm": 0.03243020176887512, "grad_norm_var": 6.298146108509039e-07, "learning_rate": 0.00020435694998992393, "loss": 2.5318, "step": 11721 }, { "crossentropy": 2.3871251344680786, "epoch": 0.6374289676173903, "grad_norm": 0.031054094433784485, "grad_norm_var": 6.32048636722606e-07, "learning_rate": 0.0002040047490906266, "loss": 2.3871, "step": 11722 }, { "crossentropy": 2.562189221382141, "epoch": 0.6374833464748906, "grad_norm": 0.031838756054639816, "grad_norm_var": 6.141686971529302e-07, "learning_rate": 0.00020365284563448493, "loss": 2.5622, "step": 11723 }, { "crossentropy": 2.559567451477051, "epoch": 0.6375377253323907, "grad_norm": 0.03280267119407654, "grad_norm_var": 6.754692402199108e-07, "learning_rate": 0.00020330123964332038, "loss": 2.5596, "step": 11724 }, { "crossentropy": 2.4995663166046143, "epoch": 0.637592104189891, "grad_norm": 0.03161679953336716, "grad_norm_var": 5.423038302312481e-07, "learning_rate": 0.00020294993113894157, "loss": 2.4996, "step": 11725 }, { "crossentropy": 2.585093379020691, "epoch": 0.6376464830473911, "grad_norm": 0.0307261161506176, "grad_norm_var": 6.159019010634297e-07, "learning_rate": 0.00020259892014313664, "loss": 2.5851, "step": 11726 }, { "crossentropy": 2.5281472206115723, "epoch": 0.6377008619048914, "grad_norm": 0.03156726062297821, "grad_norm_var": 5.657717254659948e-07, "learning_rate": 0.00020224820667767373, "loss": 2.5281, "step": 11727 }, { "crossentropy": 2.4058704376220703, "epoch": 0.6377552407623915, "grad_norm": 0.03171246498823166, "grad_norm_var": 4.416532399455104e-07, "learning_rate": 0.0002018977907643027, "loss": 2.4059, "step": 11728 }, { "crossentropy": 2.49804425239563, "epoch": 0.6378096196198918, "grad_norm": 0.031061461195349693, "grad_norm_var": 4.16163385912382e-07, "learning_rate": 0.00020154767242475946, "loss": 2.498, "step": 11729 }, { "crossentropy": 2.4389050006866455, "epoch": 0.637863998477392, "grad_norm": 0.032336313277482986, "grad_norm_var": 4.430578494013833e-07, "learning_rate": 0.000201197851680755, "loss": 2.4389, "step": 11730 }, { "crossentropy": 2.520706534385681, "epoch": 0.6379183773348922, "grad_norm": 0.031363725662231445, "grad_norm_var": 4.1101976667709943e-07, "learning_rate": 0.00020084832855398583, "loss": 2.5207, "step": 11731 }, { "crossentropy": 2.4204955101013184, "epoch": 0.6379727561923924, "grad_norm": 0.030559910461306572, "grad_norm_var": 4.369449355475733e-07, "learning_rate": 0.00020049910306612907, "loss": 2.4205, "step": 11732 }, { "crossentropy": 2.5949233770370483, "epoch": 0.6380271350498926, "grad_norm": 0.03163894638419151, "grad_norm_var": 4.1220502944813197e-07, "learning_rate": 0.0002001501752388435, "loss": 2.5949, "step": 11733 }, { "crossentropy": 2.5428744554519653, "epoch": 0.6380815139073928, "grad_norm": 0.03233799338340759, "grad_norm_var": 4.2944536584182956e-07, "learning_rate": 0.00019980154509376957, "loss": 2.5429, "step": 11734 }, { "crossentropy": 2.452563762664795, "epoch": 0.638135892764893, "grad_norm": 0.030680932104587555, "grad_norm_var": 4.4233046501227643e-07, "learning_rate": 0.00019945321265252725, "loss": 2.4526, "step": 11735 }, { "crossentropy": 2.654557943344116, "epoch": 0.6381902716223932, "grad_norm": 0.03107181005179882, "grad_norm_var": 4.5786040297408845e-07, "learning_rate": 0.000199105177936722, "loss": 2.6546, "step": 11736 }, { "crossentropy": 2.535559058189392, "epoch": 0.6382446504798934, "grad_norm": 0.029901618137955666, "grad_norm_var": 5.6070224412613e-07, "learning_rate": 0.00019875744096793825, "loss": 2.5356, "step": 11737 }, { "crossentropy": 2.5916332006454468, "epoch": 0.6382990293373936, "grad_norm": 0.03314376622438431, "grad_norm_var": 7.394941399641378e-07, "learning_rate": 0.0001984100017677415, "loss": 2.5916, "step": 11738 }, { "crossentropy": 2.468578815460205, "epoch": 0.6383534081948938, "grad_norm": 0.03444443270564079, "grad_norm_var": 1.273704024492596e-06, "learning_rate": 0.00019806286035767894, "loss": 2.4686, "step": 11739 }, { "crossentropy": 2.491665244102478, "epoch": 0.638407787052394, "grad_norm": 0.031164566054940224, "grad_norm_var": 1.1973856823150093e-06, "learning_rate": 0.0001977160167592823, "loss": 2.4917, "step": 11740 }, { "crossentropy": 2.5843303203582764, "epoch": 0.6384621659098942, "grad_norm": 0.03201849013566971, "grad_norm_var": 1.2092802602767093e-06, "learning_rate": 0.00019736947099406043, "loss": 2.5843, "step": 11741 }, { "crossentropy": 2.6053942441940308, "epoch": 0.6385165447673944, "grad_norm": 0.032130952924489975, "grad_norm_var": 1.167419948769098e-06, "learning_rate": 0.00019702322308350672, "loss": 2.6054, "step": 11742 }, { "crossentropy": 2.5704636573791504, "epoch": 0.6385709236248946, "grad_norm": 0.031588006764650345, "grad_norm_var": 1.1670909705818137e-06, "learning_rate": 0.0001966772730490951, "loss": 2.5705, "step": 11743 }, { "crossentropy": 2.5713175535202026, "epoch": 0.6386253024823948, "grad_norm": 0.0319780558347702, "grad_norm_var": 1.172039770649585e-06, "learning_rate": 0.00019633162091228006, "loss": 2.5713, "step": 11744 }, { "crossentropy": 2.412980318069458, "epoch": 0.6386796813398951, "grad_norm": 0.03134214133024216, "grad_norm_var": 1.1525500410567227e-06, "learning_rate": 0.00019598626669450002, "loss": 2.413, "step": 11745 }, { "crossentropy": 2.53204345703125, "epoch": 0.6387340601973952, "grad_norm": 0.03103451617062092, "grad_norm_var": 1.1534626800157315e-06, "learning_rate": 0.00019564121041717287, "loss": 2.532, "step": 11746 }, { "crossentropy": 2.310787320137024, "epoch": 0.6387884390548955, "grad_norm": 0.03150244429707527, "grad_norm_var": 1.1493706375325138e-06, "learning_rate": 0.00019529645210169866, "loss": 2.3108, "step": 11747 }, { "crossentropy": 2.388272762298584, "epoch": 0.6388428179123956, "grad_norm": 0.03022720105946064, "grad_norm_var": 1.2050310797428876e-06, "learning_rate": 0.00019495199176945978, "loss": 2.3883, "step": 11748 }, { "crossentropy": 2.3497376441955566, "epoch": 0.6388971967698959, "grad_norm": 0.03139021620154381, "grad_norm_var": 1.208861955245818e-06, "learning_rate": 0.00019460782944181798, "loss": 2.3497, "step": 11749 }, { "crossentropy": 2.5704652070999146, "epoch": 0.638951575627396, "grad_norm": 0.03112962655723095, "grad_norm_var": 1.1848154598662063e-06, "learning_rate": 0.0001942639651401179, "loss": 2.5705, "step": 11750 }, { "crossentropy": 2.4943584203720093, "epoch": 0.6390059544848963, "grad_norm": 0.0307014063000679, "grad_norm_var": 1.1824779369354332e-06, "learning_rate": 0.00019392039888568746, "loss": 2.4944, "step": 11751 }, { "crossentropy": 2.5541462898254395, "epoch": 0.6390603333423964, "grad_norm": 0.03157595172524452, "grad_norm_var": 1.1663486493394912e-06, "learning_rate": 0.00019357713069983296, "loss": 2.5541, "step": 11752 }, { "crossentropy": 2.5273866653442383, "epoch": 0.6391147121998967, "grad_norm": 0.030671903863549232, "grad_norm_var": 1.0310970078189572e-06, "learning_rate": 0.00019323416060384346, "loss": 2.5274, "step": 11753 }, { "crossentropy": 2.427505135536194, "epoch": 0.6391690910573968, "grad_norm": 0.031057951971888542, "grad_norm_var": 8.813881322672703e-07, "learning_rate": 0.00019289148861899031, "loss": 2.4275, "step": 11754 }, { "crossentropy": 2.51029896736145, "epoch": 0.6392234699148971, "grad_norm": 0.030657274648547173, "grad_norm_var": 2.8966444329327154e-07, "learning_rate": 0.00019254911476652482, "loss": 2.5103, "step": 11755 }, { "crossentropy": 2.4703367948532104, "epoch": 0.6392778487723972, "grad_norm": 0.031512439250946045, "grad_norm_var": 2.9277037117131137e-07, "learning_rate": 0.00019220703906768166, "loss": 2.4703, "step": 11756 }, { "crossentropy": 2.5015214681625366, "epoch": 0.6393322276298975, "grad_norm": 0.031549666076898575, "grad_norm_var": 2.604954185178587e-07, "learning_rate": 0.00019186526154367557, "loss": 2.5015, "step": 11757 }, { "crossentropy": 2.5673993825912476, "epoch": 0.6393866064873976, "grad_norm": 0.03227580711245537, "grad_norm_var": 2.787614087650566e-07, "learning_rate": 0.00019152378221570288, "loss": 2.5674, "step": 11758 }, { "crossentropy": 2.4950557947158813, "epoch": 0.6394409853448979, "grad_norm": 0.031236251816153526, "grad_norm_var": 2.7121234611389374e-07, "learning_rate": 0.00019118260110494279, "loss": 2.4951, "step": 11759 }, { "crossentropy": 2.460042119026184, "epoch": 0.639495364202398, "grad_norm": 0.029953083023428917, "grad_norm_var": 3.2827029096936987e-07, "learning_rate": 0.00019084171823255503, "loss": 2.46, "step": 11760 }, { "crossentropy": 2.564770460128784, "epoch": 0.6395497430598983, "grad_norm": 0.031109992414712906, "grad_norm_var": 3.2456507237602764e-07, "learning_rate": 0.0001905011336196788, "loss": 2.5648, "step": 11761 }, { "crossentropy": 2.561238646507263, "epoch": 0.6396041219173985, "grad_norm": 0.03066166117787361, "grad_norm_var": 3.3646500732079953e-07, "learning_rate": 0.00019016084728743998, "loss": 2.5612, "step": 11762 }, { "crossentropy": 2.533232092857361, "epoch": 0.6396585007748987, "grad_norm": 0.03141602873802185, "grad_norm_var": 3.3201596378848857e-07, "learning_rate": 0.0001898208592569406, "loss": 2.5332, "step": 11763 }, { "crossentropy": 2.4681918621063232, "epoch": 0.639712879632399, "grad_norm": 0.030945012345910072, "grad_norm_var": 2.835178796425081e-07, "learning_rate": 0.0001894811695492671, "loss": 2.4682, "step": 11764 }, { "crossentropy": 2.471529483795166, "epoch": 0.6397672584898991, "grad_norm": 0.0310161542147398, "grad_norm_var": 2.785499589693629e-07, "learning_rate": 0.0001891417781854865, "loss": 2.4715, "step": 11765 }, { "crossentropy": 2.4412237405776978, "epoch": 0.6398216373473994, "grad_norm": 0.03119521774351597, "grad_norm_var": 2.79148887163729e-07, "learning_rate": 0.00018880268518664756, "loss": 2.4412, "step": 11766 }, { "crossentropy": 2.5607184171676636, "epoch": 0.6398760162048995, "grad_norm": 0.031866177916526794, "grad_norm_var": 3.0266257343332845e-07, "learning_rate": 0.0001884638905737801, "loss": 2.5607, "step": 11767 }, { "crossentropy": 2.5419580936431885, "epoch": 0.6399303950623998, "grad_norm": 0.03149420768022537, "grad_norm_var": 2.986424195378437e-07, "learning_rate": 0.00018812539436789734, "loss": 2.542, "step": 11768 }, { "crossentropy": 2.322876811027527, "epoch": 0.6399847739198999, "grad_norm": 0.031130602583289146, "grad_norm_var": 2.8171594833962625e-07, "learning_rate": 0.00018778719658999078, "loss": 2.3229, "step": 11769 }, { "crossentropy": 2.4500995874404907, "epoch": 0.6400391527774002, "grad_norm": 0.03134864568710327, "grad_norm_var": 2.81788393329665e-07, "learning_rate": 0.00018744929726103644, "loss": 2.4501, "step": 11770 }, { "crossentropy": 2.4131627082824707, "epoch": 0.6400935316349003, "grad_norm": 0.030174409970641136, "grad_norm_var": 3.319794127525638e-07, "learning_rate": 0.00018711169640198976, "loss": 2.4132, "step": 11771 }, { "crossentropy": 2.466831088066101, "epoch": 0.6401479104924006, "grad_norm": 0.030798720195889473, "grad_norm_var": 3.322126956930548e-07, "learning_rate": 0.00018677439403378792, "loss": 2.4668, "step": 11772 }, { "crossentropy": 2.486428737640381, "epoch": 0.6402022893499008, "grad_norm": 0.030608806759119034, "grad_norm_var": 3.356109725289116e-07, "learning_rate": 0.00018643739017735138, "loss": 2.4864, "step": 11773 }, { "crossentropy": 2.5751287937164307, "epoch": 0.640256668207401, "grad_norm": 0.03385024145245552, "grad_norm_var": 7.422137835883316e-07, "learning_rate": 0.00018610068485358123, "loss": 2.5751, "step": 11774 }, { "crossentropy": 2.4712260961532593, "epoch": 0.6403110470649012, "grad_norm": 0.031124884262681007, "grad_norm_var": 7.420842643414416e-07, "learning_rate": 0.00018576427808335739, "loss": 2.4712, "step": 11775 }, { "crossentropy": 2.50466251373291, "epoch": 0.6403654259224014, "grad_norm": 0.03304887190461159, "grad_norm_var": 8.394442143266987e-07, "learning_rate": 0.00018542816988754486, "loss": 2.5047, "step": 11776 }, { "crossentropy": 2.652945399284363, "epoch": 0.6404198047799016, "grad_norm": 0.03299364447593689, "grad_norm_var": 9.979477908819093e-07, "learning_rate": 0.00018509236028698862, "loss": 2.6529, "step": 11777 }, { "crossentropy": 2.467669367790222, "epoch": 0.6404741836374018, "grad_norm": 0.03099236451089382, "grad_norm_var": 9.687179339472214e-07, "learning_rate": 0.00018475684930251536, "loss": 2.4677, "step": 11778 }, { "crossentropy": 2.418474316596985, "epoch": 0.640528562494902, "grad_norm": 0.03218633681535721, "grad_norm_var": 9.971537278958993e-07, "learning_rate": 0.00018442163695493342, "loss": 2.4185, "step": 11779 }, { "crossentropy": 2.458298444747925, "epoch": 0.6405829413524022, "grad_norm": 0.032149218022823334, "grad_norm_var": 9.909063248250317e-07, "learning_rate": 0.00018408672326503117, "loss": 2.4583, "step": 11780 }, { "crossentropy": 2.491107225418091, "epoch": 0.6406373202099024, "grad_norm": 0.03090534172952175, "grad_norm_var": 1.0006496330032388e-06, "learning_rate": 0.0001837521082535809, "loss": 2.4911, "step": 11781 }, { "crossentropy": 2.4554861783981323, "epoch": 0.6406916990674026, "grad_norm": 0.0325702466070652, "grad_norm_var": 1.0415396530904486e-06, "learning_rate": 0.00018341779194133546, "loss": 2.4555, "step": 11782 }, { "crossentropy": 2.564489006996155, "epoch": 0.6407460779249028, "grad_norm": 0.0315091609954834, "grad_norm_var": 1.041722626694772e-06, "learning_rate": 0.00018308377434902656, "loss": 2.5645, "step": 11783 }, { "crossentropy": 2.346512794494629, "epoch": 0.640800456782403, "grad_norm": 0.03177131339907646, "grad_norm_var": 1.039644130874003e-06, "learning_rate": 0.0001827500554973721, "loss": 2.3465, "step": 11784 }, { "crossentropy": 2.4551830291748047, "epoch": 0.6408548356399032, "grad_norm": 0.03087574616074562, "grad_norm_var": 1.0629732443612987e-06, "learning_rate": 0.00018241663540706832, "loss": 2.4552, "step": 11785 }, { "crossentropy": 2.5232352018356323, "epoch": 0.6409092144974035, "grad_norm": 0.03322947025299072, "grad_norm_var": 1.2005330273285546e-06, "learning_rate": 0.0001820835140987931, "loss": 2.5232, "step": 11786 }, { "crossentropy": 2.452847123146057, "epoch": 0.6409635933549036, "grad_norm": 0.030923787504434586, "grad_norm_var": 1.0732769413661243e-06, "learning_rate": 0.00018175069159320602, "loss": 2.4528, "step": 11787 }, { "crossentropy": 2.5576215982437134, "epoch": 0.6410179722124039, "grad_norm": 0.03096870146691799, "grad_norm_var": 1.051344013784187e-06, "learning_rate": 0.00018141816791095, "loss": 2.5576, "step": 11788 }, { "crossentropy": 2.588020443916321, "epoch": 0.641072351069904, "grad_norm": 0.03204037994146347, "grad_norm_var": 9.412270549767505e-07, "learning_rate": 0.00018108594307264636, "loss": 2.588, "step": 11789 }, { "crossentropy": 2.515754461288452, "epoch": 0.6411267299274043, "grad_norm": 0.031069008633494377, "grad_norm_var": 7.186146864330041e-07, "learning_rate": 0.00018075401709889973, "loss": 2.5158, "step": 11790 }, { "crossentropy": 2.461358070373535, "epoch": 0.6411811087849044, "grad_norm": 0.033848319202661514, "grad_norm_var": 9.470526320042953e-07, "learning_rate": 0.0001804223900102958, "loss": 2.4614, "step": 11791 }, { "crossentropy": 2.4903568029403687, "epoch": 0.6412354876424047, "grad_norm": 0.03731432929635048, "grad_norm_var": 2.713341900784788e-06, "learning_rate": 0.0001800910618274021, "loss": 2.4904, "step": 11792 }, { "crossentropy": 2.6223106384277344, "epoch": 0.6412898664999048, "grad_norm": 0.03381400927901268, "grad_norm_var": 2.8412072310366258e-06, "learning_rate": 0.00017976003257076824, "loss": 2.6223, "step": 11793 }, { "crossentropy": 2.4674296379089355, "epoch": 0.6413442453574051, "grad_norm": 0.0315859280526638, "grad_norm_var": 2.7628658702359325e-06, "learning_rate": 0.00017942930226092115, "loss": 2.4674, "step": 11794 }, { "crossentropy": 2.6194897890090942, "epoch": 0.6413986242149052, "grad_norm": 0.031175846233963966, "grad_norm_var": 2.8416722427403057e-06, "learning_rate": 0.00017909887091837606, "loss": 2.6195, "step": 11795 }, { "crossentropy": 2.542070984840393, "epoch": 0.6414530030724055, "grad_norm": 0.03275475278496742, "grad_norm_var": 2.8577097920735874e-06, "learning_rate": 0.00017876873856362442, "loss": 2.5421, "step": 11796 }, { "crossentropy": 2.5486037731170654, "epoch": 0.6415073819299056, "grad_norm": 0.031151792034506798, "grad_norm_var": 2.816588538832182e-06, "learning_rate": 0.00017843890521714034, "loss": 2.5486, "step": 11797 }, { "crossentropy": 2.5291827917099, "epoch": 0.6415617607874059, "grad_norm": 0.03152507543563843, "grad_norm_var": 2.8454842984007924e-06, "learning_rate": 0.00017810937089938027, "loss": 2.5292, "step": 11798 }, { "crossentropy": 2.619279980659485, "epoch": 0.641616139644906, "grad_norm": 0.030749177560210228, "grad_norm_var": 2.953851099657271e-06, "learning_rate": 0.00017778013563078177, "loss": 2.6193, "step": 11799 }, { "crossentropy": 2.5328426361083984, "epoch": 0.6416705185024063, "grad_norm": 0.03150876238942146, "grad_norm_var": 2.9722860212762534e-06, "learning_rate": 0.0001774511994317629, "loss": 2.5328, "step": 11800 }, { "crossentropy": 2.4740920066833496, "epoch": 0.6417248973599065, "grad_norm": 0.031997255980968475, "grad_norm_var": 2.859089941581562e-06, "learning_rate": 0.00017712256232272516, "loss": 2.4741, "step": 11801 }, { "crossentropy": 2.4108797311782837, "epoch": 0.6417792762174067, "grad_norm": 0.03162170946598053, "grad_norm_var": 2.806077752994758e-06, "learning_rate": 0.00017679422432404946, "loss": 2.4109, "step": 11802 }, { "crossentropy": 2.5086631774902344, "epoch": 0.6418336550749069, "grad_norm": 0.031396228820085526, "grad_norm_var": 2.7441685495886338e-06, "learning_rate": 0.0001764661854560995, "loss": 2.5087, "step": 11803 }, { "crossentropy": 2.6703155040740967, "epoch": 0.6418880339324071, "grad_norm": 0.030986206606030464, "grad_norm_var": 2.7414128374083122e-06, "learning_rate": 0.00017613844573922065, "loss": 2.6703, "step": 11804 }, { "crossentropy": 2.543387532234192, "epoch": 0.6419424127899073, "grad_norm": 0.03188798576593399, "grad_norm_var": 2.745267976958264e-06, "learning_rate": 0.00017581100519373672, "loss": 2.5434, "step": 11805 }, { "crossentropy": 2.5508601665496826, "epoch": 0.6419967916474075, "grad_norm": 0.03177657723426819, "grad_norm_var": 2.6746556678382155e-06, "learning_rate": 0.0001754838638399564, "loss": 2.5509, "step": 11806 }, { "crossentropy": 2.4759796857833862, "epoch": 0.6420511705049077, "grad_norm": 0.030219685286283493, "grad_norm_var": 2.6968992927140292e-06, "learning_rate": 0.00017515702169817072, "loss": 2.476, "step": 11807 }, { "crossentropy": 2.535158395767212, "epoch": 0.6421055493624079, "grad_norm": 0.03172625973820686, "grad_norm_var": 6.640795060722585e-07, "learning_rate": 0.00017483047878864678, "loss": 2.5352, "step": 11808 }, { "crossentropy": 2.4787601232528687, "epoch": 0.6421599282199081, "grad_norm": 0.03152409940958023, "grad_norm_var": 3.211164270107425e-07, "learning_rate": 0.00017450423513163893, "loss": 2.4788, "step": 11809 }, { "crossentropy": 2.5739893913269043, "epoch": 0.6422143070774083, "grad_norm": 0.03153190389275551, "grad_norm_var": 3.2049410252366074e-07, "learning_rate": 0.0001741782907473788, "loss": 2.574, "step": 11810 }, { "crossentropy": 2.4678258895874023, "epoch": 0.6422686859349085, "grad_norm": 0.030979428440332413, "grad_norm_var": 3.3063075401926155e-07, "learning_rate": 0.0001738526456560824, "loss": 2.4678, "step": 11811 }, { "crossentropy": 2.5146899223327637, "epoch": 0.6423230647924087, "grad_norm": 0.03031192533671856, "grad_norm_var": 2.814090460683793e-07, "learning_rate": 0.0001735272998779458, "loss": 2.5147, "step": 11812 }, { "crossentropy": 2.555304169654846, "epoch": 0.6423774436499089, "grad_norm": 0.03465421870350838, "grad_norm_var": 9.7613870122964e-07, "learning_rate": 0.00017320225343314566, "loss": 2.5553, "step": 11813 }, { "crossentropy": 2.607425570487976, "epoch": 0.6424318225074092, "grad_norm": 0.0321979857981205, "grad_norm_var": 1.0044656178479026e-06, "learning_rate": 0.00017287750634184252, "loss": 2.6074, "step": 11814 }, { "crossentropy": 2.380406975746155, "epoch": 0.6424862013649093, "grad_norm": 0.03134731575846672, "grad_norm_var": 9.616163282913717e-07, "learning_rate": 0.0001725530586241758, "loss": 2.3804, "step": 11815 }, { "crossentropy": 2.441463589668274, "epoch": 0.6425405802224096, "grad_norm": 0.0313618928194046, "grad_norm_var": 9.648338393362931e-07, "learning_rate": 0.00017222891030026777, "loss": 2.4415, "step": 11816 }, { "crossentropy": 2.5396888256073, "epoch": 0.6425949590799097, "grad_norm": 0.031543515622615814, "grad_norm_var": 9.533679567873265e-07, "learning_rate": 0.00017190506139022122, "loss": 2.5397, "step": 11817 }, { "crossentropy": 2.491994619369507, "epoch": 0.64264933793741, "grad_norm": 0.031107863411307335, "grad_norm_var": 9.661003454323417e-07, "learning_rate": 0.00017158151191412285, "loss": 2.492, "step": 11818 }, { "crossentropy": 2.644187092781067, "epoch": 0.6427037167949101, "grad_norm": 0.031522173434495926, "grad_norm_var": 9.647686418044146e-07, "learning_rate": 0.00017125826189203663, "loss": 2.6442, "step": 11819 }, { "crossentropy": 2.5238877534866333, "epoch": 0.6427580956524104, "grad_norm": 0.0318911075592041, "grad_norm_var": 9.488350371367541e-07, "learning_rate": 0.00017093531134401097, "loss": 2.5239, "step": 11820 }, { "crossentropy": 2.4988791942596436, "epoch": 0.6428124745099105, "grad_norm": 0.030343925580382347, "grad_norm_var": 1.0383470027451227e-06, "learning_rate": 0.00017061266029007538, "loss": 2.4989, "step": 11821 }, { "crossentropy": 2.495864987373352, "epoch": 0.6428668533674108, "grad_norm": 0.031934704631567, "grad_norm_var": 1.0456884798803646e-06, "learning_rate": 0.00017029030875024054, "loss": 2.4959, "step": 11822 }, { "crossentropy": 2.5210964679718018, "epoch": 0.6429212322249109, "grad_norm": 0.03251822292804718, "grad_norm_var": 9.797202861896164e-07, "learning_rate": 0.00016996825674449767, "loss": 2.5211, "step": 11823 }, { "crossentropy": 2.4844213724136353, "epoch": 0.6429756110824112, "grad_norm": 0.031636983156204224, "grad_norm_var": 9.793824945662402e-07, "learning_rate": 0.0001696465042928208, "loss": 2.4844, "step": 11824 }, { "crossentropy": 2.592680335044861, "epoch": 0.6430299899399113, "grad_norm": 0.031015625223517418, "grad_norm_var": 1.004108036290324e-06, "learning_rate": 0.00016932505141516397, "loss": 2.5927, "step": 11825 }, { "crossentropy": 2.478126883506775, "epoch": 0.6430843687974116, "grad_norm": 0.031409673392772675, "grad_norm_var": 1.0064559403321719e-06, "learning_rate": 0.00016900389813146398, "loss": 2.4781, "step": 11826 }, { "crossentropy": 2.4697285890579224, "epoch": 0.6431387476549117, "grad_norm": 0.03377128019928932, "grad_norm_var": 1.2584945791711822e-06, "learning_rate": 0.00016868304446163884, "loss": 2.4697, "step": 11827 }, { "crossentropy": 2.4937174320220947, "epoch": 0.643193126512412, "grad_norm": 0.03180098906159401, "grad_norm_var": 1.1045051606967433e-06, "learning_rate": 0.00016836249042558594, "loss": 2.4937, "step": 11828 }, { "crossentropy": 2.4410072565078735, "epoch": 0.6432475053699122, "grad_norm": 0.03143011033535004, "grad_norm_var": 5.609954714506551e-07, "learning_rate": 0.00016804223604318825, "loss": 2.441, "step": 11829 }, { "crossentropy": 2.5342483520507812, "epoch": 0.6433018842274124, "grad_norm": 0.03199603781104088, "grad_norm_var": 5.495184409389272e-07, "learning_rate": 0.00016772228133430544, "loss": 2.5342, "step": 11830 }, { "crossentropy": 2.5358736515045166, "epoch": 0.6433562630849126, "grad_norm": 0.032756101340055466, "grad_norm_var": 6.139882619803401e-07, "learning_rate": 0.00016740262631878168, "loss": 2.5359, "step": 11831 }, { "crossentropy": 2.3871783018112183, "epoch": 0.6434106419424128, "grad_norm": 0.03081568330526352, "grad_norm_var": 6.610828684750949e-07, "learning_rate": 0.00016708327101644106, "loss": 2.3872, "step": 11832 }, { "crossentropy": 2.5040359497070312, "epoch": 0.643465020799913, "grad_norm": 0.030350947752594948, "grad_norm_var": 7.777756947416683e-07, "learning_rate": 0.0001667642154470911, "loss": 2.504, "step": 11833 }, { "crossentropy": 2.6019890308380127, "epoch": 0.6435193996574132, "grad_norm": 0.03017154522240162, "grad_norm_var": 8.994814614755875e-07, "learning_rate": 0.00016644545963051817, "loss": 2.602, "step": 11834 }, { "crossentropy": 2.440820813179016, "epoch": 0.6435737785149134, "grad_norm": 0.0307466983795166, "grad_norm_var": 9.435956452904491e-07, "learning_rate": 0.00016612700358649147, "loss": 2.4408, "step": 11835 }, { "crossentropy": 2.477092742919922, "epoch": 0.6436281573724136, "grad_norm": 0.03135734423995018, "grad_norm_var": 9.361903011854503e-07, "learning_rate": 0.0001658088473347613, "loss": 2.4771, "step": 11836 }, { "crossentropy": 2.463627338409424, "epoch": 0.6436825362299138, "grad_norm": 0.031135568395256996, "grad_norm_var": 8.529639542124874e-07, "learning_rate": 0.00016549099089505914, "loss": 2.4636, "step": 11837 }, { "crossentropy": 2.591731548309326, "epoch": 0.643736915087414, "grad_norm": 0.030987724661827087, "grad_norm_var": 8.608127691727789e-07, "learning_rate": 0.00016517343428709976, "loss": 2.5917, "step": 11838 }, { "crossentropy": 2.5495996475219727, "epoch": 0.6437912939449142, "grad_norm": 0.03145867586135864, "grad_norm_var": 7.862521909427396e-07, "learning_rate": 0.00016485617753057513, "loss": 2.5496, "step": 11839 }, { "crossentropy": 2.4526891708374023, "epoch": 0.6438456728024144, "grad_norm": 0.030720243230462074, "grad_norm_var": 8.131799539907858e-07, "learning_rate": 0.00016453922064516293, "loss": 2.4527, "step": 11840 }, { "crossentropy": 2.5783313512802124, "epoch": 0.6439000516599146, "grad_norm": 0.03158752992749214, "grad_norm_var": 8.065794132825858e-07, "learning_rate": 0.00016422256365052125, "loss": 2.5783, "step": 11841 }, { "crossentropy": 2.552538752555847, "epoch": 0.6439544305174149, "grad_norm": 0.03149867057800293, "grad_norm_var": 8.071179204600931e-07, "learning_rate": 0.00016390620656628663, "loss": 2.5525, "step": 11842 }, { "crossentropy": 2.396149516105652, "epoch": 0.644008809374915, "grad_norm": 0.03314320743083954, "grad_norm_var": 6.341634960294398e-07, "learning_rate": 0.0001635901494120806, "loss": 2.3961, "step": 11843 }, { "crossentropy": 2.447044849395752, "epoch": 0.6440631882324153, "grad_norm": 0.03120698593556881, "grad_norm_var": 6.222649995757477e-07, "learning_rate": 0.00016327439220750474, "loss": 2.447, "step": 11844 }, { "crossentropy": 2.5854997634887695, "epoch": 0.6441175670899154, "grad_norm": 0.031219787895679474, "grad_norm_var": 6.223679302190388e-07, "learning_rate": 0.00016295893497214164, "loss": 2.5855, "step": 11845 }, { "crossentropy": 2.474799871444702, "epoch": 0.6441719459474157, "grad_norm": 0.031453847885131836, "grad_norm_var": 5.92016912752619e-07, "learning_rate": 0.00016264377772555572, "loss": 2.4748, "step": 11846 }, { "crossentropy": 2.556595206260681, "epoch": 0.6442263248049158, "grad_norm": 0.03050980158150196, "grad_norm_var": 4.6772519343082907e-07, "learning_rate": 0.00016232892048729296, "loss": 2.5566, "step": 11847 }, { "crossentropy": 2.472302556037903, "epoch": 0.6442807036624161, "grad_norm": 0.03184178099036217, "grad_norm_var": 4.880966805069382e-07, "learning_rate": 0.00016201436327688057, "loss": 2.4723, "step": 11848 }, { "crossentropy": 2.488587498664856, "epoch": 0.6443350825199162, "grad_norm": 0.03050682321190834, "grad_norm_var": 4.7172179466491566e-07, "learning_rate": 0.00016170010611382736, "loss": 2.4886, "step": 11849 }, { "crossentropy": 2.4282604455947876, "epoch": 0.6443894613774165, "grad_norm": 0.030398497357964516, "grad_norm_var": 4.431648408270561e-07, "learning_rate": 0.0001613861490176216, "loss": 2.4283, "step": 11850 }, { "crossentropy": 2.4345463514328003, "epoch": 0.6444438402349166, "grad_norm": 0.030834974721074104, "grad_norm_var": 4.3789478669092886e-07, "learning_rate": 0.0001610724920077361, "loss": 2.4345, "step": 11851 }, { "crossentropy": 2.459665536880493, "epoch": 0.6444982190924169, "grad_norm": 0.031501781195402145, "grad_norm_var": 4.414326749482833e-07, "learning_rate": 0.00016075913510362415, "loss": 2.4597, "step": 11852 }, { "crossentropy": 2.5254099369049072, "epoch": 0.644552597949917, "grad_norm": 0.030038263648748398, "grad_norm_var": 5.334836693682143e-07, "learning_rate": 0.00016044607832471858, "loss": 2.5254, "step": 11853 }, { "crossentropy": 2.5056761503219604, "epoch": 0.6446069768074173, "grad_norm": 0.031871113926172256, "grad_norm_var": 5.593994979560899e-07, "learning_rate": 0.00016013332169043492, "loss": 2.5057, "step": 11854 }, { "crossentropy": 2.561493396759033, "epoch": 0.6446613556649174, "grad_norm": 0.033129602670669556, "grad_norm_var": 7.83286695322247e-07, "learning_rate": 0.00015982086522017103, "loss": 2.5615, "step": 11855 }, { "crossentropy": 2.398344874382019, "epoch": 0.6447157345224177, "grad_norm": 0.031016845256090164, "grad_norm_var": 7.642188794815264e-07, "learning_rate": 0.0001595087089333047, "loss": 2.3983, "step": 11856 }, { "crossentropy": 2.3735342025756836, "epoch": 0.6447701133799179, "grad_norm": 0.031112443655729294, "grad_norm_var": 7.639107955793867e-07, "learning_rate": 0.00015919685284919495, "loss": 2.3735, "step": 11857 }, { "crossentropy": 2.523048520088196, "epoch": 0.6448244922374181, "grad_norm": 0.030369950458407402, "grad_norm_var": 8.181937640723645e-07, "learning_rate": 0.00015888529698718346, "loss": 2.523, "step": 11858 }, { "crossentropy": 2.6750558614730835, "epoch": 0.6448788710949183, "grad_norm": 0.03136986494064331, "grad_norm_var": 5.69400575107246e-07, "learning_rate": 0.00015857404136659258, "loss": 2.6751, "step": 11859 }, { "crossentropy": 2.5822900533676147, "epoch": 0.6449332499524185, "grad_norm": 0.0319858081638813, "grad_norm_var": 6.133428697753403e-07, "learning_rate": 0.00015826308600672633, "loss": 2.5823, "step": 11860 }, { "crossentropy": 2.5397650003433228, "epoch": 0.6449876288099187, "grad_norm": 0.03427925705909729, "grad_norm_var": 1.2074264488237096e-06, "learning_rate": 0.00015795243092686873, "loss": 2.5398, "step": 11861 }, { "crossentropy": 2.623094320297241, "epoch": 0.6450420076674189, "grad_norm": 0.03019915707409382, "grad_norm_var": 1.2949335173747932e-06, "learning_rate": 0.00015764207614628767, "loss": 2.6231, "step": 11862 }, { "crossentropy": 2.558510661125183, "epoch": 0.6450963865249191, "grad_norm": 0.03203052654862404, "grad_norm_var": 1.27714479456383e-06, "learning_rate": 0.00015733202168423055, "loss": 2.5585, "step": 11863 }, { "crossentropy": 2.511999487876892, "epoch": 0.6451507653824193, "grad_norm": 0.03263663500547409, "grad_norm_var": 1.3628778154033012e-06, "learning_rate": 0.00015702226755992643, "loss": 2.512, "step": 11864 }, { "crossentropy": 2.4992505311965942, "epoch": 0.6452051442399195, "grad_norm": 0.030933253467082977, "grad_norm_var": 1.32032666218899e-06, "learning_rate": 0.0001567128137925855, "loss": 2.4993, "step": 11865 }, { "crossentropy": 2.618740200996399, "epoch": 0.6452595230974197, "grad_norm": 0.03223923593759537, "grad_norm_var": 1.266232293506174e-06, "learning_rate": 0.00015640366040140186, "loss": 2.6187, "step": 11866 }, { "crossentropy": 2.4977645874023438, "epoch": 0.6453139019549199, "grad_norm": 0.031610891222953796, "grad_norm_var": 1.225045711722381e-06, "learning_rate": 0.0001560948074055468, "loss": 2.4978, "step": 11867 }, { "crossentropy": 2.4635878801345825, "epoch": 0.6453682808124201, "grad_norm": 0.030888913199305534, "grad_norm_var": 1.2602480377508906e-06, "learning_rate": 0.00015578625482417618, "loss": 2.4636, "step": 11868 }, { "crossentropy": 2.330121397972107, "epoch": 0.6454226596699203, "grad_norm": 0.03089769370853901, "grad_norm_var": 1.1266509355164465e-06, "learning_rate": 0.0001554780026764252, "loss": 2.3301, "step": 11869 }, { "crossentropy": 2.4685521125793457, "epoch": 0.6454770385274206, "grad_norm": 0.03066977858543396, "grad_norm_var": 1.183147579505069e-06, "learning_rate": 0.00015517005098141247, "loss": 2.4686, "step": 11870 }, { "crossentropy": 2.638952612876892, "epoch": 0.6455314173849207, "grad_norm": 0.03230813890695572, "grad_norm_var": 1.0562122088449145e-06, "learning_rate": 0.00015486239975823657, "loss": 2.639, "step": 11871 }, { "crossentropy": 2.487361431121826, "epoch": 0.645585796242421, "grad_norm": 0.03166563808917999, "grad_norm_var": 1.0377598765010445e-06, "learning_rate": 0.00015455504902597616, "loss": 2.4874, "step": 11872 }, { "crossentropy": 2.358868718147278, "epoch": 0.6456401750999211, "grad_norm": 0.03742704913020134, "grad_norm_var": 3.140599986590995e-06, "learning_rate": 0.00015424799880369488, "loss": 2.3589, "step": 11873 }, { "crossentropy": 2.535948872566223, "epoch": 0.6456945539574214, "grad_norm": 0.03105095587670803, "grad_norm_var": 3.0243464396316363e-06, "learning_rate": 0.00015394124911043583, "loss": 2.5359, "step": 11874 }, { "crossentropy": 2.513093590736389, "epoch": 0.6457489328149215, "grad_norm": 0.0313749760389328, "grad_norm_var": 3.0239104363430705e-06, "learning_rate": 0.00015363479996522212, "loss": 2.5131, "step": 11875 }, { "crossentropy": 2.4217381477355957, "epoch": 0.6458033116724218, "grad_norm": 0.03061692789196968, "grad_norm_var": 3.14587287100204e-06, "learning_rate": 0.00015332865138705909, "loss": 2.4217, "step": 11876 }, { "crossentropy": 2.449709415435791, "epoch": 0.6458576905299219, "grad_norm": 0.03547172248363495, "grad_norm_var": 3.608773797099338e-06, "learning_rate": 0.00015302280339493546, "loss": 2.4497, "step": 11877 }, { "crossentropy": 2.4898579120635986, "epoch": 0.6459120693874222, "grad_norm": 0.03299630433320999, "grad_norm_var": 3.425645136247837e-06, "learning_rate": 0.00015271725600781827, "loss": 2.4899, "step": 11878 }, { "crossentropy": 2.5102676153182983, "epoch": 0.6459664482449223, "grad_norm": 0.03098699077963829, "grad_norm_var": 3.5139694290527707e-06, "learning_rate": 0.00015241200924465793, "loss": 2.5103, "step": 11879 }, { "crossentropy": 2.547580361366272, "epoch": 0.6460208271024226, "grad_norm": 0.03229553624987602, "grad_norm_var": 3.4973328637804967e-06, "learning_rate": 0.00015210706312438537, "loss": 2.5476, "step": 11880 }, { "crossentropy": 2.530828595161438, "epoch": 0.6460752059599227, "grad_norm": 0.03156584873795509, "grad_norm_var": 3.4248085261378284e-06, "learning_rate": 0.00015180241766591328, "loss": 2.5308, "step": 11881 }, { "crossentropy": 2.5153579711914062, "epoch": 0.646129584817423, "grad_norm": 0.030080856755375862, "grad_norm_var": 3.684293728446695e-06, "learning_rate": 0.0001514980728881349, "loss": 2.5154, "step": 11882 }, { "crossentropy": 2.565251588821411, "epoch": 0.6461839636749231, "grad_norm": 0.03059525042772293, "grad_norm_var": 3.800679971445395e-06, "learning_rate": 0.0001511940288099256, "loss": 2.5653, "step": 11883 }, { "crossentropy": 2.5311721563339233, "epoch": 0.6462383425324234, "grad_norm": 0.032093629240989685, "grad_norm_var": 3.7240339320942452e-06, "learning_rate": 0.0001508902854501426, "loss": 2.5312, "step": 11884 }, { "crossentropy": 2.4868870973587036, "epoch": 0.6462927213899236, "grad_norm": 0.03080681711435318, "grad_norm_var": 3.737980287686106e-06, "learning_rate": 0.00015058684282762358, "loss": 2.4869, "step": 11885 }, { "crossentropy": 2.5517903566360474, "epoch": 0.6463471002474238, "grad_norm": 0.03180556744337082, "grad_norm_var": 3.61709878268403e-06, "learning_rate": 0.00015028370096118682, "loss": 2.5518, "step": 11886 }, { "crossentropy": 2.5066901445388794, "epoch": 0.646401479104924, "grad_norm": 0.03255150094628334, "grad_norm_var": 3.628482505082381e-06, "learning_rate": 0.00014998085986963282, "loss": 2.5067, "step": 11887 }, { "crossentropy": 2.488897204399109, "epoch": 0.6464558579624242, "grad_norm": 0.03233309090137482, "grad_norm_var": 3.61886303042111e-06, "learning_rate": 0.00014967831957174606, "loss": 2.4889, "step": 11888 }, { "crossentropy": 2.4449069499969482, "epoch": 0.6465102368199244, "grad_norm": 0.03122134879231453, "grad_norm_var": 1.6414678562832556e-06, "learning_rate": 0.000149376080086287, "loss": 2.4449, "step": 11889 }, { "crossentropy": 2.4954347610473633, "epoch": 0.6465646156774246, "grad_norm": 0.030744172632694244, "grad_norm_var": 1.6755537866027276e-06, "learning_rate": 0.0001490741414320007, "loss": 2.4954, "step": 11890 }, { "crossentropy": 2.4566335678100586, "epoch": 0.6466189945349248, "grad_norm": 0.03152081370353699, "grad_norm_var": 1.6701491136511855e-06, "learning_rate": 0.00014877250362761442, "loss": 2.4566, "step": 11891 }, { "crossentropy": 2.584742546081543, "epoch": 0.646673373392425, "grad_norm": 0.031886789947748184, "grad_norm_var": 1.582406230386787e-06, "learning_rate": 0.00014847116669183425, "loss": 2.5847, "step": 11892 }, { "crossentropy": 2.474522352218628, "epoch": 0.6467277522499252, "grad_norm": 0.030431760475039482, "grad_norm_var": 7.091654371937002e-07, "learning_rate": 0.00014817013064334918, "loss": 2.4745, "step": 11893 }, { "crossentropy": 2.3972564935684204, "epoch": 0.6467821311074254, "grad_norm": 0.03278925269842148, "grad_norm_var": 6.703920791555536e-07, "learning_rate": 0.00014786939550082978, "loss": 2.3973, "step": 11894 }, { "crossentropy": 2.520316004753113, "epoch": 0.6468365099649256, "grad_norm": 0.03137359395623207, "grad_norm_var": 6.542261044760467e-07, "learning_rate": 0.0001475689612829262, "loss": 2.5203, "step": 11895 }, { "crossentropy": 2.5358080863952637, "epoch": 0.6468908888224258, "grad_norm": 0.03222960606217384, "grad_norm_var": 6.475571156872954e-07, "learning_rate": 0.00014726882800827234, "loss": 2.5358, "step": 11896 }, { "crossentropy": 2.4763846397399902, "epoch": 0.646945267679926, "grad_norm": 0.031326815485954285, "grad_norm_var": 6.490890587212393e-07, "learning_rate": 0.00014696899569548172, "loss": 2.4764, "step": 11897 }, { "crossentropy": 2.5786569118499756, "epoch": 0.6469996465374263, "grad_norm": 0.030351780354976654, "grad_norm_var": 6.028847738479322e-07, "learning_rate": 0.00014666946436314832, "loss": 2.5787, "step": 11898 }, { "crossentropy": 2.472594976425171, "epoch": 0.6470540253949264, "grad_norm": 0.031404491513967514, "grad_norm_var": 5.457761233158918e-07, "learning_rate": 0.00014637023402985116, "loss": 2.4726, "step": 11899 }, { "crossentropy": 2.3915927410125732, "epoch": 0.6471084042524267, "grad_norm": 0.03213423490524292, "grad_norm_var": 5.487983958233813e-07, "learning_rate": 0.00014607130471414653, "loss": 2.3916, "step": 11900 }, { "crossentropy": 2.6206635236740112, "epoch": 0.6471627831099268, "grad_norm": 0.03357753902673721, "grad_norm_var": 7.514732689160488e-07, "learning_rate": 0.0001457726764345746, "loss": 2.6207, "step": 11901 }, { "crossentropy": 2.490963339805603, "epoch": 0.6472171619674271, "grad_norm": 0.03228076919913292, "grad_norm_var": 7.703654430460018e-07, "learning_rate": 0.00014547434920965553, "loss": 2.491, "step": 11902 }, { "crossentropy": 2.6153255701065063, "epoch": 0.6472715408249272, "grad_norm": 0.03142315894365311, "grad_norm_var": 7.308369003903529e-07, "learning_rate": 0.00014517632305789118, "loss": 2.6153, "step": 11903 }, { "crossentropy": 2.4100658893585205, "epoch": 0.6473259196824275, "grad_norm": 0.03168405964970589, "grad_norm_var": 7.014547070050784e-07, "learning_rate": 0.00014487859799776516, "loss": 2.4101, "step": 11904 }, { "crossentropy": 2.464538812637329, "epoch": 0.6473802985399276, "grad_norm": 0.0318705253303051, "grad_norm_var": 6.907985680721367e-07, "learning_rate": 0.00014458117404774208, "loss": 2.4645, "step": 11905 }, { "crossentropy": 2.484839916229248, "epoch": 0.6474346773974279, "grad_norm": 0.032302599400281906, "grad_norm_var": 6.461963809253756e-07, "learning_rate": 0.00014428405122626832, "loss": 2.4848, "step": 11906 }, { "crossentropy": 2.408018469810486, "epoch": 0.647489056254928, "grad_norm": 0.03147134184837341, "grad_norm_var": 6.481034427393972e-07, "learning_rate": 0.00014398722955177024, "loss": 2.408, "step": 11907 }, { "crossentropy": 2.584690570831299, "epoch": 0.6475434351124283, "grad_norm": 0.03159219026565552, "grad_norm_var": 6.494762214059492e-07, "learning_rate": 0.00014369070904265757, "loss": 2.5847, "step": 11908 }, { "crossentropy": 2.596389055252075, "epoch": 0.6475978139699284, "grad_norm": 0.03854332119226456, "grad_norm_var": 3.319609621139398e-06, "learning_rate": 0.00014339448971731895, "loss": 2.5964, "step": 11909 }, { "crossentropy": 2.550838589668274, "epoch": 0.6476521928274287, "grad_norm": 0.03185033053159714, "grad_norm_var": 3.3099790409958527e-06, "learning_rate": 0.0001430985715941263, "loss": 2.5508, "step": 11910 }, { "crossentropy": 2.639872908592224, "epoch": 0.6477065716849288, "grad_norm": 0.03410736098885536, "grad_norm_var": 3.4709158692580155e-06, "learning_rate": 0.0001428029546914328, "loss": 2.6399, "step": 11911 }, { "crossentropy": 2.541129946708679, "epoch": 0.6477609505424291, "grad_norm": 0.03259459137916565, "grad_norm_var": 3.4717095972710185e-06, "learning_rate": 0.0001425076390275709, "loss": 2.5411, "step": 11912 }, { "crossentropy": 2.4558043479919434, "epoch": 0.6478153293999294, "grad_norm": 0.030444810166954994, "grad_norm_var": 3.6473837573654793e-06, "learning_rate": 0.00014221262462085717, "loss": 2.4558, "step": 11913 }, { "crossentropy": 2.5372445583343506, "epoch": 0.6478697082574295, "grad_norm": 0.0319882370531559, "grad_norm_var": 3.3783067221502716e-06, "learning_rate": 0.00014191791148958688, "loss": 2.5372, "step": 11914 }, { "crossentropy": 2.4560309648513794, "epoch": 0.6479240871149298, "grad_norm": 0.030865279957652092, "grad_norm_var": 3.471957811340918e-06, "learning_rate": 0.0001416234996520388, "loss": 2.456, "step": 11915 }, { "crossentropy": 2.537088394165039, "epoch": 0.6479784659724299, "grad_norm": 0.030926721170544624, "grad_norm_var": 3.609201250071127e-06, "learning_rate": 0.0001413293891264722, "loss": 2.5371, "step": 11916 }, { "crossentropy": 2.409734010696411, "epoch": 0.6480328448299302, "grad_norm": 0.030972782522439957, "grad_norm_var": 3.6052482164329687e-06, "learning_rate": 0.00014103557993112693, "loss": 2.4097, "step": 11917 }, { "crossentropy": 2.5775033235549927, "epoch": 0.6480872236874303, "grad_norm": 0.03148325905203819, "grad_norm_var": 3.6345374313197317e-06, "learning_rate": 0.00014074207208422508, "loss": 2.5775, "step": 11918 }, { "crossentropy": 2.555099844932556, "epoch": 0.6481416025449306, "grad_norm": 0.03134406730532646, "grad_norm_var": 3.6424091674347544e-06, "learning_rate": 0.00014044886560396984, "loss": 2.5551, "step": 11919 }, { "crossentropy": 2.4951549768447876, "epoch": 0.6481959814024307, "grad_norm": 0.03173615783452988, "grad_norm_var": 3.6394978396661935e-06, "learning_rate": 0.00014015596050854452, "loss": 2.4952, "step": 11920 }, { "crossentropy": 2.5457844734191895, "epoch": 0.648250360259931, "grad_norm": 0.03020692616701126, "grad_norm_var": 3.870213596633337e-06, "learning_rate": 0.00013986335681611618, "loss": 2.5458, "step": 11921 }, { "crossentropy": 2.5431694984436035, "epoch": 0.6483047391174311, "grad_norm": 0.03211221471428871, "grad_norm_var": 3.8654797949746874e-06, "learning_rate": 0.00013957105454483199, "loss": 2.5432, "step": 11922 }, { "crossentropy": 2.509108781814575, "epoch": 0.6483591179749314, "grad_norm": 0.030176235362887383, "grad_norm_var": 4.064186053517882e-06, "learning_rate": 0.00013927905371281858, "loss": 2.5091, "step": 11923 }, { "crossentropy": 2.6311020851135254, "epoch": 0.6484134968324315, "grad_norm": 0.030836904421448708, "grad_norm_var": 4.134264528301762e-06, "learning_rate": 0.000138987354338187, "loss": 2.6311, "step": 11924 }, { "crossentropy": 2.5083316564559937, "epoch": 0.6484678756899318, "grad_norm": 0.03176169842481613, "grad_norm_var": 9.897521609508365e-07, "learning_rate": 0.00013869595643902833, "loss": 2.5083, "step": 11925 }, { "crossentropy": 2.45854651927948, "epoch": 0.648522254547432, "grad_norm": 0.03129353001713753, "grad_norm_var": 9.803714290454543e-07, "learning_rate": 0.0001384048600334137, "loss": 2.4585, "step": 11926 }, { "crossentropy": 2.5212273597717285, "epoch": 0.6485766334049322, "grad_norm": 0.03246312960982323, "grad_norm_var": 5.619794362261698e-07, "learning_rate": 0.00013811406513939694, "loss": 2.5212, "step": 11927 }, { "crossentropy": 2.5680755376815796, "epoch": 0.6486310122624324, "grad_norm": 0.032095182687044144, "grad_norm_var": 4.93055411221914e-07, "learning_rate": 0.0001378235717750137, "loss": 2.5681, "step": 11928 }, { "crossentropy": 2.5054965019226074, "epoch": 0.6486853911199326, "grad_norm": 0.03154180943965912, "grad_norm_var": 4.4403161757536336e-07, "learning_rate": 0.00013753337995827896, "loss": 2.5055, "step": 11929 }, { "crossentropy": 2.4648523330688477, "epoch": 0.6487397699774328, "grad_norm": 0.03150287643074989, "grad_norm_var": 4.182773619028357e-07, "learning_rate": 0.00013724348970719168, "loss": 2.4649, "step": 11930 }, { "crossentropy": 2.568085193634033, "epoch": 0.648794148834933, "grad_norm": 0.03183295205235481, "grad_norm_var": 4.1652946317914603e-07, "learning_rate": 0.0001369539010397286, "loss": 2.5681, "step": 11931 }, { "crossentropy": 2.4928548336029053, "epoch": 0.6488485276924332, "grad_norm": 0.03213881328701973, "grad_norm_var": 4.330117179369523e-07, "learning_rate": 0.0001366646139738509, "loss": 2.4929, "step": 11932 }, { "crossentropy": 2.478292942047119, "epoch": 0.6489029065499334, "grad_norm": 0.034548696130514145, "grad_norm_var": 9.95781047880238e-07, "learning_rate": 0.00013637562852750085, "loss": 2.4783, "step": 11933 }, { "crossentropy": 2.5001360177993774, "epoch": 0.6489572854074336, "grad_norm": 0.030033253133296967, "grad_norm_var": 1.1675747584302097e-06, "learning_rate": 0.00013608694471859916, "loss": 2.5001, "step": 11934 }, { "crossentropy": 2.429801344871521, "epoch": 0.6490116642649338, "grad_norm": 0.03186434879899025, "grad_norm_var": 1.1666327934242136e-06, "learning_rate": 0.00013579856256504985, "loss": 2.4298, "step": 11935 }, { "crossentropy": 2.4845118522644043, "epoch": 0.649066043122434, "grad_norm": 0.030393069609999657, "grad_norm_var": 1.2610895562845663e-06, "learning_rate": 0.0001355104820847397, "loss": 2.4845, "step": 11936 }, { "crossentropy": 2.534820795059204, "epoch": 0.6491204219799342, "grad_norm": 0.03130262345075607, "grad_norm_var": 1.1398954659778594e-06, "learning_rate": 0.00013522270329553444, "loss": 2.5348, "step": 11937 }, { "crossentropy": 2.5429489612579346, "epoch": 0.6491748008374344, "grad_norm": 0.03129865229129791, "grad_norm_var": 1.127716574232301e-06, "learning_rate": 0.00013493522621528086, "loss": 2.5429, "step": 11938 }, { "crossentropy": 2.687673807144165, "epoch": 0.6492291796949347, "grad_norm": 0.032391443848609924, "grad_norm_var": 1.0234181771024327e-06, "learning_rate": 0.00013464805086180976, "loss": 2.6877, "step": 11939 }, { "crossentropy": 2.4346712827682495, "epoch": 0.6492835585524348, "grad_norm": 0.03082496114075184, "grad_norm_var": 1.0248113695039234e-06, "learning_rate": 0.00013436117725293018, "loss": 2.4347, "step": 11940 }, { "crossentropy": 2.4797372817993164, "epoch": 0.6493379374099351, "grad_norm": 0.03081272915005684, "grad_norm_var": 1.0739769545251368e-06, "learning_rate": 0.00013407460540643457, "loss": 2.4797, "step": 11941 }, { "crossentropy": 2.4454582929611206, "epoch": 0.6493923162674352, "grad_norm": 0.03172193467617035, "grad_norm_var": 1.0653069158116092e-06, "learning_rate": 0.00013378833534009483, "loss": 2.4455, "step": 11942 }, { "crossentropy": 2.5412118434906006, "epoch": 0.6494466951249355, "grad_norm": 0.031737860292196274, "grad_norm_var": 1.0217661088517128e-06, "learning_rate": 0.00013350236707166508, "loss": 2.5412, "step": 11943 }, { "crossentropy": 2.5480610132217407, "epoch": 0.6495010739824356, "grad_norm": 0.031262971460819244, "grad_norm_var": 1.0131656748036192e-06, "learning_rate": 0.0001332167006188828, "loss": 2.5481, "step": 11944 }, { "crossentropy": 2.424274444580078, "epoch": 0.6495554528399359, "grad_norm": 0.033619221299886703, "grad_norm_var": 1.273544059280975e-06, "learning_rate": 0.00013293133599946328, "loss": 2.4243, "step": 11945 }, { "crossentropy": 2.3746196031570435, "epoch": 0.649609831697436, "grad_norm": 0.030338168144226074, "grad_norm_var": 1.3897789986944479e-06, "learning_rate": 0.00013264627323110402, "loss": 2.3746, "step": 11946 }, { "crossentropy": 2.493609666824341, "epoch": 0.6496642105549363, "grad_norm": 0.030181776732206345, "grad_norm_var": 1.5160702552654535e-06, "learning_rate": 0.00013236151233148586, "loss": 2.4936, "step": 11947 }, { "crossentropy": 2.485264539718628, "epoch": 0.6497185894124364, "grad_norm": 0.03205413371324539, "grad_norm_var": 1.509637859112752e-06, "learning_rate": 0.00013207705331826802, "loss": 2.4853, "step": 11948 }, { "crossentropy": 2.506609559059143, "epoch": 0.6497729682699367, "grad_norm": 0.03271486237645149, "grad_norm_var": 9.802781956079477e-07, "learning_rate": 0.00013179289620909306, "loss": 2.5066, "step": 11949 }, { "crossentropy": 2.524135947227478, "epoch": 0.6498273471274368, "grad_norm": 0.031457312405109406, "grad_norm_var": 8.457103446826245e-07, "learning_rate": 0.0001315090410215841, "loss": 2.5241, "step": 11950 }, { "crossentropy": 2.4779727458953857, "epoch": 0.6498817259849371, "grad_norm": 0.03186262771487236, "grad_norm_var": 8.456265766033459e-07, "learning_rate": 0.00013122548777334597, "loss": 2.478, "step": 11951 }, { "crossentropy": 2.545111298561096, "epoch": 0.6499361048424372, "grad_norm": 0.03377271071076393, "grad_norm_var": 1.0614187368055179e-06, "learning_rate": 0.00013094223648196402, "loss": 2.5451, "step": 11952 }, { "crossentropy": 2.4982376098632812, "epoch": 0.6499904836999375, "grad_norm": 0.032291021198034286, "grad_norm_var": 1.06883970047455e-06, "learning_rate": 0.0001306592871650042, "loss": 2.4982, "step": 11953 }, { "crossentropy": 2.5364145040512085, "epoch": 0.6500448625574377, "grad_norm": 0.03138374537229538, "grad_norm_var": 1.063928587067978e-06, "learning_rate": 0.0001303766398400158, "loss": 2.5364, "step": 11954 }, { "crossentropy": 2.5160950422286987, "epoch": 0.6500992414149379, "grad_norm": 0.03158261626958847, "grad_norm_var": 1.038521867035647e-06, "learning_rate": 0.00013009429452452925, "loss": 2.5161, "step": 11955 }, { "crossentropy": 2.4831995964050293, "epoch": 0.6501536202724381, "grad_norm": 0.031114721670746803, "grad_norm_var": 1.0089516348851425e-06, "learning_rate": 0.0001298122512360539, "loss": 2.4832, "step": 11956 }, { "crossentropy": 2.4171451330184937, "epoch": 0.6502079991299383, "grad_norm": 0.03113655000925064, "grad_norm_var": 9.752848157696155e-07, "learning_rate": 0.0001295305099920818, "loss": 2.4171, "step": 11957 }, { "crossentropy": 2.5432299375534058, "epoch": 0.6502623779874385, "grad_norm": 0.03170347958803177, "grad_norm_var": 9.754108782302016e-07, "learning_rate": 0.00012924907081008686, "loss": 2.5432, "step": 11958 }, { "crossentropy": 2.5270577669143677, "epoch": 0.6503167568449387, "grad_norm": 0.030145496129989624, "grad_norm_var": 1.1393015824748364e-06, "learning_rate": 0.0001289679337075239, "loss": 2.5271, "step": 11959 }, { "crossentropy": 2.5076621770858765, "epoch": 0.6503711357024389, "grad_norm": 0.031165307387709618, "grad_norm_var": 1.1451177644859755e-06, "learning_rate": 0.0001286870987018285, "loss": 2.5077, "step": 11960 }, { "crossentropy": 2.617357015609741, "epoch": 0.6504255145599391, "grad_norm": 0.03164995089173317, "grad_norm_var": 8.724679766633492e-07, "learning_rate": 0.0001284065658104172, "loss": 2.6174, "step": 11961 }, { "crossentropy": 2.4878289699554443, "epoch": 0.6504798934174393, "grad_norm": 0.03138300031423569, "grad_norm_var": 7.740138876742575e-07, "learning_rate": 0.0001281263350506895, "loss": 2.4878, "step": 11962 }, { "crossentropy": 2.542169451713562, "epoch": 0.6505342722749395, "grad_norm": 0.03167444095015526, "grad_norm_var": 6.310178604938821e-07, "learning_rate": 0.00012784640644002365, "loss": 2.5422, "step": 11963 }, { "crossentropy": 2.4992780685424805, "epoch": 0.6505886511324397, "grad_norm": 0.03170526772737503, "grad_norm_var": 6.218378363641308e-07, "learning_rate": 0.00012756677999578247, "loss": 2.4993, "step": 11964 }, { "crossentropy": 2.541271686553955, "epoch": 0.65064302998994, "grad_norm": 0.031143805012106895, "grad_norm_var": 5.575324020020879e-07, "learning_rate": 0.00012728745573530598, "loss": 2.5413, "step": 11965 }, { "crossentropy": 2.6089723110198975, "epoch": 0.6506974088474401, "grad_norm": 0.031838834285736084, "grad_norm_var": 5.6073196923304e-07, "learning_rate": 0.00012700843367591973, "loss": 2.609, "step": 11966 }, { "crossentropy": 2.519213318824768, "epoch": 0.6507517877049404, "grad_norm": 0.031321119517087936, "grad_norm_var": 5.598874117396553e-07, "learning_rate": 0.0001267297138349266, "loss": 2.5192, "step": 11967 }, { "crossentropy": 2.4433093070983887, "epoch": 0.6508061665624405, "grad_norm": 0.031473271548748016, "grad_norm_var": 2.1294974363026722e-07, "learning_rate": 0.00012645129622961327, "loss": 2.4433, "step": 11968 }, { "crossentropy": 2.6271215677261353, "epoch": 0.6508605454199408, "grad_norm": 0.03047877922654152, "grad_norm_var": 2.0763539676202774e-07, "learning_rate": 0.00012617318087724706, "loss": 2.6271, "step": 11969 }, { "crossentropy": 2.574034094810486, "epoch": 0.6509149242774409, "grad_norm": 0.03140563890337944, "grad_norm_var": 2.0789150392042503e-07, "learning_rate": 0.00012589536779507638, "loss": 2.574, "step": 11970 }, { "crossentropy": 2.550704836845398, "epoch": 0.6509693031349412, "grad_norm": 0.030493801459670067, "grad_norm_var": 2.4206695272514327e-07, "learning_rate": 0.0001256178570003308, "loss": 2.5507, "step": 11971 }, { "crossentropy": 2.481122136116028, "epoch": 0.6510236819924413, "grad_norm": 0.03160715475678444, "grad_norm_var": 2.4902392530173843e-07, "learning_rate": 0.000125340648510221, "loss": 2.4811, "step": 11972 }, { "crossentropy": 2.546297073364258, "epoch": 0.6510780608499416, "grad_norm": 0.031357768923044205, "grad_norm_var": 2.481354443182772e-07, "learning_rate": 0.00012506374234193984, "loss": 2.5463, "step": 11973 }, { "crossentropy": 2.5139294862747192, "epoch": 0.6511324397074417, "grad_norm": 0.030640028417110443, "grad_norm_var": 2.593666141311914e-07, "learning_rate": 0.00012478713851266088, "loss": 2.5139, "step": 11974 }, { "crossentropy": 2.5119411945343018, "epoch": 0.651186818564942, "grad_norm": 0.030307099223136902, "grad_norm_var": 2.3789534820971696e-07, "learning_rate": 0.00012451083703953814, "loss": 2.5119, "step": 11975 }, { "crossentropy": 2.5059990882873535, "epoch": 0.6512411974224421, "grad_norm": 0.031182166188955307, "grad_norm_var": 2.377725726786124e-07, "learning_rate": 0.00012423483793970734, "loss": 2.506, "step": 11976 }, { "crossentropy": 2.396799683570862, "epoch": 0.6512955762799424, "grad_norm": 0.03025166131556034, "grad_norm_var": 2.8147011121649963e-07, "learning_rate": 0.0001239591412302865, "loss": 2.3968, "step": 11977 }, { "crossentropy": 2.4813108444213867, "epoch": 0.6513499551374425, "grad_norm": 0.03110310807824135, "grad_norm_var": 2.7735342008430344e-07, "learning_rate": 0.0001236837469283747, "loss": 2.4813, "step": 11978 }, { "crossentropy": 2.6039615869522095, "epoch": 0.6514043339949428, "grad_norm": 0.030220573768019676, "grad_norm_var": 3.0275846723155455e-07, "learning_rate": 0.00012340865505104992, "loss": 2.604, "step": 11979 }, { "crossentropy": 2.517484426498413, "epoch": 0.651458712852443, "grad_norm": 0.030740657821297646, "grad_norm_var": 2.7446620956838156e-07, "learning_rate": 0.0001231338656153741, "loss": 2.5175, "step": 11980 }, { "crossentropy": 2.5099921226501465, "epoch": 0.6515130917099432, "grad_norm": 0.0311303548514843, "grad_norm_var": 2.741709185164846e-07, "learning_rate": 0.00012285937863838915, "loss": 2.51, "step": 11981 }, { "crossentropy": 2.5616894960403442, "epoch": 0.6515674705674434, "grad_norm": 0.03189616650342941, "grad_norm_var": 2.810026838262018e-07, "learning_rate": 0.00012258519413711866, "loss": 2.5617, "step": 11982 }, { "crossentropy": 2.563698172569275, "epoch": 0.6516218494249436, "grad_norm": 0.03072289191186428, "grad_norm_var": 2.758087983165443e-07, "learning_rate": 0.00012231131212856684, "loss": 2.5637, "step": 11983 }, { "crossentropy": 2.528808832168579, "epoch": 0.6516762282824438, "grad_norm": 0.030861854553222656, "grad_norm_var": 2.555525817047316e-07, "learning_rate": 0.00012203773262972007, "loss": 2.5288, "step": 11984 }, { "crossentropy": 2.403262138366699, "epoch": 0.651730607139944, "grad_norm": 0.03196147456765175, "grad_norm_var": 3.096830305636487e-07, "learning_rate": 0.0001217644556575459, "loss": 2.4033, "step": 11985 }, { "crossentropy": 2.5562041997909546, "epoch": 0.6517849859974442, "grad_norm": 0.031013812869787216, "grad_norm_var": 2.977025383022001e-07, "learning_rate": 0.00012149148122899245, "loss": 2.5562, "step": 11986 }, { "crossentropy": 2.5409995317459106, "epoch": 0.6518393648549444, "grad_norm": 0.03226259723305702, "grad_norm_var": 3.8136976878780373e-07, "learning_rate": 0.00012121880936098783, "loss": 2.541, "step": 11987 }, { "crossentropy": 2.6342610120773315, "epoch": 0.6518937437124446, "grad_norm": 0.03191962465643883, "grad_norm_var": 4.0948849076016815e-07, "learning_rate": 0.00012094644007044519, "loss": 2.6343, "step": 11988 }, { "crossentropy": 2.567679286003113, "epoch": 0.6519481225699448, "grad_norm": 0.031391292810440063, "grad_norm_var": 4.1071878698843017e-07, "learning_rate": 0.00012067437337425547, "loss": 2.5677, "step": 11989 }, { "crossentropy": 2.384467840194702, "epoch": 0.652002501427445, "grad_norm": 0.030313583090901375, "grad_norm_var": 4.3741453374921724e-07, "learning_rate": 0.00012040260928929181, "loss": 2.3845, "step": 11990 }, { "crossentropy": 2.4995899200439453, "epoch": 0.6520568802849452, "grad_norm": 0.030712710693478584, "grad_norm_var": 4.0590107161717073e-07, "learning_rate": 0.00012013114783240908, "loss": 2.4996, "step": 11991 }, { "crossentropy": 2.570154070854187, "epoch": 0.6521112591424454, "grad_norm": 0.03098706156015396, "grad_norm_var": 4.062801543936248e-07, "learning_rate": 0.00011985998902044382, "loss": 2.5702, "step": 11992 }, { "crossentropy": 2.5899200439453125, "epoch": 0.6521656379999456, "grad_norm": 0.03187210112810135, "grad_norm_var": 3.8859647510124976e-07, "learning_rate": 0.00011958913287021145, "loss": 2.5899, "step": 11993 }, { "crossentropy": 2.538103938102722, "epoch": 0.6522200168574458, "grad_norm": 0.0309748537838459, "grad_norm_var": 3.911851213604049e-07, "learning_rate": 0.00011931857939851132, "loss": 2.5381, "step": 11994 }, { "crossentropy": 2.5155677795410156, "epoch": 0.652274395714946, "grad_norm": 0.03077438473701477, "grad_norm_var": 3.3903989927515595e-07, "learning_rate": 0.00011904832862212222, "loss": 2.5156, "step": 11995 }, { "crossentropy": 2.5499988794326782, "epoch": 0.6523287745724462, "grad_norm": 0.03170168772339821, "grad_norm_var": 3.35218417001172e-07, "learning_rate": 0.00011877838055780576, "loss": 2.55, "step": 11996 }, { "crossentropy": 2.485957622528076, "epoch": 0.6523831534299465, "grad_norm": 0.030889462679624557, "grad_norm_var": 3.436847055533389e-07, "learning_rate": 0.0001185087352223041, "loss": 2.486, "step": 11997 }, { "crossentropy": 2.4976868629455566, "epoch": 0.6524375322874466, "grad_norm": 0.031025564298033714, "grad_norm_var": 3.1790336647245093e-07, "learning_rate": 0.0001182393926323383, "loss": 2.4977, "step": 11998 }, { "crossentropy": 2.4413195848464966, "epoch": 0.6524919111449469, "grad_norm": 0.03050350770354271, "grad_norm_var": 3.352055943408773e-07, "learning_rate": 0.00011797035280461443, "loss": 2.4413, "step": 11999 }, { "crossentropy": 2.381976366043091, "epoch": 0.652546290002447, "grad_norm": 0.03262916952371597, "grad_norm_var": 4.512439760133753e-07, "learning_rate": 0.00011770161575581861, "loss": 2.382, "step": 12000 }, { "crossentropy": 2.4660017490386963, "epoch": 0.6526006688599473, "grad_norm": 0.030252063646912575, "grad_norm_var": 4.850031021202498e-07, "learning_rate": 0.00011743318150261639, "loss": 2.466, "step": 12001 }, { "crossentropy": 2.503072142601013, "epoch": 0.6526550477174474, "grad_norm": 0.03150906041264534, "grad_norm_var": 4.879410935780075e-07, "learning_rate": 0.00011716505006165557, "loss": 2.5031, "step": 12002 }, { "crossentropy": 2.6206424236297607, "epoch": 0.6527094265749477, "grad_norm": 0.03157652169466019, "grad_norm_var": 4.2312260052536583e-07, "learning_rate": 0.00011689722144956672, "loss": 2.6206, "step": 12003 }, { "crossentropy": 2.4418087005615234, "epoch": 0.6527638054324478, "grad_norm": 0.030285414308309555, "grad_norm_var": 4.309563682848456e-07, "learning_rate": 0.00011662969568295933, "loss": 2.4418, "step": 12004 }, { "crossentropy": 2.5011746883392334, "epoch": 0.6528181842899481, "grad_norm": 0.030563589185476303, "grad_norm_var": 4.4023721600394695e-07, "learning_rate": 0.00011636247277842571, "loss": 2.5012, "step": 12005 }, { "crossentropy": 2.534572958946228, "epoch": 0.6528725631474482, "grad_norm": 0.03153000771999359, "grad_norm_var": 4.1560236690763434e-07, "learning_rate": 0.00011609555275253869, "loss": 2.5346, "step": 12006 }, { "crossentropy": 2.6150046586990356, "epoch": 0.6529269420049485, "grad_norm": 0.03144833445549011, "grad_norm_var": 4.102898668815908e-07, "learning_rate": 0.0001158289356218517, "loss": 2.615, "step": 12007 }, { "crossentropy": 2.53887939453125, "epoch": 0.6529813208624486, "grad_norm": 0.0313631147146225, "grad_norm_var": 4.105737840676485e-07, "learning_rate": 0.00011556262140290152, "loss": 2.5389, "step": 12008 }, { "crossentropy": 2.529401183128357, "epoch": 0.6530356997199489, "grad_norm": 0.043806422501802444, "grad_norm_var": 1.0411752959944843e-05, "learning_rate": 0.00011529661011220272, "loss": 2.5294, "step": 12009 }, { "crossentropy": 2.5543748140335083, "epoch": 0.653090078577449, "grad_norm": 0.03228745236992836, "grad_norm_var": 1.0352784368358274e-05, "learning_rate": 0.00011503090176625486, "loss": 2.5544, "step": 12010 }, { "crossentropy": 2.661415219306946, "epoch": 0.6531444574349493, "grad_norm": 0.032222673296928406, "grad_norm_var": 1.0245448834562965e-05, "learning_rate": 0.00011476549638153644, "loss": 2.6614, "step": 12011 }, { "crossentropy": 2.527276039123535, "epoch": 0.6531988362924495, "grad_norm": 0.03184743970632553, "grad_norm_var": 1.0239043152986069e-05, "learning_rate": 0.00011450039397450706, "loss": 2.5273, "step": 12012 }, { "crossentropy": 2.467210292816162, "epoch": 0.6532532151499497, "grad_norm": 0.03204335644841194, "grad_norm_var": 1.0134671614618098e-05, "learning_rate": 0.00011423559456160803, "loss": 2.4672, "step": 12013 }, { "crossentropy": 2.388607144355774, "epoch": 0.6533075940074499, "grad_norm": 0.03170989453792572, "grad_norm_var": 1.0058527406824391e-05, "learning_rate": 0.00011397109815926287, "loss": 2.3886, "step": 12014 }, { "crossentropy": 2.559667468070984, "epoch": 0.6533619728649501, "grad_norm": 0.031352151185274124, "grad_norm_var": 9.908903973719153e-06, "learning_rate": 0.00011370690478387513, "loss": 2.5597, "step": 12015 }, { "crossentropy": 2.4723376035690308, "epoch": 0.6534163517224503, "grad_norm": 0.031767670065164566, "grad_norm_var": 9.914799496426096e-06, "learning_rate": 0.00011344301445182947, "loss": 2.4723, "step": 12016 }, { "crossentropy": 2.5378626585006714, "epoch": 0.6534707305799505, "grad_norm": 0.03113068826496601, "grad_norm_var": 9.73217397760502e-06, "learning_rate": 0.00011317942717949225, "loss": 2.5379, "step": 12017 }, { "crossentropy": 2.57921040058136, "epoch": 0.6535251094374507, "grad_norm": 0.030918406322598457, "grad_norm_var": 9.814514750612624e-06, "learning_rate": 0.00011291614298321095, "loss": 2.5792, "step": 12018 }, { "crossentropy": 2.4243929386138916, "epoch": 0.6535794882949509, "grad_norm": 0.03062846139073372, "grad_norm_var": 9.954663676509969e-06, "learning_rate": 0.00011265316187931419, "loss": 2.4244, "step": 12019 }, { "crossentropy": 2.4224908351898193, "epoch": 0.6536338671524511, "grad_norm": 0.030921969562768936, "grad_norm_var": 9.819054686889019e-06, "learning_rate": 0.00011239048388411222, "loss": 2.4225, "step": 12020 }, { "crossentropy": 2.4789470434188843, "epoch": 0.6536882460099513, "grad_norm": 0.032533541321754456, "grad_norm_var": 9.626170701994055e-06, "learning_rate": 0.00011212810901389536, "loss": 2.4789, "step": 12021 }, { "crossentropy": 2.360925555229187, "epoch": 0.6537426248674515, "grad_norm": 0.032855354249477386, "grad_norm_var": 9.592027993094351e-06, "learning_rate": 0.00011186603728493671, "loss": 2.3609, "step": 12022 }, { "crossentropy": 2.521086573600769, "epoch": 0.6537970037249518, "grad_norm": 0.0325675830245018, "grad_norm_var": 9.524227507210168e-06, "learning_rate": 0.00011160426871348939, "loss": 2.5211, "step": 12023 }, { "crossentropy": 2.6087599992752075, "epoch": 0.6538513825824519, "grad_norm": 0.0317600779235363, "grad_norm_var": 9.474047654392005e-06, "learning_rate": 0.0001113428033157865, "loss": 2.6088, "step": 12024 }, { "crossentropy": 2.5287798643112183, "epoch": 0.6539057614399522, "grad_norm": 0.031058775261044502, "grad_norm_var": 4.505975578890231e-07, "learning_rate": 0.00011108164110804675, "loss": 2.5288, "step": 12025 }, { "crossentropy": 2.4729472398757935, "epoch": 0.6539601402974523, "grad_norm": 0.031883228570222855, "grad_norm_var": 4.3051415487131647e-07, "learning_rate": 0.00011082078210646496, "loss": 2.4729, "step": 12026 }, { "crossentropy": 2.5886491537094116, "epoch": 0.6540145191549526, "grad_norm": 0.045003242790699005, "grad_norm_var": 1.1529988545745995e-05, "learning_rate": 0.0001105602263272204, "loss": 2.5886, "step": 12027 }, { "crossentropy": 2.5921801328659058, "epoch": 0.6540688980124527, "grad_norm": 0.031196728348731995, "grad_norm_var": 1.1612971273530997e-05, "learning_rate": 0.00011029997378647182, "loss": 2.5922, "step": 12028 }, { "crossentropy": 2.5217297077178955, "epoch": 0.654123276869953, "grad_norm": 0.03377583250403404, "grad_norm_var": 1.1704736838925273e-05, "learning_rate": 0.00011004002450036022, "loss": 2.5217, "step": 12029 }, { "crossentropy": 2.478240728378296, "epoch": 0.6541776557274531, "grad_norm": 0.029651392251253128, "grad_norm_var": 1.2204679454579079e-05, "learning_rate": 0.00010978037848500765, "loss": 2.4782, "step": 12030 }, { "crossentropy": 2.5115652084350586, "epoch": 0.6542320345849534, "grad_norm": 0.03116788901388645, "grad_norm_var": 1.2233474488530736e-05, "learning_rate": 0.00010952103575651684, "loss": 2.5116, "step": 12031 }, { "crossentropy": 2.524099111557007, "epoch": 0.6542864134424535, "grad_norm": 0.031851284205913544, "grad_norm_var": 1.2226568647696659e-05, "learning_rate": 0.00010926199633097155, "loss": 2.5241, "step": 12032 }, { "crossentropy": 2.644127130508423, "epoch": 0.6543407922999538, "grad_norm": 0.03170352056622505, "grad_norm_var": 1.2147722097889076e-05, "learning_rate": 0.0001090032602244384, "loss": 2.6441, "step": 12033 }, { "crossentropy": 2.5290539264678955, "epoch": 0.6543951711574539, "grad_norm": 0.0314178466796875, "grad_norm_var": 1.2060166110096128e-05, "learning_rate": 0.00010874482745296288, "loss": 2.5291, "step": 12034 }, { "crossentropy": 2.4317039251327515, "epoch": 0.6544495500149542, "grad_norm": 0.03046349808573723, "grad_norm_var": 1.2102999616089387e-05, "learning_rate": 0.00010848669803257272, "loss": 2.4317, "step": 12035 }, { "crossentropy": 2.525315761566162, "epoch": 0.6545039288724543, "grad_norm": 0.03134160861372948, "grad_norm_var": 1.2026370173611605e-05, "learning_rate": 0.00010822887197927788, "loss": 2.5253, "step": 12036 }, { "crossentropy": 2.5948572158813477, "epoch": 0.6545583077299546, "grad_norm": 0.03253341466188431, "grad_norm_var": 1.2026369852415065e-05, "learning_rate": 0.00010797134930906727, "loss": 2.5949, "step": 12037 }, { "crossentropy": 2.4687143564224243, "epoch": 0.6546126865874548, "grad_norm": 0.03177841380238533, "grad_norm_var": 1.2049906943311685e-05, "learning_rate": 0.00010771413003791253, "loss": 2.4687, "step": 12038 }, { "crossentropy": 2.527394652366638, "epoch": 0.654667065444955, "grad_norm": 0.031199730932712555, "grad_norm_var": 1.2144880317715006e-05, "learning_rate": 0.00010745721418176646, "loss": 2.5274, "step": 12039 }, { "crossentropy": 2.547108769416809, "epoch": 0.6547214443024552, "grad_norm": 0.030871054157614708, "grad_norm_var": 1.2265586894842046e-05, "learning_rate": 0.00010720060175656299, "loss": 2.5471, "step": 12040 }, { "crossentropy": 2.528740167617798, "epoch": 0.6547758231599554, "grad_norm": 0.03245120868086815, "grad_norm_var": 1.2155192387379443e-05, "learning_rate": 0.0001069442927782166, "loss": 2.5287, "step": 12041 }, { "crossentropy": 2.611554980278015, "epoch": 0.6548302020174556, "grad_norm": 0.03162244334816933, "grad_norm_var": 1.2177172508801478e-05, "learning_rate": 0.00010668828726262347, "loss": 2.6116, "step": 12042 }, { "crossentropy": 2.556369662284851, "epoch": 0.6548845808749558, "grad_norm": 0.031903546303510666, "grad_norm_var": 8.486584356675574e-07, "learning_rate": 0.00010643258522566035, "loss": 2.5564, "step": 12043 }, { "crossentropy": 2.5332342386245728, "epoch": 0.654938959732456, "grad_norm": 0.03138990327715874, "grad_norm_var": 8.416832969015038e-07, "learning_rate": 0.00010617718668318621, "loss": 2.5332, "step": 12044 }, { "crossentropy": 2.563472032546997, "epoch": 0.6549933385899562, "grad_norm": 0.030559072270989418, "grad_norm_var": 5.423897263381276e-07, "learning_rate": 0.0001059220916510406, "loss": 2.5635, "step": 12045 }, { "crossentropy": 2.49215304851532, "epoch": 0.6550477174474564, "grad_norm": 0.03175783157348633, "grad_norm_var": 3.372707050917998e-07, "learning_rate": 0.00010566730014504367, "loss": 2.4922, "step": 12046 }, { "crossentropy": 2.4387680292129517, "epoch": 0.6551020963049566, "grad_norm": 0.03305380046367645, "grad_norm_var": 4.7585838404556877e-07, "learning_rate": 0.00010541281218099829, "loss": 2.4388, "step": 12047 }, { "crossentropy": 2.5571913719177246, "epoch": 0.6551564751624568, "grad_norm": 0.03124896064400673, "grad_norm_var": 4.798490639649003e-07, "learning_rate": 0.00010515862777468687, "loss": 2.5572, "step": 12048 }, { "crossentropy": 2.4251168966293335, "epoch": 0.655210854019957, "grad_norm": 0.042754534631967545, "grad_norm_var": 8.293199696340383e-06, "learning_rate": 0.00010490474694187402, "loss": 2.4251, "step": 12049 }, { "crossentropy": 2.499030351638794, "epoch": 0.6552652328774572, "grad_norm": 0.03038603626191616, "grad_norm_var": 8.477205022387038e-06, "learning_rate": 0.0001046511696983049, "loss": 2.499, "step": 12050 }, { "crossentropy": 2.4963817596435547, "epoch": 0.6553196117349575, "grad_norm": 0.030930886045098305, "grad_norm_var": 8.382194091702266e-06, "learning_rate": 0.0001043978960597064, "loss": 2.4964, "step": 12051 }, { "crossentropy": 2.4821276664733887, "epoch": 0.6553739905924576, "grad_norm": 0.029283994808793068, "grad_norm_var": 8.892290466613638e-06, "learning_rate": 0.00010414492604178649, "loss": 2.4821, "step": 12052 }, { "crossentropy": 2.6286622285842896, "epoch": 0.6554283694499579, "grad_norm": 0.03495369106531143, "grad_norm_var": 9.395745790504574e-06, "learning_rate": 0.00010389225966023319, "loss": 2.6287, "step": 12053 }, { "crossentropy": 2.4443979263305664, "epoch": 0.655482748307458, "grad_norm": 0.03407733142375946, "grad_norm_var": 9.578728037632498e-06, "learning_rate": 0.00010363989693071785, "loss": 2.4444, "step": 12054 }, { "crossentropy": 2.523869276046753, "epoch": 0.6555371271649583, "grad_norm": 0.031054407358169556, "grad_norm_var": 9.603358273630814e-06, "learning_rate": 0.00010338783786889127, "loss": 2.5239, "step": 12055 }, { "crossentropy": 2.520257353782654, "epoch": 0.6555915060224584, "grad_norm": 0.032555434852838516, "grad_norm_var": 9.438724369788727e-06, "learning_rate": 0.00010313608249038598, "loss": 2.5203, "step": 12056 }, { "crossentropy": 2.485978603363037, "epoch": 0.6556458848799587, "grad_norm": 0.030407393351197243, "grad_norm_var": 9.712806121416741e-06, "learning_rate": 0.000102884630810815, "loss": 2.486, "step": 12057 }, { "crossentropy": 2.515317440032959, "epoch": 0.6557002637374588, "grad_norm": 0.03155028074979782, "grad_norm_var": 9.720335923893416e-06, "learning_rate": 0.00010263348284577368, "loss": 2.5153, "step": 12058 }, { "crossentropy": 2.5194475650787354, "epoch": 0.6557546425949591, "grad_norm": 0.0316246822476387, "grad_norm_var": 9.74241694835878e-06, "learning_rate": 0.0001023826386108384, "loss": 2.5194, "step": 12059 }, { "crossentropy": 2.590216279029846, "epoch": 0.6558090214524592, "grad_norm": 0.03120088204741478, "grad_norm_var": 9.768828645071572e-06, "learning_rate": 0.00010213209812156566, "loss": 2.5902, "step": 12060 }, { "crossentropy": 2.457423210144043, "epoch": 0.6558634003099595, "grad_norm": 0.03228479623794556, "grad_norm_var": 9.545762518449406e-06, "learning_rate": 0.00010188186139349353, "loss": 2.4574, "step": 12061 }, { "crossentropy": 2.506393551826477, "epoch": 0.6559177791674597, "grad_norm": 0.030578432604670525, "grad_norm_var": 9.740806908986585e-06, "learning_rate": 0.00010163192844214242, "loss": 2.5064, "step": 12062 }, { "crossentropy": 2.6458630561828613, "epoch": 0.6559721580249599, "grad_norm": 0.03250124305486679, "grad_norm_var": 9.709628479344006e-06, "learning_rate": 0.00010138229928301212, "loss": 2.6459, "step": 12063 }, { "crossentropy": 2.5586044788360596, "epoch": 0.6560265368824602, "grad_norm": 0.03137911483645439, "grad_norm_var": 9.691804447690257e-06, "learning_rate": 0.00010113297393158416, "loss": 2.5586, "step": 12064 }, { "crossentropy": 2.443920373916626, "epoch": 0.6560809157399603, "grad_norm": 0.030848948284983635, "grad_norm_var": 2.02683765629415e-06, "learning_rate": 0.00010088395240332282, "loss": 2.4439, "step": 12065 }, { "crossentropy": 2.5485687255859375, "epoch": 0.6561352945974606, "grad_norm": 0.03150228038430214, "grad_norm_var": 1.9238720974077873e-06, "learning_rate": 0.00010063523471367076, "loss": 2.5486, "step": 12066 }, { "crossentropy": 2.5748358964920044, "epoch": 0.6561896734549607, "grad_norm": 0.03099120408296585, "grad_norm_var": 1.9181482989615954e-06, "learning_rate": 0.00010038682087805395, "loss": 2.5748, "step": 12067 }, { "crossentropy": 2.523136854171753, "epoch": 0.656244052312461, "grad_norm": 0.031351663172245026, "grad_norm_var": 1.5262788330626011e-06, "learning_rate": 0.00010013871091187843, "loss": 2.5231, "step": 12068 }, { "crossentropy": 2.579119324684143, "epoch": 0.6562984311699611, "grad_norm": 0.030493084341287613, "grad_norm_var": 8.964887114587857e-07, "learning_rate": 9.989090483053186e-05, "loss": 2.5791, "step": 12069 }, { "crossentropy": 2.4693280458450317, "epoch": 0.6563528100274614, "grad_norm": 0.03148121014237404, "grad_norm_var": 4.3426631889050397e-07, "learning_rate": 9.964340264938365e-05, "loss": 2.4693, "step": 12070 }, { "crossentropy": 2.45425808429718, "epoch": 0.6564071888849615, "grad_norm": 0.03138657286763191, "grad_norm_var": 4.2750315591871755e-07, "learning_rate": 9.939620438378261e-05, "loss": 2.4543, "step": 12071 }, { "crossentropy": 2.5655206441879272, "epoch": 0.6564615677424618, "grad_norm": 0.032457780092954636, "grad_norm_var": 4.128408440981357e-07, "learning_rate": 9.914931004905924e-05, "loss": 2.5655, "step": 12072 }, { "crossentropy": 2.4362785816192627, "epoch": 0.6565159465999619, "grad_norm": 0.03181631863117218, "grad_norm_var": 3.546717755887237e-07, "learning_rate": 9.8902719660528e-05, "loss": 2.4363, "step": 12073 }, { "crossentropy": 2.545019745826721, "epoch": 0.6565703254574622, "grad_norm": 0.03205028921365738, "grad_norm_var": 3.759473928011588e-07, "learning_rate": 9.86564332334805e-05, "loss": 2.545, "step": 12074 }, { "crossentropy": 2.5220004320144653, "epoch": 0.6566247043149623, "grad_norm": 0.031677208840847015, "grad_norm_var": 3.770155924965231e-07, "learning_rate": 9.841045078319121e-05, "loss": 2.522, "step": 12075 }, { "crossentropy": 2.4255034923553467, "epoch": 0.6566790831724626, "grad_norm": 0.03079853765666485, "grad_norm_var": 4.031830624830751e-07, "learning_rate": 9.816477232491572e-05, "loss": 2.4255, "step": 12076 }, { "crossentropy": 2.5892012119293213, "epoch": 0.6567334620299627, "grad_norm": 0.03133903816342354, "grad_norm_var": 3.569601627131791e-07, "learning_rate": 9.791939787389126e-05, "loss": 2.5892, "step": 12077 }, { "crossentropy": 2.5210641622543335, "epoch": 0.656787840887463, "grad_norm": 0.031204015016555786, "grad_norm_var": 3.115734465750086e-07, "learning_rate": 9.767432744533621e-05, "loss": 2.5211, "step": 12078 }, { "crossentropy": 2.501199722290039, "epoch": 0.6568422197449632, "grad_norm": 0.03114272467792034, "grad_norm_var": 2.37392770230021e-07, "learning_rate": 9.742956105444789e-05, "loss": 2.5012, "step": 12079 }, { "crossentropy": 2.4349924325942993, "epoch": 0.6568965986024634, "grad_norm": 0.03287959471344948, "grad_norm_var": 3.7993143202050677e-07, "learning_rate": 9.718509871640801e-05, "loss": 2.435, "step": 12080 }, { "crossentropy": 2.5130046606063843, "epoch": 0.6569509774599636, "grad_norm": 0.030708851292729378, "grad_norm_var": 3.9264292849675436e-07, "learning_rate": 9.69409404463778e-05, "loss": 2.513, "step": 12081 }, { "crossentropy": 2.458271622657776, "epoch": 0.6570053563174638, "grad_norm": 0.031494952738285065, "grad_norm_var": 3.926001133350374e-07, "learning_rate": 9.669708625949903e-05, "loss": 2.4583, "step": 12082 }, { "crossentropy": 2.5486682653427124, "epoch": 0.657059735174964, "grad_norm": 0.031150581315159798, "grad_norm_var": 3.843411173274728e-07, "learning_rate": 9.645353617089569e-05, "loss": 2.5487, "step": 12083 }, { "crossentropy": 2.5141783952713013, "epoch": 0.6571141140324642, "grad_norm": 0.03084113821387291, "grad_norm_var": 4.083134519609783e-07, "learning_rate": 9.62102901956724e-05, "loss": 2.5142, "step": 12084 }, { "crossentropy": 2.5579440593719482, "epoch": 0.6571684928899644, "grad_norm": 0.031097320839762688, "grad_norm_var": 3.554388573995115e-07, "learning_rate": 9.596734834891486e-05, "loss": 2.5579, "step": 12085 }, { "crossentropy": 2.472770094871521, "epoch": 0.6572228717474646, "grad_norm": 0.031334392726421356, "grad_norm_var": 3.565741264428679e-07, "learning_rate": 9.572471064569044e-05, "loss": 2.4728, "step": 12086 }, { "crossentropy": 2.524721622467041, "epoch": 0.6572772506049648, "grad_norm": 0.032942164689302444, "grad_norm_var": 4.923356485573981e-07, "learning_rate": 9.548237710104769e-05, "loss": 2.5247, "step": 12087 }, { "crossentropy": 2.3848401308059692, "epoch": 0.657331629462465, "grad_norm": 0.03230401128530502, "grad_norm_var": 4.753745561844458e-07, "learning_rate": 9.524034773001511e-05, "loss": 2.3848, "step": 12088 }, { "crossentropy": 2.4725393056869507, "epoch": 0.6573860083199652, "grad_norm": 0.03265663981437683, "grad_norm_var": 5.494794488800399e-07, "learning_rate": 9.49986225476035e-05, "loss": 2.4725, "step": 12089 }, { "crossentropy": 2.5146418809890747, "epoch": 0.6574403871774654, "grad_norm": 0.031115151941776276, "grad_norm_var": 5.481575023490951e-07, "learning_rate": 9.475720156880418e-05, "loss": 2.5146, "step": 12090 }, { "crossentropy": 2.5132973194122314, "epoch": 0.6574947660349656, "grad_norm": 0.032358430325984955, "grad_norm_var": 5.893610627691655e-07, "learning_rate": 9.451608480859019e-05, "loss": 2.5133, "step": 12091 }, { "crossentropy": 2.673735499382019, "epoch": 0.6575491448924659, "grad_norm": 0.0326237753033638, "grad_norm_var": 6.060671503149244e-07, "learning_rate": 9.427527228191568e-05, "loss": 2.6737, "step": 12092 }, { "crossentropy": 2.473752737045288, "epoch": 0.657603523749966, "grad_norm": 0.03113812766969204, "grad_norm_var": 6.182473509159231e-07, "learning_rate": 9.403476400371424e-05, "loss": 2.4738, "step": 12093 }, { "crossentropy": 2.4333138465881348, "epoch": 0.6576579026074663, "grad_norm": 0.03126084432005882, "grad_norm_var": 6.147895660137675e-07, "learning_rate": 9.379455998890341e-05, "loss": 2.4333, "step": 12094 }, { "crossentropy": 2.4698286056518555, "epoch": 0.6577122814649664, "grad_norm": 0.030962156131863594, "grad_norm_var": 6.300165670925501e-07, "learning_rate": 9.355466025238013e-05, "loss": 2.4698, "step": 12095 }, { "crossentropy": 2.4428553581237793, "epoch": 0.6577666603224667, "grad_norm": 0.03126492351293564, "grad_norm_var": 5.345444486055806e-07, "learning_rate": 9.331506480902253e-05, "loss": 2.4429, "step": 12096 }, { "crossentropy": 2.4859708547592163, "epoch": 0.6578210391799668, "grad_norm": 0.030275361612439156, "grad_norm_var": 5.965443608452047e-07, "learning_rate": 9.30757736736898e-05, "loss": 2.486, "step": 12097 }, { "crossentropy": 2.3691024780273438, "epoch": 0.6578754180374671, "grad_norm": 0.03184075281023979, "grad_norm_var": 6.014223662522174e-07, "learning_rate": 9.283678686122288e-05, "loss": 2.3691, "step": 12098 }, { "crossentropy": 2.496378779411316, "epoch": 0.6579297968949672, "grad_norm": 0.03163208067417145, "grad_norm_var": 5.888021626145509e-07, "learning_rate": 9.259810438644378e-05, "loss": 2.4964, "step": 12099 }, { "crossentropy": 2.5480321645736694, "epoch": 0.6579841757524675, "grad_norm": 0.03188269957900047, "grad_norm_var": 5.508081721015504e-07, "learning_rate": 9.235972626415457e-05, "loss": 2.548, "step": 12100 }, { "crossentropy": 2.3667646646499634, "epoch": 0.6580385546099676, "grad_norm": 0.030171331018209457, "grad_norm_var": 6.748647528589339e-07, "learning_rate": 9.212165250914061e-05, "loss": 2.3668, "step": 12101 }, { "crossentropy": 2.474611520767212, "epoch": 0.6580929334674679, "grad_norm": 0.029971692711114883, "grad_norm_var": 8.410325052298952e-07, "learning_rate": 9.188388313616514e-05, "loss": 2.4746, "step": 12102 }, { "crossentropy": 2.4402366876602173, "epoch": 0.658147312324968, "grad_norm": 0.03081715852022171, "grad_norm_var": 7.217320847909773e-07, "learning_rate": 9.164641815997631e-05, "loss": 2.4402, "step": 12103 }, { "crossentropy": 2.5131125450134277, "epoch": 0.6582016911824683, "grad_norm": 0.03208131715655327, "grad_norm_var": 6.977575078554679e-07, "learning_rate": 9.140925759530017e-05, "loss": 2.5131, "step": 12104 }, { "crossentropy": 2.5173178911209106, "epoch": 0.6582560700399684, "grad_norm": 0.03145045042037964, "grad_norm_var": 5.830954003841708e-07, "learning_rate": 9.117240145684547e-05, "loss": 2.5173, "step": 12105 }, { "crossentropy": 2.496525526046753, "epoch": 0.6583104488974687, "grad_norm": 0.03159676864743233, "grad_norm_var": 5.855368063816761e-07, "learning_rate": 9.093584975930213e-05, "loss": 2.4965, "step": 12106 }, { "crossentropy": 2.466199040412903, "epoch": 0.6583648277549689, "grad_norm": 0.032007213681936264, "grad_norm_var": 5.452262380923097e-07, "learning_rate": 9.069960251734066e-05, "loss": 2.4662, "step": 12107 }, { "crossentropy": 2.584971070289612, "epoch": 0.6584192066124691, "grad_norm": 0.03241465240716934, "grad_norm_var": 5.113564683853807e-07, "learning_rate": 9.046365974561266e-05, "loss": 2.585, "step": 12108 }, { "crossentropy": 2.468191146850586, "epoch": 0.6584735854699693, "grad_norm": 0.030665770173072815, "grad_norm_var": 5.353686412078983e-07, "learning_rate": 9.022802145875198e-05, "loss": 2.4682, "step": 12109 }, { "crossentropy": 2.516178846359253, "epoch": 0.6585279643274695, "grad_norm": 0.03234059736132622, "grad_norm_var": 6.071405776718215e-07, "learning_rate": 8.999268767137136e-05, "loss": 2.5162, "step": 12110 }, { "crossentropy": 2.6085115671157837, "epoch": 0.6585823431849697, "grad_norm": 0.02972804568707943, "grad_norm_var": 7.638341065379269e-07, "learning_rate": 8.975765839806693e-05, "loss": 2.6085, "step": 12111 }, { "crossentropy": 2.380630373954773, "epoch": 0.6586367220424699, "grad_norm": 0.03222479298710823, "grad_norm_var": 8.222020198201189e-07, "learning_rate": 8.95229336534148e-05, "loss": 2.3806, "step": 12112 }, { "crossentropy": 2.4142669439315796, "epoch": 0.6586911008999701, "grad_norm": 0.03099249117076397, "grad_norm_var": 7.545741449411133e-07, "learning_rate": 8.928851345197164e-05, "loss": 2.4143, "step": 12113 }, { "crossentropy": 2.513527512550354, "epoch": 0.6587454797574703, "grad_norm": 0.030879560858011246, "grad_norm_var": 7.511675980291972e-07, "learning_rate": 8.905439780827751e-05, "loss": 2.5135, "step": 12114 }, { "crossentropy": 2.5157532691955566, "epoch": 0.6587998586149705, "grad_norm": 0.03128378465771675, "grad_norm_var": 7.434921766274448e-07, "learning_rate": 8.882058673685079e-05, "loss": 2.5158, "step": 12115 }, { "crossentropy": 2.5275402069091797, "epoch": 0.6588542374724707, "grad_norm": 0.03150250390172005, "grad_norm_var": 7.220637188821872e-07, "learning_rate": 8.858708025219264e-05, "loss": 2.5275, "step": 12116 }, { "crossentropy": 2.559166193008423, "epoch": 0.6589086163299709, "grad_norm": 0.03170054033398628, "grad_norm_var": 6.466512049435949e-07, "learning_rate": 8.835387836878484e-05, "loss": 2.5592, "step": 12117 }, { "crossentropy": 2.473705768585205, "epoch": 0.6589629951874711, "grad_norm": 0.029396191239356995, "grad_norm_var": 7.733887065750067e-07, "learning_rate": 8.812098110109024e-05, "loss": 2.4737, "step": 12118 }, { "crossentropy": 2.4518851041793823, "epoch": 0.6590173740449713, "grad_norm": 0.031837888062000275, "grad_norm_var": 7.703960014172432e-07, "learning_rate": 8.788838846355341e-05, "loss": 2.4519, "step": 12119 }, { "crossentropy": 2.491671323776245, "epoch": 0.6590717529024716, "grad_norm": 0.031763967126607895, "grad_norm_var": 7.470750570277388e-07, "learning_rate": 8.765610047059891e-05, "loss": 2.4917, "step": 12120 }, { "crossentropy": 2.510919451713562, "epoch": 0.6591261317599717, "grad_norm": 0.03148817643523216, "grad_norm_var": 7.476110597967022e-07, "learning_rate": 8.7424117136633e-05, "loss": 2.5109, "step": 12121 }, { "crossentropy": 2.4648349285125732, "epoch": 0.659180510617472, "grad_norm": 0.030508605763316154, "grad_norm_var": 7.878356174561279e-07, "learning_rate": 8.719243847604363e-05, "loss": 2.4648, "step": 12122 }, { "crossentropy": 2.4402562379837036, "epoch": 0.6592348894749721, "grad_norm": 0.03130792826414108, "grad_norm_var": 7.520788438901271e-07, "learning_rate": 8.69610645031993e-05, "loss": 2.4403, "step": 12123 }, { "crossentropy": 2.585310697555542, "epoch": 0.6592892683324724, "grad_norm": 0.030416538938879967, "grad_norm_var": 6.919174610819923e-07, "learning_rate": 8.672999523244851e-05, "loss": 2.5853, "step": 12124 }, { "crossentropy": 2.507297992706299, "epoch": 0.6593436471899725, "grad_norm": 0.03169890493154526, "grad_norm_var": 6.950465722639632e-07, "learning_rate": 8.649923067812371e-05, "loss": 2.5073, "step": 12125 }, { "crossentropy": 2.4377723932266235, "epoch": 0.6593980260474728, "grad_norm": 0.03346986696124077, "grad_norm_var": 9.477071134122532e-07, "learning_rate": 8.626877085453566e-05, "loss": 2.4378, "step": 12126 }, { "crossentropy": 2.4638657569885254, "epoch": 0.6594524049049729, "grad_norm": 0.031365808099508286, "grad_norm_var": 7.80275410591534e-07, "learning_rate": 8.603861577597683e-05, "loss": 2.4639, "step": 12127 }, { "crossentropy": 2.495169162750244, "epoch": 0.6595067837624732, "grad_norm": 0.03126854449510574, "grad_norm_var": 7.277831507150004e-07, "learning_rate": 8.58087654567219e-05, "loss": 2.4952, "step": 12128 }, { "crossentropy": 2.473283886909485, "epoch": 0.6595611626199733, "grad_norm": 0.030727827921509743, "grad_norm_var": 7.431918823077536e-07, "learning_rate": 8.557921991102613e-05, "loss": 2.4733, "step": 12129 }, { "crossentropy": 2.5332332849502563, "epoch": 0.6596155414774736, "grad_norm": 0.030139388516545296, "grad_norm_var": 8.177948199026078e-07, "learning_rate": 8.534997915312537e-05, "loss": 2.5332, "step": 12130 }, { "crossentropy": 2.490962028503418, "epoch": 0.6596699203349737, "grad_norm": 0.03189269080758095, "grad_norm_var": 8.443374702788234e-07, "learning_rate": 8.512104319723713e-05, "loss": 2.491, "step": 12131 }, { "crossentropy": 2.4627695083618164, "epoch": 0.659724299192474, "grad_norm": 0.03145609796047211, "grad_norm_var": 8.430974083694735e-07, "learning_rate": 8.489241205755949e-05, "loss": 2.4628, "step": 12132 }, { "crossentropy": 2.5653880834579468, "epoch": 0.6597786780499741, "grad_norm": 0.03112870827317238, "grad_norm_var": 8.312750710474011e-07, "learning_rate": 8.466408574827167e-05, "loss": 2.5654, "step": 12133 }, { "crossentropy": 2.5409780740737915, "epoch": 0.6598330569074744, "grad_norm": 0.031509313732385635, "grad_norm_var": 5.903851773361145e-07, "learning_rate": 8.443606428353568e-05, "loss": 2.541, "step": 12134 }, { "crossentropy": 2.426719069480896, "epoch": 0.6598874357649746, "grad_norm": 0.03214796259999275, "grad_norm_var": 6.155826398094218e-07, "learning_rate": 8.420834767749075e-05, "loss": 2.4267, "step": 12135 }, { "crossentropy": 2.6302489042282104, "epoch": 0.6599418146224748, "grad_norm": 0.03210432082414627, "grad_norm_var": 6.396507399872645e-07, "learning_rate": 8.398093594426226e-05, "loss": 2.6302, "step": 12136 }, { "crossentropy": 2.495979070663452, "epoch": 0.659996193479975, "grad_norm": 0.03190857172012329, "grad_norm_var": 6.548308741700925e-07, "learning_rate": 8.375382909795282e-05, "loss": 2.496, "step": 12137 }, { "crossentropy": 2.4933571815490723, "epoch": 0.6600505723374752, "grad_norm": 0.029996884986758232, "grad_norm_var": 7.347927610930338e-07, "learning_rate": 8.352702715264727e-05, "loss": 2.4934, "step": 12138 }, { "crossentropy": 2.5414708852767944, "epoch": 0.6601049511949754, "grad_norm": 0.03157259523868561, "grad_norm_var": 7.35614321573097e-07, "learning_rate": 8.330053012241156e-05, "loss": 2.5415, "step": 12139 }, { "crossentropy": 2.373990058898926, "epoch": 0.6601593300524756, "grad_norm": 0.030713247135281563, "grad_norm_var": 7.012107799292746e-07, "learning_rate": 8.307433802129394e-05, "loss": 2.374, "step": 12140 }, { "crossentropy": 2.553290843963623, "epoch": 0.6602137089099758, "grad_norm": 0.031044742092490196, "grad_norm_var": 7.057052961652408e-07, "learning_rate": 8.284845086332094e-05, "loss": 2.5533, "step": 12141 }, { "crossentropy": 2.5234066247940063, "epoch": 0.660268087767476, "grad_norm": 0.03144017979502678, "grad_norm_var": 4.0381222729059656e-07, "learning_rate": 8.262286866250356e-05, "loss": 2.5234, "step": 12142 }, { "crossentropy": 2.5515804290771484, "epoch": 0.6603224666249762, "grad_norm": 0.031443748623132706, "grad_norm_var": 4.0512461536624093e-07, "learning_rate": 8.239759143283065e-05, "loss": 2.5516, "step": 12143 }, { "crossentropy": 2.514548897743225, "epoch": 0.6603768454824764, "grad_norm": 0.03197864443063736, "grad_norm_var": 4.354674028770548e-07, "learning_rate": 8.21726191882749e-05, "loss": 2.5145, "step": 12144 }, { "crossentropy": 2.487273097038269, "epoch": 0.6604312243399766, "grad_norm": 0.03198118880391121, "grad_norm_var": 4.338017771771466e-07, "learning_rate": 8.194795194278848e-05, "loss": 2.4873, "step": 12145 }, { "crossentropy": 2.470495104789734, "epoch": 0.6604856031974768, "grad_norm": 0.033048491925001144, "grad_norm_var": 4.723523982770681e-07, "learning_rate": 8.17235897103047e-05, "loss": 2.4705, "step": 12146 }, { "crossentropy": 2.448529005050659, "epoch": 0.660539982054977, "grad_norm": 0.030357476323843002, "grad_norm_var": 5.567692966547729e-07, "learning_rate": 8.149953250473852e-05, "loss": 2.4485, "step": 12147 }, { "crossentropy": 2.478013515472412, "epoch": 0.6605943609124773, "grad_norm": 0.030735263600945473, "grad_norm_var": 5.924557922279879e-07, "learning_rate": 8.127578033998662e-05, "loss": 2.478, "step": 12148 }, { "crossentropy": 2.5672683715820312, "epoch": 0.6606487397699774, "grad_norm": 0.0328889936208725, "grad_norm_var": 7.120104472110289e-07, "learning_rate": 8.1052333229924e-05, "loss": 2.5673, "step": 12149 }, { "crossentropy": 2.4508769512176514, "epoch": 0.6607031186274777, "grad_norm": 0.03067883662879467, "grad_norm_var": 7.601171069177796e-07, "learning_rate": 8.082919118841015e-05, "loss": 2.4509, "step": 12150 }, { "crossentropy": 2.437758207321167, "epoch": 0.6607574974849778, "grad_norm": 0.031113851815462112, "grad_norm_var": 7.379662611698649e-07, "learning_rate": 8.060635422928397e-05, "loss": 2.4378, "step": 12151 }, { "crossentropy": 2.4655343294143677, "epoch": 0.6608118763424781, "grad_norm": 0.030601399019360542, "grad_norm_var": 7.456038331773533e-07, "learning_rate": 8.038382236636555e-05, "loss": 2.4655, "step": 12152 }, { "crossentropy": 2.5215879678726196, "epoch": 0.6608662551999782, "grad_norm": 0.030801111832261086, "grad_norm_var": 7.388937096971012e-07, "learning_rate": 8.016159561345548e-05, "loss": 2.5216, "step": 12153 }, { "crossentropy": 2.3166263103485107, "epoch": 0.6609206340574785, "grad_norm": 0.030961010605096817, "grad_norm_var": 6.327149253584445e-07, "learning_rate": 7.993967398433611e-05, "loss": 2.3166, "step": 12154 }, { "crossentropy": 2.493411898612976, "epoch": 0.6609750129149786, "grad_norm": 0.03153624013066292, "grad_norm_var": 6.316460612433456e-07, "learning_rate": 7.971805749277195e-05, "loss": 2.4934, "step": 12155 }, { "crossentropy": 2.506806254386902, "epoch": 0.6610293917724789, "grad_norm": 0.03180548921227455, "grad_norm_var": 6.159846147169342e-07, "learning_rate": 7.949674615250647e-05, "loss": 2.5068, "step": 12156 }, { "crossentropy": 2.569537878036499, "epoch": 0.661083770629979, "grad_norm": 0.0318208746612072, "grad_norm_var": 6.167620362183708e-07, "learning_rate": 7.927573997726534e-05, "loss": 2.5695, "step": 12157 }, { "crossentropy": 2.532911539077759, "epoch": 0.6611381494874793, "grad_norm": 0.0315915010869503, "grad_norm_var": 6.180041133886534e-07, "learning_rate": 7.905503898075483e-05, "loss": 2.5329, "step": 12158 }, { "crossentropy": 2.5513625144958496, "epoch": 0.6611925283449794, "grad_norm": 0.03093639202415943, "grad_norm_var": 6.351245186897766e-07, "learning_rate": 7.8834643176664e-05, "loss": 2.5514, "step": 12159 }, { "crossentropy": 2.481860399246216, "epoch": 0.6612469072024797, "grad_norm": 0.03125143051147461, "grad_norm_var": 6.147174369934185e-07, "learning_rate": 7.861455257865969e-05, "loss": 2.4819, "step": 12160 }, { "crossentropy": 2.4004653692245483, "epoch": 0.6613012860599798, "grad_norm": 0.030876660719513893, "grad_norm_var": 6.027010143067982e-07, "learning_rate": 7.839476720039262e-05, "loss": 2.4005, "step": 12161 }, { "crossentropy": 2.4973835945129395, "epoch": 0.6613556649174801, "grad_norm": 0.03047812543809414, "grad_norm_var": 4.207812178456573e-07, "learning_rate": 7.81752870554947e-05, "loss": 2.4974, "step": 12162 }, { "crossentropy": 2.59416925907135, "epoch": 0.6614100437749802, "grad_norm": 0.03191908821463585, "grad_norm_var": 4.077294365477929e-07, "learning_rate": 7.795611215757614e-05, "loss": 2.5942, "step": 12163 }, { "crossentropy": 2.51647686958313, "epoch": 0.6614644226324805, "grad_norm": 0.03177589923143387, "grad_norm_var": 4.040240428477189e-07, "learning_rate": 7.77372425202305e-05, "loss": 2.5165, "step": 12164 }, { "crossentropy": 2.402194619178772, "epoch": 0.6615188014899807, "grad_norm": 0.03002559393644333, "grad_norm_var": 3.1546158299524303e-07, "learning_rate": 7.751867815703195e-05, "loss": 2.4022, "step": 12165 }, { "crossentropy": 2.4561270475387573, "epoch": 0.6615731803474809, "grad_norm": 0.030559156090021133, "grad_norm_var": 3.2364945030820673e-07, "learning_rate": 7.730041908153629e-05, "loss": 2.4561, "step": 12166 }, { "crossentropy": 2.4799976348876953, "epoch": 0.6616275592049811, "grad_norm": 0.031832754611968994, "grad_norm_var": 3.545597273922936e-07, "learning_rate": 7.708246530727881e-05, "loss": 2.48, "step": 12167 }, { "crossentropy": 2.5109448432922363, "epoch": 0.6616819380624813, "grad_norm": 0.04225409775972366, "grad_norm_var": 7.95259488546284e-06, "learning_rate": 7.686481684777757e-05, "loss": 2.5109, "step": 12168 }, { "crossentropy": 2.556591749191284, "epoch": 0.6617363169199815, "grad_norm": 0.03088781051337719, "grad_norm_var": 7.940343352763092e-06, "learning_rate": 7.664747371653014e-05, "loss": 2.5566, "step": 12169 }, { "crossentropy": 2.5687047243118286, "epoch": 0.6617906957774817, "grad_norm": 0.031224530190229416, "grad_norm_var": 7.91144500850201e-06, "learning_rate": 7.643043592701681e-05, "loss": 2.5687, "step": 12170 }, { "crossentropy": 2.4674935340881348, "epoch": 0.6618450746349819, "grad_norm": 0.031227312982082367, "grad_norm_var": 7.933360186778103e-06, "learning_rate": 7.62137034926974e-05, "loss": 2.4675, "step": 12171 }, { "crossentropy": 2.360262393951416, "epoch": 0.6618994534924821, "grad_norm": 0.031645070761442184, "grad_norm_var": 7.937079258852713e-06, "learning_rate": 7.599727642701337e-05, "loss": 2.3603, "step": 12172 }, { "crossentropy": 2.472288131713867, "epoch": 0.6619538323499823, "grad_norm": 0.031145131215453148, "grad_norm_var": 7.972220057867008e-06, "learning_rate": 7.578115474338843e-05, "loss": 2.4723, "step": 12173 }, { "crossentropy": 2.5250638723373413, "epoch": 0.6620082112074825, "grad_norm": 0.03201191499829292, "grad_norm_var": 7.96866954480741e-06, "learning_rate": 7.556533845522462e-05, "loss": 2.5251, "step": 12174 }, { "crossentropy": 2.474207639694214, "epoch": 0.6620625900649827, "grad_norm": 0.03203096240758896, "grad_norm_var": 7.906101904682726e-06, "learning_rate": 7.534982757590791e-05, "loss": 2.4742, "step": 12175 }, { "crossentropy": 2.5691810846328735, "epoch": 0.662116968922483, "grad_norm": 0.031466398388147354, "grad_norm_var": 7.889064997649148e-06, "learning_rate": 7.513462211880317e-05, "loss": 2.5692, "step": 12176 }, { "crossentropy": 2.5343027114868164, "epoch": 0.6621713477799831, "grad_norm": 0.030986761674284935, "grad_norm_var": 7.87391861226741e-06, "learning_rate": 7.491972209725807e-05, "loss": 2.5343, "step": 12177 }, { "crossentropy": 2.5708603858947754, "epoch": 0.6622257266374834, "grad_norm": 0.03259309008717537, "grad_norm_var": 7.733654746540465e-06, "learning_rate": 7.470512752460024e-05, "loss": 2.5709, "step": 12178 }, { "crossentropy": 2.3864930868148804, "epoch": 0.6622801054949835, "grad_norm": 0.03143906593322754, "grad_norm_var": 7.75957726475231e-06, "learning_rate": 7.44908384141385e-05, "loss": 2.3865, "step": 12179 }, { "crossentropy": 2.5706875324249268, "epoch": 0.6623344843524838, "grad_norm": 0.031425587832927704, "grad_norm_var": 7.780941876984665e-06, "learning_rate": 7.42768547791628e-05, "loss": 2.5707, "step": 12180 }, { "crossentropy": 2.487541675567627, "epoch": 0.6623888632099839, "grad_norm": 0.03181140497326851, "grad_norm_var": 7.498900501053451e-06, "learning_rate": 7.406317663294415e-05, "loss": 2.4875, "step": 12181 }, { "crossentropy": 2.612210750579834, "epoch": 0.6624432420674842, "grad_norm": 0.031435828655958176, "grad_norm_var": 7.359951488345113e-06, "learning_rate": 7.38498039887353e-05, "loss": 2.6122, "step": 12182 }, { "crossentropy": 2.4492642879486084, "epoch": 0.6624976209249843, "grad_norm": 0.030970115214586258, "grad_norm_var": 7.450265841200476e-06, "learning_rate": 7.363673685976846e-05, "loss": 2.4493, "step": 12183 }, { "crossentropy": 2.5676172971725464, "epoch": 0.6625519997824846, "grad_norm": 0.0309364702552557, "grad_norm_var": 2.2318030236760192e-07, "learning_rate": 7.34239752592586e-05, "loss": 2.5676, "step": 12184 }, { "crossentropy": 2.477867603302002, "epoch": 0.6626063786399847, "grad_norm": 0.03203874081373215, "grad_norm_var": 2.1933897066544044e-07, "learning_rate": 7.32115192004007e-05, "loss": 2.4779, "step": 12185 }, { "crossentropy": 2.427003264427185, "epoch": 0.662660757497485, "grad_norm": 0.0309099480509758, "grad_norm_var": 2.380966369144027e-07, "learning_rate": 7.299936869637147e-05, "loss": 2.427, "step": 12186 }, { "crossentropy": 2.55128812789917, "epoch": 0.6627151363549851, "grad_norm": 0.031034128740429878, "grad_norm_var": 2.475718061426916e-07, "learning_rate": 7.27875237603276e-05, "loss": 2.5513, "step": 12187 }, { "crossentropy": 2.447197198867798, "epoch": 0.6627695152124854, "grad_norm": 0.03146008402109146, "grad_norm_var": 2.459483754157476e-07, "learning_rate": 7.257598440540802e-05, "loss": 2.4472, "step": 12188 }, { "crossentropy": 2.480378031730652, "epoch": 0.6628238940699855, "grad_norm": 0.03174244239926338, "grad_norm_var": 2.4149990069005587e-07, "learning_rate": 7.236475064473169e-05, "loss": 2.4804, "step": 12189 }, { "crossentropy": 2.464174509048462, "epoch": 0.6628782729274858, "grad_norm": 0.03092947229743004, "grad_norm_var": 2.4349001426179387e-07, "learning_rate": 7.215382249139979e-05, "loss": 2.4642, "step": 12190 }, { "crossentropy": 2.5918043851852417, "epoch": 0.662932651784986, "grad_norm": 0.032334715127944946, "grad_norm_var": 2.727592269066801e-07, "learning_rate": 7.194319995849353e-05, "loss": 2.5918, "step": 12191 }, { "crossentropy": 2.436894416809082, "epoch": 0.6629870306424862, "grad_norm": 0.03099505417048931, "grad_norm_var": 2.8684834144637506e-07, "learning_rate": 7.173288305907577e-05, "loss": 2.4369, "step": 12192 }, { "crossentropy": 2.566877603530884, "epoch": 0.6630414094999864, "grad_norm": 0.031294792890548706, "grad_norm_var": 2.741561992427536e-07, "learning_rate": 7.152287180619e-05, "loss": 2.5669, "step": 12193 }, { "crossentropy": 2.548299551010132, "epoch": 0.6630957883574866, "grad_norm": 0.03201945498585701, "grad_norm_var": 2.080149407059865e-07, "learning_rate": 7.131316621286078e-05, "loss": 2.5483, "step": 12194 }, { "crossentropy": 2.4671010971069336, "epoch": 0.6631501672149868, "grad_norm": 0.03171497583389282, "grad_norm_var": 2.1334246847770113e-07, "learning_rate": 7.110376629209381e-05, "loss": 2.4671, "step": 12195 }, { "crossentropy": 2.4887927770614624, "epoch": 0.663204546072487, "grad_norm": 0.030948776751756668, "grad_norm_var": 2.285205335596779e-07, "learning_rate": 7.08946720568765e-05, "loss": 2.4888, "step": 12196 }, { "crossentropy": 2.5184764862060547, "epoch": 0.6632589249299872, "grad_norm": 0.032135434448719025, "grad_norm_var": 2.523807017382976e-07, "learning_rate": 7.068588352017569e-05, "loss": 2.5185, "step": 12197 }, { "crossentropy": 2.55422580242157, "epoch": 0.6633133037874874, "grad_norm": 0.03236657381057739, "grad_norm_var": 3.0708844761458075e-07, "learning_rate": 7.047740069494102e-05, "loss": 2.5542, "step": 12198 }, { "crossentropy": 2.490317463874817, "epoch": 0.6633676826449876, "grad_norm": 0.03131701797246933, "grad_norm_var": 2.905886952778245e-07, "learning_rate": 7.026922359410215e-05, "loss": 2.4903, "step": 12199 }, { "crossentropy": 2.5458728075027466, "epoch": 0.6634220615024878, "grad_norm": 0.03281549736857414, "grad_norm_var": 3.672865869741929e-07, "learning_rate": 7.006135223057041e-05, "loss": 2.5459, "step": 12200 }, { "crossentropy": 2.5191861391067505, "epoch": 0.663476440359988, "grad_norm": 0.032634835690259933, "grad_norm_var": 4.220948031592268e-07, "learning_rate": 6.985378661723662e-05, "loss": 2.5192, "step": 12201 }, { "crossentropy": 2.5627527236938477, "epoch": 0.6635308192174882, "grad_norm": 0.031215041875839233, "grad_norm_var": 3.971639797542096e-07, "learning_rate": 6.964652676697547e-05, "loss": 2.5628, "step": 12202 }, { "crossentropy": 2.473870038986206, "epoch": 0.6635851980749884, "grad_norm": 0.03519562631845474, "grad_norm_var": 1.1184553150083195e-06, "learning_rate": 6.943957269263946e-05, "loss": 2.4739, "step": 12203 }, { "crossentropy": 2.4353121519088745, "epoch": 0.6636395769324887, "grad_norm": 0.030527004972100258, "grad_norm_var": 1.2331971691921055e-06, "learning_rate": 6.923292440706496e-05, "loss": 2.4353, "step": 12204 }, { "crossentropy": 2.5111664533615112, "epoch": 0.6636939557899888, "grad_norm": 0.03344307839870453, "grad_norm_var": 1.3812535680963906e-06, "learning_rate": 6.902658192306732e-05, "loss": 2.5112, "step": 12205 }, { "crossentropy": 2.4936275482177734, "epoch": 0.6637483346474891, "grad_norm": 0.03152775391936302, "grad_norm_var": 1.318789556610002e-06, "learning_rate": 6.88205452534435e-05, "loss": 2.4936, "step": 12206 }, { "crossentropy": 2.5301254987716675, "epoch": 0.6638027135049892, "grad_norm": 0.030934790149331093, "grad_norm_var": 1.3844650517484766e-06, "learning_rate": 6.861481441097328e-05, "loss": 2.5301, "step": 12207 }, { "crossentropy": 2.569833755493164, "epoch": 0.6638570923624895, "grad_norm": 0.03326452895998955, "grad_norm_var": 1.4195703864479266e-06, "learning_rate": 6.840938940841368e-05, "loss": 2.5698, "step": 12208 }, { "crossentropy": 2.4199057817459106, "epoch": 0.6639114712199896, "grad_norm": 0.030873984098434448, "grad_norm_var": 1.4749578149064169e-06, "learning_rate": 6.820427025850618e-05, "loss": 2.4199, "step": 12209 }, { "crossentropy": 2.474139928817749, "epoch": 0.6639658500774899, "grad_norm": 0.03068990260362625, "grad_norm_var": 1.5923433149704624e-06, "learning_rate": 6.799945697397225e-05, "loss": 2.4741, "step": 12210 }, { "crossentropy": 2.517812490463257, "epoch": 0.6640202289349901, "grad_norm": 0.03125152736902237, "grad_norm_var": 1.6218536791239488e-06, "learning_rate": 6.779494956751397e-05, "loss": 2.5178, "step": 12211 }, { "crossentropy": 2.5132588148117065, "epoch": 0.6640746077924903, "grad_norm": 0.029974311590194702, "grad_norm_var": 1.8108141301037898e-06, "learning_rate": 6.759074805181398e-05, "loss": 2.5133, "step": 12212 }, { "crossentropy": 2.5490658283233643, "epoch": 0.6641289866499905, "grad_norm": 0.02956419810652733, "grad_norm_var": 2.138308880406577e-06, "learning_rate": 6.738685243953768e-05, "loss": 2.5491, "step": 12213 }, { "crossentropy": 2.3344058990478516, "epoch": 0.6641833655074907, "grad_norm": 0.030691616237163544, "grad_norm_var": 2.170309978223363e-06, "learning_rate": 6.718326274333053e-05, "loss": 2.3344, "step": 12214 }, { "crossentropy": 2.570660948753357, "epoch": 0.664237744364991, "grad_norm": 0.03126521781086922, "grad_norm_var": 2.172570593082732e-06, "learning_rate": 6.697997897581854e-05, "loss": 2.5707, "step": 12215 }, { "crossentropy": 2.536836624145508, "epoch": 0.6642921232224911, "grad_norm": 0.032709673047065735, "grad_norm_var": 2.1563571076699723e-06, "learning_rate": 6.677700114960827e-05, "loss": 2.5368, "step": 12216 }, { "crossentropy": 2.408719062805176, "epoch": 0.6643465020799914, "grad_norm": 0.030861472710967064, "grad_norm_var": 2.110633048341354e-06, "learning_rate": 6.657432927728968e-05, "loss": 2.4087, "step": 12217 }, { "crossentropy": 2.587390422821045, "epoch": 0.6644008809374915, "grad_norm": 0.03359228000044823, "grad_norm_var": 2.3737185798656935e-06, "learning_rate": 6.637196337143214e-05, "loss": 2.5874, "step": 12218 }, { "crossentropy": 2.508865237236023, "epoch": 0.6644552597949918, "grad_norm": 0.031121892854571342, "grad_norm_var": 1.4839454685943697e-06, "learning_rate": 6.616990344458562e-05, "loss": 2.5089, "step": 12219 }, { "crossentropy": 2.5308021306991577, "epoch": 0.6645096386524919, "grad_norm": 0.031586404889822006, "grad_norm_var": 1.4317201082084668e-06, "learning_rate": 6.596814950928176e-05, "loss": 2.5308, "step": 12220 }, { "crossentropy": 2.5242650508880615, "epoch": 0.6645640175099922, "grad_norm": 0.03188605234026909, "grad_norm_var": 1.171451184960948e-06, "learning_rate": 6.576670157803333e-05, "loss": 2.5243, "step": 12221 }, { "crossentropy": 2.5665169954299927, "epoch": 0.6646183963674923, "grad_norm": 0.03737375885248184, "grad_norm_var": 3.4364610979273667e-06, "learning_rate": 6.556555966333366e-05, "loss": 2.5665, "step": 12222 }, { "crossentropy": 2.4639073610305786, "epoch": 0.6646727752249926, "grad_norm": 0.031375642865896225, "grad_norm_var": 3.402006349906746e-06, "learning_rate": 6.536472377765778e-05, "loss": 2.4639, "step": 12223 }, { "crossentropy": 2.459602952003479, "epoch": 0.6647271540824927, "grad_norm": 0.031294647604227066, "grad_norm_var": 3.248094777842427e-06, "learning_rate": 6.516419393346129e-05, "loss": 2.4596, "step": 12224 }, { "crossentropy": 2.5746445655822754, "epoch": 0.664781532939993, "grad_norm": 0.030016781762242317, "grad_norm_var": 3.3806600828346073e-06, "learning_rate": 6.49639701431809e-05, "loss": 2.5746, "step": 12225 }, { "crossentropy": 2.4885066747665405, "epoch": 0.6648359117974931, "grad_norm": 0.03148386627435684, "grad_norm_var": 3.3259942850290358e-06, "learning_rate": 6.476405241923389e-05, "loss": 2.4885, "step": 12226 }, { "crossentropy": 2.5430309772491455, "epoch": 0.6648902906549934, "grad_norm": 0.031623028218746185, "grad_norm_var": 3.3159679413508145e-06, "learning_rate": 6.456444077401924e-05, "loss": 2.543, "step": 12227 }, { "crossentropy": 2.411508083343506, "epoch": 0.6649446695124935, "grad_norm": 0.030657067894935608, "grad_norm_var": 3.1924391904662817e-06, "learning_rate": 6.436513521991649e-05, "loss": 2.4115, "step": 12228 }, { "crossentropy": 2.5568459033966064, "epoch": 0.6649990483699938, "grad_norm": 0.030910266563296318, "grad_norm_var": 2.9234395426134055e-06, "learning_rate": 6.416613576928687e-05, "loss": 2.5568, "step": 12229 }, { "crossentropy": 2.507359743118286, "epoch": 0.6650534272274939, "grad_norm": 0.030808845534920692, "grad_norm_var": 2.9073160297608208e-06, "learning_rate": 6.39674424344716e-05, "loss": 2.5074, "step": 12230 }, { "crossentropy": 2.4402012825012207, "epoch": 0.6651078060849942, "grad_norm": 0.031498655676841736, "grad_norm_var": 2.8945301899201223e-06, "learning_rate": 6.376905522779419e-05, "loss": 2.4402, "step": 12231 }, { "crossentropy": 2.5199220180511475, "epoch": 0.6651621849424944, "grad_norm": 0.02983732521533966, "grad_norm_var": 3.061800817186081e-06, "learning_rate": 6.357097416155755e-05, "loss": 2.5199, "step": 12232 }, { "crossentropy": 2.4774945974349976, "epoch": 0.6652165637999946, "grad_norm": 0.0307188481092453, "grad_norm_var": 3.0775062941231755e-06, "learning_rate": 6.337319924804686e-05, "loss": 2.4775, "step": 12233 }, { "crossentropy": 2.5050140619277954, "epoch": 0.6652709426574948, "grad_norm": 0.03287485986948013, "grad_norm_var": 2.920209154188188e-06, "learning_rate": 6.317573049952786e-05, "loss": 2.505, "step": 12234 }, { "crossentropy": 2.5467180013656616, "epoch": 0.665325321514995, "grad_norm": 0.03281962499022484, "grad_norm_var": 2.999653569746282e-06, "learning_rate": 6.29785679282474e-05, "loss": 2.5467, "step": 12235 }, { "crossentropy": 2.5074622631073, "epoch": 0.6653797003724952, "grad_norm": 0.03169542923569679, "grad_norm_var": 2.999139778133208e-06, "learning_rate": 6.278171154643353e-05, "loss": 2.5075, "step": 12236 }, { "crossentropy": 2.48027241230011, "epoch": 0.6654340792299954, "grad_norm": 0.03022869862616062, "grad_norm_var": 3.1252093774302692e-06, "learning_rate": 6.258516136629477e-05, "loss": 2.4803, "step": 12237 }, { "crossentropy": 2.548077702522278, "epoch": 0.6654884580874956, "grad_norm": 0.03212622180581093, "grad_norm_var": 7.897813610972341e-07, "learning_rate": 6.238891740002194e-05, "loss": 2.5481, "step": 12238 }, { "crossentropy": 2.5049967765808105, "epoch": 0.6655428369449958, "grad_norm": 0.031470704823732376, "grad_norm_var": 7.919625896373831e-07, "learning_rate": 6.21929796597842e-05, "loss": 2.505, "step": 12239 }, { "crossentropy": 2.4687607288360596, "epoch": 0.665597215802496, "grad_norm": 0.030664071440696716, "grad_norm_var": 8.134012951511889e-07, "learning_rate": 6.199734815773461e-05, "loss": 2.4688, "step": 12240 }, { "crossentropy": 2.4671878814697266, "epoch": 0.6656515946599962, "grad_norm": 0.030605150386691093, "grad_norm_var": 7.410661622797577e-07, "learning_rate": 6.180202290600622e-05, "loss": 2.4672, "step": 12241 }, { "crossentropy": 2.5362095832824707, "epoch": 0.6657059735174964, "grad_norm": 0.030485836789011955, "grad_norm_var": 7.723878656820757e-07, "learning_rate": 6.160700391671215e-05, "loss": 2.5362, "step": 12242 }, { "crossentropy": 2.614673614501953, "epoch": 0.6657603523749966, "grad_norm": 0.03294984996318817, "grad_norm_var": 9.591930827073606e-07, "learning_rate": 6.141229120194713e-05, "loss": 2.6147, "step": 12243 }, { "crossentropy": 2.432049512863159, "epoch": 0.6658147312324968, "grad_norm": 0.030264170840382576, "grad_norm_var": 1.0010533122814524e-06, "learning_rate": 6.12178847737882e-05, "loss": 2.432, "step": 12244 }, { "crossentropy": 2.491620898246765, "epoch": 0.665869110089997, "grad_norm": 0.03146076202392578, "grad_norm_var": 9.95247516937187e-07, "learning_rate": 6.102378464429126e-05, "loss": 2.4916, "step": 12245 }, { "crossentropy": 2.5013128519058228, "epoch": 0.6659234889474972, "grad_norm": 0.03143361955881119, "grad_norm_var": 9.802439769733088e-07, "learning_rate": 6.0829990825495005e-05, "loss": 2.5013, "step": 12246 }, { "crossentropy": 2.416205048561096, "epoch": 0.6659778678049975, "grad_norm": 0.03130638599395752, "grad_norm_var": 9.77996602578394e-07, "learning_rate": 6.0636503329417614e-05, "loss": 2.4162, "step": 12247 }, { "crossentropy": 2.4526145458221436, "epoch": 0.6660322466624976, "grad_norm": 0.032966699451208115, "grad_norm_var": 9.760655320304248e-07, "learning_rate": 6.044332216805948e-05, "loss": 2.4526, "step": 12248 }, { "crossentropy": 2.4696028232574463, "epoch": 0.6660866255199979, "grad_norm": 0.03150569275021553, "grad_norm_var": 9.323430056782749e-07, "learning_rate": 6.025044735340157e-05, "loss": 2.4696, "step": 12249 }, { "crossentropy": 2.5047227144241333, "epoch": 0.666141004377498, "grad_norm": 0.031761012971401215, "grad_norm_var": 8.136614340477818e-07, "learning_rate": 6.0057878897404885e-05, "loss": 2.5047, "step": 12250 }, { "crossentropy": 2.42490553855896, "epoch": 0.6661953832349983, "grad_norm": 0.03142745420336723, "grad_norm_var": 6.868719473612783e-07, "learning_rate": 5.986561681201375e-05, "loss": 2.4249, "step": 12251 }, { "crossentropy": 2.459627628326416, "epoch": 0.6662497620924984, "grad_norm": 0.03082822822034359, "grad_norm_var": 6.993661511083311e-07, "learning_rate": 5.96736611091514e-05, "loss": 2.4596, "step": 12252 }, { "crossentropy": 2.4370557069778442, "epoch": 0.6663041409499987, "grad_norm": 0.03210786357522011, "grad_norm_var": 6.409296848928007e-07, "learning_rate": 5.9482011800722214e-05, "loss": 2.4371, "step": 12253 }, { "crossentropy": 2.425788164138794, "epoch": 0.6663585198074988, "grad_norm": 0.030980611220002174, "grad_norm_var": 6.212276200896154e-07, "learning_rate": 5.9290668898612785e-05, "loss": 2.4258, "step": 12254 }, { "crossentropy": 2.4243624210357666, "epoch": 0.6664128986649991, "grad_norm": 0.03048107959330082, "grad_norm_var": 6.716080077640482e-07, "learning_rate": 5.9099632414690295e-05, "loss": 2.4244, "step": 12255 }, { "crossentropy": 2.6050597429275513, "epoch": 0.6664672775224992, "grad_norm": 0.03141028806567192, "grad_norm_var": 6.404738525278911e-07, "learning_rate": 5.890890236080193e-05, "loss": 2.6051, "step": 12256 }, { "crossentropy": 2.5399487018585205, "epoch": 0.6665216563799995, "grad_norm": 0.0326915979385376, "grad_norm_var": 6.98825830601823e-07, "learning_rate": 5.871847874877656e-05, "loss": 2.5399, "step": 12257 }, { "crossentropy": 2.5337876081466675, "epoch": 0.6665760352374996, "grad_norm": 0.030246997252106667, "grad_norm_var": 7.348091186978733e-07, "learning_rate": 5.8528361590424736e-05, "loss": 2.5338, "step": 12258 }, { "crossentropy": 2.5596554279327393, "epoch": 0.6666304140949999, "grad_norm": 0.03216675668954849, "grad_norm_var": 6.205944019735948e-07, "learning_rate": 5.833855089753648e-05, "loss": 2.5597, "step": 12259 }, { "crossentropy": 2.5672695636749268, "epoch": 0.6666847929525, "grad_norm": 0.03160477802157402, "grad_norm_var": 5.227531692255626e-07, "learning_rate": 5.8149046681885146e-05, "loss": 2.5673, "step": 12260 }, { "crossentropy": 2.4846203327178955, "epoch": 0.6667391718100003, "grad_norm": 0.030965298414230347, "grad_norm_var": 5.422563188995158e-07, "learning_rate": 5.795984895522188e-05, "loss": 2.4846, "step": 12261 }, { "crossentropy": 2.4972003698349, "epoch": 0.6667935506675005, "grad_norm": 0.030727488920092583, "grad_norm_var": 5.78989415782891e-07, "learning_rate": 5.77709577292812e-05, "loss": 2.4972, "step": 12262 }, { "crossentropy": 2.4752371311187744, "epoch": 0.6668479295250007, "grad_norm": 0.0314023531973362, "grad_norm_var": 5.777447984469612e-07, "learning_rate": 5.758237301577873e-05, "loss": 2.4752, "step": 12263 }, { "crossentropy": 2.5261898040771484, "epoch": 0.6669023083825009, "grad_norm": 0.030667975544929504, "grad_norm_var": 4.4456130521701384e-07, "learning_rate": 5.739409482640956e-05, "loss": 2.5262, "step": 12264 }, { "crossentropy": 2.367875576019287, "epoch": 0.6669566872400011, "grad_norm": 0.032922856509685516, "grad_norm_var": 6.068777580579919e-07, "learning_rate": 5.72061231728499e-05, "loss": 2.3679, "step": 12265 }, { "crossentropy": 2.563101291656494, "epoch": 0.6670110660975013, "grad_norm": 0.030698629096150398, "grad_norm_var": 6.262159055400729e-07, "learning_rate": 5.7018458066759317e-05, "loss": 2.5631, "step": 12266 }, { "crossentropy": 2.5167049169540405, "epoch": 0.6670654449550015, "grad_norm": 0.030670806765556335, "grad_norm_var": 6.524832050347832e-07, "learning_rate": 5.6831099519775164e-05, "loss": 2.5167, "step": 12267 }, { "crossentropy": 2.385651111602783, "epoch": 0.6671198238125017, "grad_norm": 0.0303034745156765, "grad_norm_var": 7.017121442191015e-07, "learning_rate": 5.664404754351759e-05, "loss": 2.3857, "step": 12268 }, { "crossentropy": 2.481018304824829, "epoch": 0.6671742026700019, "grad_norm": 0.03151283413171768, "grad_norm_var": 6.560226048122567e-07, "learning_rate": 5.645730214958733e-05, "loss": 2.481, "step": 12269 }, { "crossentropy": 2.5506608486175537, "epoch": 0.6672285815275021, "grad_norm": 0.030490778386592865, "grad_norm_var": 6.863832300358156e-07, "learning_rate": 5.627086334956677e-05, "loss": 2.5507, "step": 12270 }, { "crossentropy": 2.5415422916412354, "epoch": 0.6672829603850023, "grad_norm": 0.03256787359714508, "grad_norm_var": 7.626248659432181e-07, "learning_rate": 5.608473115501833e-05, "loss": 2.5415, "step": 12271 }, { "crossentropy": 2.653144598007202, "epoch": 0.6673373392425025, "grad_norm": 0.032434143126010895, "grad_norm_var": 8.410584343083398e-07, "learning_rate": 5.5898905577485e-05, "loss": 2.6531, "step": 12272 }, { "crossentropy": 2.514439344406128, "epoch": 0.6673917181000028, "grad_norm": 0.03215721622109413, "grad_norm_var": 7.654297855213029e-07, "learning_rate": 5.571338662849257e-05, "loss": 2.5144, "step": 12273 }, { "crossentropy": 2.478214979171753, "epoch": 0.6674460969575029, "grad_norm": 0.03347141295671463, "grad_norm_var": 9.426332854283366e-07, "learning_rate": 5.552817431954682e-05, "loss": 2.4782, "step": 12274 }, { "crossentropy": 2.4019081592559814, "epoch": 0.6675004758150032, "grad_norm": 0.03056526556611061, "grad_norm_var": 9.707628429153046e-07, "learning_rate": 5.534326866213357e-05, "loss": 2.4019, "step": 12275 }, { "crossentropy": 2.3950936794281006, "epoch": 0.6675548546725033, "grad_norm": 0.03246543928980827, "grad_norm_var": 1.0350845269415631e-06, "learning_rate": 5.515866966772032e-05, "loss": 2.3951, "step": 12276 }, { "crossentropy": 2.3879584074020386, "epoch": 0.6676092335300036, "grad_norm": 0.03075874224305153, "grad_norm_var": 1.0525182858845775e-06, "learning_rate": 5.497437734775734e-05, "loss": 2.388, "step": 12277 }, { "crossentropy": 2.4216896295547485, "epoch": 0.6676636123875037, "grad_norm": 0.030150622129440308, "grad_norm_var": 1.1318565459883379e-06, "learning_rate": 5.4790391713672724e-05, "loss": 2.4217, "step": 12278 }, { "crossentropy": 2.596803665161133, "epoch": 0.667717991245004, "grad_norm": 0.031008966267108917, "grad_norm_var": 1.1441602929580882e-06, "learning_rate": 5.460671277687734e-05, "loss": 2.5968, "step": 12279 }, { "crossentropy": 2.448435068130493, "epoch": 0.6677723701025041, "grad_norm": 0.03134117275476456, "grad_norm_var": 1.1042708418969634e-06, "learning_rate": 5.442334054876319e-05, "loss": 2.4484, "step": 12280 }, { "crossentropy": 2.438589096069336, "epoch": 0.6678267489600044, "grad_norm": 0.03074287623167038, "grad_norm_var": 9.790015722007147e-07, "learning_rate": 5.4240275040702835e-05, "loss": 2.4386, "step": 12281 }, { "crossentropy": 2.5220839977264404, "epoch": 0.6678811278175045, "grad_norm": 0.03191963955760002, "grad_norm_var": 9.687795852029077e-07, "learning_rate": 5.405751626404998e-05, "loss": 2.5221, "step": 12282 }, { "crossentropy": 2.476736068725586, "epoch": 0.6679355066750048, "grad_norm": 0.03173670172691345, "grad_norm_var": 9.347229685002327e-07, "learning_rate": 5.387506423013833e-05, "loss": 2.4767, "step": 12283 }, { "crossentropy": 2.6062084436416626, "epoch": 0.6679898855325049, "grad_norm": 0.031048517674207687, "grad_norm_var": 8.528691572637822e-07, "learning_rate": 5.369291895028383e-05, "loss": 2.6062, "step": 12284 }, { "crossentropy": 2.5039254426956177, "epoch": 0.6680442643900052, "grad_norm": 0.031003709882497787, "grad_norm_var": 8.697775453327927e-07, "learning_rate": 5.351108043578412e-05, "loss": 2.5039, "step": 12285 }, { "crossentropy": 2.4182302951812744, "epoch": 0.6680986432475053, "grad_norm": 0.03154842182993889, "grad_norm_var": 7.985778912478203e-07, "learning_rate": 5.332954869791462e-05, "loss": 2.4182, "step": 12286 }, { "crossentropy": 2.5153279304504395, "epoch": 0.6681530221050056, "grad_norm": 0.031791459769010544, "grad_norm_var": 7.316629709291343e-07, "learning_rate": 5.3148323747934657e-05, "loss": 2.5153, "step": 12287 }, { "crossentropy": 2.384541153907776, "epoch": 0.6682074009625057, "grad_norm": 0.03085050731897354, "grad_norm_var": 6.930656193309658e-07, "learning_rate": 5.296740559708413e-05, "loss": 2.3845, "step": 12288 }, { "crossentropy": 2.4865533113479614, "epoch": 0.668261779820006, "grad_norm": 0.03191174194216728, "grad_norm_var": 6.723767790724085e-07, "learning_rate": 5.278679425658295e-05, "loss": 2.4866, "step": 12289 }, { "crossentropy": 2.5273159742355347, "epoch": 0.6683161586775062, "grad_norm": 0.03216268867254257, "grad_norm_var": 4.1704491076880596e-07, "learning_rate": 5.2606489737632715e-05, "loss": 2.5273, "step": 12290 }, { "crossentropy": 2.5211181640625, "epoch": 0.6683705375350064, "grad_norm": 0.03113977238535881, "grad_norm_var": 3.8040371489276443e-07, "learning_rate": 5.242649205141503e-05, "loss": 2.5211, "step": 12291 }, { "crossentropy": 2.431359648704529, "epoch": 0.6684249163925066, "grad_norm": 0.030926024541258812, "grad_norm_var": 2.993222481441436e-07, "learning_rate": 5.2246801209093754e-05, "loss": 2.4314, "step": 12292 }, { "crossentropy": 2.6258517503738403, "epoch": 0.6684792952500068, "grad_norm": 0.032276302576065063, "grad_norm_var": 3.4333166802021953e-07, "learning_rate": 5.206741722181385e-05, "loss": 2.6259, "step": 12293 }, { "crossentropy": 2.6181458234786987, "epoch": 0.668533674107507, "grad_norm": 0.03044155426323414, "grad_norm_var": 3.0219585188945064e-07, "learning_rate": 5.188834010069865e-05, "loss": 2.6181, "step": 12294 }, { "crossentropy": 2.5713804960250854, "epoch": 0.6685880529650072, "grad_norm": 0.03096720762550831, "grad_norm_var": 3.0429066959768103e-07, "learning_rate": 5.170956985685593e-05, "loss": 2.5714, "step": 12295 }, { "crossentropy": 2.2923104763031006, "epoch": 0.6686424318225074, "grad_norm": 0.031446341425180435, "grad_norm_var": 3.046756135326265e-07, "learning_rate": 5.153110650137294e-05, "loss": 2.2923, "step": 12296 }, { "crossentropy": 2.4826242923736572, "epoch": 0.6686968106800076, "grad_norm": 0.03037174418568611, "grad_norm_var": 3.442968606433833e-07, "learning_rate": 5.1352950045316924e-05, "loss": 2.4826, "step": 12297 }, { "crossentropy": 2.50512433052063, "epoch": 0.6687511895375078, "grad_norm": 0.030730009078979492, "grad_norm_var": 3.41821745648479e-07, "learning_rate": 5.117510049973684e-05, "loss": 2.5051, "step": 12298 }, { "crossentropy": 2.427735924720764, "epoch": 0.668805568395008, "grad_norm": 0.03065590374171734, "grad_norm_var": 3.4786937019188705e-07, "learning_rate": 5.099755787566385e-05, "loss": 2.4277, "step": 12299 }, { "crossentropy": 2.368615508079529, "epoch": 0.6688599472525082, "grad_norm": 0.031592223793268204, "grad_norm_var": 3.550380074555474e-07, "learning_rate": 5.082032218410804e-05, "loss": 2.3686, "step": 12300 }, { "crossentropy": 2.4562333822250366, "epoch": 0.6689143261100085, "grad_norm": 0.031903866678476334, "grad_norm_var": 3.775038286568203e-07, "learning_rate": 5.064339343606173e-05, "loss": 2.4562, "step": 12301 }, { "crossentropy": 2.5230813026428223, "epoch": 0.6689687049675086, "grad_norm": 0.031973715871572495, "grad_norm_var": 4.0319401565490815e-07, "learning_rate": 5.04667716424978e-05, "loss": 2.5231, "step": 12302 }, { "crossentropy": 2.503637194633484, "epoch": 0.6690230838250089, "grad_norm": 0.03238016739487648, "grad_norm_var": 4.6175864845676706e-07, "learning_rate": 5.029045681437083e-05, "loss": 2.5036, "step": 12303 }, { "crossentropy": 2.549073815345764, "epoch": 0.669077462682509, "grad_norm": 0.030743563547730446, "grad_norm_var": 4.697114620058936e-07, "learning_rate": 5.01144489626143e-05, "loss": 2.5491, "step": 12304 }, { "crossentropy": 2.6025593280792236, "epoch": 0.6691318415400093, "grad_norm": 0.032246463000774384, "grad_norm_var": 5.017204220391132e-07, "learning_rate": 4.993874809814558e-05, "loss": 2.6026, "step": 12305 }, { "crossentropy": 2.5525048971176147, "epoch": 0.6691862203975094, "grad_norm": 0.03099719062447548, "grad_norm_var": 4.638006068189883e-07, "learning_rate": 4.97633542318604e-05, "loss": 2.5525, "step": 12306 }, { "crossentropy": 2.4965380430221558, "epoch": 0.6692405992550097, "grad_norm": 0.032020553946495056, "grad_norm_var": 4.935282259696845e-07, "learning_rate": 4.958826737463728e-05, "loss": 2.4965, "step": 12307 }, { "crossentropy": 2.383024215698242, "epoch": 0.6692949781125098, "grad_norm": 0.03038123808801174, "grad_norm_var": 5.432052053701142e-07, "learning_rate": 4.941348753733421e-05, "loss": 2.383, "step": 12308 }, { "crossentropy": 2.574028491973877, "epoch": 0.6693493569700101, "grad_norm": 0.03084649331867695, "grad_norm_var": 4.887625257479767e-07, "learning_rate": 4.923901473079084e-05, "loss": 2.574, "step": 12309 }, { "crossentropy": 2.538468599319458, "epoch": 0.6694037358275102, "grad_norm": 0.03125014156103134, "grad_norm_var": 4.444993544267985e-07, "learning_rate": 4.906484896582908e-05, "loss": 2.5385, "step": 12310 }, { "crossentropy": 2.4775726795196533, "epoch": 0.6694581146850105, "grad_norm": 0.03593188896775246, "grad_norm_var": 1.776838127312321e-06, "learning_rate": 4.8890990253249166e-05, "loss": 2.4776, "step": 12311 }, { "crossentropy": 2.4033961296081543, "epoch": 0.6695124935425106, "grad_norm": 0.030846336856484413, "grad_norm_var": 1.810988770785379e-06, "learning_rate": 4.87174386038336e-05, "loss": 2.4034, "step": 12312 }, { "crossentropy": 2.415904402732849, "epoch": 0.6695668724000109, "grad_norm": 0.03060351498425007, "grad_norm_var": 1.7777966570597357e-06, "learning_rate": 4.854419402834709e-05, "loss": 2.4159, "step": 12313 }, { "crossentropy": 2.5679851770401, "epoch": 0.669621251257511, "grad_norm": 0.031505484133958817, "grad_norm_var": 1.7286375919416581e-06, "learning_rate": 4.837125653753327e-05, "loss": 2.568, "step": 12314 }, { "crossentropy": 2.54453182220459, "epoch": 0.6696756301150113, "grad_norm": 0.03232697769999504, "grad_norm_var": 1.6889324297000245e-06, "learning_rate": 4.819862614211745e-05, "loss": 2.5445, "step": 12315 }, { "crossentropy": 2.625622272491455, "epoch": 0.6697300089725114, "grad_norm": 0.031433042138814926, "grad_norm_var": 1.6932676123047046e-06, "learning_rate": 4.8026302852806625e-05, "loss": 2.6256, "step": 12316 }, { "crossentropy": 2.4602441787719727, "epoch": 0.6697843878300117, "grad_norm": 0.031099513173103333, "grad_norm_var": 1.713117872058573e-06, "learning_rate": 4.78542866802878e-05, "loss": 2.4602, "step": 12317 }, { "crossentropy": 2.545243263244629, "epoch": 0.6698387666875119, "grad_norm": 0.030819226056337357, "grad_norm_var": 1.7483827734232714e-06, "learning_rate": 4.76825776352291e-05, "loss": 2.5452, "step": 12318 }, { "crossentropy": 2.6495449542999268, "epoch": 0.6698931455450121, "grad_norm": 0.03124665841460228, "grad_norm_var": 1.709186359022382e-06, "learning_rate": 4.751117572828034e-05, "loss": 2.6495, "step": 12319 }, { "crossentropy": 2.4659738540649414, "epoch": 0.6699475244025123, "grad_norm": 0.030885742977261543, "grad_norm_var": 1.6957564170891887e-06, "learning_rate": 4.734008097007025e-05, "loss": 2.466, "step": 12320 }, { "crossentropy": 2.5394366979599, "epoch": 0.6700019032600125, "grad_norm": 0.03625661879777908, "grad_norm_var": 3.085245677072199e-06, "learning_rate": 4.716929337121201e-05, "loss": 2.5394, "step": 12321 }, { "crossentropy": 2.5142226219177246, "epoch": 0.6700562821175127, "grad_norm": 0.03031449019908905, "grad_norm_var": 3.1854651035603898e-06, "learning_rate": 4.699881294229602e-05, "loss": 2.5142, "step": 12322 }, { "crossentropy": 2.6196160316467285, "epoch": 0.6701106609750129, "grad_norm": 0.03205075487494469, "grad_norm_var": 3.1866699818810004e-06, "learning_rate": 4.682863969389606e-05, "loss": 2.6196, "step": 12323 }, { "crossentropy": 2.438969850540161, "epoch": 0.6701650398325131, "grad_norm": 0.030913077294826508, "grad_norm_var": 3.1081815116933953e-06, "learning_rate": 4.665877363656646e-05, "loss": 2.439, "step": 12324 }, { "crossentropy": 2.4962358474731445, "epoch": 0.6702194186900133, "grad_norm": 0.031425587832927704, "grad_norm_var": 3.0577864896467183e-06, "learning_rate": 4.6489214780841e-05, "loss": 2.4962, "step": 12325 }, { "crossentropy": 2.4187926054000854, "epoch": 0.6702737975475135, "grad_norm": 0.03094284050166607, "grad_norm_var": 3.0864974968458634e-06, "learning_rate": 4.631996313723686e-05, "loss": 2.4188, "step": 12326 }, { "crossentropy": 2.4916781187057495, "epoch": 0.6703281764050137, "grad_norm": 0.03249278664588928, "grad_norm_var": 1.9253648101651954e-06, "learning_rate": 4.6151018716250045e-05, "loss": 2.4917, "step": 12327 }, { "crossentropy": 2.514703154563904, "epoch": 0.6703825552625139, "grad_norm": 0.03229961171746254, "grad_norm_var": 1.9166245557314937e-06, "learning_rate": 4.5982381528358875e-05, "loss": 2.5147, "step": 12328 }, { "crossentropy": 2.5472062826156616, "epoch": 0.6704369341200141, "grad_norm": 0.0315304659307003, "grad_norm_var": 1.8393202796326835e-06, "learning_rate": 4.581405158402108e-05, "loss": 2.5472, "step": 12329 }, { "crossentropy": 2.424813747406006, "epoch": 0.6704913129775143, "grad_norm": 0.03148673474788666, "grad_norm_var": 1.8398820975868216e-06, "learning_rate": 4.5646028893678306e-05, "loss": 2.4248, "step": 12330 }, { "crossentropy": 2.559836983680725, "epoch": 0.6705456918350146, "grad_norm": 0.03334646299481392, "grad_norm_var": 1.9873137223397687e-06, "learning_rate": 4.5478313467748334e-05, "loss": 2.5598, "step": 12331 }, { "crossentropy": 2.5176732540130615, "epoch": 0.6706000706925147, "grad_norm": 0.03237629309296608, "grad_norm_var": 1.9987855567865065e-06, "learning_rate": 4.5310905316635065e-05, "loss": 2.5177, "step": 12332 }, { "crossentropy": 2.5194350481033325, "epoch": 0.670654449550015, "grad_norm": 0.031160464510321617, "grad_norm_var": 1.992976122576932e-06, "learning_rate": 4.514380445072075e-05, "loss": 2.5194, "step": 12333 }, { "crossentropy": 2.5233715772628784, "epoch": 0.6707088284075151, "grad_norm": 0.03275228664278984, "grad_norm_var": 1.961688807421017e-06, "learning_rate": 4.497701088036765e-05, "loss": 2.5234, "step": 12334 }, { "crossentropy": 2.573469400405884, "epoch": 0.6707632072650154, "grad_norm": 0.032536182552576065, "grad_norm_var": 1.941669896122567e-06, "learning_rate": 4.481052461592028e-05, "loss": 2.5735, "step": 12335 }, { "crossentropy": 2.5646969079971313, "epoch": 0.6708175861225155, "grad_norm": 0.030317872762680054, "grad_norm_var": 2.0498375320194644e-06, "learning_rate": 4.4644345667705364e-05, "loss": 2.5647, "step": 12336 }, { "crossentropy": 2.457290530204773, "epoch": 0.6708719649800158, "grad_norm": 0.031157799065113068, "grad_norm_var": 7.894848556321461e-07, "learning_rate": 4.447847404602745e-05, "loss": 2.4573, "step": 12337 }, { "crossentropy": 2.5528091192245483, "epoch": 0.6709263438375159, "grad_norm": 0.03294801339507103, "grad_norm_var": 7.385603251687407e-07, "learning_rate": 4.431290976117497e-05, "loss": 2.5528, "step": 12338 }, { "crossentropy": 2.4951752424240112, "epoch": 0.6709807226950162, "grad_norm": 0.030933108180761337, "grad_norm_var": 7.879929701331533e-07, "learning_rate": 4.414765282341582e-05, "loss": 2.4952, "step": 12339 }, { "crossentropy": 2.532106041908264, "epoch": 0.6710351015525163, "grad_norm": 0.03275931999087334, "grad_norm_var": 7.854769885772457e-07, "learning_rate": 4.398270324299847e-05, "loss": 2.5321, "step": 12340 }, { "crossentropy": 2.5622247457504272, "epoch": 0.6710894804100166, "grad_norm": 0.0318526029586792, "grad_norm_var": 7.696282802146607e-07, "learning_rate": 4.3818061030154176e-05, "loss": 2.5622, "step": 12341 }, { "crossentropy": 2.549006223678589, "epoch": 0.6711438592675167, "grad_norm": 0.030920829623937607, "grad_norm_var": 7.725580159456304e-07, "learning_rate": 4.3653726195092006e-05, "loss": 2.549, "step": 12342 }, { "crossentropy": 2.501095771789551, "epoch": 0.671198238125017, "grad_norm": 0.030613841488957405, "grad_norm_var": 8.520739909721742e-07, "learning_rate": 4.348969874800546e-05, "loss": 2.5011, "step": 12343 }, { "crossentropy": 2.53959321975708, "epoch": 0.6712526169825171, "grad_norm": 0.03169124573469162, "grad_norm_var": 8.356524037253857e-07, "learning_rate": 4.332597869906751e-05, "loss": 2.5396, "step": 12344 }, { "crossentropy": 2.5345253944396973, "epoch": 0.6713069958400174, "grad_norm": 0.032424721866846085, "grad_norm_var": 8.565992367203766e-07, "learning_rate": 4.316256605843061e-05, "loss": 2.5345, "step": 12345 }, { "crossentropy": 2.6129610538482666, "epoch": 0.6713613746975176, "grad_norm": 0.03142629563808441, "grad_norm_var": 8.595926432763389e-07, "learning_rate": 4.299946083622941e-05, "loss": 2.613, "step": 12346 }, { "crossentropy": 2.4702885150909424, "epoch": 0.6714157535550178, "grad_norm": 0.030450159683823586, "grad_norm_var": 7.96747867557003e-07, "learning_rate": 4.283666304258138e-05, "loss": 2.4703, "step": 12347 }, { "crossentropy": 2.4905049800872803, "epoch": 0.671470132412518, "grad_norm": 0.03110978566110134, "grad_norm_var": 7.735196298485602e-07, "learning_rate": 4.2674172687581227e-05, "loss": 2.4905, "step": 12348 }, { "crossentropy": 2.540353298187256, "epoch": 0.6715245112700182, "grad_norm": 0.030148420482873917, "grad_norm_var": 8.922444358802253e-07, "learning_rate": 4.2511989781307546e-05, "loss": 2.5404, "step": 12349 }, { "crossentropy": 2.4952536821365356, "epoch": 0.6715788901275184, "grad_norm": 0.031327616423368454, "grad_norm_var": 7.817247661290876e-07, "learning_rate": 4.235011433381841e-05, "loss": 2.4953, "step": 12350 }, { "crossentropy": 2.513311266899109, "epoch": 0.6716332689850186, "grad_norm": 0.030337166041135788, "grad_norm_var": 7.548146406591149e-07, "learning_rate": 4.218854635515301e-05, "loss": 2.5133, "step": 12351 }, { "crossentropy": 2.4880993366241455, "epoch": 0.6716876478425188, "grad_norm": 0.030690090730786324, "grad_norm_var": 7.159141407058733e-07, "learning_rate": 4.202728585533222e-05, "loss": 2.4881, "step": 12352 }, { "crossentropy": 2.530319929122925, "epoch": 0.671742026700019, "grad_norm": 0.03159063309431076, "grad_norm_var": 7.19449035967099e-07, "learning_rate": 4.186633284435581e-05, "loss": 2.5303, "step": 12353 }, { "crossentropy": 2.4668549299240112, "epoch": 0.6717964055575192, "grad_norm": 0.030784985050559044, "grad_norm_var": 5.442139850472671e-07, "learning_rate": 4.170568733220803e-05, "loss": 2.4669, "step": 12354 }, { "crossentropy": 2.5445823669433594, "epoch": 0.6718507844150194, "grad_norm": 0.030706092715263367, "grad_norm_var": 5.55250167384016e-07, "learning_rate": 4.1545349328850903e-05, "loss": 2.5446, "step": 12355 }, { "crossentropy": 2.43026340007782, "epoch": 0.6719051632725196, "grad_norm": 0.03045518323779106, "grad_norm_var": 4.009827006432361e-07, "learning_rate": 4.138531884422758e-05, "loss": 2.4303, "step": 12356 }, { "crossentropy": 2.5929282903671265, "epoch": 0.6719595421300198, "grad_norm": 0.03350517898797989, "grad_norm_var": 7.522418275376542e-07, "learning_rate": 4.1225595888264024e-05, "loss": 2.5929, "step": 12357 }, { "crossentropy": 2.534767985343933, "epoch": 0.67201392098752, "grad_norm": 0.03184470906853676, "grad_norm_var": 7.790352811481849e-07, "learning_rate": 4.106618047086674e-05, "loss": 2.5348, "step": 12358 }, { "crossentropy": 2.535860776901245, "epoch": 0.6720682998450203, "grad_norm": 0.03354676440358162, "grad_norm_var": 1.0897359311173282e-06, "learning_rate": 4.090707260192117e-05, "loss": 2.5359, "step": 12359 }, { "crossentropy": 2.5713521242141724, "epoch": 0.6721226787025205, "grad_norm": 0.03207497298717499, "grad_norm_var": 1.1149942573312453e-06, "learning_rate": 4.0748272291296075e-05, "loss": 2.5714, "step": 12360 }, { "crossentropy": 2.4502060413360596, "epoch": 0.6721770575600207, "grad_norm": 0.03069324605166912, "grad_norm_var": 1.0661276063594067e-06, "learning_rate": 4.0589779548839137e-05, "loss": 2.4502, "step": 12361 }, { "crossentropy": 2.4593639373779297, "epoch": 0.6722314364175209, "grad_norm": 0.0313110388815403, "grad_norm_var": 1.0649126039426742e-06, "learning_rate": 4.043159438438026e-05, "loss": 2.4594, "step": 12362 }, { "crossentropy": 2.3883628845214844, "epoch": 0.6722858152750211, "grad_norm": 0.030273541808128357, "grad_norm_var": 1.0865455305699343e-06, "learning_rate": 4.027371680773106e-05, "loss": 2.3884, "step": 12363 }, { "crossentropy": 2.6047303676605225, "epoch": 0.6723401941325213, "grad_norm": 0.030889810994267464, "grad_norm_var": 1.0944145102705893e-06, "learning_rate": 4.011614682868092e-05, "loss": 2.6047, "step": 12364 }, { "crossentropy": 2.5454108715057373, "epoch": 0.6723945729900215, "grad_norm": 0.030565544962882996, "grad_norm_var": 1.0433991795906483e-06, "learning_rate": 3.995888445700369e-05, "loss": 2.5454, "step": 12365 }, { "crossentropy": 2.4713908433914185, "epoch": 0.6724489518475217, "grad_norm": 0.037174057215452194, "grad_norm_var": 3.2111421688618227e-06, "learning_rate": 3.980192970245211e-05, "loss": 2.4714, "step": 12366 }, { "crossentropy": 2.5487807989120483, "epoch": 0.6725033307050219, "grad_norm": 0.03080471232533455, "grad_norm_var": 3.142795620130386e-06, "learning_rate": 3.964528257476063e-05, "loss": 2.5488, "step": 12367 }, { "crossentropy": 2.444890022277832, "epoch": 0.6725577095625221, "grad_norm": 0.030706778168678284, "grad_norm_var": 3.140606234531398e-06, "learning_rate": 3.948894308364426e-05, "loss": 2.4449, "step": 12368 }, { "crossentropy": 2.509722352027893, "epoch": 0.6726120884200223, "grad_norm": 0.032103393226861954, "grad_norm_var": 3.1507271818376753e-06, "learning_rate": 3.9332911238798565e-05, "loss": 2.5097, "step": 12369 }, { "crossentropy": 2.5203773975372314, "epoch": 0.6726664672775226, "grad_norm": 0.03015722893178463, "grad_norm_var": 3.2532001036486194e-06, "learning_rate": 3.917718704990081e-05, "loss": 2.5204, "step": 12370 }, { "crossentropy": 2.495420813560486, "epoch": 0.6727208461350227, "grad_norm": 0.03078414686024189, "grad_norm_var": 3.243489280514653e-06, "learning_rate": 3.902177052660938e-05, "loss": 2.4954, "step": 12371 }, { "crossentropy": 2.40668261051178, "epoch": 0.672775224992523, "grad_norm": 0.03025107830762863, "grad_norm_var": 3.2794426416874374e-06, "learning_rate": 3.886666167856267e-05, "loss": 2.4067, "step": 12372 }, { "crossentropy": 2.4999260902404785, "epoch": 0.6728296038500231, "grad_norm": 0.030712967738509178, "grad_norm_var": 3.0827063164034674e-06, "learning_rate": 3.871186051538023e-05, "loss": 2.4999, "step": 12373 }, { "crossentropy": 2.46524441242218, "epoch": 0.6728839827075234, "grad_norm": 0.031019868329167366, "grad_norm_var": 3.0865896142335977e-06, "learning_rate": 3.855736704666269e-05, "loss": 2.4652, "step": 12374 }, { "crossentropy": 2.4064871072769165, "epoch": 0.6729383615650235, "grad_norm": 0.0316777303814888, "grad_norm_var": 2.7803589170679694e-06, "learning_rate": 3.840318128199183e-05, "loss": 2.4065, "step": 12375 }, { "crossentropy": 2.542370319366455, "epoch": 0.6729927404225238, "grad_norm": 0.030327031388878822, "grad_norm_var": 2.7965289989890916e-06, "learning_rate": 3.8249303230929453e-05, "loss": 2.5424, "step": 12376 }, { "crossentropy": 2.5068827867507935, "epoch": 0.6730471192800239, "grad_norm": 0.03271503001451492, "grad_norm_var": 2.9111496766667222e-06, "learning_rate": 3.8095732903020154e-05, "loss": 2.5069, "step": 12377 }, { "crossentropy": 2.5774956941604614, "epoch": 0.6731014981375242, "grad_norm": 0.0332036092877388, "grad_norm_var": 3.127169880540644e-06, "learning_rate": 3.7942470307787415e-05, "loss": 2.5775, "step": 12378 }, { "crossentropy": 2.5192497968673706, "epoch": 0.6731558769950243, "grad_norm": 0.03203696757555008, "grad_norm_var": 3.042464197637364e-06, "learning_rate": 3.778951545473641e-05, "loss": 2.5192, "step": 12379 }, { "crossentropy": 2.5335575342178345, "epoch": 0.6732102558525246, "grad_norm": 0.03122684545814991, "grad_norm_var": 3.0189694652758836e-06, "learning_rate": 3.7636868353353446e-05, "loss": 2.5336, "step": 12380 }, { "crossentropy": 2.48418128490448, "epoch": 0.6732646347100247, "grad_norm": 0.031649112701416016, "grad_norm_var": 2.944099334480782e-06, "learning_rate": 3.74845290131054e-05, "loss": 2.4842, "step": 12381 }, { "crossentropy": 2.5744175910949707, "epoch": 0.673319013567525, "grad_norm": 0.029831688851118088, "grad_norm_var": 9.147546975084297e-07, "learning_rate": 3.733249744344025e-05, "loss": 2.5744, "step": 12382 }, { "crossentropy": 2.5509183406829834, "epoch": 0.6733733924250251, "grad_norm": 0.03105771169066429, "grad_norm_var": 9.054036362435292e-07, "learning_rate": 3.718077365378658e-05, "loss": 2.5509, "step": 12383 }, { "crossentropy": 2.328508973121643, "epoch": 0.6734277712825254, "grad_norm": 0.030636616051197052, "grad_norm_var": 9.104780845502202e-07, "learning_rate": 3.7029357653555197e-05, "loss": 2.3285, "step": 12384 }, { "crossentropy": 2.496167302131653, "epoch": 0.6734821501400255, "grad_norm": 0.03121192567050457, "grad_norm_var": 8.54187419402338e-07, "learning_rate": 3.68782494521358e-05, "loss": 2.4962, "step": 12385 }, { "crossentropy": 2.6528279781341553, "epoch": 0.6735365289975258, "grad_norm": 0.033294156193733215, "grad_norm_var": 1.0513709585768226e-06, "learning_rate": 3.672744905890035e-05, "loss": 2.6528, "step": 12386 }, { "crossentropy": 2.5163416862487793, "epoch": 0.673590907855026, "grad_norm": 0.035118766129016876, "grad_norm_var": 1.8973263726173532e-06, "learning_rate": 3.6576956483200806e-05, "loss": 2.5163, "step": 12387 }, { "crossentropy": 2.5172756910324097, "epoch": 0.6736452867125262, "grad_norm": 0.03105483204126358, "grad_norm_var": 1.790656866691243e-06, "learning_rate": 3.642677173437137e-05, "loss": 2.5173, "step": 12388 }, { "crossentropy": 2.5119564533233643, "epoch": 0.6736996655700264, "grad_norm": 0.03106720559298992, "grad_norm_var": 1.7531354270189457e-06, "learning_rate": 3.627689482172625e-05, "loss": 2.512, "step": 12389 }, { "crossentropy": 2.551500916481018, "epoch": 0.6737540444275266, "grad_norm": 0.03158599138259888, "grad_norm_var": 1.7221624517363883e-06, "learning_rate": 3.612732575456024e-05, "loss": 2.5515, "step": 12390 }, { "crossentropy": 2.5624701976776123, "epoch": 0.6738084232850268, "grad_norm": 0.0306762233376503, "grad_norm_var": 1.7919577875528021e-06, "learning_rate": 3.59780645421498e-05, "loss": 2.5625, "step": 12391 }, { "crossentropy": 2.421278715133667, "epoch": 0.673862802142527, "grad_norm": 0.031586870551109314, "grad_norm_var": 1.6658435049769626e-06, "learning_rate": 3.582911119375143e-05, "loss": 2.4213, "step": 12392 }, { "crossentropy": 2.5107470750808716, "epoch": 0.6739171810000272, "grad_norm": 0.030693553388118744, "grad_norm_var": 1.660354303032356e-06, "learning_rate": 3.5680465718603836e-05, "loss": 2.5107, "step": 12393 }, { "crossentropy": 2.57294762134552, "epoch": 0.6739715598575274, "grad_norm": 0.030677082017064095, "grad_norm_var": 1.5260963667299472e-06, "learning_rate": 3.553212812592577e-05, "loss": 2.5729, "step": 12394 }, { "crossentropy": 2.532137155532837, "epoch": 0.6740259387150276, "grad_norm": 0.0321025513112545, "grad_norm_var": 1.5313855914984738e-06, "learning_rate": 3.538409842491597e-05, "loss": 2.5321, "step": 12395 }, { "crossentropy": 2.5877431631088257, "epoch": 0.6740803175725278, "grad_norm": 0.03096088580787182, "grad_norm_var": 1.5443207660742013e-06, "learning_rate": 3.523637662475654e-05, "loss": 2.5877, "step": 12396 }, { "crossentropy": 2.5341111421585083, "epoch": 0.674134696430028, "grad_norm": 0.03146408125758171, "grad_norm_var": 1.54155625031917e-06, "learning_rate": 3.5088962734608485e-05, "loss": 2.5341, "step": 12397 }, { "crossentropy": 2.379533290863037, "epoch": 0.6741890752875282, "grad_norm": 0.03179315850138664, "grad_norm_var": 1.3617205844713871e-06, "learning_rate": 3.4941856763613386e-05, "loss": 2.3795, "step": 12398 }, { "crossentropy": 2.5250535011291504, "epoch": 0.6742434541450284, "grad_norm": 0.033901821821928024, "grad_norm_var": 1.6762934355748471e-06, "learning_rate": 3.479505872089617e-05, "loss": 2.5251, "step": 12399 }, { "crossentropy": 2.586768627166748, "epoch": 0.6742978330025287, "grad_norm": 0.03243587538599968, "grad_norm_var": 1.6141377335963011e-06, "learning_rate": 3.464856861556009e-05, "loss": 2.5868, "step": 12400 }, { "crossentropy": 2.50847852230072, "epoch": 0.6743522118600288, "grad_norm": 0.02992001175880432, "grad_norm_var": 1.8286333821410644e-06, "learning_rate": 3.450238645669068e-05, "loss": 2.5085, "step": 12401 }, { "crossentropy": 2.5419938564300537, "epoch": 0.6744065907175291, "grad_norm": 0.030938906595110893, "grad_norm_var": 1.6969541124750112e-06, "learning_rate": 3.4356512253354014e-05, "loss": 2.542, "step": 12402 }, { "crossentropy": 2.5724096298217773, "epoch": 0.6744609695750292, "grad_norm": 0.030112236738204956, "grad_norm_var": 9.303931013227019e-07, "learning_rate": 3.4210946014596736e-05, "loss": 2.5724, "step": 12403 }, { "crossentropy": 2.549988031387329, "epoch": 0.6745153484325295, "grad_norm": 0.03065953589975834, "grad_norm_var": 9.53645395067532e-07, "learning_rate": 3.4065687749446626e-05, "loss": 2.55, "step": 12404 }, { "crossentropy": 2.3469024896621704, "epoch": 0.6745697272900296, "grad_norm": 0.030788421630859375, "grad_norm_var": 9.666357556193174e-07, "learning_rate": 3.392073746691371e-05, "loss": 2.3469, "step": 12405 }, { "crossentropy": 2.5207279920578003, "epoch": 0.6746241061475299, "grad_norm": 0.031092209741473198, "grad_norm_var": 9.60976637685605e-07, "learning_rate": 3.3776095175986344e-05, "loss": 2.5207, "step": 12406 }, { "crossentropy": 2.5029473304748535, "epoch": 0.67467848500503, "grad_norm": 0.030674582347273827, "grad_norm_var": 9.610996594489977e-07, "learning_rate": 3.3631760885635135e-05, "loss": 2.5029, "step": 12407 }, { "crossentropy": 2.55386483669281, "epoch": 0.6747328638625303, "grad_norm": 0.031116019934415817, "grad_norm_var": 9.53029419412515e-07, "learning_rate": 3.348773460481291e-05, "loss": 2.5539, "step": 12408 }, { "crossentropy": 2.522372007369995, "epoch": 0.6747872427200304, "grad_norm": 0.03249149024486542, "grad_norm_var": 1.0316958466945077e-06, "learning_rate": 3.334401634245032e-05, "loss": 2.5224, "step": 12409 }, { "crossentropy": 2.512652039527893, "epoch": 0.6748416215775307, "grad_norm": 0.030916789546608925, "grad_norm_var": 1.014721053374928e-06, "learning_rate": 3.320060610746134e-05, "loss": 2.5127, "step": 12410 }, { "crossentropy": 2.4405187368392944, "epoch": 0.6748960004350308, "grad_norm": 0.03163360059261322, "grad_norm_var": 9.805067525787506e-07, "learning_rate": 3.3057503908741074e-05, "loss": 2.4405, "step": 12411 }, { "crossentropy": 2.5310139656066895, "epoch": 0.6749503792925311, "grad_norm": 0.02994273416697979, "grad_norm_var": 1.092177558248882e-06, "learning_rate": 3.291470975516353e-05, "loss": 2.531, "step": 12412 }, { "crossentropy": 2.4949535131454468, "epoch": 0.6750047581500312, "grad_norm": 0.031449683010578156, "grad_norm_var": 1.091765307988044e-06, "learning_rate": 3.277222365558386e-05, "loss": 2.495, "step": 12413 }, { "crossentropy": 2.5676889419555664, "epoch": 0.6750591370075315, "grad_norm": 0.030661309137940407, "grad_norm_var": 1.0886094490013353e-06, "learning_rate": 3.2630045618841086e-05, "loss": 2.5677, "step": 12414 }, { "crossentropy": 2.412574887275696, "epoch": 0.6751135158650317, "grad_norm": 0.030724186450242996, "grad_norm_var": 5.626669618961303e-07, "learning_rate": 3.2488175653751504e-05, "loss": 2.4126, "step": 12415 }, { "crossentropy": 2.5827313661575317, "epoch": 0.6751678947225319, "grad_norm": 0.03166412189602852, "grad_norm_var": 4.492946903852096e-07, "learning_rate": 3.234661376911363e-05, "loss": 2.5827, "step": 12416 }, { "crossentropy": 2.416644334793091, "epoch": 0.6752222735800321, "grad_norm": 0.032159872353076935, "grad_norm_var": 4.62982159411572e-07, "learning_rate": 3.220535997370821e-05, "loss": 2.4166, "step": 12417 }, { "crossentropy": 2.501905679702759, "epoch": 0.6752766524375323, "grad_norm": 0.030353432521224022, "grad_norm_var": 4.941793909396617e-07, "learning_rate": 3.2064414276294364e-05, "loss": 2.5019, "step": 12418 }, { "crossentropy": 2.46274995803833, "epoch": 0.6753310312950325, "grad_norm": 0.030570901930332184, "grad_norm_var": 4.5135363290458155e-07, "learning_rate": 3.192377668561397e-05, "loss": 2.4627, "step": 12419 }, { "crossentropy": 2.577233910560608, "epoch": 0.6753854101525327, "grad_norm": 0.03263550624251366, "grad_norm_var": 5.908811893384191e-07, "learning_rate": 3.178344721038895e-05, "loss": 2.5772, "step": 12420 }, { "crossentropy": 2.625405430793762, "epoch": 0.6754397890100329, "grad_norm": 0.0315123125910759, "grad_norm_var": 5.858686458868126e-07, "learning_rate": 3.164342585932289e-05, "loss": 2.6254, "step": 12421 }, { "crossentropy": 2.600546717643738, "epoch": 0.6754941678675331, "grad_norm": 0.03030395694077015, "grad_norm_var": 6.38650663325561e-07, "learning_rate": 3.150371264109997e-05, "loss": 2.6005, "step": 12422 }, { "crossentropy": 2.5652421712875366, "epoch": 0.6755485467250333, "grad_norm": 0.031201234087347984, "grad_norm_var": 6.208002664898997e-07, "learning_rate": 3.1364307564384354e-05, "loss": 2.5652, "step": 12423 }, { "crossentropy": 2.524590492248535, "epoch": 0.6756029255825335, "grad_norm": 0.03144829347729683, "grad_norm_var": 6.2360027714356e-07, "learning_rate": 3.122521063782191e-05, "loss": 2.5246, "step": 12424 }, { "crossentropy": 2.4637818336486816, "epoch": 0.6756573044400337, "grad_norm": 0.03213796764612198, "grad_norm_var": 5.719182214744545e-07, "learning_rate": 3.108642187004018e-05, "loss": 2.4638, "step": 12425 }, { "crossentropy": 2.5901843309402466, "epoch": 0.675711683297534, "grad_norm": 0.03202952444553375, "grad_norm_var": 6.062112450025963e-07, "learning_rate": 3.094794126964562e-05, "loss": 2.5902, "step": 12426 }, { "crossentropy": 2.539790391921997, "epoch": 0.6757660621550341, "grad_norm": 0.030867479741573334, "grad_norm_var": 6.064470467259982e-07, "learning_rate": 3.080976884522691e-05, "loss": 2.5398, "step": 12427 }, { "crossentropy": 2.4119452238082886, "epoch": 0.6758204410125344, "grad_norm": 0.031358204782009125, "grad_norm_var": 4.889306615877183e-07, "learning_rate": 3.067190460535385e-05, "loss": 2.4119, "step": 12428 }, { "crossentropy": 2.4544517993927, "epoch": 0.6758748198700345, "grad_norm": 0.0320022888481617, "grad_norm_var": 5.177651032190269e-07, "learning_rate": 3.0534348558576285e-05, "loss": 2.4545, "step": 12429 }, { "crossentropy": 2.5435508489608765, "epoch": 0.6759291987275348, "grad_norm": 0.03075423836708069, "grad_norm_var": 5.097478835939708e-07, "learning_rate": 3.0397100713425718e-05, "loss": 2.5436, "step": 12430 }, { "crossentropy": 2.509778618812561, "epoch": 0.6759835775850349, "grad_norm": 0.031599514186382294, "grad_norm_var": 4.836953503046012e-07, "learning_rate": 3.026016107841312e-05, "loss": 2.5098, "step": 12431 }, { "crossentropy": 2.4885623455047607, "epoch": 0.6760379564425352, "grad_norm": 0.03126528859138489, "grad_norm_var": 4.802525863372888e-07, "learning_rate": 3.0123529662032244e-05, "loss": 2.4886, "step": 12432 }, { "crossentropy": 2.597862482070923, "epoch": 0.6760923353000353, "grad_norm": 0.031052855774760246, "grad_norm_var": 4.4284173081887703e-07, "learning_rate": 2.9987206472756877e-05, "loss": 2.5979, "step": 12433 }, { "crossentropy": 2.504811644554138, "epoch": 0.6761467141575356, "grad_norm": 0.03084103763103485, "grad_norm_var": 3.9497093049082646e-07, "learning_rate": 2.9851191519041365e-05, "loss": 2.5048, "step": 12434 }, { "crossentropy": 2.4107621908187866, "epoch": 0.6762010930150357, "grad_norm": 0.0317392498254776, "grad_norm_var": 3.591068835749004e-07, "learning_rate": 2.971548480932118e-05, "loss": 2.4108, "step": 12435 }, { "crossentropy": 2.478598713874817, "epoch": 0.676255471872536, "grad_norm": 0.0303396787494421, "grad_norm_var": 3.1700828519853666e-07, "learning_rate": 2.9580086352012924e-05, "loss": 2.4786, "step": 12436 }, { "crossentropy": 2.484283685684204, "epoch": 0.6763098507300361, "grad_norm": 0.03156415373086929, "grad_norm_var": 3.1879364407825377e-07, "learning_rate": 2.944499615551377e-05, "loss": 2.4843, "step": 12437 }, { "crossentropy": 2.5556485652923584, "epoch": 0.6763642295875364, "grad_norm": 0.030675923451781273, "grad_norm_var": 2.789563192079347e-07, "learning_rate": 2.9310214228202014e-05, "loss": 2.5556, "step": 12438 }, { "crossentropy": 2.536497473716736, "epoch": 0.6764186084450365, "grad_norm": 0.03084881231188774, "grad_norm_var": 2.9158579774346494e-07, "learning_rate": 2.9175740578436528e-05, "loss": 2.5365, "step": 12439 }, { "crossentropy": 2.393996000289917, "epoch": 0.6764729873025368, "grad_norm": 0.03086450882256031, "grad_norm_var": 3.0000300355042856e-07, "learning_rate": 2.9041575214557303e-05, "loss": 2.394, "step": 12440 }, { "crossentropy": 2.52784526348114, "epoch": 0.676527366160037, "grad_norm": 0.03347127512097359, "grad_norm_var": 5.696262354498716e-07, "learning_rate": 2.8907718144884908e-05, "loss": 2.5278, "step": 12441 }, { "crossentropy": 2.45962917804718, "epoch": 0.6765817450175372, "grad_norm": 0.03073790669441223, "grad_norm_var": 5.533602083995781e-07, "learning_rate": 2.877416937772159e-05, "loss": 2.4596, "step": 12442 }, { "crossentropy": 2.506385326385498, "epoch": 0.6766361238750374, "grad_norm": 0.03268539533019066, "grad_norm_var": 6.674590466141176e-07, "learning_rate": 2.8640928921349618e-05, "loss": 2.5064, "step": 12443 }, { "crossentropy": 2.53579318523407, "epoch": 0.6766905027325376, "grad_norm": 0.032155267894268036, "grad_norm_var": 7.067072156205848e-07, "learning_rate": 2.850799678403293e-05, "loss": 2.5358, "step": 12444 }, { "crossentropy": 2.386649966239929, "epoch": 0.6767448815900378, "grad_norm": 0.030692879110574722, "grad_norm_var": 7.108683756440817e-07, "learning_rate": 2.8375372974014936e-05, "loss": 2.3867, "step": 12445 }, { "crossentropy": 2.5656001567840576, "epoch": 0.676799260447538, "grad_norm": 0.031021054834127426, "grad_norm_var": 6.948170344717175e-07, "learning_rate": 2.824305749952072e-05, "loss": 2.5656, "step": 12446 }, { "crossentropy": 2.4824053049087524, "epoch": 0.6768536393050382, "grad_norm": 0.032943930476903915, "grad_norm_var": 8.530161474178006e-07, "learning_rate": 2.8111050368757605e-05, "loss": 2.4824, "step": 12447 }, { "crossentropy": 2.4271514415740967, "epoch": 0.6769080181625384, "grad_norm": 0.032587286084890366, "grad_norm_var": 9.330011855462894e-07, "learning_rate": 2.7979351589911273e-05, "loss": 2.4272, "step": 12448 }, { "crossentropy": 2.440232038497925, "epoch": 0.6769623970200386, "grad_norm": 0.030394455417990685, "grad_norm_var": 1.0005614354877586e-06, "learning_rate": 2.7847961171150737e-05, "loss": 2.4402, "step": 12449 }, { "crossentropy": 2.4803318977355957, "epoch": 0.6770167758775388, "grad_norm": 0.03051440790295601, "grad_norm_var": 1.0347376194564176e-06, "learning_rate": 2.7716879120623927e-05, "loss": 2.4803, "step": 12450 }, { "crossentropy": 2.4576419591903687, "epoch": 0.677071154735039, "grad_norm": 0.030414776876568794, "grad_norm_var": 1.0936958227324475e-06, "learning_rate": 2.7586105446460452e-05, "loss": 2.4576, "step": 12451 }, { "crossentropy": 2.5449177026748657, "epoch": 0.6771255335925392, "grad_norm": 0.031983427703380585, "grad_norm_var": 1.0368668234894684e-06, "learning_rate": 2.7455640156771045e-05, "loss": 2.5449, "step": 12452 }, { "crossentropy": 2.411687731742859, "epoch": 0.6771799124500394, "grad_norm": 0.030686600133776665, "grad_norm_var": 1.0742407638550012e-06, "learning_rate": 2.732548325964701e-05, "loss": 2.4117, "step": 12453 }, { "crossentropy": 2.4021443128585815, "epoch": 0.6772342913075396, "grad_norm": 0.029963530600070953, "grad_norm_var": 1.1763864965067447e-06, "learning_rate": 2.7195634763160227e-05, "loss": 2.4021, "step": 12454 }, { "crossentropy": 2.6066354513168335, "epoch": 0.6772886701650398, "grad_norm": 0.03054150938987732, "grad_norm_var": 1.2037602475491991e-06, "learning_rate": 2.706609467536425e-05, "loss": 2.6066, "step": 12455 }, { "crossentropy": 2.5022692680358887, "epoch": 0.6773430490225401, "grad_norm": 0.030471062287688255, "grad_norm_var": 1.239094763604472e-06, "learning_rate": 2.6936863004293212e-05, "loss": 2.5023, "step": 12456 }, { "crossentropy": 2.504518985748291, "epoch": 0.6773974278800402, "grad_norm": 0.03169187158346176, "grad_norm_var": 9.287355055550539e-07, "learning_rate": 2.68079397579607e-05, "loss": 2.5045, "step": 12457 }, { "crossentropy": 2.4985493421554565, "epoch": 0.6774518067375405, "grad_norm": 0.03103499300777912, "grad_norm_var": 9.152410830262723e-07, "learning_rate": 2.6679324944363646e-05, "loss": 2.4985, "step": 12458 }, { "crossentropy": 2.447691559791565, "epoch": 0.6775061855950406, "grad_norm": 0.031327325850725174, "grad_norm_var": 7.68135500033235e-07, "learning_rate": 2.655101857147846e-05, "loss": 2.4477, "step": 12459 }, { "crossentropy": 2.587104082107544, "epoch": 0.6775605644525409, "grad_norm": 0.030197609215974808, "grad_norm_var": 7.456637470672761e-07, "learning_rate": 2.6423020647262096e-05, "loss": 2.5871, "step": 12460 }, { "crossentropy": 2.4350287914276123, "epoch": 0.677614943310041, "grad_norm": 0.03494390472769737, "grad_norm_var": 1.6845040802510816e-06, "learning_rate": 2.6295331179653215e-05, "loss": 2.435, "step": 12461 }, { "crossentropy": 2.482403516769409, "epoch": 0.6776693221675413, "grad_norm": 0.030160997062921524, "grad_norm_var": 1.7621336259576823e-06, "learning_rate": 2.6167950176571032e-05, "loss": 2.4824, "step": 12462 }, { "crossentropy": 2.533428907394409, "epoch": 0.6777237010250414, "grad_norm": 0.031292278319597244, "grad_norm_var": 1.5576342022288604e-06, "learning_rate": 2.604087764591534e-05, "loss": 2.5334, "step": 12463 }, { "crossentropy": 2.4945300817489624, "epoch": 0.6777780798825417, "grad_norm": 0.031547993421554565, "grad_norm_var": 1.4242942853948664e-06, "learning_rate": 2.5914113595567058e-05, "loss": 2.4945, "step": 12464 }, { "crossentropy": 2.5059133768081665, "epoch": 0.6778324587400418, "grad_norm": 0.030736222863197327, "grad_norm_var": 1.4006775825414764e-06, "learning_rate": 2.578765803338823e-05, "loss": 2.5059, "step": 12465 }, { "crossentropy": 2.582773804664612, "epoch": 0.6778868375975421, "grad_norm": 0.031195485964417458, "grad_norm_var": 1.37701069292521e-06, "learning_rate": 2.566151096722147e-05, "loss": 2.5828, "step": 12466 }, { "crossentropy": 2.625348687171936, "epoch": 0.6779412164550422, "grad_norm": 0.031140638515353203, "grad_norm_var": 1.3400570969000162e-06, "learning_rate": 2.553567240489052e-05, "loss": 2.6253, "step": 12467 }, { "crossentropy": 2.637515425682068, "epoch": 0.6779955953125425, "grad_norm": 0.032000746577978134, "grad_norm_var": 1.3419259888050219e-06, "learning_rate": 2.541014235419914e-05, "loss": 2.6375, "step": 12468 }, { "crossentropy": 2.4978615045547485, "epoch": 0.6780499741700426, "grad_norm": 0.0297517329454422, "grad_norm_var": 1.4584624135882223e-06, "learning_rate": 2.5284920822932767e-05, "loss": 2.4979, "step": 12469 }, { "crossentropy": 2.488903760910034, "epoch": 0.6781043530275429, "grad_norm": 0.03199175372719765, "grad_norm_var": 1.4015075541018787e-06, "learning_rate": 2.5160007818857967e-05, "loss": 2.4889, "step": 12470 }, { "crossentropy": 2.485970377922058, "epoch": 0.678158731885043, "grad_norm": 0.0318736732006073, "grad_norm_var": 1.3862903951800468e-06, "learning_rate": 2.503540334972132e-05, "loss": 2.486, "step": 12471 }, { "crossentropy": 2.5418410301208496, "epoch": 0.6782131107425433, "grad_norm": 0.031109414994716644, "grad_norm_var": 1.3382349514271987e-06, "learning_rate": 2.491110742325109e-05, "loss": 2.5418, "step": 12472 }, { "crossentropy": 2.4759124517440796, "epoch": 0.6782674896000435, "grad_norm": 0.03244462236762047, "grad_norm_var": 1.40547400270193e-06, "learning_rate": 2.4787120047155e-05, "loss": 2.4759, "step": 12473 }, { "crossentropy": 2.42154061794281, "epoch": 0.6783218684575437, "grad_norm": 0.029840858653187752, "grad_norm_var": 1.5561888073071697e-06, "learning_rate": 2.4663441229124117e-05, "loss": 2.4215, "step": 12474 }, { "crossentropy": 2.488437533378601, "epoch": 0.6783762473150439, "grad_norm": 0.03114096075296402, "grad_norm_var": 1.558853491241208e-06, "learning_rate": 2.4540070976827865e-05, "loss": 2.4884, "step": 12475 }, { "crossentropy": 2.4832884073257446, "epoch": 0.6784306261725441, "grad_norm": 0.0317976176738739, "grad_norm_var": 1.4760919548094012e-06, "learning_rate": 2.441700929791735e-05, "loss": 2.4833, "step": 12476 }, { "crossentropy": 2.5225954055786133, "epoch": 0.6784850050300443, "grad_norm": 0.03200575336813927, "grad_norm_var": 6.412299897290505e-07, "learning_rate": 2.42942562000259e-05, "loss": 2.5226, "step": 12477 }, { "crossentropy": 2.4948389530181885, "epoch": 0.6785393838875445, "grad_norm": 0.030378082767128944, "grad_norm_var": 6.125988191996168e-07, "learning_rate": 2.4171811690765212e-05, "loss": 2.4948, "step": 12478 }, { "crossentropy": 2.5343024730682373, "epoch": 0.6785937627450447, "grad_norm": 0.030429411679506302, "grad_norm_var": 6.560505023511798e-07, "learning_rate": 2.4049675777730317e-05, "loss": 2.5343, "step": 12479 }, { "crossentropy": 2.5679609775543213, "epoch": 0.6786481416025449, "grad_norm": 0.03144802525639534, "grad_norm_var": 6.521907614570957e-07, "learning_rate": 2.3927848468495163e-05, "loss": 2.568, "step": 12480 }, { "crossentropy": 2.577619433403015, "epoch": 0.6787025204600451, "grad_norm": 0.03234337642788887, "grad_norm_var": 7.131047776311184e-07, "learning_rate": 2.380632977061592e-05, "loss": 2.5776, "step": 12481 }, { "crossentropy": 2.4769259691238403, "epoch": 0.6787568993175453, "grad_norm": 0.03028826229274273, "grad_norm_var": 7.778847586620633e-07, "learning_rate": 2.3685119691628232e-05, "loss": 2.4769, "step": 12482 }, { "crossentropy": 2.456426739692688, "epoch": 0.6788112781750455, "grad_norm": 0.03056568279862404, "grad_norm_var": 8.068571740939134e-07, "learning_rate": 2.3564218239049974e-05, "loss": 2.4564, "step": 12483 }, { "crossentropy": 2.415749669075012, "epoch": 0.6788656570325458, "grad_norm": 0.030396681278944016, "grad_norm_var": 7.992180801723632e-07, "learning_rate": 2.3443625420380143e-05, "loss": 2.4157, "step": 12484 }, { "crossentropy": 2.5932459831237793, "epoch": 0.6789200358900459, "grad_norm": 0.033134136348962784, "grad_norm_var": 9.004038750353433e-07, "learning_rate": 2.3323341243096653e-05, "loss": 2.5932, "step": 12485 }, { "crossentropy": 2.589587092399597, "epoch": 0.6789744147475462, "grad_norm": 0.03042399138212204, "grad_norm_var": 9.144937673090931e-07, "learning_rate": 2.320336571465964e-05, "loss": 2.5896, "step": 12486 }, { "crossentropy": 2.4906961917877197, "epoch": 0.6790287936050463, "grad_norm": 0.0310558769851923, "grad_norm_var": 8.857022405402033e-07, "learning_rate": 2.3083698842510383e-05, "loss": 2.4907, "step": 12487 }, { "crossentropy": 2.586916446685791, "epoch": 0.6790831724625466, "grad_norm": 0.03142322972416878, "grad_norm_var": 8.891058106548962e-07, "learning_rate": 2.2964340634069602e-05, "loss": 2.5869, "step": 12488 }, { "crossentropy": 2.5690903663635254, "epoch": 0.6791375513200467, "grad_norm": 0.03168056905269623, "grad_norm_var": 7.982663133551093e-07, "learning_rate": 2.2845291096740827e-05, "loss": 2.5691, "step": 12489 }, { "crossentropy": 2.4835307598114014, "epoch": 0.679191930177547, "grad_norm": 0.031201492995023727, "grad_norm_var": 6.770108846358558e-07, "learning_rate": 2.2726550237906486e-05, "loss": 2.4835, "step": 12490 }, { "crossentropy": 2.631275177001953, "epoch": 0.6792463090350471, "grad_norm": 0.03382835537195206, "grad_norm_var": 1.0957446100766232e-06, "learning_rate": 2.2608118064931795e-05, "loss": 2.6313, "step": 12491 }, { "crossentropy": 2.5171018838882446, "epoch": 0.6793006878925474, "grad_norm": 0.032954517751932144, "grad_norm_var": 1.2407243195721676e-06, "learning_rate": 2.2489994585160877e-05, "loss": 2.5171, "step": 12492 }, { "crossentropy": 2.358250141143799, "epoch": 0.6793550667500475, "grad_norm": 0.030527321621775627, "grad_norm_var": 1.2721857129101173e-06, "learning_rate": 2.23721798059201e-05, "loss": 2.3583, "step": 12493 }, { "crossentropy": 2.539830803871155, "epoch": 0.6794094456075478, "grad_norm": 0.030808735638856888, "grad_norm_var": 1.22625014974489e-06, "learning_rate": 2.2254673734515284e-05, "loss": 2.5398, "step": 12494 }, { "crossentropy": 2.499160408973694, "epoch": 0.6794638244650479, "grad_norm": 0.0303048025816679, "grad_norm_var": 1.2434603774219103e-06, "learning_rate": 2.2137476378236153e-05, "loss": 2.4992, "step": 12495 }, { "crossentropy": 2.466990113258362, "epoch": 0.6795182033225482, "grad_norm": 0.033415537327528, "grad_norm_var": 1.4982480607865037e-06, "learning_rate": 2.202058774434912e-05, "loss": 2.467, "step": 12496 }, { "crossentropy": 2.4308329820632935, "epoch": 0.6795725821800483, "grad_norm": 0.03265293687582016, "grad_norm_var": 1.5381379082153473e-06, "learning_rate": 2.190400784010449e-05, "loss": 2.4308, "step": 12497 }, { "crossentropy": 2.529665470123291, "epoch": 0.6796269610375486, "grad_norm": 0.0319027304649353, "grad_norm_var": 1.4312947986653274e-06, "learning_rate": 2.178773667273204e-05, "loss": 2.5297, "step": 12498 }, { "crossentropy": 2.4337655305862427, "epoch": 0.6796813398950488, "grad_norm": 0.03118913248181343, "grad_norm_var": 1.3660933875670923e-06, "learning_rate": 2.1671774249442665e-05, "loss": 2.4338, "step": 12499 }, { "crossentropy": 2.4923834800720215, "epoch": 0.679735718752549, "grad_norm": 0.03053521178662777, "grad_norm_var": 1.3435658235259422e-06, "learning_rate": 2.1556120577429506e-05, "loss": 2.4924, "step": 12500 }, { "crossentropy": 2.5401453971862793, "epoch": 0.6797900976100492, "grad_norm": 0.03147701919078827, "grad_norm_var": 1.1960931054849056e-06, "learning_rate": 2.1440775663863487e-05, "loss": 2.5401, "step": 12501 }, { "crossentropy": 2.467963695526123, "epoch": 0.6798444764675494, "grad_norm": 0.031445037573575974, "grad_norm_var": 1.1030098320476597e-06, "learning_rate": 2.132573951589889e-05, "loss": 2.468, "step": 12502 }, { "crossentropy": 2.567864179611206, "epoch": 0.6798988553250496, "grad_norm": 0.03066680207848549, "grad_norm_var": 1.1433002812439302e-06, "learning_rate": 2.1211012140671115e-05, "loss": 2.5679, "step": 12503 }, { "crossentropy": 2.411859631538391, "epoch": 0.6799532341825498, "grad_norm": 0.030072513967752457, "grad_norm_var": 1.2938164485342402e-06, "learning_rate": 2.1096593545294473e-05, "loss": 2.4119, "step": 12504 }, { "crossentropy": 2.4957960844039917, "epoch": 0.68000761304005, "grad_norm": 0.03353695943951607, "grad_norm_var": 1.5436450730730988e-06, "learning_rate": 2.0982483736864953e-05, "loss": 2.4958, "step": 12505 }, { "crossentropy": 2.427446722984314, "epoch": 0.6800619918975502, "grad_norm": 0.03137286379933357, "grad_norm_var": 1.5350623402198553e-06, "learning_rate": 2.086868272246023e-05, "loss": 2.4274, "step": 12506 }, { "crossentropy": 2.540867805480957, "epoch": 0.6801163707550504, "grad_norm": 0.03104923665523529, "grad_norm_var": 1.2173206573248685e-06, "learning_rate": 2.0755190509137434e-05, "loss": 2.5409, "step": 12507 }, { "crossentropy": 2.5938658714294434, "epoch": 0.6801707496125506, "grad_norm": 0.030261995270848274, "grad_norm_var": 1.1462603626864627e-06, "learning_rate": 2.064200710393538e-05, "loss": 2.5939, "step": 12508 }, { "crossentropy": 2.554632782936096, "epoch": 0.6802251284700509, "grad_norm": 0.030506419017910957, "grad_norm_var": 1.14851409194483e-06, "learning_rate": 2.052913251387456e-05, "loss": 2.5546, "step": 12509 }, { "crossentropy": 2.429065465927124, "epoch": 0.680279507327551, "grad_norm": 0.036163460463285446, "grad_norm_var": 2.572079835239799e-06, "learning_rate": 2.0416566745953823e-05, "loss": 2.4291, "step": 12510 }, { "crossentropy": 2.4748090505599976, "epoch": 0.6803338861850513, "grad_norm": 0.03181714564561844, "grad_norm_var": 2.4418514107962664e-06, "learning_rate": 2.0304309807155364e-05, "loss": 2.4748, "step": 12511 }, { "crossentropy": 2.4568440914154053, "epoch": 0.6803882650425515, "grad_norm": 0.03038354218006134, "grad_norm_var": 2.344735807914425e-06, "learning_rate": 2.0192361704441388e-05, "loss": 2.4568, "step": 12512 }, { "crossentropy": 2.6717731952667236, "epoch": 0.6804426439000517, "grad_norm": 0.03052860125899315, "grad_norm_var": 2.318509716423102e-06, "learning_rate": 2.008072244475412e-05, "loss": 2.6718, "step": 12513 }, { "crossentropy": 2.463109254837036, "epoch": 0.6804970227575519, "grad_norm": 0.030171798542141914, "grad_norm_var": 2.3970792155292643e-06, "learning_rate": 1.996939203501802e-05, "loss": 2.4631, "step": 12514 }, { "crossentropy": 2.553165912628174, "epoch": 0.6805514016150521, "grad_norm": 0.031342122703790665, "grad_norm_var": 2.395798951307313e-06, "learning_rate": 1.9858370482137012e-05, "loss": 2.5532, "step": 12515 }, { "crossentropy": 2.479183793067932, "epoch": 0.6806057804725523, "grad_norm": 0.030405569821596146, "grad_norm_var": 2.410642584366749e-06, "learning_rate": 1.9747657792996697e-05, "loss": 2.4792, "step": 12516 }, { "crossentropy": 2.576900362968445, "epoch": 0.6806601593300525, "grad_norm": 0.032394494861364365, "grad_norm_var": 2.4818408904929973e-06, "learning_rate": 1.963725397446381e-05, "loss": 2.5769, "step": 12517 }, { "crossentropy": 2.5758315324783325, "epoch": 0.6807145381875527, "grad_norm": 0.03262811899185181, "grad_norm_var": 2.5792000966113796e-06, "learning_rate": 1.9527159033385644e-05, "loss": 2.5758, "step": 12518 }, { "crossentropy": 2.4878705739974976, "epoch": 0.6807689170450529, "grad_norm": 0.03228765353560448, "grad_norm_var": 2.5727649717116464e-06, "learning_rate": 1.941737297658952e-05, "loss": 2.4879, "step": 12519 }, { "crossentropy": 2.547038435935974, "epoch": 0.6808232959025531, "grad_norm": 0.03293807432055473, "grad_norm_var": 2.5185445096994325e-06, "learning_rate": 1.930789581088388e-05, "loss": 2.547, "step": 12520 }, { "crossentropy": 2.554770588874817, "epoch": 0.6808776747600533, "grad_norm": 0.03148544579744339, "grad_norm_var": 2.2891691765604724e-06, "learning_rate": 1.9198727543059958e-05, "loss": 2.5548, "step": 12521 }, { "crossentropy": 2.3923981189727783, "epoch": 0.6809320536175535, "grad_norm": 0.03102901391685009, "grad_norm_var": 2.307363409105821e-06, "learning_rate": 1.9089868179886783e-05, "loss": 2.3924, "step": 12522 }, { "crossentropy": 2.520526647567749, "epoch": 0.6809864324750537, "grad_norm": 0.03114631026983261, "grad_norm_var": 2.300991453045113e-06, "learning_rate": 1.8981317728116732e-05, "loss": 2.5205, "step": 12523 }, { "crossentropy": 2.598833203315735, "epoch": 0.6810408113325539, "grad_norm": 0.03190108761191368, "grad_norm_var": 2.1779960166502423e-06, "learning_rate": 1.8873076194481087e-05, "loss": 2.5988, "step": 12524 }, { "crossentropy": 2.5030816793441772, "epoch": 0.6810951901900542, "grad_norm": 0.0322481207549572, "grad_norm_var": 2.091442270392834e-06, "learning_rate": 1.8765143585693923e-05, "loss": 2.5031, "step": 12525 }, { "crossentropy": 2.6368002891540527, "epoch": 0.6811495690475543, "grad_norm": 0.0363549180328846, "grad_norm_var": 2.20500969703574e-06, "learning_rate": 1.8657519908448217e-05, "loss": 2.6368, "step": 12526 }, { "crossentropy": 2.5169352293014526, "epoch": 0.6812039479050546, "grad_norm": 0.031158028170466423, "grad_norm_var": 2.2320943114971833e-06, "learning_rate": 1.8550205169418632e-05, "loss": 2.5169, "step": 12527 }, { "crossentropy": 2.5010565519332886, "epoch": 0.6812583267625547, "grad_norm": 0.037284642457962036, "grad_norm_var": 3.928156229577905e-06, "learning_rate": 1.844319937526151e-05, "loss": 2.5011, "step": 12528 }, { "crossentropy": 2.4270925521850586, "epoch": 0.681312705620055, "grad_norm": 0.03185318037867546, "grad_norm_var": 3.7414784649724183e-06, "learning_rate": 1.8336502532613208e-05, "loss": 2.4271, "step": 12529 }, { "crossentropy": 2.4780919551849365, "epoch": 0.6813670844775551, "grad_norm": 0.030513176694512367, "grad_norm_var": 3.6523802824936125e-06, "learning_rate": 1.82301146480901e-05, "loss": 2.4781, "step": 12530 }, { "crossentropy": 2.4891204833984375, "epoch": 0.6814214633350554, "grad_norm": 0.03104838728904724, "grad_norm_var": 3.695703834491572e-06, "learning_rate": 1.81240357282908e-05, "loss": 2.4891, "step": 12531 }, { "crossentropy": 2.5758817195892334, "epoch": 0.6814758421925555, "grad_norm": 0.03025776520371437, "grad_norm_var": 3.7342508359066895e-06, "learning_rate": 1.8018265779794483e-05, "loss": 2.5759, "step": 12532 }, { "crossentropy": 2.3807860612869263, "epoch": 0.6815302210500558, "grad_norm": 0.030570533126592636, "grad_norm_var": 3.915069487931188e-06, "learning_rate": 1.7912804809160356e-05, "loss": 2.3808, "step": 12533 }, { "crossentropy": 2.5648778676986694, "epoch": 0.6815845999075559, "grad_norm": 0.031244292855262756, "grad_norm_var": 3.950048561601387e-06, "learning_rate": 1.7807652822929843e-05, "loss": 2.5649, "step": 12534 }, { "crossentropy": 2.5226956605911255, "epoch": 0.6816389787650562, "grad_norm": 0.031482942402362823, "grad_norm_var": 3.96851337840739e-06, "learning_rate": 1.770280982762329e-05, "loss": 2.5227, "step": 12535 }, { "crossentropy": 2.3515515327453613, "epoch": 0.6816933576225563, "grad_norm": 0.03328811004757881, "grad_norm_var": 4.018447543606519e-06, "learning_rate": 1.7598275829743272e-05, "loss": 2.3516, "step": 12536 }, { "crossentropy": 2.5098408460617065, "epoch": 0.6817477364800566, "grad_norm": 0.030328962951898575, "grad_norm_var": 4.189726928091478e-06, "learning_rate": 1.7494050835774045e-05, "loss": 2.5098, "step": 12537 }, { "crossentropy": 2.4173697233200073, "epoch": 0.6818021153375567, "grad_norm": 0.0314042903482914, "grad_norm_var": 4.150852429533834e-06, "learning_rate": 1.7390134852177664e-05, "loss": 2.4174, "step": 12538 }, { "crossentropy": 2.40418803691864, "epoch": 0.681856494195057, "grad_norm": 0.031229276210069656, "grad_norm_var": 4.141780422193866e-06, "learning_rate": 1.7286527885400082e-05, "loss": 2.4042, "step": 12539 }, { "crossentropy": 2.522895336151123, "epoch": 0.6819108730525572, "grad_norm": 0.03268791362643242, "grad_norm_var": 4.1689972740827546e-06, "learning_rate": 1.718322994186672e-05, "loss": 2.5229, "step": 12540 }, { "crossentropy": 2.5058796405792236, "epoch": 0.6819652519100574, "grad_norm": 0.030857715755701065, "grad_norm_var": 4.254885440820958e-06, "learning_rate": 1.708024102798411e-05, "loss": 2.5059, "step": 12541 }, { "crossentropy": 2.5209758281707764, "epoch": 0.6820196307675576, "grad_norm": 0.030542805790901184, "grad_norm_var": 2.970228888030518e-06, "learning_rate": 1.6977561150139376e-05, "loss": 2.521, "step": 12542 }, { "crossentropy": 2.488642692565918, "epoch": 0.6820740096250578, "grad_norm": 0.0320938304066658, "grad_norm_var": 2.9686297906470016e-06, "learning_rate": 1.6875190314700194e-05, "loss": 2.4886, "step": 12543 }, { "crossentropy": 2.59785795211792, "epoch": 0.682128388482558, "grad_norm": 0.03157434239983559, "grad_norm_var": 7.302300773169654e-07, "learning_rate": 1.677312852801649e-05, "loss": 2.5979, "step": 12544 }, { "crossentropy": 2.517266035079956, "epoch": 0.6821827673400582, "grad_norm": 0.03150765970349312, "grad_norm_var": 7.127180658724238e-07, "learning_rate": 1.6671375796417087e-05, "loss": 2.5173, "step": 12545 }, { "crossentropy": 2.4466229677200317, "epoch": 0.6822371461975584, "grad_norm": 0.030350077897310257, "grad_norm_var": 7.312629681648896e-07, "learning_rate": 1.656993212621305e-05, "loss": 2.4466, "step": 12546 }, { "crossentropy": 2.3048421144485474, "epoch": 0.6822915250550586, "grad_norm": 0.03043689951300621, "grad_norm_var": 7.734600463324877e-07, "learning_rate": 1.646879752369601e-05, "loss": 2.3048, "step": 12547 }, { "crossentropy": 2.4908379316329956, "epoch": 0.6823459039125588, "grad_norm": 0.03150341659784317, "grad_norm_var": 7.071209140007671e-07, "learning_rate": 1.6367971995138176e-05, "loss": 2.4908, "step": 12548 }, { "crossentropy": 2.613402843475342, "epoch": 0.682400282770059, "grad_norm": 0.0312562920153141, "grad_norm_var": 6.680821008889574e-07, "learning_rate": 1.6267455546791765e-05, "loss": 2.6134, "step": 12549 }, { "crossentropy": 2.494328737258911, "epoch": 0.6824546616275592, "grad_norm": 0.03506576269865036, "grad_norm_var": 1.5209348431692467e-06, "learning_rate": 1.6167248184891793e-05, "loss": 2.4943, "step": 12550 }, { "crossentropy": 2.5527645349502563, "epoch": 0.6825090404850594, "grad_norm": 0.030584804713726044, "grad_norm_var": 1.585445473177976e-06, "learning_rate": 1.6067349915652728e-05, "loss": 2.5528, "step": 12551 }, { "crossentropy": 2.5378684997558594, "epoch": 0.6825634193425596, "grad_norm": 0.029688473790884018, "grad_norm_var": 1.5584383574675115e-06, "learning_rate": 1.5967760745270176e-05, "loss": 2.5379, "step": 12552 }, { "crossentropy": 2.5265259742736816, "epoch": 0.6826177982000599, "grad_norm": 0.030871687456965446, "grad_norm_var": 1.5051668564717053e-06, "learning_rate": 1.586848067992086e-05, "loss": 2.5265, "step": 12553 }, { "crossentropy": 2.4902063608169556, "epoch": 0.68267217705756, "grad_norm": 0.03123856708407402, "grad_norm_var": 1.5057600460944744e-06, "learning_rate": 1.5769509725760967e-05, "loss": 2.4902, "step": 12554 }, { "crossentropy": 2.4850562810897827, "epoch": 0.6827265559150603, "grad_norm": 0.029992060735821724, "grad_norm_var": 1.6202047699878075e-06, "learning_rate": 1.5670847888929475e-05, "loss": 2.4851, "step": 12555 }, { "crossentropy": 2.53721284866333, "epoch": 0.6827809347725604, "grad_norm": 0.03204088658094406, "grad_norm_var": 1.5236812484086446e-06, "learning_rate": 1.5572495175545376e-05, "loss": 2.5372, "step": 12556 }, { "crossentropy": 2.5396745204925537, "epoch": 0.6828353136300607, "grad_norm": 0.030951786786317825, "grad_norm_var": 1.5196234174272294e-06, "learning_rate": 1.5474451591707684e-05, "loss": 2.5397, "step": 12557 }, { "crossentropy": 2.536535143852234, "epoch": 0.6828896924875608, "grad_norm": 0.03249642625451088, "grad_norm_var": 1.5788456628676797e-06, "learning_rate": 1.5376717143497642e-05, "loss": 2.5365, "step": 12558 }, { "crossentropy": 2.4041388034820557, "epoch": 0.6829440713450611, "grad_norm": 0.030056117102503777, "grad_norm_var": 1.6371673594735378e-06, "learning_rate": 1.527929183697707e-05, "loss": 2.4041, "step": 12559 }, { "crossentropy": 2.5145013332366943, "epoch": 0.6829984502025612, "grad_norm": 0.031825073063373566, "grad_norm_var": 1.6527433689401353e-06, "learning_rate": 1.5182175678186694e-05, "loss": 2.5145, "step": 12560 }, { "crossentropy": 2.467860460281372, "epoch": 0.6830528290600615, "grad_norm": 0.03126941993832588, "grad_norm_var": 1.647840066948851e-06, "learning_rate": 1.5085368673150579e-05, "loss": 2.4679, "step": 12561 }, { "crossentropy": 2.4998371601104736, "epoch": 0.6831072079175616, "grad_norm": 0.030633028596639633, "grad_norm_var": 1.6197704718745222e-06, "learning_rate": 1.4988870827872814e-05, "loss": 2.4998, "step": 12562 }, { "crossentropy": 2.4212284088134766, "epoch": 0.6831615867750619, "grad_norm": 0.032516058534383774, "grad_norm_var": 1.666090356684209e-06, "learning_rate": 1.48926821483375e-05, "loss": 2.4212, "step": 12563 }, { "crossentropy": 2.5432615280151367, "epoch": 0.683215965632562, "grad_norm": 0.030682647600769997, "grad_norm_var": 1.694071492573899e-06, "learning_rate": 1.4796802640510976e-05, "loss": 2.5433, "step": 12564 }, { "crossentropy": 2.4681050777435303, "epoch": 0.6832703444900623, "grad_norm": 0.030928336083889008, "grad_norm_var": 1.7037136434545792e-06, "learning_rate": 1.4701232310338486e-05, "loss": 2.4681, "step": 12565 }, { "crossentropy": 2.5223543643951416, "epoch": 0.6833247233475624, "grad_norm": 0.032898902893066406, "grad_norm_var": 9.099275728026319e-07, "learning_rate": 1.4605971163748067e-05, "loss": 2.5224, "step": 12566 }, { "crossentropy": 2.5647541284561157, "epoch": 0.6833791022050627, "grad_norm": 0.03186221793293953, "grad_norm_var": 9.127293375608507e-07, "learning_rate": 1.4511019206647214e-05, "loss": 2.5648, "step": 12567 }, { "crossentropy": 2.505710482597351, "epoch": 0.6834334810625629, "grad_norm": 0.029784848913550377, "grad_norm_var": 8.932830091694001e-07, "learning_rate": 1.4416376444924551e-05, "loss": 2.5057, "step": 12568 }, { "crossentropy": 2.566911458969116, "epoch": 0.6834878599200631, "grad_norm": 0.030368957668542862, "grad_norm_var": 9.346389845408624e-07, "learning_rate": 1.4322042884450937e-05, "loss": 2.5669, "step": 12569 }, { "crossentropy": 2.50493323802948, "epoch": 0.6835422387775633, "grad_norm": 0.03097495250403881, "grad_norm_var": 9.383853250758637e-07, "learning_rate": 1.4228018531076137e-05, "loss": 2.5049, "step": 12570 }, { "crossentropy": 2.545214295387268, "epoch": 0.6835966176350635, "grad_norm": 0.03130074217915535, "grad_norm_var": 8.337601850866641e-07, "learning_rate": 1.4134303390631042e-05, "loss": 2.5452, "step": 12571 }, { "crossentropy": 2.581295609474182, "epoch": 0.6836509964925637, "grad_norm": 0.03146873787045479, "grad_norm_var": 7.967008384873742e-07, "learning_rate": 1.4040897468927672e-05, "loss": 2.5813, "step": 12572 }, { "crossentropy": 2.3396687507629395, "epoch": 0.6837053753500639, "grad_norm": 0.030279187485575676, "grad_norm_var": 8.51821248525628e-07, "learning_rate": 1.3947800771760277e-05, "loss": 2.3397, "step": 12573 }, { "crossentropy": 2.491612195968628, "epoch": 0.6837597542075641, "grad_norm": 0.03157109022140503, "grad_norm_var": 7.465091771513179e-07, "learning_rate": 1.3855013304901465e-05, "loss": 2.4916, "step": 12574 }, { "crossentropy": 2.5454012155532837, "epoch": 0.6838141330650643, "grad_norm": 0.031120574101805687, "grad_norm_var": 6.61893561358405e-07, "learning_rate": 1.3762535074106076e-05, "loss": 2.5454, "step": 12575 }, { "crossentropy": 2.5173639059066772, "epoch": 0.6838685119225645, "grad_norm": 0.03234810009598732, "grad_norm_var": 7.213403613390377e-07, "learning_rate": 1.367036608510952e-05, "loss": 2.5174, "step": 12576 }, { "crossentropy": 2.5623661279678345, "epoch": 0.6839228907800647, "grad_norm": 0.030632926151156425, "grad_norm_var": 7.450539293083272e-07, "learning_rate": 1.3578506343628338e-05, "loss": 2.5624, "step": 12577 }, { "crossentropy": 2.4208627939224243, "epoch": 0.6839772696375649, "grad_norm": 0.031221378594636917, "grad_norm_var": 7.213717323483725e-07, "learning_rate": 1.3486955855359085e-05, "loss": 2.4209, "step": 12578 }, { "crossentropy": 2.4866628646850586, "epoch": 0.6840316484950651, "grad_norm": 0.03230341151356697, "grad_norm_var": 6.882299409181776e-07, "learning_rate": 1.3395714625979994e-05, "loss": 2.4867, "step": 12579 }, { "crossentropy": 2.4875634908676147, "epoch": 0.6840860273525653, "grad_norm": 0.03138194978237152, "grad_norm_var": 6.673681027803315e-07, "learning_rate": 1.3304782661149872e-05, "loss": 2.4876, "step": 12580 }, { "crossentropy": 2.5108343362808228, "epoch": 0.6841404062100656, "grad_norm": 0.029843352735042572, "grad_norm_var": 7.915110997875798e-07, "learning_rate": 1.3214159966508099e-05, "loss": 2.5108, "step": 12581 }, { "crossentropy": 2.4721707105636597, "epoch": 0.6841947850675657, "grad_norm": 0.03462162986397743, "grad_norm_var": 1.3649145691988258e-06, "learning_rate": 1.3123846547674623e-05, "loss": 2.4722, "step": 12582 }, { "crossentropy": 2.484043002128601, "epoch": 0.684249163925066, "grad_norm": 0.03214159607887268, "grad_norm_var": 1.3900743532240979e-06, "learning_rate": 1.3033842410251074e-05, "loss": 2.484, "step": 12583 }, { "crossentropy": 2.616098999977112, "epoch": 0.6843035427825661, "grad_norm": 0.03077973611652851, "grad_norm_var": 1.2462783386292155e-06, "learning_rate": 1.2944147559819097e-05, "loss": 2.6161, "step": 12584 }, { "crossentropy": 2.495565176010132, "epoch": 0.6843579216400664, "grad_norm": 0.03169623762369156, "grad_norm_var": 1.1743795767912469e-06, "learning_rate": 1.2854762001942022e-05, "loss": 2.4956, "step": 12585 }, { "crossentropy": 2.545638084411621, "epoch": 0.6844123004975665, "grad_norm": 0.030525002628564835, "grad_norm_var": 1.21735349043565e-06, "learning_rate": 1.2765685742162636e-05, "loss": 2.5456, "step": 12586 }, { "crossentropy": 2.5356364250183105, "epoch": 0.6844666793550668, "grad_norm": 0.031719353049993515, "grad_norm_var": 1.2198505159837872e-06, "learning_rate": 1.2676918786005965e-05, "loss": 2.5356, "step": 12587 }, { "crossentropy": 2.419213891029358, "epoch": 0.6845210582125669, "grad_norm": 0.030281150713562965, "grad_norm_var": 1.3095268221086405e-06, "learning_rate": 1.2588461138977603e-05, "loss": 2.4192, "step": 12588 }, { "crossentropy": 2.501156806945801, "epoch": 0.6845754370700672, "grad_norm": 0.030445020645856857, "grad_norm_var": 1.2863710842668404e-06, "learning_rate": 1.2500312806562608e-05, "loss": 2.5012, "step": 12589 }, { "crossentropy": 2.343172073364258, "epoch": 0.6846298159275673, "grad_norm": 0.02973085269331932, "grad_norm_var": 1.459611778728322e-06, "learning_rate": 1.2412473794228274e-05, "loss": 2.3432, "step": 12590 }, { "crossentropy": 2.427201509475708, "epoch": 0.6846841947850676, "grad_norm": 0.03436354547739029, "grad_norm_var": 2.0395415009184877e-06, "learning_rate": 1.2324944107423019e-05, "loss": 2.4272, "step": 12591 }, { "crossentropy": 2.4725492000579834, "epoch": 0.6847385736425677, "grad_norm": 0.031277842819690704, "grad_norm_var": 1.9904217917208424e-06, "learning_rate": 1.223772375157417e-05, "loss": 2.4725, "step": 12592 }, { "crossentropy": 2.5241870880126953, "epoch": 0.684792952500068, "grad_norm": 0.030545877292752266, "grad_norm_var": 2.0002082846591e-06, "learning_rate": 1.2150812732091842e-05, "loss": 2.5242, "step": 12593 }, { "crossentropy": 2.4147889614105225, "epoch": 0.6848473313575681, "grad_norm": 0.02941003069281578, "grad_norm_var": 2.2556232730666734e-06, "learning_rate": 1.2064211054365614e-05, "loss": 2.4148, "step": 12594 }, { "crossentropy": 2.3998851776123047, "epoch": 0.6849017102150684, "grad_norm": 0.030028151348233223, "grad_norm_var": 2.2798255330513475e-06, "learning_rate": 1.19779187237673e-05, "loss": 2.3999, "step": 12595 }, { "crossentropy": 2.552398681640625, "epoch": 0.6849560890725686, "grad_norm": 0.03277803212404251, "grad_norm_var": 2.4402642990380605e-06, "learning_rate": 1.1891935745648175e-05, "loss": 2.5524, "step": 12596 }, { "crossentropy": 2.4985740184783936, "epoch": 0.6850104679300688, "grad_norm": 0.030905747786164284, "grad_norm_var": 2.309892443769209e-06, "learning_rate": 1.1806262125340083e-05, "loss": 2.4986, "step": 12597 }, { "crossentropy": 2.4366918802261353, "epoch": 0.685064846787569, "grad_norm": 0.03288798779249191, "grad_norm_var": 1.736433185875102e-06, "learning_rate": 1.1720897868157664e-05, "loss": 2.4367, "step": 12598 }, { "crossentropy": 2.4857534170150757, "epoch": 0.6851192256450692, "grad_norm": 0.03160274401307106, "grad_norm_var": 1.6883497012166176e-06, "learning_rate": 1.1635842979394462e-05, "loss": 2.4858, "step": 12599 }, { "crossentropy": 2.4625574350357056, "epoch": 0.6851736045025694, "grad_norm": 0.029838401824235916, "grad_norm_var": 1.7947325735359042e-06, "learning_rate": 1.15510974643257e-05, "loss": 2.4626, "step": 12600 }, { "crossentropy": 2.5141454935073853, "epoch": 0.6852279833600696, "grad_norm": 0.03145324066281319, "grad_norm_var": 1.7799880317940127e-06, "learning_rate": 1.1466661328207173e-05, "loss": 2.5141, "step": 12601 }, { "crossentropy": 2.4969170093536377, "epoch": 0.6852823622175698, "grad_norm": 0.030533012002706528, "grad_norm_var": 1.7793651114502336e-06, "learning_rate": 1.138253457627525e-05, "loss": 2.4969, "step": 12602 }, { "crossentropy": 2.4888921976089478, "epoch": 0.68533674107507, "grad_norm": 0.03129172325134277, "grad_norm_var": 1.7561967191922467e-06, "learning_rate": 1.1298717213747977e-05, "loss": 2.4889, "step": 12603 }, { "crossentropy": 2.57928729057312, "epoch": 0.6853911199325702, "grad_norm": 0.032092973589897156, "grad_norm_var": 1.7669728656372613e-06, "learning_rate": 1.1215209245822866e-05, "loss": 2.5793, "step": 12604 }, { "crossentropy": 2.6926275491714478, "epoch": 0.6854454987900704, "grad_norm": 0.031238123774528503, "grad_norm_var": 1.7265472080728635e-06, "learning_rate": 1.113201067767966e-05, "loss": 2.6926, "step": 12605 }, { "crossentropy": 2.490976929664612, "epoch": 0.6854998776475706, "grad_norm": 0.030994342640042305, "grad_norm_var": 1.5706276057486068e-06, "learning_rate": 1.1049121514478122e-05, "loss": 2.491, "step": 12606 }, { "crossentropy": 2.465858221054077, "epoch": 0.6855542565050708, "grad_norm": 0.030191797763109207, "grad_norm_var": 9.696580517933503e-07, "learning_rate": 1.0966541761359139e-05, "loss": 2.4659, "step": 12607 }, { "crossentropy": 2.3750970363616943, "epoch": 0.685608635362571, "grad_norm": 0.03133748471736908, "grad_norm_var": 9.715580289215246e-07, "learning_rate": 1.0884271423443614e-05, "loss": 2.3751, "step": 12608 }, { "crossentropy": 2.5697765350341797, "epoch": 0.6856630142200713, "grad_norm": 0.030846748501062393, "grad_norm_var": 9.561657073034092e-07, "learning_rate": 1.0802310505834135e-05, "loss": 2.5698, "step": 12609 }, { "crossentropy": 2.4383312463760376, "epoch": 0.6857173930775714, "grad_norm": 0.030804038047790527, "grad_norm_var": 7.65477161589649e-07, "learning_rate": 1.0720659013613854e-05, "loss": 2.4383, "step": 12610 }, { "crossentropy": 2.4997756481170654, "epoch": 0.6857717719350717, "grad_norm": 0.029564308002591133, "grad_norm_var": 8.499467199098483e-07, "learning_rate": 1.0639316951847056e-05, "loss": 2.4998, "step": 12611 }, { "crossentropy": 2.4157110452651978, "epoch": 0.6858261507925718, "grad_norm": 0.03109208680689335, "grad_norm_var": 6.610756383954956e-07, "learning_rate": 1.0558284325578038e-05, "loss": 2.4157, "step": 12612 }, { "crossentropy": 2.4697169065475464, "epoch": 0.6858805296500721, "grad_norm": 0.030079998075962067, "grad_norm_var": 7.18712413704187e-07, "learning_rate": 1.047756113983278e-05, "loss": 2.4697, "step": 12613 }, { "crossentropy": 2.5591907501220703, "epoch": 0.6859349085075722, "grad_norm": 0.030905762687325478, "grad_norm_var": 4.628054194155684e-07, "learning_rate": 1.0397147399617279e-05, "loss": 2.5592, "step": 12614 }, { "crossentropy": 2.6536790132522583, "epoch": 0.6859892873650725, "grad_norm": 0.032373614609241486, "grad_norm_var": 5.756007938094327e-07, "learning_rate": 1.031704310991921e-05, "loss": 2.6537, "step": 12615 }, { "crossentropy": 2.4534738063812256, "epoch": 0.6860436662225726, "grad_norm": 0.03078783117234707, "grad_norm_var": 4.956706499735132e-07, "learning_rate": 1.0237248275706269e-05, "loss": 2.4535, "step": 12616 }, { "crossentropy": 2.5836658477783203, "epoch": 0.6860980450800729, "grad_norm": 0.03058934584259987, "grad_norm_var": 4.871357064403404e-07, "learning_rate": 1.0157762901926715e-05, "loss": 2.5837, "step": 12617 }, { "crossentropy": 2.4718791246414185, "epoch": 0.686152423937573, "grad_norm": 0.030958129093050957, "grad_norm_var": 4.7648432241129135e-07, "learning_rate": 1.0078586993511051e-05, "loss": 2.4719, "step": 12618 }, { "crossentropy": 2.4921551942825317, "epoch": 0.6862068027950733, "grad_norm": 0.03056398220360279, "grad_norm_var": 4.7611314058676413e-07, "learning_rate": 9.99972055536924e-06, "loss": 2.4922, "step": 12619 }, { "crossentropy": 2.4959933757781982, "epoch": 0.6862611816525734, "grad_norm": 0.031050831079483032, "grad_norm_var": 3.7840410738187016e-07, "learning_rate": 9.921163592392923e-06, "loss": 2.496, "step": 12620 }, { "crossentropy": 2.4415688514709473, "epoch": 0.6863155605100737, "grad_norm": 0.0302695594727993, "grad_norm_var": 3.851249499495311e-07, "learning_rate": 9.842916109453203e-06, "loss": 2.4416, "step": 12621 }, { "crossentropy": 2.4869649410247803, "epoch": 0.6863699393675738, "grad_norm": 0.03185269236564636, "grad_norm_var": 4.562052087985745e-07, "learning_rate": 9.764978111403978e-06, "loss": 2.487, "step": 12622 }, { "crossentropy": 2.3979049921035767, "epoch": 0.6864243182250741, "grad_norm": 0.03139756992459297, "grad_norm_var": 4.4458804598629156e-07, "learning_rate": 9.687349603078043e-06, "loss": 2.3979, "step": 12623 }, { "crossentropy": 2.5055456161499023, "epoch": 0.6864786970825743, "grad_norm": 0.029917405918240547, "grad_norm_var": 4.886675063184848e-07, "learning_rate": 9.610030589290442e-06, "loss": 2.5055, "step": 12624 }, { "crossentropy": 2.548196315765381, "epoch": 0.6865330759400745, "grad_norm": 0.031101301312446594, "grad_norm_var": 4.937653894386406e-07, "learning_rate": 9.533021074836224e-06, "loss": 2.5482, "step": 12625 }, { "crossentropy": 2.5292506217956543, "epoch": 0.6865874547975747, "grad_norm": 0.030559318140149117, "grad_norm_var": 4.984135323871444e-07, "learning_rate": 9.45632106449157e-06, "loss": 2.5293, "step": 12626 }, { "crossentropy": 2.542325019836426, "epoch": 0.6866418336550749, "grad_norm": 0.03147381544113159, "grad_norm_var": 4.0749706369778e-07, "learning_rate": 9.379930563012673e-06, "loss": 2.5423, "step": 12627 }, { "crossentropy": 2.560035824775696, "epoch": 0.6866962125125751, "grad_norm": 0.03466436639428139, "grad_norm_var": 1.2794977052582936e-06, "learning_rate": 9.303849575138523e-06, "loss": 2.56, "step": 12628 }, { "crossentropy": 2.4873801469802856, "epoch": 0.6867505913700753, "grad_norm": 0.029803361743688583, "grad_norm_var": 1.3240830146657446e-06, "learning_rate": 9.228078105586457e-06, "loss": 2.4874, "step": 12629 }, { "crossentropy": 2.51607608795166, "epoch": 0.6868049702275755, "grad_norm": 0.031094307079911232, "grad_norm_var": 1.3203708875239583e-06, "learning_rate": 9.152616159056048e-06, "loss": 2.5161, "step": 12630 }, { "crossentropy": 2.500751256942749, "epoch": 0.6868593490850757, "grad_norm": 0.03264102712273598, "grad_norm_var": 1.3683402262232649e-06, "learning_rate": 9.077463740226888e-06, "loss": 2.5008, "step": 12631 }, { "crossentropy": 2.4456902742385864, "epoch": 0.6869137279425759, "grad_norm": 0.031140243634581566, "grad_norm_var": 1.3581306842382878e-06, "learning_rate": 9.002620853761356e-06, "loss": 2.4457, "step": 12632 }, { "crossentropy": 2.498579740524292, "epoch": 0.6869681068000761, "grad_norm": 0.02973208948969841, "grad_norm_var": 1.4729826466427486e-06, "learning_rate": 8.928087504299632e-06, "loss": 2.4986, "step": 12633 }, { "crossentropy": 2.5081353187561035, "epoch": 0.6870224856575763, "grad_norm": 0.03351064398884773, "grad_norm_var": 1.8187192320582296e-06, "learning_rate": 8.853863696464681e-06, "loss": 2.5081, "step": 12634 }, { "crossentropy": 2.497692108154297, "epoch": 0.6870768645150765, "grad_norm": 0.03063979558646679, "grad_norm_var": 1.8116558257328841e-06, "learning_rate": 8.779949434860047e-06, "loss": 2.4977, "step": 12635 }, { "crossentropy": 2.3754764795303345, "epoch": 0.6871312433725767, "grad_norm": 0.030533460900187492, "grad_norm_var": 1.8457820273907906e-06, "learning_rate": 8.706344724069281e-06, "loss": 2.3755, "step": 12636 }, { "crossentropy": 2.503700017929077, "epoch": 0.687185622230077, "grad_norm": 0.030713863670825958, "grad_norm_var": 1.7988126850633039e-06, "learning_rate": 8.63304956865818e-06, "loss": 2.5037, "step": 12637 }, { "crossentropy": 2.5032843351364136, "epoch": 0.6872400010875771, "grad_norm": 0.03190973773598671, "grad_norm_var": 1.8032316359439225e-06, "learning_rate": 8.560063973171439e-06, "loss": 2.5033, "step": 12638 }, { "crossentropy": 2.533935308456421, "epoch": 0.6872943799450774, "grad_norm": 0.030796758830547333, "grad_norm_var": 1.8181381221553382e-06, "learning_rate": 8.487387942136548e-06, "loss": 2.5339, "step": 12639 }, { "crossentropy": 2.5007392168045044, "epoch": 0.6873487588025775, "grad_norm": 0.030113162472844124, "grad_norm_var": 1.7853736491872662e-06, "learning_rate": 8.415021480059903e-06, "loss": 2.5007, "step": 12640 }, { "crossentropy": 2.534903645515442, "epoch": 0.6874031376600778, "grad_norm": 0.03173971176147461, "grad_norm_var": 1.795916177515372e-06, "learning_rate": 8.342964591430135e-06, "loss": 2.5349, "step": 12641 }, { "crossentropy": 2.4795161485671997, "epoch": 0.6874575165175779, "grad_norm": 0.030515462160110474, "grad_norm_var": 1.8004645885066056e-06, "learning_rate": 8.271217280715893e-06, "loss": 2.4795, "step": 12642 }, { "crossentropy": 2.5860108137130737, "epoch": 0.6875118953750782, "grad_norm": 0.03036537393927574, "grad_norm_var": 1.8536150256096324e-06, "learning_rate": 8.199779552366948e-06, "loss": 2.586, "step": 12643 }, { "crossentropy": 2.4026925563812256, "epoch": 0.6875662742325783, "grad_norm": 0.03212357312440872, "grad_norm_var": 1.0985643925229359e-06, "learning_rate": 8.128651410813647e-06, "loss": 2.4027, "step": 12644 }, { "crossentropy": 2.574649691581726, "epoch": 0.6876206530900786, "grad_norm": 0.03140941634774208, "grad_norm_var": 9.85158523897804e-07, "learning_rate": 8.057832860467463e-06, "loss": 2.5746, "step": 12645 }, { "crossentropy": 2.519094228744507, "epoch": 0.6876750319475787, "grad_norm": 0.03130393475294113, "grad_norm_var": 9.853375686064494e-07, "learning_rate": 7.987323905720989e-06, "loss": 2.5191, "step": 12646 }, { "crossentropy": 2.4675196409225464, "epoch": 0.687729410805079, "grad_norm": 0.03111467882990837, "grad_norm_var": 8.375289731423739e-07, "learning_rate": 7.917124550946287e-06, "loss": 2.4675, "step": 12647 }, { "crossentropy": 2.4263226985931396, "epoch": 0.6877837896625791, "grad_norm": 0.03129926323890686, "grad_norm_var": 8.398806583607369e-07, "learning_rate": 7.84723480049765e-06, "loss": 2.4263, "step": 12648 }, { "crossentropy": 2.5038437843322754, "epoch": 0.6878381685200794, "grad_norm": 0.042114365845918655, "grad_norm_var": 8.141252705375411e-06, "learning_rate": 7.777654658708833e-06, "loss": 2.5038, "step": 12649 }, { "crossentropy": 2.4293644428253174, "epoch": 0.6878925473775795, "grad_norm": 0.03712102770805359, "grad_norm_var": 9.737192095937925e-06, "learning_rate": 7.70838412989583e-06, "loss": 2.4294, "step": 12650 }, { "crossentropy": 2.5330153703689575, "epoch": 0.6879469262350798, "grad_norm": 0.030616769567131996, "grad_norm_var": 9.741749243095477e-06, "learning_rate": 7.639423218355202e-06, "loss": 2.533, "step": 12651 }, { "crossentropy": 2.5371737480163574, "epoch": 0.68800130509258, "grad_norm": 0.032110292464494705, "grad_norm_var": 9.565289311072069e-06, "learning_rate": 7.5707719283624185e-06, "loss": 2.5372, "step": 12652 }, { "crossentropy": 2.4510642290115356, "epoch": 0.6880556839500802, "grad_norm": 0.030483990907669067, "grad_norm_var": 9.614462197322868e-06, "learning_rate": 7.502430264176296e-06, "loss": 2.4511, "step": 12653 }, { "crossentropy": 2.4987640380859375, "epoch": 0.6881100628075804, "grad_norm": 0.030503464862704277, "grad_norm_var": 9.791755266119389e-06, "learning_rate": 7.434398230034556e-06, "loss": 2.4988, "step": 12654 }, { "crossentropy": 2.52022385597229, "epoch": 0.6881644416650806, "grad_norm": 0.030308077111840248, "grad_norm_var": 9.892131380606397e-06, "learning_rate": 7.366675830157155e-06, "loss": 2.5202, "step": 12655 }, { "crossentropy": 2.445887327194214, "epoch": 0.6882188205225808, "grad_norm": 0.031616099178791046, "grad_norm_var": 9.639638762528448e-06, "learning_rate": 7.299263068744067e-06, "loss": 2.4459, "step": 12656 }, { "crossentropy": 2.5732951164245605, "epoch": 0.688273199380081, "grad_norm": 0.03069424442946911, "grad_norm_var": 9.76815386765356e-06, "learning_rate": 7.232159949975281e-06, "loss": 2.5733, "step": 12657 }, { "crossentropy": 2.5371519327163696, "epoch": 0.6883275782375813, "grad_norm": 0.0322098471224308, "grad_norm_var": 9.588199556763895e-06, "learning_rate": 7.165366478014135e-06, "loss": 2.5372, "step": 12658 }, { "crossentropy": 2.496555209159851, "epoch": 0.6883819570950814, "grad_norm": 0.031516872346401215, "grad_norm_var": 9.387529864352852e-06, "learning_rate": 7.0988826570012045e-06, "loss": 2.4966, "step": 12659 }, { "crossentropy": 2.5207576751708984, "epoch": 0.6884363359525817, "grad_norm": 0.031841062009334564, "grad_norm_var": 9.39856564597796e-06, "learning_rate": 7.032708491061523e-06, "loss": 2.5208, "step": 12660 }, { "crossentropy": 2.4988720417022705, "epoch": 0.6884907148100818, "grad_norm": 0.03257680684328079, "grad_norm_var": 9.350339611817008e-06, "learning_rate": 6.966843984297921e-06, "loss": 2.4989, "step": 12661 }, { "crossentropy": 2.401450037956238, "epoch": 0.6885450936675821, "grad_norm": 0.05325079336762428, "grad_norm_var": 3.642427673797941e-05, "learning_rate": 6.901289140795463e-06, "loss": 2.4015, "step": 12662 }, { "crossentropy": 2.5666390657424927, "epoch": 0.6885994725250822, "grad_norm": 0.0329851359128952, "grad_norm_var": 3.5995406391491534e-05, "learning_rate": 6.8360439646203425e-06, "loss": 2.5666, "step": 12663 }, { "crossentropy": 2.5410549640655518, "epoch": 0.6886538513825825, "grad_norm": 0.029631314799189568, "grad_norm_var": 3.6731659882185256e-05, "learning_rate": 6.771108459818765e-06, "loss": 2.5411, "step": 12664 }, { "crossentropy": 2.41961669921875, "epoch": 0.6887082302400827, "grad_norm": 0.029834864661097527, "grad_norm_var": 3.24181340844112e-05, "learning_rate": 6.70648263041751e-06, "loss": 2.4196, "step": 12665 }, { "crossentropy": 2.4579333066940308, "epoch": 0.6887626090975829, "grad_norm": 0.03176885098218918, "grad_norm_var": 3.1236442390792284e-05, "learning_rate": 6.642166480425038e-06, "loss": 2.4579, "step": 12666 }, { "crossentropy": 2.5387649536132812, "epoch": 0.6888169879550831, "grad_norm": 0.03145819902420044, "grad_norm_var": 3.1055749253457034e-05, "learning_rate": 6.578160013830381e-06, "loss": 2.5388, "step": 12667 }, { "crossentropy": 2.5315014123916626, "epoch": 0.6888713668125833, "grad_norm": 0.030751457437872887, "grad_norm_var": 3.127334968950832e-05, "learning_rate": 6.5144632346025855e-06, "loss": 2.5315, "step": 12668 }, { "crossentropy": 2.4421842098236084, "epoch": 0.6889257456700835, "grad_norm": 0.030413664877414703, "grad_norm_var": 3.129340120648499e-05, "learning_rate": 6.4510761466923804e-06, "loss": 2.4422, "step": 12669 }, { "crossentropy": 2.5335875749588013, "epoch": 0.6889801245275837, "grad_norm": 0.03202551230788231, "grad_norm_var": 3.101575487557497e-05, "learning_rate": 6.387998754030511e-06, "loss": 2.5336, "step": 12670 }, { "crossentropy": 2.5631848573684692, "epoch": 0.6890345033850839, "grad_norm": 0.030798552557826042, "grad_norm_var": 3.087566281496251e-05, "learning_rate": 6.325231060529957e-06, "loss": 2.5632, "step": 12671 }, { "crossentropy": 2.4901280403137207, "epoch": 0.6890888822425841, "grad_norm": 0.0313342846930027, "grad_norm_var": 3.0921761323167646e-05, "learning_rate": 6.262773070083161e-06, "loss": 2.4901, "step": 12672 }, { "crossentropy": 2.4468973875045776, "epoch": 0.6891432611000843, "grad_norm": 0.03242690488696098, "grad_norm_var": 3.06475880335781e-05, "learning_rate": 6.200624786563136e-06, "loss": 2.4469, "step": 12673 }, { "crossentropy": 2.428927183151245, "epoch": 0.6891976399575845, "grad_norm": 0.03150526434183121, "grad_norm_var": 3.073419852384999e-05, "learning_rate": 6.1387862138251315e-06, "loss": 2.4289, "step": 12674 }, { "crossentropy": 2.4448814392089844, "epoch": 0.6892520188150847, "grad_norm": 0.03247016295790672, "grad_norm_var": 3.063330936406468e-05, "learning_rate": 6.077257355703303e-06, "loss": 2.4449, "step": 12675 }, { "crossentropy": 2.4879891872406006, "epoch": 0.689306397672585, "grad_norm": 0.032741740345954895, "grad_norm_var": 3.056680363606127e-05, "learning_rate": 6.0160382160140415e-06, "loss": 2.488, "step": 12676 }, { "crossentropy": 2.563620448112488, "epoch": 0.6893607765300851, "grad_norm": 0.03158358484506607, "grad_norm_var": 3.066772960378609e-05, "learning_rate": 5.9551287985548655e-06, "loss": 2.5636, "step": 12677 }, { "crossentropy": 2.484247922897339, "epoch": 0.6894151553875854, "grad_norm": 0.031907688826322556, "grad_norm_var": 9.725130104623036e-07, "learning_rate": 5.894529107102198e-06, "loss": 2.4842, "step": 12678 }, { "crossentropy": 2.6261075735092163, "epoch": 0.6894695342450855, "grad_norm": 0.03202291205525398, "grad_norm_var": 8.369331654034067e-07, "learning_rate": 5.834239145414699e-06, "loss": 2.6261, "step": 12679 }, { "crossentropy": 2.483777403831482, "epoch": 0.6895239131025858, "grad_norm": 0.03208254650235176, "grad_norm_var": 6.287891736252479e-07, "learning_rate": 5.7742589172321555e-06, "loss": 2.4838, "step": 12680 }, { "crossentropy": 2.5684951543807983, "epoch": 0.6895782919600859, "grad_norm": 0.030546247959136963, "grad_norm_var": 4.958021010160555e-07, "learning_rate": 5.714588426273259e-06, "loss": 2.5685, "step": 12681 }, { "crossentropy": 2.506978750228882, "epoch": 0.6896326708175862, "grad_norm": 0.030422642827033997, "grad_norm_var": 5.814267738113906e-07, "learning_rate": 5.655227676240049e-06, "loss": 2.507, "step": 12682 }, { "crossentropy": 2.4585092067718506, "epoch": 0.6896870496750863, "grad_norm": 0.031704436987638474, "grad_norm_var": 5.828356708314474e-07, "learning_rate": 5.596176670812914e-06, "loss": 2.4585, "step": 12683 }, { "crossentropy": 2.5070852041244507, "epoch": 0.6897414285325866, "grad_norm": 0.030389118939638138, "grad_norm_var": 6.294318706981239e-07, "learning_rate": 5.5374354136550344e-06, "loss": 2.5071, "step": 12684 }, { "crossentropy": 2.557254910469055, "epoch": 0.6897958073900867, "grad_norm": 0.03147102892398834, "grad_norm_var": 5.428478752942846e-07, "learning_rate": 5.479003908409052e-06, "loss": 2.5573, "step": 12685 }, { "crossentropy": 2.4826297760009766, "epoch": 0.689850186247587, "grad_norm": 0.03000427596271038, "grad_norm_var": 6.806912356132599e-07, "learning_rate": 5.420882158698737e-06, "loss": 2.4826, "step": 12686 }, { "crossentropy": 2.584039092063904, "epoch": 0.6899045651050871, "grad_norm": 0.03169365972280502, "grad_norm_var": 6.514417520226869e-07, "learning_rate": 5.363070168128425e-06, "loss": 2.584, "step": 12687 }, { "crossentropy": 2.530279278755188, "epoch": 0.6899589439625874, "grad_norm": 0.0317964106798172, "grad_norm_var": 6.533980823305071e-07, "learning_rate": 5.305567940284694e-06, "loss": 2.5303, "step": 12688 }, { "crossentropy": 2.5314308404922485, "epoch": 0.6900133228200875, "grad_norm": 0.03160854056477547, "grad_norm_var": 5.993579466388356e-07, "learning_rate": 5.2483754787330255e-06, "loss": 2.5314, "step": 12689 }, { "crossentropy": 2.5671424865722656, "epoch": 0.6900677016775878, "grad_norm": 0.031166432425379753, "grad_norm_var": 6.061551191598643e-07, "learning_rate": 5.191492787020024e-06, "loss": 2.5671, "step": 12690 }, { "crossentropy": 2.5157190561294556, "epoch": 0.690122080535088, "grad_norm": 0.031329236924648285, "grad_norm_var": 5.362331672308906e-07, "learning_rate": 5.134919868674537e-06, "loss": 2.5157, "step": 12691 }, { "crossentropy": 2.493574619293213, "epoch": 0.6901764593925882, "grad_norm": 0.03035411611199379, "grad_norm_var": 4.667899264264771e-07, "learning_rate": 5.0786567272043116e-06, "loss": 2.4936, "step": 12692 }, { "crossentropy": 2.543147921562195, "epoch": 0.6902308382500884, "grad_norm": 0.030710948631167412, "grad_norm_var": 4.7617290616975595e-07, "learning_rate": 5.02270336609878e-06, "loss": 2.5431, "step": 12693 }, { "crossentropy": 2.5558841228485107, "epoch": 0.6902852171075886, "grad_norm": 0.02999970130622387, "grad_norm_var": 5.238269491362098e-07, "learning_rate": 4.9670597888290535e-06, "loss": 2.5559, "step": 12694 }, { "crossentropy": 2.3434898853302, "epoch": 0.6903395959650888, "grad_norm": 0.03062896430492401, "grad_norm_var": 4.7027927998066534e-07, "learning_rate": 4.9117259988445964e-06, "loss": 2.3435, "step": 12695 }, { "crossentropy": 2.526820182800293, "epoch": 0.690393974822589, "grad_norm": 0.02998666651546955, "grad_norm_var": 4.4070404918206533e-07, "learning_rate": 4.856701999578772e-06, "loss": 2.5268, "step": 12696 }, { "crossentropy": 2.524345636367798, "epoch": 0.6904483536800892, "grad_norm": 0.030994832515716553, "grad_norm_var": 4.343189068209263e-07, "learning_rate": 4.801987794442741e-06, "loss": 2.5243, "step": 12697 }, { "crossentropy": 2.505065679550171, "epoch": 0.6905027325375894, "grad_norm": 0.031174752861261368, "grad_norm_var": 4.226743453240491e-07, "learning_rate": 4.747583386830456e-06, "loss": 2.5051, "step": 12698 }, { "crossentropy": 2.507229447364807, "epoch": 0.6905571113950896, "grad_norm": 0.03143530339002609, "grad_norm_var": 3.9970969925906946e-07, "learning_rate": 4.6934887801164395e-06, "loss": 2.5072, "step": 12699 }, { "crossentropy": 2.475939154624939, "epoch": 0.6906114902525898, "grad_norm": 0.02985927276313305, "grad_norm_var": 4.548663907120882e-07, "learning_rate": 4.6397039776546745e-06, "loss": 2.4759, "step": 12700 }, { "crossentropy": 2.506360173225403, "epoch": 0.69066586911009, "grad_norm": 0.03061758540570736, "grad_norm_var": 4.340886483153768e-07, "learning_rate": 4.586228982781382e-06, "loss": 2.5064, "step": 12701 }, { "crossentropy": 2.4659069776535034, "epoch": 0.6907202479675902, "grad_norm": 0.030812092125415802, "grad_norm_var": 3.8539306281322694e-07, "learning_rate": 4.533063798813352e-06, "loss": 2.4659, "step": 12702 }, { "crossentropy": 2.5170745849609375, "epoch": 0.6907746268250904, "grad_norm": 0.030920185148715973, "grad_norm_var": 3.3944235633403866e-07, "learning_rate": 4.480208429046839e-06, "loss": 2.5171, "step": 12703 }, { "crossentropy": 2.413001775741577, "epoch": 0.6908290056825906, "grad_norm": 0.03009972721338272, "grad_norm_var": 3.023641216732056e-07, "learning_rate": 4.427662876760885e-06, "loss": 2.413, "step": 12704 }, { "crossentropy": 2.571622848510742, "epoch": 0.6908833845400908, "grad_norm": 0.031608764082193375, "grad_norm_var": 3.02390273150023e-07, "learning_rate": 4.375427145213995e-06, "loss": 2.5716, "step": 12705 }, { "crossentropy": 2.5291887521743774, "epoch": 0.690937763397591, "grad_norm": 0.03061065636575222, "grad_norm_var": 2.894406139231434e-07, "learning_rate": 4.323501237645245e-06, "loss": 2.5292, "step": 12706 }, { "crossentropy": 2.471206545829773, "epoch": 0.6909921422550912, "grad_norm": 0.03136945515871048, "grad_norm_var": 2.9293511658491875e-07, "learning_rate": 4.2718851572759455e-06, "loss": 2.4712, "step": 12707 }, { "crossentropy": 2.6341546773910522, "epoch": 0.6910465211125915, "grad_norm": 0.03159742057323456, "grad_norm_var": 3.3238534691582926e-07, "learning_rate": 4.2205789073063165e-06, "loss": 2.6342, "step": 12708 }, { "crossentropy": 2.520006537437439, "epoch": 0.6911008999700916, "grad_norm": 0.03078770637512207, "grad_norm_var": 3.320812153469275e-07, "learning_rate": 4.169582490918811e-06, "loss": 2.52, "step": 12709 }, { "crossentropy": 2.5244816541671753, "epoch": 0.6911552788275919, "grad_norm": 0.030981875956058502, "grad_norm_var": 2.8999867111804275e-07, "learning_rate": 4.11889591127701e-06, "loss": 2.5245, "step": 12710 }, { "crossentropy": 2.537869691848755, "epoch": 0.691209657685092, "grad_norm": 0.03054921329021454, "grad_norm_var": 2.9267030650873687e-07, "learning_rate": 4.068519171522844e-06, "loss": 2.5379, "step": 12711 }, { "crossentropy": 2.494687557220459, "epoch": 0.6912640365425923, "grad_norm": 0.03328994661569595, "grad_norm_var": 5.997585094284861e-07, "learning_rate": 4.018452274781592e-06, "loss": 2.4947, "step": 12712 }, { "crossentropy": 2.5444239377975464, "epoch": 0.6913184154000924, "grad_norm": 0.030249323695898056, "grad_norm_var": 6.394120349478924e-07, "learning_rate": 3.968695224158547e-06, "loss": 2.5444, "step": 12713 }, { "crossentropy": 2.594771146774292, "epoch": 0.6913727942575927, "grad_norm": 0.031222455203533173, "grad_norm_var": 6.406803338235376e-07, "learning_rate": 3.919248022739019e-06, "loss": 2.5948, "step": 12714 }, { "crossentropy": 2.487107038497925, "epoch": 0.6914271731150928, "grad_norm": 0.030908264219760895, "grad_norm_var": 6.274996239483682e-07, "learning_rate": 3.870110673589444e-06, "loss": 2.4871, "step": 12715 }, { "crossentropy": 2.5196391344070435, "epoch": 0.6914815519725931, "grad_norm": 0.03116670250892639, "grad_norm_var": 5.411018674775573e-07, "learning_rate": 3.82128317975794e-06, "loss": 2.5196, "step": 12716 }, { "crossentropy": 2.501866102218628, "epoch": 0.6915359308300932, "grad_norm": 0.03258019685745239, "grad_norm_var": 6.688282681589861e-07, "learning_rate": 3.7727655442726384e-06, "loss": 2.5019, "step": 12717 }, { "crossentropy": 2.4856507778167725, "epoch": 0.6915903096875935, "grad_norm": 0.031152993440628052, "grad_norm_var": 6.597269089377341e-07, "learning_rate": 3.7245577701428002e-06, "loss": 2.4857, "step": 12718 }, { "crossentropy": 2.430757761001587, "epoch": 0.6916446885450936, "grad_norm": 0.03041115775704384, "grad_norm_var": 6.944664579144512e-07, "learning_rate": 3.6766598603577007e-06, "loss": 2.4308, "step": 12719 }, { "crossentropy": 2.5662986040115356, "epoch": 0.6916990674025939, "grad_norm": 0.033367790281772614, "grad_norm_var": 8.992718611871744e-07, "learning_rate": 3.629071817887741e-06, "loss": 2.5663, "step": 12720 }, { "crossentropy": 2.572746515274048, "epoch": 0.691753446260094, "grad_norm": 0.03173376992344856, "grad_norm_var": 9.042969363549548e-07, "learning_rate": 3.581793645685005e-06, "loss": 2.5727, "step": 12721 }, { "crossentropy": 2.4595812559127808, "epoch": 0.6918078251175943, "grad_norm": 0.03013957478106022, "grad_norm_var": 9.660931762492703e-07, "learning_rate": 3.5348253466810364e-06, "loss": 2.4596, "step": 12722 }, { "crossentropy": 2.5196667909622192, "epoch": 0.6918622039750945, "grad_norm": 0.032792821526527405, "grad_norm_var": 1.097501723920285e-06, "learning_rate": 3.48816692378906e-06, "loss": 2.5197, "step": 12723 }, { "crossentropy": 2.4222936630249023, "epoch": 0.6919165828325947, "grad_norm": 0.030420072376728058, "grad_norm_var": 1.1583568439205478e-06, "learning_rate": 3.441818379902317e-06, "loss": 2.4223, "step": 12724 }, { "crossentropy": 2.5462065935134888, "epoch": 0.6919709616900949, "grad_norm": 0.031121067702770233, "grad_norm_var": 1.1398820901239628e-06, "learning_rate": 3.3957797178962855e-06, "loss": 2.5462, "step": 12725 }, { "crossentropy": 2.463660478591919, "epoch": 0.6920253405475951, "grad_norm": 0.0306264478713274, "grad_norm_var": 1.166666325629062e-06, "learning_rate": 3.3500509406253486e-06, "loss": 2.4637, "step": 12726 }, { "crossentropy": 2.5190317630767822, "epoch": 0.6920797194050953, "grad_norm": 0.029914721846580505, "grad_norm_var": 1.260270052509671e-06, "learning_rate": 3.3046320509261264e-06, "loss": 2.519, "step": 12727 }, { "crossentropy": 2.460067391395569, "epoch": 0.6921340982625955, "grad_norm": 0.0298831295222044, "grad_norm_var": 1.0901929127101004e-06, "learning_rate": 3.259523051615254e-06, "loss": 2.4601, "step": 12728 }, { "crossentropy": 2.415026068687439, "epoch": 0.6921884771200957, "grad_norm": 0.030727049335837364, "grad_norm_var": 1.0499112138184774e-06, "learning_rate": 3.214723945489939e-06, "loss": 2.415, "step": 12729 }, { "crossentropy": 2.5784475803375244, "epoch": 0.6922428559775959, "grad_norm": 0.031023701652884483, "grad_norm_var": 1.0500761519848286e-06, "learning_rate": 3.1702347353290694e-06, "loss": 2.5784, "step": 12730 }, { "crossentropy": 2.394230604171753, "epoch": 0.6922972348350961, "grad_norm": 0.03138098493218422, "grad_norm_var": 1.0505022752551787e-06, "learning_rate": 3.1260554238921046e-06, "loss": 2.3942, "step": 12731 }, { "crossentropy": 2.5623570680618286, "epoch": 0.6923516136925963, "grad_norm": 0.030218832194805145, "grad_norm_var": 1.104878194932189e-06, "learning_rate": 3.0821860139179646e-06, "loss": 2.5624, "step": 12732 }, { "crossentropy": 2.391826033592224, "epoch": 0.6924059925500965, "grad_norm": 0.03159424290060997, "grad_norm_var": 9.70178947955882e-07, "learning_rate": 3.0386265081283615e-06, "loss": 2.3918, "step": 12733 }, { "crossentropy": 2.453009247779846, "epoch": 0.6924603714075968, "grad_norm": 0.02996027283370495, "grad_norm_var": 1.0398126362410123e-06, "learning_rate": 2.9953769092244677e-06, "loss": 2.453, "step": 12734 }, { "crossentropy": 2.428253173828125, "epoch": 0.6925147502650969, "grad_norm": 0.03148160129785538, "grad_norm_var": 1.0334900010700958e-06, "learning_rate": 2.952437219888582e-06, "loss": 2.4283, "step": 12735 }, { "crossentropy": 2.5683624744415283, "epoch": 0.6925691291225972, "grad_norm": 0.031937744468450546, "grad_norm_var": 7.14432242175135e-07, "learning_rate": 2.909807442784129e-06, "loss": 2.5684, "step": 12736 }, { "crossentropy": 2.4607222080230713, "epoch": 0.6926235079800973, "grad_norm": 0.03201723098754883, "grad_norm_var": 7.496528500651005e-07, "learning_rate": 2.867487580553996e-06, "loss": 2.4607, "step": 12737 }, { "crossentropy": 2.5176422595977783, "epoch": 0.6926778868375976, "grad_norm": 0.0305464006960392, "grad_norm_var": 7.159028929685686e-07, "learning_rate": 2.8254776358238587e-06, "loss": 2.5176, "step": 12738 }, { "crossentropy": 2.4514538049697876, "epoch": 0.6927322656950977, "grad_norm": 0.030659999698400497, "grad_norm_var": 4.840890022234177e-07, "learning_rate": 2.7837776111988568e-06, "loss": 2.4515, "step": 12739 }, { "crossentropy": 2.530973792076111, "epoch": 0.692786644552598, "grad_norm": 0.03129059076309204, "grad_norm_var": 4.821778197843743e-07, "learning_rate": 2.7423875092646987e-06, "loss": 2.531, "step": 12740 }, { "crossentropy": 2.537309169769287, "epoch": 0.6928410234100981, "grad_norm": 0.03177559748291969, "grad_norm_var": 5.283332847886285e-07, "learning_rate": 2.701307332589331e-06, "loss": 2.5373, "step": 12741 }, { "crossentropy": 2.581383466720581, "epoch": 0.6928954022675984, "grad_norm": 0.03061482310295105, "grad_norm_var": 5.288275862443126e-07, "learning_rate": 2.6605370837190502e-06, "loss": 2.5814, "step": 12742 }, { "crossentropy": 2.318603038787842, "epoch": 0.6929497811250985, "grad_norm": 0.029347660019993782, "grad_norm_var": 6.263827190887966e-07, "learning_rate": 2.620076765183499e-06, "loss": 2.3186, "step": 12743 }, { "crossentropy": 2.5867536067962646, "epoch": 0.6930041599825988, "grad_norm": 0.031635385006666183, "grad_norm_var": 5.798329803368788e-07, "learning_rate": 2.5799263794923368e-06, "loss": 2.5868, "step": 12744 }, { "crossentropy": 2.5657827854156494, "epoch": 0.6930585388400989, "grad_norm": 0.031059924513101578, "grad_norm_var": 5.740554762315251e-07, "learning_rate": 2.5400859291346835e-06, "loss": 2.5658, "step": 12745 }, { "crossentropy": 2.513761878013611, "epoch": 0.6931129176975992, "grad_norm": 0.03184027969837189, "grad_norm_var": 6.146024497374891e-07, "learning_rate": 2.5005554165813406e-06, "loss": 2.5138, "step": 12746 }, { "crossentropy": 2.500722646713257, "epoch": 0.6931672965550993, "grad_norm": 0.03013082966208458, "grad_norm_var": 6.629623748997948e-07, "learning_rate": 2.4613348442847905e-06, "loss": 2.5007, "step": 12747 }, { "crossentropy": 2.4664775133132935, "epoch": 0.6932216754125996, "grad_norm": 0.03171273693442345, "grad_norm_var": 6.454610983377383e-07, "learning_rate": 2.422424214676977e-06, "loss": 2.4665, "step": 12748 }, { "crossentropy": 2.460361957550049, "epoch": 0.6932760542700998, "grad_norm": 0.030503608286380768, "grad_norm_var": 6.479804030681361e-07, "learning_rate": 2.383823530171525e-06, "loss": 2.4604, "step": 12749 }, { "crossentropy": 2.584271192550659, "epoch": 0.6933304331276, "grad_norm": 0.03750148043036461, "grad_norm_var": 3.1245592847373246e-06, "learning_rate": 2.34553279316152e-06, "loss": 2.5843, "step": 12750 }, { "crossentropy": 2.5501173734664917, "epoch": 0.6933848119851002, "grad_norm": 0.03195887431502342, "grad_norm_var": 3.1374030013003033e-06, "learning_rate": 2.3075520060228393e-06, "loss": 2.5501, "step": 12751 }, { "crossentropy": 2.4557902812957764, "epoch": 0.6934391908426004, "grad_norm": 0.03116939775645733, "grad_norm_var": 3.132868826707877e-06, "learning_rate": 2.269881171109711e-06, "loss": 2.4558, "step": 12752 }, { "crossentropy": 2.507534146308899, "epoch": 0.6934935697001006, "grad_norm": 0.0322829931974411, "grad_norm_var": 3.156132084826344e-06, "learning_rate": 2.2325202907597097e-06, "loss": 2.5075, "step": 12753 }, { "crossentropy": 2.5360074043273926, "epoch": 0.6935479485576008, "grad_norm": 0.03099272958934307, "grad_norm_var": 3.111719742494032e-06, "learning_rate": 2.1954693672893155e-06, "loss": 2.536, "step": 12754 }, { "crossentropy": 2.449878692626953, "epoch": 0.693602327415101, "grad_norm": 0.03065178170800209, "grad_norm_var": 3.1126770390891516e-06, "learning_rate": 2.158728402996135e-06, "loss": 2.4499, "step": 12755 }, { "crossentropy": 2.3794039487838745, "epoch": 0.6936567062726012, "grad_norm": 0.02995975688099861, "grad_norm_var": 3.26572841826433e-06, "learning_rate": 2.1222974001594564e-06, "loss": 2.3794, "step": 12756 }, { "crossentropy": 2.5506671667099, "epoch": 0.6937110851301014, "grad_norm": 0.030833303928375244, "grad_norm_var": 3.279827489633064e-06, "learning_rate": 2.086176361038583e-06, "loss": 2.5507, "step": 12757 }, { "crossentropy": 2.5071122646331787, "epoch": 0.6937654639876016, "grad_norm": 0.03000260889530182, "grad_norm_var": 3.36630275636165e-06, "learning_rate": 2.050365287872835e-06, "loss": 2.5071, "step": 12758 }, { "crossentropy": 2.453134775161743, "epoch": 0.6938198428451018, "grad_norm": 0.031112490221858025, "grad_norm_var": 3.0900397075644583e-06, "learning_rate": 2.014864182884324e-06, "loss": 2.4531, "step": 12759 }, { "crossentropy": 2.524025797843933, "epoch": 0.693874221702602, "grad_norm": 0.0314997136592865, "grad_norm_var": 3.0880041353956484e-06, "learning_rate": 1.9796730482746216e-06, "loss": 2.524, "step": 12760 }, { "crossentropy": 2.483076810836792, "epoch": 0.6939286005601022, "grad_norm": 0.030882805585861206, "grad_norm_var": 3.0991952611202626e-06, "learning_rate": 1.9447918862253167e-06, "loss": 2.4831, "step": 12761 }, { "crossentropy": 2.50284481048584, "epoch": 0.6939829794176025, "grad_norm": 0.030996939167380333, "grad_norm_var": 3.0986047088075544e-06, "learning_rate": 1.9102206989007885e-06, "loss": 2.5028, "step": 12762 }, { "crossentropy": 2.490221858024597, "epoch": 0.6940373582751026, "grad_norm": 0.030077828094363213, "grad_norm_var": 3.1076575035056715e-06, "learning_rate": 1.8759594884443231e-06, "loss": 2.4902, "step": 12763 }, { "crossentropy": 2.501746654510498, "epoch": 0.6940917371326029, "grad_norm": 0.031480614095926285, "grad_norm_var": 3.1008411757100686e-06, "learning_rate": 1.8420082569808872e-06, "loss": 2.5017, "step": 12764 }, { "crossentropy": 2.558900237083435, "epoch": 0.694146115990103, "grad_norm": 0.03162495419383049, "grad_norm_var": 3.050015248050999e-06, "learning_rate": 1.8083670066171288e-06, "loss": 2.5589, "step": 12765 }, { "crossentropy": 2.485729932785034, "epoch": 0.6942004948476033, "grad_norm": 0.03057153895497322, "grad_norm_var": 4.500828806410963e-07, "learning_rate": 1.7750357394380468e-06, "loss": 2.4857, "step": 12766 }, { "crossentropy": 2.506337881088257, "epoch": 0.6942548737051034, "grad_norm": 0.030654877424240112, "grad_norm_var": 3.9071100988867917e-07, "learning_rate": 1.7420144575114315e-06, "loss": 2.5063, "step": 12767 }, { "crossentropy": 2.4927898645401, "epoch": 0.6943092525626037, "grad_norm": 0.03107772395014763, "grad_norm_var": 3.882446199409787e-07, "learning_rate": 1.7093031628850896e-06, "loss": 2.4928, "step": 12768 }, { "crossentropy": 2.5099189281463623, "epoch": 0.6943636314201038, "grad_norm": 0.031989432871341705, "grad_norm_var": 3.4023887309775345e-07, "learning_rate": 1.6769018575885086e-06, "loss": 2.5099, "step": 12769 }, { "crossentropy": 2.4417728185653687, "epoch": 0.6944180102776041, "grad_norm": 0.031009703874588013, "grad_norm_var": 3.404654630221089e-07, "learning_rate": 1.644810543630082e-06, "loss": 2.4418, "step": 12770 }, { "crossentropy": 2.553871750831604, "epoch": 0.6944723891351042, "grad_norm": 0.02995530515909195, "grad_norm_var": 3.939846961043079e-07, "learning_rate": 1.6130292230004395e-06, "loss": 2.5539, "step": 12771 }, { "crossentropy": 2.4420297145843506, "epoch": 0.6945267679926045, "grad_norm": 0.030296429991722107, "grad_norm_var": 3.6074260729179223e-07, "learning_rate": 1.5815578976707822e-06, "loss": 2.442, "step": 12772 }, { "crossentropy": 2.373917579650879, "epoch": 0.6945811468501046, "grad_norm": 0.03039873018860817, "grad_norm_var": 3.752019992134346e-07, "learning_rate": 1.5503965695928824e-06, "loss": 2.3739, "step": 12773 }, { "crossentropy": 2.442262887954712, "epoch": 0.6946355257076049, "grad_norm": 0.03137406334280968, "grad_norm_var": 3.3744076506450093e-07, "learning_rate": 1.519545240699638e-06, "loss": 2.4423, "step": 12774 }, { "crossentropy": 2.4947314262390137, "epoch": 0.694689904565105, "grad_norm": 0.029667412862181664, "grad_norm_var": 4.3427765595304273e-07, "learning_rate": 1.4890039129039634e-06, "loss": 2.4947, "step": 12775 }, { "crossentropy": 2.6171733140945435, "epoch": 0.6947442834226053, "grad_norm": 0.030686480924487114, "grad_norm_var": 4.048786174629578e-07, "learning_rate": 1.4587725880998993e-06, "loss": 2.6172, "step": 12776 }, { "crossentropy": 2.5203115940093994, "epoch": 0.6947986622801055, "grad_norm": 0.030301207676529884, "grad_norm_var": 4.1933101352016553e-07, "learning_rate": 1.4288512681631672e-06, "loss": 2.5203, "step": 12777 }, { "crossentropy": 2.4497263431549072, "epoch": 0.6948530411376057, "grad_norm": 0.030048778280615807, "grad_norm_var": 4.4559050899407813e-07, "learning_rate": 1.3992399549483948e-06, "loss": 2.4497, "step": 12778 }, { "crossentropy": 2.608184576034546, "epoch": 0.6949074199951059, "grad_norm": 0.030128763988614082, "grad_norm_var": 4.4152081034690094e-07, "learning_rate": 1.3699386502930012e-06, "loss": 2.6082, "step": 12779 }, { "crossentropy": 2.405240058898926, "epoch": 0.6949617988526061, "grad_norm": 0.030702941119670868, "grad_norm_var": 3.988054343605768e-07, "learning_rate": 1.3409473560133111e-06, "loss": 2.4052, "step": 12780 }, { "crossentropy": 2.4463740587234497, "epoch": 0.6950161777101063, "grad_norm": 0.029790323227643967, "grad_norm_var": 3.7203220522688485e-07, "learning_rate": 1.3122660739078863e-06, "loss": 2.4464, "step": 12781 }, { "crossentropy": 2.609679698944092, "epoch": 0.6950705565676065, "grad_norm": 0.031044702976942062, "grad_norm_var": 3.879606395917745e-07, "learning_rate": 1.2838948057558585e-06, "loss": 2.6097, "step": 12782 }, { "crossentropy": 2.503589391708374, "epoch": 0.6951249354251067, "grad_norm": 0.030045315623283386, "grad_norm_var": 4.043200215073413e-07, "learning_rate": 1.2558335533158217e-06, "loss": 2.5036, "step": 12783 }, { "crossentropy": 2.547238349914551, "epoch": 0.6951793142826069, "grad_norm": 0.031184010207653046, "grad_norm_var": 4.127550885135578e-07, "learning_rate": 1.2280823183291601e-06, "loss": 2.5472, "step": 12784 }, { "crossentropy": 2.6108230352401733, "epoch": 0.6952336931401071, "grad_norm": 0.03161076456308365, "grad_norm_var": 3.484846275223722e-07, "learning_rate": 1.2006411025161645e-06, "loss": 2.6108, "step": 12785 }, { "crossentropy": 2.4185991287231445, "epoch": 0.6952880719976073, "grad_norm": 0.031145907938480377, "grad_norm_var": 3.586225893644947e-07, "learning_rate": 1.173509907579362e-06, "loss": 2.4186, "step": 12786 }, { "crossentropy": 2.461664319038391, "epoch": 0.6953424508551075, "grad_norm": 0.030911598354578018, "grad_norm_var": 3.4328957106559363e-07, "learning_rate": 1.1466887352007405e-06, "loss": 2.4617, "step": 12787 }, { "crossentropy": 2.5209447145462036, "epoch": 0.6953968297126077, "grad_norm": 0.030997272580862045, "grad_norm_var": 3.4715453116547446e-07, "learning_rate": 1.120177587044524e-06, "loss": 2.5209, "step": 12788 }, { "crossentropy": 2.6553510427474976, "epoch": 0.6954512085701079, "grad_norm": 0.033145032823085785, "grad_norm_var": 7.348106856930181e-07, "learning_rate": 1.093976464753843e-06, "loss": 2.6554, "step": 12789 }, { "crossentropy": 2.477663278579712, "epoch": 0.6955055874276082, "grad_norm": 0.031109819188714027, "grad_norm_var": 7.189150657463395e-07, "learning_rate": 1.068085369954619e-06, "loss": 2.4777, "step": 12790 }, { "crossentropy": 2.4959765672683716, "epoch": 0.6955599662851083, "grad_norm": 0.029927603900432587, "grad_norm_var": 6.844607988285792e-07, "learning_rate": 1.0425043042522343e-06, "loss": 2.496, "step": 12791 }, { "crossentropy": 2.378997325897217, "epoch": 0.6956143451426086, "grad_norm": 0.030535168945789337, "grad_norm_var": 6.881574367451418e-07, "learning_rate": 1.0172332692331976e-06, "loss": 2.379, "step": 12792 }, { "crossentropy": 2.5071009397506714, "epoch": 0.6956687240001087, "grad_norm": 0.03095882013440132, "grad_norm_var": 6.723868206318182e-07, "learning_rate": 9.922722664645889e-07, "loss": 2.5071, "step": 12793 }, { "crossentropy": 2.5060616731643677, "epoch": 0.695723102857609, "grad_norm": 0.031008651480078697, "grad_norm_var": 6.299338497820026e-07, "learning_rate": 9.676212974951692e-07, "loss": 2.5061, "step": 12794 }, { "crossentropy": 2.5282329320907593, "epoch": 0.6957774817151091, "grad_norm": 0.03168962523341179, "grad_norm_var": 6.236902240094877e-07, "learning_rate": 9.432803638526055e-07, "loss": 2.5282, "step": 12795 }, { "crossentropy": 2.5615652799606323, "epoch": 0.6958318605726094, "grad_norm": 0.031020907685160637, "grad_norm_var": 6.179250896924295e-07, "learning_rate": 9.192494670479113e-07, "loss": 2.5616, "step": 12796 }, { "crossentropy": 2.4668549299240112, "epoch": 0.6958862394301095, "grad_norm": 0.029772507026791573, "grad_norm_var": 6.208371439696155e-07, "learning_rate": 8.955286085698955e-07, "loss": 2.4669, "step": 12797 }, { "crossentropy": 2.575287342071533, "epoch": 0.6959406182876098, "grad_norm": 0.030878575518727303, "grad_norm_var": 6.217209655381365e-07, "learning_rate": 8.721177898912691e-07, "loss": 2.5753, "step": 12798 }, { "crossentropy": 2.511898994445801, "epoch": 0.6959949971451099, "grad_norm": 0.030762149021029472, "grad_norm_var": 5.629389498380359e-07, "learning_rate": 8.490170124630936e-07, "loss": 2.5119, "step": 12799 }, { "crossentropy": 2.435673952102661, "epoch": 0.6960493760026102, "grad_norm": 0.030775168910622597, "grad_norm_var": 5.65598329627552e-07, "learning_rate": 8.262262777175566e-07, "loss": 2.4357, "step": 12800 }, { "crossentropy": 2.4011411666870117, "epoch": 0.6961037548601103, "grad_norm": 0.031630855053663254, "grad_norm_var": 5.672178472414333e-07, "learning_rate": 8.037455870696376e-07, "loss": 2.4011, "step": 12801 }, { "crossentropy": 2.6244184970855713, "epoch": 0.6961581337176106, "grad_norm": 0.031502898782491684, "grad_norm_var": 5.813258116908835e-07, "learning_rate": 7.81574941912111e-07, "loss": 2.6244, "step": 12802 }, { "crossentropy": 2.5250091552734375, "epoch": 0.6962125125751107, "grad_norm": 0.03121660277247429, "grad_norm_var": 5.819522200236111e-07, "learning_rate": 7.597143436205434e-07, "loss": 2.525, "step": 12803 }, { "crossentropy": 2.562278985977173, "epoch": 0.696266891432611, "grad_norm": 0.03188113495707512, "grad_norm_var": 6.235944387859704e-07, "learning_rate": 7.381637935510722e-07, "loss": 2.5623, "step": 12804 }, { "crossentropy": 2.451061725616455, "epoch": 0.6963212702901111, "grad_norm": 0.03071591816842556, "grad_norm_var": 3.3439530643294045e-07, "learning_rate": 7.169232930392955e-07, "loss": 2.4511, "step": 12805 }, { "crossentropy": 2.555885672569275, "epoch": 0.6963756491476114, "grad_norm": 0.06974086165428162, "grad_norm_var": 9.436992394371162e-05, "learning_rate": 6.959928434036033e-07, "loss": 2.5559, "step": 12806 }, { "crossentropy": 2.498378872871399, "epoch": 0.6964300280051117, "grad_norm": 0.030121009796857834, "grad_norm_var": 9.428333412114021e-05, "learning_rate": 6.753724459412913e-07, "loss": 2.4984, "step": 12807 }, { "crossentropy": 2.406861901283264, "epoch": 0.6964844068626118, "grad_norm": 0.03244832530617714, "grad_norm_var": 9.378432747188512e-05, "learning_rate": 6.550621019318914e-07, "loss": 2.4069, "step": 12808 }, { "crossentropy": 2.532208800315857, "epoch": 0.6965387857201121, "grad_norm": 0.03032192401587963, "grad_norm_var": 9.402613362957946e-05, "learning_rate": 6.350618126343966e-07, "loss": 2.5322, "step": 12809 }, { "crossentropy": 2.484745502471924, "epoch": 0.6965931645776122, "grad_norm": 0.031934257596731186, "grad_norm_var": 9.377616871626564e-05, "learning_rate": 6.153715792894809e-07, "loss": 2.4847, "step": 12810 }, { "crossentropy": 2.5241129398345947, "epoch": 0.6966475434351125, "grad_norm": 0.030669966712594032, "grad_norm_var": 9.409078568412456e-05, "learning_rate": 5.959914031183899e-07, "loss": 2.5241, "step": 12811 }, { "crossentropy": 2.3663700819015503, "epoch": 0.6967019222926126, "grad_norm": 0.030840516090393066, "grad_norm_var": 9.415153477394809e-05, "learning_rate": 5.769212853223848e-07, "loss": 2.3664, "step": 12812 }, { "crossentropy": 2.5784263610839844, "epoch": 0.6967563011501129, "grad_norm": 0.030320895835757256, "grad_norm_var": 9.390137971089205e-05, "learning_rate": 5.581612270855185e-07, "loss": 2.5784, "step": 12813 }, { "crossentropy": 2.51546311378479, "epoch": 0.696810680007613, "grad_norm": 0.030416084453463554, "grad_norm_var": 9.407547882571466e-05, "learning_rate": 5.397112295701945e-07, "loss": 2.5155, "step": 12814 }, { "crossentropy": 2.486360192298889, "epoch": 0.6968650588651133, "grad_norm": 0.0342702642083168, "grad_norm_var": 9.358453795864848e-05, "learning_rate": 5.215712939210527e-07, "loss": 2.4864, "step": 12815 }, { "crossentropy": 2.553286910057068, "epoch": 0.6969194377226134, "grad_norm": 0.03111201710999012, "grad_norm_var": 9.346137048053399e-05, "learning_rate": 5.037414212633041e-07, "loss": 2.5533, "step": 12816 }, { "crossentropy": 2.469987988471985, "epoch": 0.6969738165801137, "grad_norm": 0.030444417148828506, "grad_norm_var": 9.387611096819242e-05, "learning_rate": 4.862216127021756e-07, "loss": 2.47, "step": 12817 }, { "crossentropy": 2.4770960807800293, "epoch": 0.6970281954376139, "grad_norm": 0.03047013282775879, "grad_norm_var": 9.423462240479337e-05, "learning_rate": 4.69011869324576e-07, "loss": 2.4771, "step": 12818 }, { "crossentropy": 2.5668699741363525, "epoch": 0.6970825742951141, "grad_norm": 0.03125254064798355, "grad_norm_var": 9.42234849126998e-05, "learning_rate": 4.5211219219742914e-07, "loss": 2.5669, "step": 12819 }, { "crossentropy": 2.536862850189209, "epoch": 0.6971369531526143, "grad_norm": 0.03131062537431717, "grad_norm_var": 9.437153656653886e-05, "learning_rate": 4.355225823698961e-07, "loss": 2.5369, "step": 12820 }, { "crossentropy": 2.5393368005752563, "epoch": 0.6971913320101145, "grad_norm": 0.04131999611854553, "grad_norm_var": 9.742864967946888e-05, "learning_rate": 4.192430408700432e-07, "loss": 2.5393, "step": 12821 }, { "crossentropy": 2.6071587800979614, "epoch": 0.6972457108676147, "grad_norm": 0.031175950542092323, "grad_norm_var": 7.56497718427974e-06, "learning_rate": 4.032735687070632e-07, "loss": 2.6072, "step": 12822 }, { "crossentropy": 2.452293038368225, "epoch": 0.6973000897251149, "grad_norm": 0.030727645382285118, "grad_norm_var": 7.454048811453501e-06, "learning_rate": 3.8761416687238537e-07, "loss": 2.4523, "step": 12823 }, { "crossentropy": 2.492980718612671, "epoch": 0.6973544685826151, "grad_norm": 0.030006960034370422, "grad_norm_var": 7.6203178714419655e-06, "learning_rate": 3.7226483633689967e-07, "loss": 2.493, "step": 12824 }, { "crossentropy": 2.5377594232559204, "epoch": 0.6974088474401153, "grad_norm": 0.030641861259937286, "grad_norm_var": 7.5695441493747274e-06, "learning_rate": 3.5722557805206725e-07, "loss": 2.5378, "step": 12825 }, { "crossentropy": 2.4937695264816284, "epoch": 0.6974632262976155, "grad_norm": 0.029926234856247902, "grad_norm_var": 7.754050990058435e-06, "learning_rate": 3.4249639295103053e-07, "loss": 2.4938, "step": 12826 }, { "crossentropy": 2.511237859725952, "epoch": 0.6975176051551157, "grad_norm": 0.03070162609219551, "grad_norm_var": 7.75037079933603e-06, "learning_rate": 3.2807728194750307e-07, "loss": 2.5112, "step": 12827 }, { "crossentropy": 2.5285112857818604, "epoch": 0.6975719840126159, "grad_norm": 0.03102201037108898, "grad_norm_var": 7.735052223120408e-06, "learning_rate": 3.1396824593521446e-07, "loss": 2.5285, "step": 12828 }, { "crossentropy": 2.5045100450515747, "epoch": 0.6976263628701161, "grad_norm": 0.03231267258524895, "grad_norm_var": 7.651287997646691e-06, "learning_rate": 3.0016928578902037e-07, "loss": 2.5045, "step": 12829 }, { "crossentropy": 2.3847371339797974, "epoch": 0.6976807417276163, "grad_norm": 0.03038495033979416, "grad_norm_var": 7.656655309624587e-06, "learning_rate": 2.8668040236545787e-07, "loss": 2.3847, "step": 12830 }, { "crossentropy": 2.4703927040100098, "epoch": 0.6977351205851166, "grad_norm": 0.030520664528012276, "grad_norm_var": 7.246626434673683e-06, "learning_rate": 2.735015965010801e-07, "loss": 2.4704, "step": 12831 }, { "crossentropy": 2.575659990310669, "epoch": 0.6977894994426167, "grad_norm": 0.030702853575348854, "grad_norm_var": 7.2759728639425205e-06, "learning_rate": 2.60632869012456e-07, "loss": 2.5757, "step": 12832 }, { "crossentropy": 2.4953795671463013, "epoch": 0.697843878300117, "grad_norm": 0.0301943551748991, "grad_norm_var": 7.312827689721898e-06, "learning_rate": 2.480742206978359e-07, "loss": 2.4954, "step": 12833 }, { "crossentropy": 2.4829254150390625, "epoch": 0.6978982571576171, "grad_norm": 0.03038131073117256, "grad_norm_var": 7.324533790830839e-06, "learning_rate": 2.3582565233715158e-07, "loss": 2.4829, "step": 12834 }, { "crossentropy": 2.4352495670318604, "epoch": 0.6979526360151174, "grad_norm": 0.03194994106888771, "grad_norm_var": 7.3401607669982036e-06, "learning_rate": 2.238871646886853e-07, "loss": 2.4352, "step": 12835 }, { "crossentropy": 2.5611979961395264, "epoch": 0.6980070148726175, "grad_norm": 0.03181931748986244, "grad_norm_var": 7.346542898064888e-06, "learning_rate": 2.1225875849351095e-07, "loss": 2.5612, "step": 12836 }, { "crossentropy": 2.4968854188919067, "epoch": 0.6980613937301178, "grad_norm": 0.03850621357560158, "grad_norm_var": 4.152238035038979e-06, "learning_rate": 2.0094043447327348e-07, "loss": 2.4969, "step": 12837 }, { "crossentropy": 2.5406984090805054, "epoch": 0.6981157725876179, "grad_norm": 0.030192852020263672, "grad_norm_var": 4.230333725479401e-06, "learning_rate": 1.8993219332907873e-07, "loss": 2.5407, "step": 12838 }, { "crossentropy": 2.482033371925354, "epoch": 0.6981701514451182, "grad_norm": 0.03022891841828823, "grad_norm_var": 4.2805787847241076e-06, "learning_rate": 1.7923403574371388e-07, "loss": 2.482, "step": 12839 }, { "crossentropy": 2.5389939546585083, "epoch": 0.6982245303026183, "grad_norm": 0.030782155692577362, "grad_norm_var": 4.192933780304553e-06, "learning_rate": 1.688459623810923e-07, "loss": 2.539, "step": 12840 }, { "crossentropy": 2.537272810935974, "epoch": 0.6982789091601186, "grad_norm": 0.031677234917879105, "grad_norm_var": 4.173668451625846e-06, "learning_rate": 1.587679738856984e-07, "loss": 2.5373, "step": 12841 }, { "crossentropy": 2.519510269165039, "epoch": 0.6983332880176187, "grad_norm": 0.03041369840502739, "grad_norm_var": 4.09718713863757e-06, "learning_rate": 1.490000708814776e-07, "loss": 2.5195, "step": 12842 }, { "crossentropy": 2.424663782119751, "epoch": 0.698387666875119, "grad_norm": 0.030201155692338943, "grad_norm_var": 4.156902789257755e-06, "learning_rate": 1.3954225397516673e-07, "loss": 2.4247, "step": 12843 }, { "crossentropy": 2.601308226585388, "epoch": 0.6984420457326191, "grad_norm": 0.03132595121860504, "grad_norm_var": 4.150169024631316e-06, "learning_rate": 1.3039452375351867e-07, "loss": 2.6013, "step": 12844 }, { "crossentropy": 2.491533398628235, "epoch": 0.6984964245901194, "grad_norm": 0.030643315985798836, "grad_norm_var": 4.109988426007867e-06, "learning_rate": 1.215568807833023e-07, "loss": 2.4915, "step": 12845 }, { "crossentropy": 2.475619912147522, "epoch": 0.6985508034476196, "grad_norm": 0.03145747631788254, "grad_norm_var": 4.05884914894485e-06, "learning_rate": 1.1302932561241264e-07, "loss": 2.4756, "step": 12846 }, { "crossentropy": 2.539713501930237, "epoch": 0.6986051823051198, "grad_norm": 0.03288979455828667, "grad_norm_var": 4.159570596999155e-06, "learning_rate": 1.0481185876987099e-07, "loss": 2.5397, "step": 12847 }, { "crossentropy": 2.5221846103668213, "epoch": 0.69865956116262, "grad_norm": 0.030864831060171127, "grad_norm_var": 4.144849466987847e-06, "learning_rate": 9.690448076582481e-08, "loss": 2.5222, "step": 12848 }, { "crossentropy": 2.4572548866271973, "epoch": 0.6987139400201202, "grad_norm": 0.030763601884245872, "grad_norm_var": 4.068240770405462e-06, "learning_rate": 8.930719209043758e-08, "loss": 2.4573, "step": 12849 }, { "crossentropy": 2.4422428607940674, "epoch": 0.6987683188776204, "grad_norm": 0.030497992411255836, "grad_norm_var": 4.051592545896894e-06, "learning_rate": 8.201999321499898e-08, "loss": 2.4422, "step": 12850 }, { "crossentropy": 2.3968082666397095, "epoch": 0.6988226977351206, "grad_norm": 0.0310140922665596, "grad_norm_var": 4.051859727861815e-06, "learning_rate": 7.504288459081465e-08, "loss": 2.3968, "step": 12851 }, { "crossentropy": 2.464958429336548, "epoch": 0.6988770765926208, "grad_norm": 0.03078586235642433, "grad_norm_var": 4.0683987612463146e-06, "learning_rate": 6.837586665087158e-08, "loss": 2.465, "step": 12852 }, { "crossentropy": 2.504406690597534, "epoch": 0.698931455450121, "grad_norm": 0.03147977218031883, "grad_norm_var": 4.874923557639128e-07, "learning_rate": 6.201893980928298e-08, "loss": 2.5044, "step": 12853 }, { "crossentropy": 2.376330018043518, "epoch": 0.6989858343076212, "grad_norm": 0.03099118359386921, "grad_norm_var": 4.466072261855343e-07, "learning_rate": 5.597210445962286e-08, "loss": 2.3763, "step": 12854 }, { "crossentropy": 2.556961417198181, "epoch": 0.6990402131651214, "grad_norm": 0.03107110969722271, "grad_norm_var": 4.042316162953709e-07, "learning_rate": 5.0235360977701764e-08, "loss": 2.557, "step": 12855 }, { "crossentropy": 2.5080372095108032, "epoch": 0.6990945920226216, "grad_norm": 0.03010198287665844, "grad_norm_var": 4.5777274173721083e-07, "learning_rate": 4.480870971823592e-08, "loss": 2.508, "step": 12856 }, { "crossentropy": 2.406705379486084, "epoch": 0.6991489708801218, "grad_norm": 0.030061595141887665, "grad_norm_var": 4.774377039852099e-07, "learning_rate": 3.969215101817802e-08, "loss": 2.4067, "step": 12857 }, { "crossentropy": 2.4672352075576782, "epoch": 0.699203349737622, "grad_norm": 0.03262364864349365, "grad_norm_var": 6.363770603538705e-07, "learning_rate": 3.488568519505186e-08, "loss": 2.4672, "step": 12858 }, { "crossentropy": 2.4460264444351196, "epoch": 0.6992577285951223, "grad_norm": 0.030814243480563164, "grad_norm_var": 5.906166149795133e-07, "learning_rate": 3.038931254695232e-08, "loss": 2.446, "step": 12859 }, { "crossentropy": 2.573065400123596, "epoch": 0.6993121074526224, "grad_norm": 0.03334343060851097, "grad_norm_var": 9.093760258527811e-07, "learning_rate": 2.620303335254537e-08, "loss": 2.5731, "step": 12860 }, { "crossentropy": 2.546080708503723, "epoch": 0.6993664863101227, "grad_norm": 0.03133992850780487, "grad_norm_var": 8.868157453251665e-07, "learning_rate": 2.23268478716232e-08, "loss": 2.5461, "step": 12861 }, { "crossentropy": 2.583012580871582, "epoch": 0.6994208651676228, "grad_norm": 0.032837796956300735, "grad_norm_var": 1.0429240314662724e-06, "learning_rate": 1.87607563445491e-08, "loss": 2.583, "step": 12862 }, { "crossentropy": 2.4035812616348267, "epoch": 0.6994752440251231, "grad_norm": 0.03091614693403244, "grad_norm_var": 8.792183568145003e-07, "learning_rate": 1.5504758992257452e-08, "loss": 2.4036, "step": 12863 }, { "crossentropy": 2.4787272214889526, "epoch": 0.6995296228826232, "grad_norm": 0.03066101297736168, "grad_norm_var": 8.914449913082397e-07, "learning_rate": 1.255885601680884e-08, "loss": 2.4787, "step": 12864 }, { "crossentropy": 2.555733561515808, "epoch": 0.6995840017401235, "grad_norm": 0.03085603564977646, "grad_norm_var": 8.865209543976038e-07, "learning_rate": 9.923047601390068e-09, "loss": 2.5557, "step": 12865 }, { "crossentropy": 2.4216474294662476, "epoch": 0.6996383805976236, "grad_norm": 0.03073730878531933, "grad_norm_var": 8.673096671489664e-07, "learning_rate": 7.597333908648808e-09, "loss": 2.4216, "step": 12866 }, { "crossentropy": 2.496477246284485, "epoch": 0.6996927594551239, "grad_norm": 0.030051009729504585, "grad_norm_var": 9.526451476657427e-07, "learning_rate": 5.581715083469163e-09, "loss": 2.4965, "step": 12867 }, { "crossentropy": 2.599653959274292, "epoch": 0.699747138312624, "grad_norm": 0.030120383948087692, "grad_norm_var": 1.0141428862485424e-06, "learning_rate": 3.8761912507512264e-09, "loss": 2.5997, "step": 12868 }, { "crossentropy": 2.4044268131256104, "epoch": 0.6998015171701243, "grad_norm": 0.030739080160856247, "grad_norm_var": 1.0134356795053757e-06, "learning_rate": 2.480762515966184e-09, "loss": 2.4044, "step": 12869 }, { "crossentropy": 2.515276312828064, "epoch": 0.6998558960276244, "grad_norm": 0.029932815581560135, "grad_norm_var": 1.0958536214227537e-06, "learning_rate": 1.3954289657114316e-09, "loss": 2.5153, "step": 12870 }, { "crossentropy": 2.502491593360901, "epoch": 0.6999102748851247, "grad_norm": 0.03072347678244114, "grad_norm_var": 1.1007118533557045e-06, "learning_rate": 6.201906671554625e-10, "loss": 2.5025, "step": 12871 }, { "crossentropy": 2.4731030464172363, "epoch": 0.6999646537426248, "grad_norm": 0.03838558495044708, "grad_norm_var": 4.4071699387570695e-06, "learning_rate": 1.5504766914808954e-10, "loss": 2.4731, "step": 12872 }, { "crossentropy": 2.446722388267517, "epoch": 0.7000190326001251, "grad_norm": 0.03122061677277088, "grad_norm_var": 4.267456503647991e-06, "learning_rate": 0.0, "loss": 2.4467, "step": 12873 } ], "logging_steps": 1, "max_steps": 12873, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.456135361637679e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }