{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.014395220786698816, "eval_steps": 1000, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.4395220786698816e-05, "grad_norm": 0.18359375, "learning_rate": 0.0001, "loss": 0.2384, "loss/crossentropy": 2.463143229484558, "loss/fcd": 0.4892578125, "loss/idx": 18.0, "loss/logits": 0.23836339265108109, "step": 1 }, { "epoch": 2.8790441573397632e-05, "grad_norm": 0.1328125, "learning_rate": 0.0001, "loss": 0.2453, "loss/crossentropy": 2.74690580368042, "loss/fcd": 0.462890625, "loss/idx": 18.0, "loss/logits": 0.2453368902206421, "step": 2 }, { "epoch": 4.3185662360096445e-05, "grad_norm": 0.15625, "learning_rate": 0.0001, "loss": 0.2292, "loss/crossentropy": 2.3877265453338623, "loss/fcd": 0.4130859375, "loss/idx": 18.0, "loss/logits": 0.22919423878192902, "step": 3 }, { "epoch": 5.7580883146795265e-05, "grad_norm": 0.130859375, "learning_rate": 0.0001, "loss": 0.2284, "loss/crossentropy": 2.392206907272339, "loss/fcd": 0.453125, "loss/idx": 18.0, "loss/logits": 0.22838981449604034, "step": 4 }, { "epoch": 7.197610393349408e-05, "grad_norm": 0.138671875, "learning_rate": 0.0001, "loss": 0.2237, "loss/crossentropy": 2.1798477172851562, "loss/fcd": 0.4384765625, "loss/idx": 18.0, "loss/logits": 0.22366443276405334, "step": 5 }, { "epoch": 8.637132472019289e-05, "grad_norm": 0.1357421875, "learning_rate": 0.0001, "loss": 0.2644, "loss/crossentropy": 2.492342710494995, "loss/fcd": 0.462890625, "loss/idx": 18.0, "loss/logits": 0.2643834352493286, "step": 6 }, { "epoch": 0.00010076654550689171, "grad_norm": 0.150390625, "learning_rate": 0.0001, "loss": 0.211, "loss/crossentropy": 2.035392999649048, "loss/fcd": 0.453125, "loss/idx": 18.0, "loss/logits": 0.21097075939178467, "step": 7 }, { "epoch": 0.00011516176629359053, "grad_norm": 0.13671875, "learning_rate": 0.0001, "loss": 0.2388, "loss/crossentropy": 2.3071805238723755, "loss/fcd": 0.4453125, "loss/idx": 18.0, "loss/logits": 0.23878887295722961, "step": 8 }, { "epoch": 0.00012955698708028935, "grad_norm": 0.126953125, "learning_rate": 0.0001, "loss": 0.2061, "loss/crossentropy": 2.1987677812576294, "loss/fcd": 0.4091796875, "loss/idx": 18.0, "loss/logits": 0.20613879710435867, "step": 9 }, { "epoch": 0.00014395220786698817, "grad_norm": 0.1240234375, "learning_rate": 0.0001, "loss": 0.2075, "loss/crossentropy": 1.9901325702667236, "loss/fcd": 0.4091796875, "loss/idx": 18.0, "loss/logits": 0.20753345638513565, "step": 10 }, { "epoch": 0.000158347428653687, "grad_norm": 0.1298828125, "learning_rate": 0.0001, "loss": 0.2213, "loss/crossentropy": 2.3090018033981323, "loss/fcd": 0.431640625, "loss/idx": 18.0, "loss/logits": 0.2212577611207962, "step": 11 }, { "epoch": 0.00017274264944038578, "grad_norm": 0.1513671875, "learning_rate": 0.0001, "loss": 0.2689, "loss/crossentropy": 2.2487552165985107, "loss/fcd": 0.5, "loss/idx": 18.0, "loss/logits": 0.26888714730739594, "step": 12 }, { "epoch": 0.0001871378702270846, "grad_norm": 0.12109375, "learning_rate": 0.0001, "loss": 0.2335, "loss/crossentropy": 2.3826037645339966, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.23347856849431992, "step": 13 }, { "epoch": 0.00020153309101378342, "grad_norm": 0.115234375, "learning_rate": 0.0001, "loss": 0.2299, "loss/crossentropy": 2.524248242378235, "loss/fcd": 0.4560546875, "loss/idx": 18.0, "loss/logits": 0.22988282144069672, "step": 14 }, { "epoch": 0.00021592831180048224, "grad_norm": 0.1279296875, "learning_rate": 0.0001, "loss": 0.2354, "loss/crossentropy": 2.33734929561615, "loss/fcd": 0.4482421875, "loss/idx": 18.0, "loss/logits": 0.23541489243507385, "step": 15 }, { "epoch": 0.00023032353258718106, "grad_norm": 0.12060546875, "grad_norm_var": 0.0002919107675552368, "learning_rate": 0.0001, "loss": 0.2428, "loss/crossentropy": 2.3426687717437744, "loss/fcd": 0.455078125, "loss/idx": 18.0, "loss/logits": 0.24281759560108185, "step": 16 }, { "epoch": 0.0002447187533738799, "grad_norm": 0.1171875, "grad_norm_var": 0.00014951129754384358, "learning_rate": 0.0001, "loss": 0.2399, "loss/crossentropy": 2.634019374847412, "loss/fcd": 0.4580078125, "loss/idx": 18.0, "loss/logits": 0.23987850546836853, "step": 17 }, { "epoch": 0.0002591139741605787, "grad_norm": 0.1279296875, "grad_norm_var": 0.0001506239175796509, "learning_rate": 0.0001, "loss": 0.2267, "loss/crossentropy": 2.2048473358154297, "loss/fcd": 0.43359375, "loss/idx": 18.0, "loss/logits": 0.2267211154103279, "step": 18 }, { "epoch": 0.0002735091949472775, "grad_norm": 0.1201171875, "grad_norm_var": 0.0001150439182917277, "learning_rate": 0.0001, "loss": 0.2111, "loss/crossentropy": 2.421955704689026, "loss/fcd": 0.4169921875, "loss/idx": 18.0, "loss/logits": 0.21106208860874176, "step": 19 }, { "epoch": 0.00028790441573397634, "grad_norm": 0.125, "grad_norm_var": 0.00011625985304514567, "learning_rate": 0.0001, "loss": 0.2474, "loss/crossentropy": 2.4863855838775635, "loss/fcd": 0.4990234375, "loss/idx": 18.0, "loss/logits": 0.24741190671920776, "step": 20 }, { "epoch": 0.00030229963652067516, "grad_norm": 0.11962890625, "grad_norm_var": 0.00011513630549112956, "learning_rate": 0.0001, "loss": 0.2185, "loss/crossentropy": 2.2641090154647827, "loss/fcd": 0.4462890625, "loss/idx": 18.0, "loss/logits": 0.21849986910820007, "step": 21 }, { "epoch": 0.000316694857307374, "grad_norm": 0.11669921875, "grad_norm_var": 0.0001184294621149699, "learning_rate": 0.0001, "loss": 0.2309, "loss/crossentropy": 2.614189624786377, "loss/fcd": 0.4580078125, "loss/idx": 18.0, "loss/logits": 0.23093532770872116, "step": 22 }, { "epoch": 0.00033109007809407274, "grad_norm": 0.1123046875, "grad_norm_var": 8.991460005442301e-05, "learning_rate": 0.0001, "loss": 0.2174, "loss/crossentropy": 2.258315682411194, "loss/fcd": 0.408203125, "loss/idx": 18.0, "loss/logits": 0.21735627949237823, "step": 23 }, { "epoch": 0.00034548529888077156, "grad_norm": 0.1279296875, "grad_norm_var": 8.047322432200113e-05, "learning_rate": 0.0001, "loss": 0.2413, "loss/crossentropy": 2.355400562286377, "loss/fcd": 0.466796875, "loss/idx": 18.0, "loss/logits": 0.2412610948085785, "step": 24 }, { "epoch": 0.0003598805196674704, "grad_norm": 0.130859375, "grad_norm_var": 8.29686721165975e-05, "learning_rate": 0.0001, "loss": 0.2334, "loss/crossentropy": 2.4980456829071045, "loss/fcd": 0.4599609375, "loss/idx": 18.0, "loss/logits": 0.23341741412878036, "step": 25 }, { "epoch": 0.0003742757404541692, "grad_norm": 0.11279296875, "grad_norm_var": 9.11712646484375e-05, "learning_rate": 0.0001, "loss": 0.2017, "loss/crossentropy": 2.1927164793014526, "loss/fcd": 0.427734375, "loss/idx": 18.0, "loss/logits": 0.20174731314182281, "step": 26 }, { "epoch": 0.000388670961240868, "grad_norm": 0.173828125, "grad_norm_var": 0.0002490639686584473, "learning_rate": 0.0001, "loss": 0.253, "loss/crossentropy": 2.5806944370269775, "loss/fcd": 0.4658203125, "loss/idx": 18.0, "loss/logits": 0.2529568448662758, "step": 27 }, { "epoch": 0.00040306618202756684, "grad_norm": 0.11572265625, "grad_norm_var": 0.0002092510461807251, "learning_rate": 0.0001, "loss": 0.2384, "loss/crossentropy": 2.292937397956848, "loss/fcd": 0.43359375, "loss/idx": 18.0, "loss/logits": 0.2384110689163208, "step": 28 }, { "epoch": 0.00041746140281426566, "grad_norm": 0.12451171875, "grad_norm_var": 0.0002086321512858073, "learning_rate": 0.0001, "loss": 0.257, "loss/crossentropy": 2.4048542976379395, "loss/fcd": 0.482421875, "loss/idx": 18.0, "loss/logits": 0.25698406249284744, "step": 29 }, { "epoch": 0.0004318566236009645, "grad_norm": 0.107421875, "grad_norm_var": 0.00022185643513997395, "learning_rate": 0.0001, "loss": 0.2111, "loss/crossentropy": 2.4948848485946655, "loss/fcd": 0.431640625, "loss/idx": 18.0, "loss/logits": 0.21111004799604416, "step": 30 }, { "epoch": 0.0004462518443876633, "grad_norm": 0.1142578125, "grad_norm_var": 0.00022597312927246093, "learning_rate": 0.0001, "loss": 0.2299, "loss/crossentropy": 2.233025908470154, "loss/fcd": 0.421875, "loss/idx": 18.0, "loss/logits": 0.22986605763435364, "step": 31 }, { "epoch": 0.0004606470651743621, "grad_norm": 0.11669921875, "grad_norm_var": 0.0002281347910563151, "learning_rate": 0.0001, "loss": 0.2272, "loss/crossentropy": 2.448768973350525, "loss/fcd": 0.4462890625, "loss/idx": 18.0, "loss/logits": 0.22723641991615295, "step": 32 }, { "epoch": 0.00047504228596106094, "grad_norm": 0.111328125, "grad_norm_var": 0.0002345720926920573, "learning_rate": 0.0001, "loss": 0.1984, "loss/crossentropy": 2.116120755672455, "loss/fcd": 0.3935546875, "loss/idx": 18.0, "loss/logits": 0.19836096465587616, "step": 33 }, { "epoch": 0.0004894375067477598, "grad_norm": 0.1259765625, "grad_norm_var": 0.00023334821065266927, "learning_rate": 0.0001, "loss": 0.2416, "loss/crossentropy": 2.3083192110061646, "loss/fcd": 0.447265625, "loss/idx": 18.0, "loss/logits": 0.2415921539068222, "step": 34 }, { "epoch": 0.0005038327275344585, "grad_norm": 0.1337890625, "grad_norm_var": 0.00024124781290690104, "learning_rate": 0.0001, "loss": 0.2936, "loss/crossentropy": 2.6550590991973877, "loss/fcd": 0.529296875, "loss/idx": 18.0, "loss/logits": 0.29357363283634186, "step": 35 }, { "epoch": 0.0005182279483211574, "grad_norm": 0.1025390625, "grad_norm_var": 0.00026692946751912433, "learning_rate": 0.0001, "loss": 0.2041, "loss/crossentropy": 2.341429352760315, "loss/fcd": 0.4140625, "loss/idx": 18.0, "loss/logits": 0.20406678318977356, "step": 36 }, { "epoch": 0.0005326231691078562, "grad_norm": 0.11376953125, "grad_norm_var": 0.00027064879735310874, "learning_rate": 0.0001, "loss": 0.225, "loss/crossentropy": 2.350203037261963, "loss/fcd": 0.41796875, "loss/idx": 18.0, "loss/logits": 0.22499807178974152, "step": 37 }, { "epoch": 0.000547018389894555, "grad_norm": 0.11083984375, "grad_norm_var": 0.00027637084325154624, "learning_rate": 0.0001, "loss": 0.222, "loss/crossentropy": 2.209356427192688, "loss/fcd": 0.421875, "loss/idx": 18.0, "loss/logits": 0.22195565700531006, "step": 38 }, { "epoch": 0.0005614136106812538, "grad_norm": 0.11376953125, "grad_norm_var": 0.00027482410271962486, "learning_rate": 0.0001, "loss": 0.2432, "loss/crossentropy": 2.6039966344833374, "loss/fcd": 0.4638671875, "loss/idx": 18.0, "loss/logits": 0.24324779212474823, "step": 39 }, { "epoch": 0.0005758088314679527, "grad_norm": 0.11865234375, "grad_norm_var": 0.00027163426081339517, "learning_rate": 0.0001, "loss": 0.2133, "loss/crossentropy": 2.3391385078430176, "loss/fcd": 0.416015625, "loss/idx": 18.0, "loss/logits": 0.21334318816661835, "step": 40 }, { "epoch": 0.0005902040522546514, "grad_norm": 0.1015625, "grad_norm_var": 0.00028450886408487954, "learning_rate": 0.0001, "loss": 0.2074, "loss/crossentropy": 2.5192357301712036, "loss/fcd": 0.4091796875, "loss/idx": 18.0, "loss/logits": 0.2073945701122284, "step": 41 }, { "epoch": 0.0006045992730413503, "grad_norm": 0.10791015625, "grad_norm_var": 0.0002897739410400391, "learning_rate": 0.0001, "loss": 0.1894, "loss/crossentropy": 2.35784912109375, "loss/fcd": 0.4248046875, "loss/idx": 18.0, "loss/logits": 0.18937092274427414, "step": 42 }, { "epoch": 0.0006189944938280491, "grad_norm": 0.11669921875, "grad_norm_var": 7.068216800689697e-05, "learning_rate": 0.0001, "loss": 0.1925, "loss/crossentropy": 2.0304250717163086, "loss/fcd": 0.40625, "loss/idx": 18.0, "loss/logits": 0.19247674196958542, "step": 43 }, { "epoch": 0.000633389714614748, "grad_norm": 0.111328125, "grad_norm_var": 7.129907608032227e-05, "learning_rate": 0.0001, "loss": 0.2238, "loss/crossentropy": 2.257385492324829, "loss/fcd": 0.4384765625, "loss/idx": 18.0, "loss/logits": 0.22384560853242874, "step": 44 }, { "epoch": 0.0006477849354014467, "grad_norm": 0.10986328125, "grad_norm_var": 6.504058837890625e-05, "learning_rate": 0.0001, "loss": 0.2174, "loss/crossentropy": 2.47000515460968, "loss/fcd": 0.4189453125, "loss/idx": 18.0, "loss/logits": 0.2174428552389145, "step": 45 }, { "epoch": 0.0006621801561881455, "grad_norm": 0.1318359375, "grad_norm_var": 8.242527643839518e-05, "learning_rate": 0.0001, "loss": 0.2356, "loss/crossentropy": 2.77071475982666, "loss/fcd": 0.4501953125, "loss/idx": 18.0, "loss/logits": 0.23558437824249268, "step": 46 }, { "epoch": 0.0006765753769748444, "grad_norm": 0.11669921875, "grad_norm_var": 8.253951867421468e-05, "learning_rate": 0.0001, "loss": 0.2314, "loss/crossentropy": 2.3579763174057007, "loss/fcd": 0.431640625, "loss/idx": 18.0, "loss/logits": 0.23137739300727844, "step": 47 }, { "epoch": 0.0006909705977615431, "grad_norm": 0.11962890625, "grad_norm_var": 8.366008599599202e-05, "learning_rate": 0.0001, "loss": 0.2147, "loss/crossentropy": 2.4674328565597534, "loss/fcd": 0.48046875, "loss/idx": 18.0, "loss/logits": 0.2146531641483307, "step": 48 }, { "epoch": 0.000705365818548242, "grad_norm": 0.13671875, "grad_norm_var": 0.00011021196842193603, "learning_rate": 0.0001, "loss": 0.2773, "loss/crossentropy": 2.5875381231307983, "loss/fcd": 0.501953125, "loss/idx": 18.0, "loss/logits": 0.27730000019073486, "step": 49 }, { "epoch": 0.0007197610393349408, "grad_norm": 0.10400390625, "grad_norm_var": 0.00011401176452636718, "learning_rate": 0.0001, "loss": 0.1992, "loss/crossentropy": 2.3770352602005005, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.1992211416363716, "step": 50 }, { "epoch": 0.0007341562601216396, "grad_norm": 0.115234375, "grad_norm_var": 9.05315081278483e-05, "learning_rate": 0.0001, "loss": 0.2208, "loss/crossentropy": 2.503299593925476, "loss/fcd": 0.4462890625, "loss/idx": 18.0, "loss/logits": 0.22077593207359314, "step": 51 }, { "epoch": 0.0007485514809083384, "grad_norm": 0.1181640625, "grad_norm_var": 8.09947649637858e-05, "learning_rate": 0.0001, "loss": 0.2448, "loss/crossentropy": 2.5992391109466553, "loss/fcd": 0.478515625, "loss/idx": 18.0, "loss/logits": 0.2447950839996338, "step": 52 }, { "epoch": 0.0007629467016950373, "grad_norm": 0.11083984375, "grad_norm_var": 8.217493693033855e-05, "learning_rate": 0.0001, "loss": 0.2302, "loss/crossentropy": 2.5341001749038696, "loss/fcd": 0.421875, "loss/idx": 18.0, "loss/logits": 0.23020881414413452, "step": 53 }, { "epoch": 0.000777341922481736, "grad_norm": 0.11328125, "grad_norm_var": 8.111695448557536e-05, "learning_rate": 0.0001, "loss": 0.242, "loss/crossentropy": 2.594543933868408, "loss/fcd": 0.4580078125, "loss/idx": 18.0, "loss/logits": 0.24200908839702606, "step": 54 }, { "epoch": 0.0007917371432684349, "grad_norm": 0.1142578125, "grad_norm_var": 8.102655410766602e-05, "learning_rate": 0.0001, "loss": 0.2157, "loss/crossentropy": 2.35564386844635, "loss/fcd": 0.4296875, "loss/idx": 18.0, "loss/logits": 0.21571539342403412, "step": 55 }, { "epoch": 0.0008061323640551337, "grad_norm": 0.140625, "grad_norm_var": 0.00012067854404449463, "learning_rate": 0.0001, "loss": 0.2262, "loss/crossentropy": 2.5845850706100464, "loss/fcd": 0.505859375, "loss/idx": 18.0, "loss/logits": 0.22615493834018707, "step": 56 }, { "epoch": 0.0008205275848418326, "grad_norm": 0.10791015625, "grad_norm_var": 0.00011030832926432292, "learning_rate": 0.0001, "loss": 0.2264, "loss/crossentropy": 2.6225829124450684, "loss/fcd": 0.451171875, "loss/idx": 18.0, "loss/logits": 0.22639667242765427, "step": 57 }, { "epoch": 0.0008349228056285313, "grad_norm": 0.12158203125, "grad_norm_var": 0.00010507901509602865, "learning_rate": 0.0001, "loss": 0.2332, "loss/crossentropy": 2.49368155002594, "loss/fcd": 0.4404296875, "loss/idx": 18.0, "loss/logits": 0.23318731039762497, "step": 58 }, { "epoch": 0.0008493180264152302, "grad_norm": 0.10986328125, "grad_norm_var": 0.00010922352472941081, "learning_rate": 0.0001, "loss": 0.2161, "loss/crossentropy": 2.365482449531555, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.21612977981567383, "step": 59 }, { "epoch": 0.000863713247201929, "grad_norm": 0.12451171875, "grad_norm_var": 0.00010903577009836832, "learning_rate": 0.0001, "loss": 0.2292, "loss/crossentropy": 2.450873017311096, "loss/fcd": 0.4462890625, "loss/idx": 18.0, "loss/logits": 0.22924820333719254, "step": 60 }, { "epoch": 0.0008781084679886277, "grad_norm": 0.10400390625, "grad_norm_var": 0.0001178810993830363, "learning_rate": 0.0001, "loss": 0.2071, "loss/crossentropy": 2.364640951156616, "loss/fcd": 0.412109375, "loss/idx": 18.0, "loss/logits": 0.20711997151374817, "step": 61 }, { "epoch": 0.0008925036887753266, "grad_norm": 0.10009765625, "grad_norm_var": 0.00012259483337402345, "learning_rate": 0.0001, "loss": 0.1989, "loss/crossentropy": 2.430219888687134, "loss/fcd": 0.4072265625, "loss/idx": 18.0, "loss/logits": 0.19892004877328873, "step": 62 }, { "epoch": 0.0009068989095620254, "grad_norm": 0.1083984375, "grad_norm_var": 0.00012622574965159098, "learning_rate": 0.0001, "loss": 0.2166, "loss/crossentropy": 2.412087559700012, "loss/fcd": 0.41015625, "loss/idx": 18.0, "loss/logits": 0.21661554276943207, "step": 63 }, { "epoch": 0.0009212941303487242, "grad_norm": 0.11279296875, "grad_norm_var": 0.00012544691562652588, "learning_rate": 0.0001, "loss": 0.2152, "loss/crossentropy": 2.369842290878296, "loss/fcd": 0.4306640625, "loss/idx": 18.0, "loss/logits": 0.21520362049341202, "step": 64 }, { "epoch": 0.000935689351135423, "grad_norm": 0.1279296875, "grad_norm_var": 0.00010499060153961182, "learning_rate": 0.0001, "loss": 0.2505, "loss/crossentropy": 2.5731316804885864, "loss/fcd": 0.484375, "loss/idx": 18.0, "loss/logits": 0.2505309656262398, "step": 65 }, { "epoch": 0.0009500845719221219, "grad_norm": 0.11572265625, "grad_norm_var": 9.702742099761962e-05, "learning_rate": 0.0001, "loss": 0.2162, "loss/crossentropy": 2.4219590425491333, "loss/fcd": 0.4228515625, "loss/idx": 18.0, "loss/logits": 0.2161625698208809, "step": 66 }, { "epoch": 0.0009644797927088206, "grad_norm": 0.109375, "grad_norm_var": 9.924471378326416e-05, "learning_rate": 0.0001, "loss": 0.1955, "loss/crossentropy": 2.072207987308502, "loss/fcd": 0.4052734375, "loss/idx": 18.0, "loss/logits": 0.19551369547843933, "step": 67 }, { "epoch": 0.0009788750134955195, "grad_norm": 0.10302734375, "grad_norm_var": 0.00010709762573242187, "learning_rate": 0.0001, "loss": 0.2152, "loss/crossentropy": 2.4199079275131226, "loss/fcd": 0.404296875, "loss/idx": 18.0, "loss/logits": 0.2152082547545433, "step": 68 }, { "epoch": 0.0009932702342822183, "grad_norm": 0.1181640625, "grad_norm_var": 0.00010735094547271728, "learning_rate": 0.0001, "loss": 0.1939, "loss/crossentropy": 2.073515832424164, "loss/fcd": 0.4111328125, "loss/idx": 18.0, "loss/logits": 0.19392766803503036, "step": 69 }, { "epoch": 0.001007665455068917, "grad_norm": 0.12109375, "grad_norm_var": 0.00010992586612701416, "learning_rate": 0.0001, "loss": 0.244, "loss/crossentropy": 2.376970887184143, "loss/fcd": 0.4453125, "loss/idx": 18.0, "loss/logits": 0.2439984604716301, "step": 70 }, { "epoch": 0.001022060675855616, "grad_norm": 0.1201171875, "grad_norm_var": 0.00011152327060699463, "learning_rate": 0.0001, "loss": 0.2341, "loss/crossentropy": 2.329576015472412, "loss/fcd": 0.4716796875, "loss/idx": 18.0, "loss/logits": 0.23405101150274277, "step": 71 }, { "epoch": 0.0010364558966423148, "grad_norm": 0.11083984375, "grad_norm_var": 6.649891535441081e-05, "learning_rate": 0.0001, "loss": 0.2061, "loss/crossentropy": 2.240494966506958, "loss/fcd": 0.431640625, "loss/idx": 18.0, "loss/logits": 0.20607301592826843, "step": 72 }, { "epoch": 0.0010508511174290136, "grad_norm": 0.115234375, "grad_norm_var": 6.442765394846599e-05, "learning_rate": 0.0001, "loss": 0.2261, "loss/crossentropy": 2.58719003200531, "loss/fcd": 0.4501953125, "loss/idx": 18.0, "loss/logits": 0.22607439756393433, "step": 73 }, { "epoch": 0.0010652463382157123, "grad_norm": 0.10693359375, "grad_norm_var": 6.287793318430583e-05, "learning_rate": 0.0001, "loss": 0.2209, "loss/crossentropy": 2.458608031272888, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.22093002498149872, "step": 74 }, { "epoch": 0.001079641559002411, "grad_norm": 0.1015625, "grad_norm_var": 7.06632932027181e-05, "learning_rate": 0.0001, "loss": 0.189, "loss/crossentropy": 2.4321776628494263, "loss/fcd": 0.40625, "loss/idx": 18.0, "loss/logits": 0.18898583948612213, "step": 75 }, { "epoch": 0.00109403677978911, "grad_norm": 0.11279296875, "grad_norm_var": 6.0458978017171226e-05, "learning_rate": 0.0001, "loss": 0.241, "loss/crossentropy": 2.5812788009643555, "loss/fcd": 0.4453125, "loss/idx": 18.0, "loss/logits": 0.24104547500610352, "step": 76 }, { "epoch": 0.0011084320005758088, "grad_norm": 0.10986328125, "grad_norm_var": 5.65489133199056e-05, "learning_rate": 0.0001, "loss": 0.203, "loss/crossentropy": 2.408494293689728, "loss/fcd": 0.4111328125, "loss/idx": 18.0, "loss/logits": 0.20304062217473984, "step": 77 }, { "epoch": 0.0011228272213625076, "grad_norm": 0.1337890625, "grad_norm_var": 7.34796126683553e-05, "learning_rate": 0.0001, "loss": 0.2287, "loss/crossentropy": 2.239099144935608, "loss/fcd": 0.46484375, "loss/idx": 18.0, "loss/logits": 0.2286640703678131, "step": 78 }, { "epoch": 0.0011372224421492064, "grad_norm": 0.12353515625, "grad_norm_var": 7.603565851847331e-05, "learning_rate": 0.0001, "loss": 0.2298, "loss/crossentropy": 2.357472777366638, "loss/fcd": 0.48046875, "loss/idx": 18.0, "loss/logits": 0.22975638508796692, "step": 79 }, { "epoch": 0.0011516176629359054, "grad_norm": 0.12158203125, "grad_norm_var": 7.807413736979166e-05, "learning_rate": 0.0001, "loss": 0.2185, "loss/crossentropy": 2.4130557775497437, "loss/fcd": 0.4765625, "loss/idx": 18.0, "loss/logits": 0.21845312416553497, "step": 80 }, { "epoch": 0.0011660128837226041, "grad_norm": 0.11962890625, "grad_norm_var": 6.88701868057251e-05, "learning_rate": 0.0001, "loss": 0.2304, "loss/crossentropy": 2.293164014816284, "loss/fcd": 0.466796875, "loss/idx": 18.0, "loss/logits": 0.2303522452712059, "step": 81 }, { "epoch": 0.0011804081045093029, "grad_norm": 0.10107421875, "grad_norm_var": 8.126795291900635e-05, "learning_rate": 0.0001, "loss": 0.1964, "loss/crossentropy": 2.2688822746276855, "loss/fcd": 0.3935546875, "loss/idx": 18.0, "loss/logits": 0.19636806100606918, "step": 82 }, { "epoch": 0.0011948033252960016, "grad_norm": 0.11376953125, "grad_norm_var": 7.959604263305665e-05, "learning_rate": 0.0001, "loss": 0.2397, "loss/crossentropy": 2.398077368736267, "loss/fcd": 0.4404296875, "loss/idx": 18.0, "loss/logits": 0.23967822641134262, "step": 83 }, { "epoch": 0.0012091985460827006, "grad_norm": 0.099609375, "grad_norm_var": 8.558332920074462e-05, "learning_rate": 0.0001, "loss": 0.2079, "loss/crossentropy": 2.524065375328064, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.20788107812404633, "step": 84 }, { "epoch": 0.0012235937668693994, "grad_norm": 0.1103515625, "grad_norm_var": 8.542438348134359e-05, "learning_rate": 0.0001, "loss": 0.2091, "loss/crossentropy": 2.398527979850769, "loss/fcd": 0.4296875, "loss/idx": 18.0, "loss/logits": 0.20912020653486252, "step": 85 }, { "epoch": 0.0012379889876560982, "grad_norm": 0.12109375, "grad_norm_var": 8.542438348134359e-05, "learning_rate": 0.0001, "loss": 0.2435, "loss/crossentropy": 2.4105933904647827, "loss/fcd": 0.48828125, "loss/idx": 18.0, "loss/logits": 0.24351391196250916, "step": 86 }, { "epoch": 0.001252384208442797, "grad_norm": 0.12158203125, "grad_norm_var": 8.678038914998373e-05, "learning_rate": 0.0001, "loss": 0.2189, "loss/crossentropy": 2.26534903049469, "loss/fcd": 0.443359375, "loss/idx": 18.0, "loss/logits": 0.21887247264385223, "step": 87 }, { "epoch": 0.001266779429229496, "grad_norm": 0.1162109375, "grad_norm_var": 8.635421593983968e-05, "learning_rate": 0.0001, "loss": 0.1992, "loss/crossentropy": 2.1426846981048584, "loss/fcd": 0.4296875, "loss/idx": 18.0, "loss/logits": 0.1991657018661499, "step": 88 }, { "epoch": 0.0012811746500161947, "grad_norm": 0.11279296875, "grad_norm_var": 8.641878763834635e-05, "learning_rate": 0.0001, "loss": 0.2104, "loss/crossentropy": 2.2193171977996826, "loss/fcd": 0.4140625, "loss/idx": 18.0, "loss/logits": 0.21043668687343597, "step": 89 }, { "epoch": 0.0012955698708028934, "grad_norm": 0.14453125, "grad_norm_var": 0.00013866325219472249, "learning_rate": 0.0001, "loss": 0.2259, "loss/crossentropy": 2.4619998931884766, "loss/fcd": 0.4755859375, "loss/idx": 18.0, "loss/logits": 0.22587314993143082, "step": 90 }, { "epoch": 0.0013099650915895922, "grad_norm": 0.1162109375, "grad_norm_var": 0.00012292762597401936, "learning_rate": 0.0001, "loss": 0.2502, "loss/crossentropy": 2.5881928205490112, "loss/fcd": 0.4638671875, "loss/idx": 18.0, "loss/logits": 0.2501572445034981, "step": 91 }, { "epoch": 0.001324360312376291, "grad_norm": 0.123046875, "grad_norm_var": 0.0001231988271077474, "learning_rate": 0.0001, "loss": 0.2096, "loss/crossentropy": 2.28423535823822, "loss/fcd": 0.4140625, "loss/idx": 18.0, "loss/logits": 0.20956922322511673, "step": 92 }, { "epoch": 0.00133875553316299, "grad_norm": 0.119140625, "grad_norm_var": 0.0001184612512588501, "learning_rate": 0.0001, "loss": 0.2081, "loss/crossentropy": 2.0630246996879578, "loss/fcd": 0.4189453125, "loss/idx": 18.0, "loss/logits": 0.208104208111763, "step": 93 }, { "epoch": 0.0013531507539496887, "grad_norm": 0.1142578125, "grad_norm_var": 0.00010280509789784749, "learning_rate": 0.0001, "loss": 0.2285, "loss/crossentropy": 2.4672012329101562, "loss/fcd": 0.4375, "loss/idx": 18.0, "loss/logits": 0.22853697836399078, "step": 94 }, { "epoch": 0.0013675459747363875, "grad_norm": 0.109375, "grad_norm_var": 0.00010375579198201497, "learning_rate": 0.0001, "loss": 0.2217, "loss/crossentropy": 2.432914614677429, "loss/fcd": 0.42578125, "loss/idx": 18.0, "loss/logits": 0.22167058289051056, "step": 95 }, { "epoch": 0.0013819411955230862, "grad_norm": 0.103515625, "grad_norm_var": 0.00011195242404937744, "learning_rate": 0.0001, "loss": 0.198, "loss/crossentropy": 2.522903800010681, "loss/fcd": 0.416015625, "loss/idx": 18.0, "loss/logits": 0.19795683026313782, "step": 96 }, { "epoch": 0.0013963364163097852, "grad_norm": 0.109375, "grad_norm_var": 0.00011272430419921875, "learning_rate": 0.0001, "loss": 0.2049, "loss/crossentropy": 2.149677038192749, "loss/fcd": 0.3984375, "loss/idx": 18.0, "loss/logits": 0.20487764477729797, "step": 97 }, { "epoch": 0.001410731637096484, "grad_norm": 0.10546875, "grad_norm_var": 0.00010592043399810791, "learning_rate": 0.0001, "loss": 0.2057, "loss/crossentropy": 2.467462182044983, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.20570345222949982, "step": 98 }, { "epoch": 0.0014251268578831828, "grad_norm": 0.12890625, "grad_norm_var": 0.00011771519978841146, "learning_rate": 0.0001, "loss": 0.2105, "loss/crossentropy": 2.353211760520935, "loss/fcd": 0.4306640625, "loss/idx": 18.0, "loss/logits": 0.21045749634504318, "step": 99 }, { "epoch": 0.0014395220786698815, "grad_norm": 0.1279296875, "grad_norm_var": 0.00010607639948527019, "learning_rate": 0.0001, "loss": 0.2903, "loss/crossentropy": 2.590612769126892, "loss/fcd": 0.47265625, "loss/idx": 18.0, "loss/logits": 0.290309339761734, "step": 100 }, { "epoch": 0.0014539172994565805, "grad_norm": 0.1259765625, "grad_norm_var": 0.00010594924290974935, "learning_rate": 0.0001, "loss": 0.2467, "loss/crossentropy": 2.3608927726745605, "loss/fcd": 0.4638671875, "loss/idx": 18.0, "loss/logits": 0.24666306376457214, "step": 101 }, { "epoch": 0.0014683125202432793, "grad_norm": 0.1298828125, "grad_norm_var": 0.00011356671651204427, "learning_rate": 0.0001, "loss": 0.2413, "loss/crossentropy": 2.1008136868476868, "loss/fcd": 0.4521484375, "loss/idx": 18.0, "loss/logits": 0.24129530787467957, "step": 102 }, { "epoch": 0.001482707741029978, "grad_norm": 0.11279296875, "grad_norm_var": 0.0001156767209370931, "learning_rate": 0.0001, "loss": 0.2219, "loss/crossentropy": 2.36824232339859, "loss/fcd": 0.4462890625, "loss/idx": 18.0, "loss/logits": 0.2218664586544037, "step": 103 }, { "epoch": 0.0014971029618166768, "grad_norm": 0.1181640625, "grad_norm_var": 0.00011526346206665039, "learning_rate": 0.0001, "loss": 0.2459, "loss/crossentropy": 2.3991124629974365, "loss/fcd": 0.4423828125, "loss/idx": 18.0, "loss/logits": 0.24585890769958496, "step": 104 }, { "epoch": 0.0015114981826033756, "grad_norm": 0.1005859375, "grad_norm_var": 0.00013441145420074464, "learning_rate": 0.0001, "loss": 0.2028, "loss/crossentropy": 2.5206661224365234, "loss/fcd": 0.416015625, "loss/idx": 18.0, "loss/logits": 0.20282022655010223, "step": 105 }, { "epoch": 0.0015258934033900746, "grad_norm": 0.123046875, "grad_norm_var": 8.746683597564698e-05, "learning_rate": 0.0001, "loss": 0.2339, "loss/crossentropy": 2.294739842414856, "loss/fcd": 0.4619140625, "loss/idx": 18.0, "loss/logits": 0.2339302897453308, "step": 106 }, { "epoch": 0.0015402886241767733, "grad_norm": 0.11181640625, "grad_norm_var": 8.897781372070312e-05, "learning_rate": 0.0001, "loss": 0.2028, "loss/crossentropy": 2.430526852607727, "loss/fcd": 0.404296875, "loss/idx": 18.0, "loss/logits": 0.20277925580739975, "step": 107 }, { "epoch": 0.001554683844963472, "grad_norm": 0.10400390625, "grad_norm_var": 9.490549564361572e-05, "learning_rate": 0.0001, "loss": 0.1685, "loss/crossentropy": 1.9886462688446045, "loss/fcd": 0.4814453125, "loss/idx": 18.0, "loss/logits": 0.16851283982396126, "step": 108 }, { "epoch": 0.0015690790657501708, "grad_norm": 0.11376953125, "grad_norm_var": 9.39329465230306e-05, "learning_rate": 0.0001, "loss": 0.2232, "loss/crossentropy": 2.3031085729599, "loss/fcd": 0.41796875, "loss/idx": 18.0, "loss/logits": 0.2231953889131546, "step": 109 }, { "epoch": 0.0015834742865368698, "grad_norm": 0.1064453125, "grad_norm_var": 9.844700495402019e-05, "learning_rate": 0.0001, "loss": 0.2145, "loss/crossentropy": 2.4420076608657837, "loss/fcd": 0.427734375, "loss/idx": 18.0, "loss/logits": 0.2144630402326584, "step": 110 }, { "epoch": 0.0015978695073235686, "grad_norm": 0.10693359375, "grad_norm_var": 0.00010046859582265218, "learning_rate": 0.0001, "loss": 0.1999, "loss/crossentropy": 2.265585422515869, "loss/fcd": 0.3876953125, "loss/idx": 18.0, "loss/logits": 0.19986777007579803, "step": 111 }, { "epoch": 0.0016122647281102674, "grad_norm": 0.11572265625, "grad_norm_var": 9.224812189737956e-05, "learning_rate": 0.0001, "loss": 0.2391, "loss/crossentropy": 2.5880415439605713, "loss/fcd": 0.447265625, "loss/idx": 18.0, "loss/logits": 0.23910276591777802, "step": 112 }, { "epoch": 0.0016266599488969661, "grad_norm": 0.1181640625, "grad_norm_var": 9.04242197672526e-05, "learning_rate": 0.0001, "loss": 0.2466, "loss/crossentropy": 2.6048234701156616, "loss/fcd": 0.47265625, "loss/idx": 18.0, "loss/logits": 0.24656572192907333, "step": 113 }, { "epoch": 0.0016410551696836651, "grad_norm": 0.134765625, "grad_norm_var": 0.00010449091593424479, "learning_rate": 0.0001, "loss": 0.2386, "loss/crossentropy": 2.1900378465652466, "loss/fcd": 0.484375, "loss/idx": 18.0, "loss/logits": 0.23863784968852997, "step": 114 }, { "epoch": 0.0016554503904703639, "grad_norm": 0.10986328125, "grad_norm_var": 9.802083174387614e-05, "learning_rate": 0.0001, "loss": 0.2123, "loss/crossentropy": 2.547809600830078, "loss/fcd": 0.4423828125, "loss/idx": 18.0, "loss/logits": 0.21230217069387436, "step": 115 }, { "epoch": 0.0016698456112570626, "grad_norm": 0.255859375, "grad_norm_var": 0.0013202657302220663, "learning_rate": 0.0001, "loss": 0.2161, "loss/crossentropy": 2.586913585662842, "loss/fcd": 0.431640625, "loss/idx": 18.0, "loss/logits": 0.2160758599638939, "step": 116 }, { "epoch": 0.0016842408320437614, "grad_norm": 0.1171875, "grad_norm_var": 0.0013230552275975546, "learning_rate": 0.0001, "loss": 0.2282, "loss/crossentropy": 2.3776031732559204, "loss/fcd": 0.4462890625, "loss/idx": 18.0, "loss/logits": 0.2281685397028923, "step": 117 }, { "epoch": 0.0016986360528304604, "grad_norm": 0.11572265625, "grad_norm_var": 0.0013238906860351563, "learning_rate": 0.0001, "loss": 0.2269, "loss/crossentropy": 2.5417513847351074, "loss/fcd": 0.455078125, "loss/idx": 18.0, "loss/logits": 0.226931631565094, "step": 118 }, { "epoch": 0.0017130312736171592, "grad_norm": 0.109375, "grad_norm_var": 0.0013291825850804647, "learning_rate": 0.0001, "loss": 0.2172, "loss/crossentropy": 2.416541814804077, "loss/fcd": 0.40625, "loss/idx": 18.0, "loss/logits": 0.2172057330608368, "step": 119 }, { "epoch": 0.001727426494403858, "grad_norm": 0.1103515625, "grad_norm_var": 0.0013376067082087198, "learning_rate": 0.0001, "loss": 0.2088, "loss/crossentropy": 2.3803776502609253, "loss/fcd": 0.4375, "loss/idx": 18.0, "loss/logits": 0.20880089700222015, "step": 120 }, { "epoch": 0.0017418217151905567, "grad_norm": 0.11962890625, "grad_norm_var": 0.0013056437174479166, "learning_rate": 0.0001, "loss": 0.2246, "loss/crossentropy": 2.4869107007980347, "loss/fcd": 0.4501953125, "loss/idx": 18.0, "loss/logits": 0.22459682077169418, "step": 121 }, { "epoch": 0.0017562169359772554, "grad_norm": 0.10595703125, "grad_norm_var": 0.0013244539499282838, "learning_rate": 0.0001, "loss": 0.2141, "loss/crossentropy": 2.2889301776885986, "loss/fcd": 0.4130859375, "loss/idx": 18.0, "loss/logits": 0.21405386179685593, "step": 122 }, { "epoch": 0.0017706121567639544, "grad_norm": 0.10888671875, "grad_norm_var": 0.0013290554285049438, "learning_rate": 0.0001, "loss": 0.183, "loss/crossentropy": 2.2636550664901733, "loss/fcd": 0.3916015625, "loss/idx": 18.0, "loss/logits": 0.1829545795917511, "step": 123 }, { "epoch": 0.0017850073775506532, "grad_norm": 0.123046875, "grad_norm_var": 0.0013059258460998535, "learning_rate": 0.0001, "loss": 0.2259, "loss/crossentropy": 2.3760812282562256, "loss/fcd": 0.4501953125, "loss/idx": 18.0, "loss/logits": 0.2258809506893158, "step": 124 }, { "epoch": 0.001799402598337352, "grad_norm": 0.10791015625, "grad_norm_var": 0.001315462589263916, "learning_rate": 0.0001, "loss": 0.2041, "loss/crossentropy": 2.595892906188965, "loss/fcd": 0.4189453125, "loss/idx": 18.0, "loss/logits": 0.20406261831521988, "step": 125 }, { "epoch": 0.0018137978191240507, "grad_norm": 0.126953125, "grad_norm_var": 0.001296854019165039, "learning_rate": 0.0001, "loss": 0.2131, "loss/crossentropy": 2.3521647453308105, "loss/fcd": 0.431640625, "loss/idx": 18.0, "loss/logits": 0.21306610107421875, "step": 126 }, { "epoch": 0.0018281930399107497, "grad_norm": 0.10205078125, "grad_norm_var": 0.0013095498085021972, "learning_rate": 0.0001, "loss": 0.2135, "loss/crossentropy": 2.5395818948745728, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.21354226768016815, "step": 127 }, { "epoch": 0.0018425882606974485, "grad_norm": 0.10791015625, "grad_norm_var": 0.0013218204180399577, "learning_rate": 0.0001, "loss": 0.1906, "loss/crossentropy": 2.154847741127014, "loss/fcd": 0.3994140625, "loss/idx": 18.0, "loss/logits": 0.19056915491819382, "step": 128 }, { "epoch": 0.0018569834814841472, "grad_norm": 0.11962890625, "grad_norm_var": 0.0013209412495295207, "learning_rate": 0.0001, "loss": 0.2376, "loss/crossentropy": 2.3668060302734375, "loss/fcd": 0.4482421875, "loss/idx": 18.0, "loss/logits": 0.23755235970020294, "step": 129 }, { "epoch": 0.001871378702270846, "grad_norm": 0.1044921875, "grad_norm_var": 0.0013325204451878866, "learning_rate": 0.0001, "loss": 0.2072, "loss/crossentropy": 2.4412275552749634, "loss/fcd": 0.439453125, "loss/idx": 18.0, "loss/logits": 0.20715615153312683, "step": 130 }, { "epoch": 0.001885773923057545, "grad_norm": 0.11865234375, "grad_norm_var": 0.0013236512740453085, "learning_rate": 0.0001, "loss": 0.2363, "loss/crossentropy": 2.589287519454956, "loss/fcd": 0.4853515625, "loss/idx": 18.0, "loss/logits": 0.2362738400697708, "step": 131 }, { "epoch": 0.0019001691438442438, "grad_norm": 0.1181640625, "grad_norm_var": 5.292793114980062e-05, "learning_rate": 0.0001, "loss": 0.1757, "loss/crossentropy": 2.1394956707954407, "loss/fcd": 0.41015625, "loss/idx": 18.0, "loss/logits": 0.17568951100111008, "step": 132 }, { "epoch": 0.0019145643646309425, "grad_norm": 0.1044921875, "grad_norm_var": 5.675057570139567e-05, "learning_rate": 0.0001, "loss": 0.2141, "loss/crossentropy": 2.5705530643463135, "loss/fcd": 0.431640625, "loss/idx": 18.0, "loss/logits": 0.21412815153598785, "step": 133 }, { "epoch": 0.0019289595854176413, "grad_norm": 0.1171875, "grad_norm_var": 5.7474772135416666e-05, "learning_rate": 0.0001, "loss": 0.2091, "loss/crossentropy": 2.2588201761245728, "loss/fcd": 0.400390625, "loss/idx": 18.0, "loss/logits": 0.20908734947443008, "step": 134 }, { "epoch": 0.00194335480620434, "grad_norm": 0.10595703125, "grad_norm_var": 5.976259708404541e-05, "learning_rate": 0.0001, "loss": 0.2167, "loss/crossentropy": 2.432557463645935, "loss/fcd": 0.427734375, "loss/idx": 18.0, "loss/logits": 0.2166854664683342, "step": 135 }, { "epoch": 0.001957750026991039, "grad_norm": 0.11328125, "grad_norm_var": 5.942881107330322e-05, "learning_rate": 0.0001, "loss": 0.2177, "loss/crossentropy": 2.4058191776275635, "loss/fcd": 0.4443359375, "loss/idx": 18.0, "loss/logits": 0.21774785220623016, "step": 136 }, { "epoch": 0.001972145247777738, "grad_norm": 0.1044921875, "grad_norm_var": 5.98907470703125e-05, "learning_rate": 0.0001, "loss": 0.1946, "loss/crossentropy": 2.441463589668274, "loss/fcd": 0.421875, "loss/idx": 18.0, "loss/logits": 0.19459661096334457, "step": 137 }, { "epoch": 0.0019865404685644366, "grad_norm": 0.12353515625, "grad_norm_var": 6.546974182128907e-05, "learning_rate": 0.0001, "loss": 0.2507, "loss/crossentropy": 2.5539783239364624, "loss/fcd": 0.484375, "loss/idx": 18.0, "loss/logits": 0.25072282552719116, "step": 138 }, { "epoch": 0.0020009356893511353, "grad_norm": 0.0986328125, "grad_norm_var": 7.754862308502197e-05, "learning_rate": 0.0001, "loss": 0.2023, "loss/crossentropy": 2.5158984661102295, "loss/fcd": 0.41015625, "loss/idx": 18.0, "loss/logits": 0.2023158147931099, "step": 139 }, { "epoch": 0.002015330910137834, "grad_norm": 0.109375, "grad_norm_var": 6.959338982899983e-05, "learning_rate": 0.0001, "loss": 0.1937, "loss/crossentropy": 2.2275065183639526, "loss/fcd": 0.4248046875, "loss/idx": 18.0, "loss/logits": 0.19366320967674255, "step": 140 }, { "epoch": 0.002029726130924533, "grad_norm": 0.1171875, "grad_norm_var": 7.063150405883789e-05, "learning_rate": 0.0001, "loss": 0.1863, "loss/crossentropy": 2.422375202178955, "loss/fcd": 0.44921875, "loss/idx": 18.0, "loss/logits": 0.18625369668006897, "step": 141 }, { "epoch": 0.002044121351711232, "grad_norm": 0.11474609375, "grad_norm_var": 5.560616652170817e-05, "learning_rate": 0.0001, "loss": 0.207, "loss/crossentropy": 2.209444999694824, "loss/fcd": 0.4248046875, "loss/idx": 18.0, "loss/logits": 0.2070077657699585, "step": 142 }, { "epoch": 0.002058516572497931, "grad_norm": 0.11083984375, "grad_norm_var": 4.966954390207927e-05, "learning_rate": 0.0001, "loss": 0.2254, "loss/crossentropy": 2.641687750816345, "loss/fcd": 0.453125, "loss/idx": 18.0, "loss/logits": 0.2254098877310753, "step": 143 }, { "epoch": 0.0020729117932846296, "grad_norm": 0.1083984375, "grad_norm_var": 4.943211873372396e-05, "learning_rate": 0.0001, "loss": 0.2174, "loss/crossentropy": 2.4751927852630615, "loss/fcd": 0.4375, "loss/idx": 18.0, "loss/logits": 0.2174309641122818, "step": 144 }, { "epoch": 0.0020873070140713284, "grad_norm": 0.10986328125, "grad_norm_var": 4.522005716959635e-05, "learning_rate": 0.0001, "loss": 0.2059, "loss/crossentropy": 2.703999638557434, "loss/fcd": 0.4501953125, "loss/idx": 18.0, "loss/logits": 0.20589765906333923, "step": 145 }, { "epoch": 0.002101702234858027, "grad_norm": 0.11474609375, "grad_norm_var": 4.261235396067301e-05, "learning_rate": 0.0001, "loss": 0.2243, "loss/crossentropy": 2.3885515928268433, "loss/fcd": 0.443359375, "loss/idx": 18.0, "loss/logits": 0.22432449460029602, "step": 146 }, { "epoch": 0.002116097455644726, "grad_norm": 0.1142578125, "grad_norm_var": 3.983179728190104e-05, "learning_rate": 0.0001, "loss": 0.2524, "loss/crossentropy": 2.471445918083191, "loss/fcd": 0.470703125, "loss/idx": 18.0, "loss/logits": 0.25243769586086273, "step": 147 }, { "epoch": 0.0021304926764314247, "grad_norm": 0.10693359375, "grad_norm_var": 3.784398237864176e-05, "learning_rate": 0.0001, "loss": 0.2059, "loss/crossentropy": 2.4856609106063843, "loss/fcd": 0.41015625, "loss/idx": 18.0, "loss/logits": 0.20586465299129486, "step": 148 }, { "epoch": 0.0021448878972181234, "grad_norm": 0.1240234375, "grad_norm_var": 4.507601261138916e-05, "learning_rate": 0.0001, "loss": 0.238, "loss/crossentropy": 2.4825209379196167, "loss/fcd": 0.4912109375, "loss/idx": 18.0, "loss/logits": 0.23801030218601227, "step": 149 }, { "epoch": 0.002159283118004822, "grad_norm": 0.11279296875, "grad_norm_var": 4.329681396484375e-05, "learning_rate": 0.0001, "loss": 0.218, "loss/crossentropy": 2.373395562171936, "loss/fcd": 0.42578125, "loss/idx": 18.0, "loss/logits": 0.21804769337177277, "step": 150 }, { "epoch": 0.0021736783387915214, "grad_norm": 0.1162109375, "grad_norm_var": 4.1857361793518066e-05, "learning_rate": 0.0001, "loss": 0.2016, "loss/crossentropy": 2.242987275123596, "loss/fcd": 0.41015625, "loss/idx": 18.0, "loss/logits": 0.2015869840979576, "step": 151 }, { "epoch": 0.00218807355957822, "grad_norm": 0.1103515625, "grad_norm_var": 4.2071938514709474e-05, "learning_rate": 0.0001, "loss": 0.2289, "loss/crossentropy": 2.6060279607772827, "loss/fcd": 0.443359375, "loss/idx": 18.0, "loss/logits": 0.2288510948419571, "step": 152 }, { "epoch": 0.002202468780364919, "grad_norm": 0.10595703125, "grad_norm_var": 4.068613052368164e-05, "learning_rate": 0.0001, "loss": 0.2122, "loss/crossentropy": 2.4911882877349854, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.2122008204460144, "step": 153 }, { "epoch": 0.0022168640011516177, "grad_norm": 0.11279296875, "grad_norm_var": 3.1900405883789065e-05, "learning_rate": 0.0001, "loss": 0.1964, "loss/crossentropy": 2.2283207178115845, "loss/fcd": 0.388671875, "loss/idx": 18.0, "loss/logits": 0.19640249013900757, "step": 154 }, { "epoch": 0.0022312592219383164, "grad_norm": 0.1240234375, "grad_norm_var": 2.7974446614583332e-05, "learning_rate": 0.0001, "loss": 0.2518, "loss/crossentropy": 2.6885886192321777, "loss/fcd": 0.498046875, "loss/idx": 18.0, "loss/logits": 0.251840204000473, "step": 155 }, { "epoch": 0.002245654442725015, "grad_norm": 0.1123046875, "grad_norm_var": 2.698500951131185e-05, "learning_rate": 0.0001, "loss": 0.2217, "loss/crossentropy": 2.3278268575668335, "loss/fcd": 0.4111328125, "loss/idx": 18.0, "loss/logits": 0.22167562693357468, "step": 156 }, { "epoch": 0.002260049663511714, "grad_norm": 0.1171875, "grad_norm_var": 2.698500951131185e-05, "learning_rate": 0.0001, "loss": 0.2134, "loss/crossentropy": 2.2359228134155273, "loss/fcd": 0.4306640625, "loss/idx": 18.0, "loss/logits": 0.21338575333356857, "step": 157 }, { "epoch": 0.0022744448842984127, "grad_norm": 0.1220703125, "grad_norm_var": 3.1589468320210776e-05, "learning_rate": 0.0001, "loss": 0.1983, "loss/crossentropy": 2.1452057361602783, "loss/fcd": 0.40625, "loss/idx": 18.0, "loss/logits": 0.19825652241706848, "step": 158 }, { "epoch": 0.002288840105085112, "grad_norm": 0.11328125, "grad_norm_var": 3.095865249633789e-05, "learning_rate": 0.0001, "loss": 0.2048, "loss/crossentropy": 2.075889527797699, "loss/fcd": 0.400390625, "loss/idx": 18.0, "loss/logits": 0.20479386299848557, "step": 159 }, { "epoch": 0.0023032353258718107, "grad_norm": 0.123046875, "grad_norm_var": 3.3283233642578124e-05, "learning_rate": 0.0001, "loss": 0.206, "loss/crossentropy": 2.2650269269943237, "loss/fcd": 0.4375, "loss/idx": 18.0, "loss/logits": 0.20604287087917328, "step": 160 }, { "epoch": 0.0023176305466585095, "grad_norm": 0.11328125, "grad_norm_var": 3.167688846588135e-05, "learning_rate": 0.0001, "loss": 0.2225, "loss/crossentropy": 2.385145902633667, "loss/fcd": 0.416015625, "loss/idx": 18.0, "loss/logits": 0.22251462936401367, "step": 161 }, { "epoch": 0.0023320257674452082, "grad_norm": 0.11669921875, "grad_norm_var": 3.179609775543213e-05, "learning_rate": 0.0001, "loss": 0.2162, "loss/crossentropy": 2.3363062143325806, "loss/fcd": 0.43359375, "loss/idx": 18.0, "loss/logits": 0.21623297035694122, "step": 162 }, { "epoch": 0.002346420988231907, "grad_norm": 0.11328125, "grad_norm_var": 3.199477990468343e-05, "learning_rate": 0.0001, "loss": 0.2196, "loss/crossentropy": 2.258102059364319, "loss/fcd": 0.4111328125, "loss/idx": 18.0, "loss/logits": 0.219633050262928, "step": 163 }, { "epoch": 0.0023608162090186058, "grad_norm": 0.11328125, "grad_norm_var": 2.7461846669514975e-05, "learning_rate": 0.0001, "loss": 0.2229, "loss/crossentropy": 2.477385640144348, "loss/fcd": 0.4384765625, "loss/idx": 18.0, "loss/logits": 0.22293243557214737, "step": 164 }, { "epoch": 0.0023752114298053045, "grad_norm": 0.11376953125, "grad_norm_var": 2.260108788808187e-05, "learning_rate": 0.0001, "loss": 0.206, "loss/crossentropy": 2.5965325832366943, "loss/fcd": 0.4345703125, "loss/idx": 18.0, "loss/logits": 0.20604980736970901, "step": 165 }, { "epoch": 0.0023896066505920033, "grad_norm": 0.12158203125, "grad_norm_var": 2.48183806737264e-05, "learning_rate": 0.0001, "loss": 0.2456, "loss/crossentropy": 2.391031265258789, "loss/fcd": 0.4580078125, "loss/idx": 18.0, "loss/logits": 0.24561651051044464, "step": 166 }, { "epoch": 0.002404001871378702, "grad_norm": 0.11083984375, "grad_norm_var": 2.616246541341146e-05, "learning_rate": 0.0001, "loss": 0.2163, "loss/crossentropy": 2.534990668296814, "loss/fcd": 0.431640625, "loss/idx": 18.0, "loss/logits": 0.21634604781866074, "step": 167 }, { "epoch": 0.0024183970921654013, "grad_norm": 0.130859375, "grad_norm_var": 3.909667332967122e-05, "learning_rate": 0.0001, "loss": 0.2477, "loss/crossentropy": 2.354380965232849, "loss/fcd": 0.482421875, "loss/idx": 18.0, "loss/logits": 0.24768973886966705, "step": 168 }, { "epoch": 0.0024327923129521, "grad_norm": 0.1142578125, "grad_norm_var": 3.171662489573161e-05, "learning_rate": 0.0001, "loss": 0.2411, "loss/crossentropy": 2.430347204208374, "loss/fcd": 0.447265625, "loss/idx": 18.0, "loss/logits": 0.2411317229270935, "step": 169 }, { "epoch": 0.002447187533738799, "grad_norm": 0.11328125, "grad_norm_var": 3.145535786946615e-05, "learning_rate": 0.0001, "loss": 0.2188, "loss/crossentropy": 2.312503755092621, "loss/fcd": 0.4169921875, "loss/idx": 18.0, "loss/logits": 0.21878328174352646, "step": 170 }, { "epoch": 0.0024615827545254976, "grad_norm": 0.109375, "grad_norm_var": 3.1276543935139975e-05, "learning_rate": 0.0001, "loss": 0.2309, "loss/crossentropy": 2.5175788402557373, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.23088021576404572, "step": 171 }, { "epoch": 0.0024759779753121963, "grad_norm": 0.1123046875, "grad_norm_var": 3.1276543935139975e-05, "learning_rate": 0.0001, "loss": 0.2165, "loss/crossentropy": 2.484018087387085, "loss/fcd": 0.4140625, "loss/idx": 18.0, "loss/logits": 0.2165074348449707, "step": 172 }, { "epoch": 0.002490373196098895, "grad_norm": 0.1318359375, "grad_norm_var": 4.671414693196615e-05, "learning_rate": 0.0001, "loss": 0.2481, "loss/crossentropy": 2.2699760794639587, "loss/fcd": 0.486328125, "loss/idx": 18.0, "loss/logits": 0.24809680879116058, "step": 173 }, { "epoch": 0.002504768416885594, "grad_norm": 0.10693359375, "grad_norm_var": 5.0933162371317545e-05, "learning_rate": 0.0001, "loss": 0.1955, "loss/crossentropy": 2.2288765907287598, "loss/fcd": 0.3857421875, "loss/idx": 18.0, "loss/logits": 0.19553960859775543, "step": 174 }, { "epoch": 0.0025191636376722926, "grad_norm": 0.1220703125, "grad_norm_var": 5.243519941965739e-05, "learning_rate": 0.0001, "loss": 0.213, "loss/crossentropy": 2.4654963612556458, "loss/fcd": 0.453125, "loss/idx": 18.0, "loss/logits": 0.2130081057548523, "step": 175 }, { "epoch": 0.002533558858458992, "grad_norm": 0.1162109375, "grad_norm_var": 4.954238732655843e-05, "learning_rate": 0.0001, "loss": 0.21, "loss/crossentropy": 2.2151373624801636, "loss/fcd": 0.4072265625, "loss/idx": 18.0, "loss/logits": 0.20995519310235977, "step": 176 }, { "epoch": 0.0025479540792456906, "grad_norm": 0.11669921875, "grad_norm_var": 4.892349243164062e-05, "learning_rate": 0.0001, "loss": 0.2092, "loss/crossentropy": 2.4239630699157715, "loss/fcd": 0.4306640625, "loss/idx": 18.0, "loss/logits": 0.20919281244277954, "step": 177 }, { "epoch": 0.0025623493000323893, "grad_norm": 0.11572265625, "grad_norm_var": 4.8951307932535806e-05, "learning_rate": 0.0001, "loss": 0.2638, "loss/crossentropy": 2.718831419944763, "loss/fcd": 0.4755859375, "loss/idx": 18.0, "loss/logits": 0.2638430893421173, "step": 178 }, { "epoch": 0.002576744520819088, "grad_norm": 0.10302734375, "grad_norm_var": 5.977849165598551e-05, "learning_rate": 0.0001, "loss": 0.1942, "loss/crossentropy": 2.4341124296188354, "loss/fcd": 0.4189453125, "loss/idx": 18.0, "loss/logits": 0.1942092925310135, "step": 179 }, { "epoch": 0.002591139741605787, "grad_norm": 0.1142578125, "grad_norm_var": 5.9516231218973795e-05, "learning_rate": 0.0001, "loss": 0.223, "loss/crossentropy": 2.3784589767456055, "loss/fcd": 0.41015625, "loss/idx": 18.0, "loss/logits": 0.2230425328016281, "step": 180 }, { "epoch": 0.0026055349623924856, "grad_norm": 0.11083984375, "grad_norm_var": 6.085137526194254e-05, "learning_rate": 0.0001, "loss": 0.1995, "loss/crossentropy": 2.1103312969207764, "loss/fcd": 0.41015625, "loss/idx": 18.0, "loss/logits": 0.1994745284318924, "step": 181 }, { "epoch": 0.0026199301831791844, "grad_norm": 0.1123046875, "grad_norm_var": 5.8869520823160805e-05, "learning_rate": 0.0001, "loss": 0.2228, "loss/crossentropy": 2.173603892326355, "loss/fcd": 0.4228515625, "loss/idx": 18.0, "loss/logits": 0.22283250093460083, "step": 182 }, { "epoch": 0.002634325403965883, "grad_norm": 0.12353515625, "grad_norm_var": 6.18139902750651e-05, "learning_rate": 0.0001, "loss": 0.2399, "loss/crossentropy": 2.3933345079421997, "loss/fcd": 0.45703125, "loss/idx": 18.0, "loss/logits": 0.23994869738817215, "step": 183 }, { "epoch": 0.002648720624752582, "grad_norm": 0.12890625, "grad_norm_var": 5.8142344156901043e-05, "learning_rate": 0.0001, "loss": 0.2447, "loss/crossentropy": 2.5679067373275757, "loss/fcd": 0.474609375, "loss/idx": 18.0, "loss/logits": 0.24468251317739487, "step": 184 }, { "epoch": 0.002663115845539281, "grad_norm": 0.11279296875, "grad_norm_var": 5.8562556902567545e-05, "learning_rate": 0.0001, "loss": 0.1861, "loss/crossentropy": 1.966173768043518, "loss/fcd": 0.3759765625, "loss/idx": 18.0, "loss/logits": 0.18611325323581696, "step": 185 }, { "epoch": 0.00267751106632598, "grad_norm": 0.1865234375, "grad_norm_var": 0.0003708908955256144, "learning_rate": 0.0001, "loss": 0.3475, "loss/crossentropy": 2.386851668357849, "loss/fcd": 0.560546875, "loss/idx": 18.0, "loss/logits": 0.3474508970975876, "step": 186 }, { "epoch": 0.0026919062871126787, "grad_norm": 0.11279296875, "grad_norm_var": 0.0003666838010152181, "learning_rate": 0.0001, "loss": 0.2127, "loss/crossentropy": 2.4003021717071533, "loss/fcd": 0.4287109375, "loss/idx": 18.0, "loss/logits": 0.21274058520793915, "step": 187 }, { "epoch": 0.0027063015078993774, "grad_norm": 0.15234375, "grad_norm_var": 0.00042354265848795574, "learning_rate": 0.0001, "loss": 0.2758, "loss/crossentropy": 2.218628406524658, "loss/fcd": 0.4873046875, "loss/idx": 18.0, "loss/logits": 0.2757628411054611, "step": 188 }, { "epoch": 0.002720696728686076, "grad_norm": 0.10986328125, "grad_norm_var": 0.00042761067549387615, "learning_rate": 0.0001, "loss": 0.1907, "loss/crossentropy": 2.1557281017303467, "loss/fcd": 0.388671875, "loss/idx": 18.0, "loss/logits": 0.1906721442937851, "step": 189 }, { "epoch": 0.002735091949472775, "grad_norm": 0.10595703125, "grad_norm_var": 0.000429573655128479, "learning_rate": 0.0001, "loss": 0.1882, "loss/crossentropy": 2.047899842262268, "loss/fcd": 0.3857421875, "loss/idx": 18.0, "loss/logits": 0.1881674826145172, "step": 190 }, { "epoch": 0.0027494871702594737, "grad_norm": 0.10302734375, "grad_norm_var": 0.00045076608657836916, "learning_rate": 0.0001, "loss": 0.1987, "loss/crossentropy": 2.2902016639709473, "loss/fcd": 0.3974609375, "loss/idx": 18.0, "loss/logits": 0.1987495943903923, "step": 191 }, { "epoch": 0.0027638823910461725, "grad_norm": 0.107421875, "grad_norm_var": 0.0004603862762451172, "learning_rate": 0.0001, "loss": 0.1967, "loss/crossentropy": 2.296987771987915, "loss/fcd": 0.3837890625, "loss/idx": 18.0, "loss/logits": 0.1967175006866455, "step": 192 }, { "epoch": 0.0027782776118328717, "grad_norm": 0.119140625, "grad_norm_var": 0.00045976539452870685, "learning_rate": 0.0001, "loss": 0.2354, "loss/crossentropy": 2.2293859124183655, "loss/fcd": 0.427734375, "loss/idx": 18.0, "loss/logits": 0.23544982075691223, "step": 193 }, { "epoch": 0.0027926728326195705, "grad_norm": 0.1044921875, "grad_norm_var": 0.0004739085833231608, "learning_rate": 0.0001, "loss": 0.2093, "loss/crossentropy": 2.3077027797698975, "loss/fcd": 0.4384765625, "loss/idx": 18.0, "loss/logits": 0.20933127403259277, "step": 194 }, { "epoch": 0.0028070680534062692, "grad_norm": 0.11767578125, "grad_norm_var": 0.0004557291666666667, "learning_rate": 0.0001, "loss": 0.2349, "loss/crossentropy": 2.5241353511810303, "loss/fcd": 0.423828125, "loss/idx": 18.0, "loss/logits": 0.23492421209812164, "step": 195 }, { "epoch": 0.002821463274192968, "grad_norm": 0.10888671875, "grad_norm_var": 0.0004617283741633097, "learning_rate": 0.0001, "loss": 0.2089, "loss/crossentropy": 2.2112027406692505, "loss/fcd": 0.41796875, "loss/idx": 18.0, "loss/logits": 0.20893365144729614, "step": 196 }, { "epoch": 0.0028358584949796668, "grad_norm": 0.095703125, "grad_norm_var": 0.0004940946896870931, "learning_rate": 0.0001, "loss": 0.1738, "loss/crossentropy": 2.3283063173294067, "loss/fcd": 0.392578125, "loss/idx": 18.0, "loss/logits": 0.1738404482603073, "step": 197 }, { "epoch": 0.0028502537157663655, "grad_norm": 0.11279296875, "grad_norm_var": 0.0004936844110488891, "learning_rate": 0.0001, "loss": 0.2259, "loss/crossentropy": 2.4649304151535034, "loss/fcd": 0.4697265625, "loss/idx": 18.0, "loss/logits": 0.22589464485645294, "step": 198 }, { "epoch": 0.0028646489365530643, "grad_norm": 0.142578125, "grad_norm_var": 0.0005282044410705566, "learning_rate": 0.0001, "loss": 0.2334, "loss/crossentropy": 2.4893065690994263, "loss/fcd": 0.443359375, "loss/idx": 18.0, "loss/logits": 0.23335154354572296, "step": 199 }, { "epoch": 0.002879044157339763, "grad_norm": 0.1044921875, "grad_norm_var": 0.000536648432413737, "learning_rate": 0.0001, "loss": 0.2107, "loss/crossentropy": 2.5291190147399902, "loss/fcd": 0.431640625, "loss/idx": 18.0, "loss/logits": 0.2106790393590927, "step": 200 }, { "epoch": 0.002893439378126462, "grad_norm": 0.11279296875, "grad_norm_var": 0.000536648432413737, "learning_rate": 0.0001, "loss": 0.2286, "loss/crossentropy": 2.5203051567077637, "loss/fcd": 0.4521484375, "loss/idx": 18.0, "loss/logits": 0.2286214381456375, "step": 201 }, { "epoch": 0.002907834598913161, "grad_norm": 0.11181640625, "grad_norm_var": 0.00020819405714670816, "learning_rate": 0.0001, "loss": 0.2162, "loss/crossentropy": 2.1828808784484863, "loss/fcd": 0.40234375, "loss/idx": 18.0, "loss/logits": 0.21623709797859192, "step": 202 }, { "epoch": 0.00292222981969986, "grad_norm": 0.109375, "grad_norm_var": 0.0002094109853108724, "learning_rate": 0.0001, "loss": 0.1716, "loss/crossentropy": 1.858969271183014, "loss/fcd": 0.5029296875, "loss/idx": 18.0, "loss/logits": 0.17157060280442238, "step": 203 }, { "epoch": 0.0029366250404865585, "grad_norm": 0.11376953125, "grad_norm_var": 0.0001033852497736613, "learning_rate": 0.0001, "loss": 0.2325, "loss/crossentropy": 2.4954288005828857, "loss/fcd": 0.4296875, "loss/idx": 18.0, "loss/logits": 0.23250436782836914, "step": 204 }, { "epoch": 0.0029510202612732573, "grad_norm": 0.11669921875, "grad_norm_var": 0.00010505417982737223, "learning_rate": 0.0001, "loss": 0.2277, "loss/crossentropy": 2.389811635017395, "loss/fcd": 0.4482421875, "loss/idx": 18.0, "loss/logits": 0.22774703800678253, "step": 205 }, { "epoch": 0.002965415482059956, "grad_norm": 0.1328125, "grad_norm_var": 0.00012969573338826498, "learning_rate": 0.0001, "loss": 0.1985, "loss/crossentropy": 2.144119679927826, "loss/fcd": 0.412109375, "loss/idx": 18.0, "loss/logits": 0.19851599633693695, "step": 206 }, { "epoch": 0.002979810702846655, "grad_norm": 0.1220703125, "grad_norm_var": 0.00012617011864980062, "learning_rate": 0.0001, "loss": 0.2454, "loss/crossentropy": 2.509921073913574, "loss/fcd": 0.4638671875, "loss/idx": 18.0, "loss/logits": 0.24539965391159058, "step": 207 }, { "epoch": 0.0029942059236333536, "grad_norm": 0.1142578125, "grad_norm_var": 0.0001226097345352173, "learning_rate": 0.0001, "loss": 0.238, "loss/crossentropy": 2.336063265800476, "loss/fcd": 0.478515625, "loss/idx": 18.0, "loss/logits": 0.237995944917202, "step": 208 }, { "epoch": 0.0030086011444200524, "grad_norm": 0.1435546875, "grad_norm_var": 0.0001734723647435506, "learning_rate": 0.0001, "loss": 0.2493, "loss/crossentropy": 2.3922590017318726, "loss/fcd": 0.484375, "loss/idx": 18.0, "loss/logits": 0.24932140111923218, "step": 209 }, { "epoch": 0.003022996365206751, "grad_norm": 0.115234375, "grad_norm_var": 0.0001635064681371053, "learning_rate": 0.0001, "loss": 0.2434, "loss/crossentropy": 2.597308397293091, "loss/fcd": 0.470703125, "loss/idx": 18.0, "loss/logits": 0.24338021874427795, "step": 210 }, { "epoch": 0.0030373915859934503, "grad_norm": 0.1123046875, "grad_norm_var": 0.00016493797302246093, "learning_rate": 0.0001, "loss": 0.2082, "loss/crossentropy": 2.3584909439086914, "loss/fcd": 0.42578125, "loss/idx": 18.0, "loss/logits": 0.20816650241613388, "step": 211 }, { "epoch": 0.003051786806780149, "grad_norm": 0.10791015625, "grad_norm_var": 0.0001660307248433431, "learning_rate": 0.0001, "loss": 0.2122, "loss/crossentropy": 2.3587781190872192, "loss/fcd": 0.4111328125, "loss/idx": 18.0, "loss/logits": 0.21221671998500824, "step": 212 }, { "epoch": 0.003066182027566848, "grad_norm": 0.1298828125, "grad_norm_var": 0.0001430829366048177, "learning_rate": 0.0001, "loss": 0.2491, "loss/crossentropy": 2.4296464920043945, "loss/fcd": 0.46875, "loss/idx": 18.0, "loss/logits": 0.24906984716653824, "step": 213 }, { "epoch": 0.0030805772483535466, "grad_norm": 0.123046875, "grad_norm_var": 0.00014130969842274984, "learning_rate": 0.0001, "loss": 0.2168, "loss/crossentropy": 2.1808066368103027, "loss/fcd": 0.4248046875, "loss/idx": 18.0, "loss/logits": 0.21684125810861588, "step": 214 }, { "epoch": 0.0030949724691402454, "grad_norm": 0.1025390625, "grad_norm_var": 0.00011850098768870035, "learning_rate": 0.0001, "loss": 0.2052, "loss/crossentropy": 2.3064663410186768, "loss/fcd": 0.41015625, "loss/idx": 18.0, "loss/logits": 0.20520812273025513, "step": 215 }, { "epoch": 0.003109367689926944, "grad_norm": 0.1162109375, "grad_norm_var": 0.00010748604933420817, "learning_rate": 0.0001, "loss": 0.2414, "loss/crossentropy": 2.4660093784332275, "loss/fcd": 0.4609375, "loss/idx": 18.0, "loss/logits": 0.241433747112751, "step": 216 }, { "epoch": 0.003123762910713643, "grad_norm": 0.11669921875, "grad_norm_var": 0.00010584890842437744, "learning_rate": 0.0001, "loss": 0.2161, "loss/crossentropy": 2.2378053665161133, "loss/fcd": 0.4130859375, "loss/idx": 18.0, "loss/logits": 0.21608934551477432, "step": 217 }, { "epoch": 0.0031381581315003417, "grad_norm": 0.11669921875, "grad_norm_var": 0.00010330577691396078, "learning_rate": 0.0001, "loss": 0.2132, "loss/crossentropy": 2.312962532043457, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.21319198608398438, "step": 218 }, { "epoch": 0.003152553352287041, "grad_norm": 0.11474609375, "grad_norm_var": 9.870529174804688e-05, "learning_rate": 0.0001, "loss": 0.2193, "loss/crossentropy": 2.3573015928268433, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.21930547058582306, "step": 219 }, { "epoch": 0.0031669485730737397, "grad_norm": 0.119140625, "grad_norm_var": 9.701152642567952e-05, "learning_rate": 0.0001, "loss": 0.2496, "loss/crossentropy": 2.6434515714645386, "loss/fcd": 0.4638671875, "loss/idx": 18.0, "loss/logits": 0.24958771467208862, "step": 220 }, { "epoch": 0.0031813437938604384, "grad_norm": 0.1396484375, "grad_norm_var": 0.0001229246457417806, "learning_rate": 0.0001, "loss": 0.2262, "loss/crossentropy": 2.2807798981666565, "loss/fcd": 0.50390625, "loss/idx": 18.0, "loss/logits": 0.22618486732244492, "step": 221 }, { "epoch": 0.003195739014647137, "grad_norm": 0.12060546875, "grad_norm_var": 0.00011207163333892822, "learning_rate": 0.0001, "loss": 0.2143, "loss/crossentropy": 2.350602626800537, "loss/fcd": 0.42578125, "loss/idx": 18.0, "loss/logits": 0.2142939791083336, "step": 222 }, { "epoch": 0.003210134235433836, "grad_norm": 0.193359375, "grad_norm_var": 0.00045262078444163, "learning_rate": 0.0001, "loss": 0.2248, "loss/crossentropy": 2.7532432079315186, "loss/fcd": 0.447265625, "loss/idx": 18.0, "loss/logits": 0.22482239454984665, "step": 223 }, { "epoch": 0.0032245294562205347, "grad_norm": 0.1240234375, "grad_norm_var": 0.000445746382077535, "learning_rate": 0.0001, "loss": 0.2287, "loss/crossentropy": 2.397303342819214, "loss/fcd": 0.4541015625, "loss/idx": 18.0, "loss/logits": 0.22872482240200043, "step": 224 }, { "epoch": 0.0032389246770072335, "grad_norm": 0.119140625, "grad_norm_var": 0.00042170584201812745, "learning_rate": 0.0001, "loss": 0.1917, "loss/crossentropy": 2.161116361618042, "loss/fcd": 0.390625, "loss/idx": 18.0, "loss/logits": 0.1917443946003914, "step": 225 }, { "epoch": 0.0032533198977939323, "grad_norm": 0.10986328125, "grad_norm_var": 0.0004292130470275879, "learning_rate": 0.0001, "loss": 0.2043, "loss/crossentropy": 2.131627917289734, "loss/fcd": 0.392578125, "loss/idx": 18.0, "loss/logits": 0.20434105396270752, "step": 226 }, { "epoch": 0.003267715118580631, "grad_norm": 0.12060546875, "grad_norm_var": 0.0004218329985936483, "learning_rate": 0.0001, "loss": 0.218, "loss/crossentropy": 2.5683807134628296, "loss/fcd": 0.4580078125, "loss/idx": 18.0, "loss/logits": 0.21802888065576553, "step": 227 }, { "epoch": 0.0032821103393673302, "grad_norm": 0.134765625, "grad_norm_var": 0.00041150649388631185, "learning_rate": 0.0001, "loss": 0.2244, "loss/crossentropy": 2.4449127912521362, "loss/fcd": 0.4658203125, "loss/idx": 18.0, "loss/logits": 0.2243650108575821, "step": 228 }, { "epoch": 0.003296505560154029, "grad_norm": 0.11376953125, "grad_norm_var": 0.00041737457116444905, "learning_rate": 0.0001, "loss": 0.1937, "loss/crossentropy": 2.1692421436309814, "loss/fcd": 0.3828125, "loss/idx": 18.0, "loss/logits": 0.19374938309192657, "step": 229 }, { "epoch": 0.0033109007809407278, "grad_norm": 0.10986328125, "grad_norm_var": 0.0004300077756245931, "learning_rate": 0.0001, "loss": 0.2099, "loss/crossentropy": 2.1626864671707153, "loss/fcd": 0.4140625, "loss/idx": 18.0, "loss/logits": 0.20994187891483307, "step": 230 }, { "epoch": 0.0033252960017274265, "grad_norm": 0.1123046875, "grad_norm_var": 0.0004090269406636556, "learning_rate": 0.0001, "loss": 0.2224, "loss/crossentropy": 2.4669238328933716, "loss/fcd": 0.4404296875, "loss/idx": 18.0, "loss/logits": 0.2224324494600296, "step": 231 }, { "epoch": 0.0033396912225141253, "grad_norm": 0.11328125, "grad_norm_var": 0.00041254361470540363, "learning_rate": 0.0001, "loss": 0.241, "loss/crossentropy": 2.534782886505127, "loss/fcd": 0.4384765625, "loss/idx": 18.0, "loss/logits": 0.24097825586795807, "step": 232 }, { "epoch": 0.003354086443300824, "grad_norm": 0.11572265625, "grad_norm_var": 0.000413509209950765, "learning_rate": 0.0001, "loss": 0.229, "loss/crossentropy": 2.4167356491088867, "loss/fcd": 0.4248046875, "loss/idx": 18.0, "loss/logits": 0.22896190732717514, "step": 233 }, { "epoch": 0.003368481664087523, "grad_norm": 0.1220703125, "grad_norm_var": 0.0004103730122248332, "learning_rate": 0.0001, "loss": 0.2494, "loss/crossentropy": 2.544241964817047, "loss/fcd": 0.478515625, "loss/idx": 18.0, "loss/logits": 0.2493698000907898, "step": 234 }, { "epoch": 0.0033828768848742216, "grad_norm": 0.12060546875, "grad_norm_var": 0.0004053423802057902, "learning_rate": 0.0001, "loss": 0.2229, "loss/crossentropy": 2.656595230102539, "loss/fcd": 0.45703125, "loss/idx": 18.0, "loss/logits": 0.22288895398378372, "step": 235 }, { "epoch": 0.0033972721056609208, "grad_norm": 0.109375, "grad_norm_var": 0.0004180183013280233, "learning_rate": 0.0001, "loss": 0.2, "loss/crossentropy": 2.153246819972992, "loss/fcd": 0.41015625, "loss/idx": 18.0, "loss/logits": 0.19998866319656372, "step": 236 }, { "epoch": 0.0034116673264476195, "grad_norm": 0.1181640625, "grad_norm_var": 0.0004011462132136027, "learning_rate": 0.0001, "loss": 0.2159, "loss/crossentropy": 2.3706564903259277, "loss/fcd": 0.41796875, "loss/idx": 18.0, "loss/logits": 0.21588444709777832, "step": 237 }, { "epoch": 0.0034260625472343183, "grad_norm": 0.10986328125, "grad_norm_var": 0.0004108498493830363, "learning_rate": 0.0001, "loss": 0.2155, "loss/crossentropy": 2.377021312713623, "loss/fcd": 0.4287109375, "loss/idx": 18.0, "loss/logits": 0.21553778648376465, "step": 238 }, { "epoch": 0.003440457768021017, "grad_norm": 0.10888671875, "grad_norm_var": 4.942814509073893e-05, "learning_rate": 0.0001, "loss": 0.1947, "loss/crossentropy": 2.1807267665863037, "loss/fcd": 0.41796875, "loss/idx": 18.0, "loss/logits": 0.19467756152153015, "step": 239 }, { "epoch": 0.003454852988807716, "grad_norm": 0.107421875, "grad_norm_var": 4.976590474446614e-05, "learning_rate": 0.0001, "loss": 0.2297, "loss/crossentropy": 2.5010019540786743, "loss/fcd": 0.4169921875, "loss/idx": 18.0, "loss/logits": 0.22967635095119476, "step": 240 }, { "epoch": 0.0034692482095944146, "grad_norm": 0.12109375, "grad_norm_var": 5.098978678385417e-05, "learning_rate": 0.0001, "loss": 0.2257, "loss/crossentropy": 2.1949596405029297, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.2256992757320404, "step": 241 }, { "epoch": 0.0034836434303811134, "grad_norm": 0.099609375, "grad_norm_var": 6.52382771174113e-05, "learning_rate": 0.0001, "loss": 0.1786, "loss/crossentropy": 2.3066688776016235, "loss/fcd": 0.404296875, "loss/idx": 18.0, "loss/logits": 0.17856091260910034, "step": 242 }, { "epoch": 0.003498038651167812, "grad_norm": 0.1357421875, "grad_norm_var": 9.119908014933268e-05, "learning_rate": 0.0001, "loss": 0.2979, "loss/crossentropy": 2.833424210548401, "loss/fcd": 0.53125, "loss/idx": 18.0, "loss/logits": 0.29794102907180786, "step": 243 }, { "epoch": 0.003512433871954511, "grad_norm": 0.11083984375, "grad_norm_var": 6.642242272694906e-05, "learning_rate": 0.0001, "loss": 0.2084, "loss/crossentropy": 2.4168113470077515, "loss/fcd": 0.3994140625, "loss/idx": 18.0, "loss/logits": 0.2084333300590515, "step": 244 }, { "epoch": 0.00352682909274121, "grad_norm": 0.10546875, "grad_norm_var": 7.130304972330729e-05, "learning_rate": 0.0001, "loss": 0.2081, "loss/crossentropy": 2.4122915267944336, "loss/fcd": 0.427734375, "loss/idx": 18.0, "loss/logits": 0.20814163982868195, "step": 245 }, { "epoch": 0.003541224313527909, "grad_norm": 0.11376953125, "grad_norm_var": 7.022221883138021e-05, "learning_rate": 0.0001, "loss": 0.2177, "loss/crossentropy": 2.357482075691223, "loss/fcd": 0.4267578125, "loss/idx": 18.0, "loss/logits": 0.2177310660481453, "step": 246 }, { "epoch": 0.0035556195343146076, "grad_norm": 0.1064453125, "grad_norm_var": 7.370313008626302e-05, "learning_rate": 0.0001, "loss": 0.225, "loss/crossentropy": 2.329651951789856, "loss/fcd": 0.427734375, "loss/idx": 18.0, "loss/logits": 0.22500982880592346, "step": 247 }, { "epoch": 0.0035700147551013064, "grad_norm": 0.1123046875, "grad_norm_var": 7.381041844685872e-05, "learning_rate": 0.0001, "loss": 0.177, "loss/crossentropy": 2.0500356554985046, "loss/fcd": 0.380859375, "loss/idx": 18.0, "loss/logits": 0.1770332083106041, "step": 248 }, { "epoch": 0.003584409975888005, "grad_norm": 0.1123046875, "grad_norm_var": 7.356703281402588e-05, "learning_rate": 0.0001, "loss": 0.1987, "loss/crossentropy": 2.2625420093536377, "loss/fcd": 0.3935546875, "loss/idx": 18.0, "loss/logits": 0.19871972501277924, "step": 249 }, { "epoch": 0.003598805196674704, "grad_norm": 0.10791015625, "grad_norm_var": 6.967782974243164e-05, "learning_rate": 0.0001, "loss": 0.1945, "loss/crossentropy": 2.5878301858901978, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.19449464231729507, "step": 250 }, { "epoch": 0.0036132004174614027, "grad_norm": 0.11962890625, "grad_norm_var": 6.86804453531901e-05, "learning_rate": 0.0001, "loss": 0.218, "loss/crossentropy": 2.4477245807647705, "loss/fcd": 0.4501953125, "loss/idx": 18.0, "loss/logits": 0.21796388924121857, "step": 251 }, { "epoch": 0.0036275956382481015, "grad_norm": 0.12158203125, "grad_norm_var": 7.302661736806234e-05, "learning_rate": 0.0001, "loss": 0.2601, "loss/crossentropy": 2.5919313430786133, "loss/fcd": 0.466796875, "loss/idx": 18.0, "loss/logits": 0.2600754201412201, "step": 252 }, { "epoch": 0.0036419908590348007, "grad_norm": 0.1083984375, "grad_norm_var": 7.251004378000895e-05, "learning_rate": 0.0001, "loss": 0.1969, "loss/crossentropy": 2.3274489641189575, "loss/fcd": 0.3916015625, "loss/idx": 18.0, "loss/logits": 0.1969192698597908, "step": 253 }, { "epoch": 0.0036563860798214994, "grad_norm": 0.115234375, "grad_norm_var": 7.236798604329428e-05, "learning_rate": 0.0001, "loss": 0.2279, "loss/crossentropy": 2.4737610816955566, "loss/fcd": 0.4326171875, "loss/idx": 18.0, "loss/logits": 0.227908656001091, "step": 254 }, { "epoch": 0.003670781300608198, "grad_norm": 0.11669921875, "grad_norm_var": 7.198651631673177e-05, "learning_rate": 0.0001, "loss": 0.2286, "loss/crossentropy": 2.442078709602356, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.2285866141319275, "step": 255 }, { "epoch": 0.003685176521394897, "grad_norm": 0.10986328125, "grad_norm_var": 7.04119602839152e-05, "learning_rate": 0.0001, "loss": 0.2116, "loss/crossentropy": 2.2948302030563354, "loss/fcd": 0.427734375, "loss/idx": 18.0, "loss/logits": 0.21162152290344238, "step": 256 }, { "epoch": 0.0036995717421815957, "grad_norm": 0.10498046875, "grad_norm_var": 7.044474283854166e-05, "learning_rate": 0.0001, "loss": 0.2117, "loss/crossentropy": 2.3752611875534058, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.21169160306453705, "step": 257 }, { "epoch": 0.0037139669629682945, "grad_norm": 0.1044921875, "grad_norm_var": 6.351073582967122e-05, "learning_rate": 0.0001, "loss": 0.1965, "loss/crossentropy": 2.3940770626068115, "loss/fcd": 0.4111328125, "loss/idx": 18.0, "loss/logits": 0.19652695208787918, "step": 258 }, { "epoch": 0.0037283621837549932, "grad_norm": 0.12451171875, "grad_norm_var": 3.712077935536702e-05, "learning_rate": 0.0001, "loss": 0.2326, "loss/crossentropy": 2.329423666000366, "loss/fcd": 0.482421875, "loss/idx": 18.0, "loss/logits": 0.23256323486566544, "step": 259 }, { "epoch": 0.003742757404541692, "grad_norm": 0.109375, "grad_norm_var": 3.7511189778645836e-05, "learning_rate": 0.0001, "loss": 0.2077, "loss/crossentropy": 2.2093913555145264, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.20770975947380066, "step": 260 }, { "epoch": 0.0037571526253283908, "grad_norm": 0.11474609375, "grad_norm_var": 3.47365935643514e-05, "learning_rate": 0.0001, "loss": 0.2247, "loss/crossentropy": 2.3547682762145996, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.2247237116098404, "step": 261 }, { "epoch": 0.00377154784611509, "grad_norm": 0.10400390625, "grad_norm_var": 3.922681013743083e-05, "learning_rate": 0.0001, "loss": 0.188, "loss/crossentropy": 2.2215335369110107, "loss/fcd": 0.4091796875, "loss/idx": 18.0, "loss/logits": 0.18795417994260788, "step": 262 }, { "epoch": 0.0037859430669017887, "grad_norm": 0.11865234375, "grad_norm_var": 3.945032755533854e-05, "learning_rate": 0.0001, "loss": 0.2405, "loss/crossentropy": 2.5075334310531616, "loss/fcd": 0.462890625, "loss/idx": 18.0, "loss/logits": 0.24054966121912003, "step": 263 }, { "epoch": 0.0038003382876884875, "grad_norm": 0.10107421875, "grad_norm_var": 4.806419213612874e-05, "learning_rate": 0.0001, "loss": 0.1904, "loss/crossentropy": 2.4045649766921997, "loss/fcd": 0.380859375, "loss/idx": 18.0, "loss/logits": 0.1903528869152069, "step": 264 }, { "epoch": 0.0038147335084751863, "grad_norm": 0.1171875, "grad_norm_var": 4.969338575998942e-05, "learning_rate": 0.0001, "loss": 0.2226, "loss/crossentropy": 2.2266829013824463, "loss/fcd": 0.416015625, "loss/idx": 18.0, "loss/logits": 0.2226012423634529, "step": 265 }, { "epoch": 0.003829128729261885, "grad_norm": 0.11376953125, "grad_norm_var": 4.833439985911051e-05, "learning_rate": 0.0001, "loss": 0.2003, "loss/crossentropy": 2.052451729774475, "loss/fcd": 0.4033203125, "loss/idx": 18.0, "loss/logits": 0.20034398138523102, "step": 266 }, { "epoch": 0.003843523950048584, "grad_norm": 0.10400390625, "grad_norm_var": 4.928807417551676e-05, "learning_rate": 0.0001, "loss": 0.2289, "loss/crossentropy": 2.7160192728042603, "loss/fcd": 0.443359375, "loss/idx": 18.0, "loss/logits": 0.22888437658548355, "step": 267 }, { "epoch": 0.0038579191708352826, "grad_norm": 0.1201171875, "grad_norm_var": 4.750887552897136e-05, "learning_rate": 0.0001, "loss": 0.2423, "loss/crossentropy": 2.2038062810897827, "loss/fcd": 0.44140625, "loss/idx": 18.0, "loss/logits": 0.24225647747516632, "step": 268 }, { "epoch": 0.0038723143916219813, "grad_norm": 0.11474609375, "grad_norm_var": 4.7237674395243326e-05, "learning_rate": 0.0001, "loss": 0.2423, "loss/crossentropy": 2.5651720762252808, "loss/fcd": 0.4423828125, "loss/idx": 18.0, "loss/logits": 0.24227841198444366, "step": 269 }, { "epoch": 0.00388670961240868, "grad_norm": 0.11669921875, "grad_norm_var": 4.798571268717448e-05, "learning_rate": 0.0001, "loss": 0.2367, "loss/crossentropy": 2.645506978034973, "loss/fcd": 0.470703125, "loss/idx": 18.0, "loss/logits": 0.23671862483024597, "step": 270 }, { "epoch": 0.0039011048331953793, "grad_norm": 0.10791015625, "grad_norm_var": 4.752079645792643e-05, "learning_rate": 0.0001, "loss": 0.2177, "loss/crossentropy": 2.5453277826309204, "loss/fcd": 0.4267578125, "loss/idx": 18.0, "loss/logits": 0.21773213893175125, "step": 271 }, { "epoch": 0.003915500053982078, "grad_norm": 0.11181640625, "grad_norm_var": 4.729827245076497e-05, "learning_rate": 0.0001, "loss": 0.1938, "loss/crossentropy": 2.4203790426254272, "loss/fcd": 0.439453125, "loss/idx": 18.0, "loss/logits": 0.19378525018692017, "step": 272 }, { "epoch": 0.003929895274768776, "grad_norm": 0.12255859375, "grad_norm_var": 5.0731499989827474e-05, "learning_rate": 0.0001, "loss": 0.2212, "loss/crossentropy": 2.1389888525009155, "loss/fcd": 0.42578125, "loss/idx": 18.0, "loss/logits": 0.22124628722667694, "step": 273 }, { "epoch": 0.003944290495555476, "grad_norm": 0.10986328125, "grad_norm_var": 4.654626051584879e-05, "learning_rate": 0.0001, "loss": 0.2305, "loss/crossentropy": 2.364627480506897, "loss/fcd": 0.4599609375, "loss/idx": 18.0, "loss/logits": 0.2305009961128235, "step": 274 }, { "epoch": 0.003958685716342175, "grad_norm": 0.107421875, "grad_norm_var": 3.90013058980306e-05, "learning_rate": 0.0001, "loss": 0.2175, "loss/crossentropy": 2.290530562400818, "loss/fcd": 0.421875, "loss/idx": 18.0, "loss/logits": 0.21754977107048035, "step": 275 }, { "epoch": 0.003973080937128873, "grad_norm": 0.10986328125, "grad_norm_var": 3.883739312489828e-05, "learning_rate": 0.0001, "loss": 0.223, "loss/crossentropy": 2.2974144220352173, "loss/fcd": 0.412109375, "loss/idx": 18.0, "loss/logits": 0.22295525670051575, "step": 276 }, { "epoch": 0.003987476157915572, "grad_norm": 0.10595703125, "grad_norm_var": 4.0625532468159996e-05, "learning_rate": 0.0001, "loss": 0.2162, "loss/crossentropy": 2.5710668563842773, "loss/fcd": 0.4443359375, "loss/idx": 18.0, "loss/logits": 0.21617399901151657, "step": 277 }, { "epoch": 0.004001871378702271, "grad_norm": 0.1123046875, "grad_norm_var": 3.652175267537435e-05, "learning_rate": 0.0001, "loss": 0.2197, "loss/crossentropy": 2.4304351806640625, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.21974685788154602, "step": 278 }, { "epoch": 0.00401626659948897, "grad_norm": 0.126953125, "grad_norm_var": 4.805624485015869e-05, "learning_rate": 0.0001, "loss": 0.2358, "loss/crossentropy": 2.475973963737488, "loss/fcd": 0.451171875, "loss/idx": 18.0, "loss/logits": 0.2358318790793419, "step": 279 }, { "epoch": 0.004030661820275668, "grad_norm": 0.109375, "grad_norm_var": 3.956158955891927e-05, "learning_rate": 0.0001, "loss": 0.2156, "loss/crossentropy": 2.5783761739730835, "loss/fcd": 0.421875, "loss/idx": 18.0, "loss/logits": 0.21563701331615448, "step": 280 }, { "epoch": 0.004045057041062367, "grad_norm": 0.1171875, "grad_norm_var": 3.956158955891927e-05, "learning_rate": 0.0001, "loss": 0.2139, "loss/crossentropy": 2.36005961894989, "loss/fcd": 0.4140625, "loss/idx": 18.0, "loss/logits": 0.21386945247650146, "step": 281 }, { "epoch": 0.004059452261849066, "grad_norm": 0.1142578125, "grad_norm_var": 3.961622714996338e-05, "learning_rate": 0.0001, "loss": 0.2326, "loss/crossentropy": 2.589225172996521, "loss/fcd": 0.4345703125, "loss/idx": 18.0, "loss/logits": 0.23257827758789062, "step": 282 }, { "epoch": 0.004073847482635765, "grad_norm": 0.10693359375, "grad_norm_var": 3.656446933746338e-05, "learning_rate": 0.0001, "loss": 0.2159, "loss/crossentropy": 2.340222954750061, "loss/fcd": 0.46484375, "loss/idx": 18.0, "loss/logits": 0.21591536700725555, "step": 283 }, { "epoch": 0.004088242703422464, "grad_norm": 0.11376953125, "grad_norm_var": 3.337462743123372e-05, "learning_rate": 0.0001, "loss": 0.238, "loss/crossentropy": 2.484541654586792, "loss/fcd": 0.451171875, "loss/idx": 18.0, "loss/logits": 0.23801321536302567, "step": 284 }, { "epoch": 0.0041026379242091624, "grad_norm": 0.11279296875, "grad_norm_var": 3.315210342407227e-05, "learning_rate": 0.0001, "loss": 0.2338, "loss/crossentropy": 2.4735066890716553, "loss/fcd": 0.443359375, "loss/idx": 18.0, "loss/logits": 0.23380715399980545, "step": 285 }, { "epoch": 0.004117033144995862, "grad_norm": 0.1025390625, "grad_norm_var": 3.8424134254455565e-05, "learning_rate": 0.0001, "loss": 0.1958, "loss/crossentropy": 2.296001434326172, "loss/fcd": 0.40625, "loss/idx": 18.0, "loss/logits": 0.19581247121095657, "step": 286 }, { "epoch": 0.00413142836578256, "grad_norm": 0.10498046875, "grad_norm_var": 4.054605960845947e-05, "learning_rate": 0.0001, "loss": 0.2233, "loss/crossentropy": 2.469460368156433, "loss/fcd": 0.4296875, "loss/idx": 18.0, "loss/logits": 0.22334590554237366, "step": 287 }, { "epoch": 0.004145823586569259, "grad_norm": 0.10986328125, "grad_norm_var": 4.0776530901590984e-05, "learning_rate": 0.0001, "loss": 0.2563, "loss/crossentropy": 2.3161216378211975, "loss/fcd": 0.505859375, "loss/idx": 18.0, "loss/logits": 0.2562841549515724, "step": 288 }, { "epoch": 0.0041602188073559575, "grad_norm": 0.11083984375, "grad_norm_var": 3.233651320139567e-05, "learning_rate": 0.0001, "loss": 0.2132, "loss/crossentropy": 2.571072220802307, "loss/fcd": 0.412109375, "loss/idx": 18.0, "loss/logits": 0.2131756693124771, "step": 289 }, { "epoch": 0.004174614028142657, "grad_norm": 0.11279296875, "grad_norm_var": 3.245572249094645e-05, "learning_rate": 0.0001, "loss": 0.2147, "loss/crossentropy": 2.3715583086013794, "loss/fcd": 0.3994140625, "loss/idx": 18.0, "loss/logits": 0.21474920213222504, "step": 290 }, { "epoch": 0.004189009248929355, "grad_norm": 0.11083984375, "grad_norm_var": 3.1503041585286457e-05, "learning_rate": 0.0001, "loss": 0.2157, "loss/crossentropy": 2.379094123840332, "loss/fcd": 0.4189453125, "loss/idx": 18.0, "loss/logits": 0.21565410494804382, "step": 291 }, { "epoch": 0.004203404469716054, "grad_norm": 0.154296875, "grad_norm_var": 0.00014622112115224203, "learning_rate": 0.0001, "loss": 0.2908, "loss/crossentropy": 2.696184992790222, "loss/fcd": 0.548828125, "loss/idx": 18.0, "loss/logits": 0.29075586795806885, "step": 292 }, { "epoch": 0.0042177996905027534, "grad_norm": 0.10595703125, "grad_norm_var": 0.00014622112115224203, "learning_rate": 0.0001, "loss": 0.2109, "loss/crossentropy": 2.4592641592025757, "loss/fcd": 0.4365234375, "loss/idx": 18.0, "loss/logits": 0.2109208032488823, "step": 293 }, { "epoch": 0.004232194911289452, "grad_norm": 0.11083984375, "grad_norm_var": 0.00014670689900716147, "learning_rate": 0.0001, "loss": 0.219, "loss/crossentropy": 2.6254968643188477, "loss/fcd": 0.4599609375, "loss/idx": 18.0, "loss/logits": 0.21897459030151367, "step": 294 }, { "epoch": 0.004246590132076151, "grad_norm": 0.12451171875, "grad_norm_var": 0.00014286736647288004, "learning_rate": 0.0001, "loss": 0.2263, "loss/crossentropy": 2.7246745824813843, "loss/fcd": 0.4658203125, "loss/idx": 18.0, "loss/logits": 0.2262566015124321, "step": 295 }, { "epoch": 0.004260985352862849, "grad_norm": 0.1064453125, "grad_norm_var": 0.00014515618483225504, "learning_rate": 0.0001, "loss": 0.2029, "loss/crossentropy": 2.3958386182785034, "loss/fcd": 0.40625, "loss/idx": 18.0, "loss/logits": 0.20287074148654938, "step": 296 }, { "epoch": 0.0042753805736495485, "grad_norm": 0.11083984375, "grad_norm_var": 0.00014470418294270832, "learning_rate": 0.0001, "loss": 0.2364, "loss/crossentropy": 2.457562804222107, "loss/fcd": 0.431640625, "loss/idx": 18.0, "loss/logits": 0.2363838478922844, "step": 297 }, { "epoch": 0.004289775794436247, "grad_norm": 0.11474609375, "grad_norm_var": 0.0001447826623916626, "learning_rate": 0.0001, "loss": 0.244, "loss/crossentropy": 2.29829204082489, "loss/fcd": 0.4521484375, "loss/idx": 18.0, "loss/logits": 0.24399850517511368, "step": 298 }, { "epoch": 0.004304171015222946, "grad_norm": 0.1025390625, "grad_norm_var": 0.00014972686767578125, "learning_rate": 0.0001, "loss": 0.2147, "loss/crossentropy": 2.6273841857910156, "loss/fcd": 0.43359375, "loss/idx": 18.0, "loss/logits": 0.21469515562057495, "step": 299 }, { "epoch": 0.004318566236009644, "grad_norm": 0.1201171875, "grad_norm_var": 0.00015286505222320557, "learning_rate": 0.0001, "loss": 0.2202, "loss/crossentropy": 2.4213569164276123, "loss/fcd": 0.4267578125, "loss/idx": 18.0, "loss/logits": 0.22024693340063095, "step": 300 }, { "epoch": 0.0043329614567963436, "grad_norm": 0.11279296875, "grad_norm_var": 0.00015286505222320557, "learning_rate": 0.0001, "loss": 0.2201, "loss/crossentropy": 2.4482584595680237, "loss/fcd": 0.447265625, "loss/idx": 18.0, "loss/logits": 0.2201283797621727, "step": 301 }, { "epoch": 0.004347356677583043, "grad_norm": 0.1103515625, "grad_norm_var": 0.0001453310251235962, "learning_rate": 0.0001, "loss": 0.1952, "loss/crossentropy": 2.16507089138031, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.1952093541622162, "step": 302 }, { "epoch": 0.004361751898369741, "grad_norm": 0.109375, "grad_norm_var": 0.000141298770904541, "learning_rate": 0.0001, "loss": 0.2006, "loss/crossentropy": 2.2546703815460205, "loss/fcd": 0.40234375, "loss/idx": 18.0, "loss/logits": 0.2005770206451416, "step": 303 }, { "epoch": 0.00437614711915644, "grad_norm": 0.1572265625, "grad_norm_var": 0.00025413731733957924, "learning_rate": 0.0001, "loss": 0.2731, "loss/crossentropy": 2.345265507698059, "loss/fcd": 0.470703125, "loss/idx": 18.0, "loss/logits": 0.27310631424188614, "step": 304 }, { "epoch": 0.004390542339943139, "grad_norm": 0.115234375, "grad_norm_var": 0.0002516428629557292, "learning_rate": 0.0001, "loss": 0.2321, "loss/crossentropy": 2.4603192806243896, "loss/fcd": 0.4462890625, "loss/idx": 18.0, "loss/logits": 0.23207177966833115, "step": 305 }, { "epoch": 0.004404937560729838, "grad_norm": 0.1103515625, "grad_norm_var": 0.00025352537631988524, "learning_rate": 0.0001, "loss": 0.222, "loss/crossentropy": 2.598379373550415, "loss/fcd": 0.423828125, "loss/idx": 18.0, "loss/logits": 0.22200769931077957, "step": 306 }, { "epoch": 0.004419332781516536, "grad_norm": 0.1328125, "grad_norm_var": 0.0002648353576660156, "learning_rate": 0.0001, "loss": 0.248, "loss/crossentropy": 2.2982794046401978, "loss/fcd": 0.447265625, "loss/idx": 18.0, "loss/logits": 0.2480178400874138, "step": 307 }, { "epoch": 0.004433728002303235, "grad_norm": 0.10888671875, "grad_norm_var": 0.00017789900302886963, "learning_rate": 0.0001, "loss": 0.1998, "loss/crossentropy": 2.2329931259155273, "loss/fcd": 0.388671875, "loss/idx": 18.0, "loss/logits": 0.1997941955924034, "step": 308 }, { "epoch": 0.0044481232230899346, "grad_norm": 0.11328125, "grad_norm_var": 0.00017162561416625977, "learning_rate": 0.0001, "loss": 0.2278, "loss/crossentropy": 2.4267385005950928, "loss/fcd": 0.4384765625, "loss/idx": 18.0, "loss/logits": 0.22776535153388977, "step": 309 }, { "epoch": 0.004462518443876633, "grad_norm": 0.1201171875, "grad_norm_var": 0.00017028550306955972, "learning_rate": 0.0001, "loss": 0.2176, "loss/crossentropy": 2.1391916275024414, "loss/fcd": 0.46484375, "loss/idx": 18.0, "loss/logits": 0.2176017314195633, "step": 310 }, { "epoch": 0.004476913664663332, "grad_norm": 0.11474609375, "grad_norm_var": 0.00016627212365468344, "learning_rate": 0.0001, "loss": 0.2485, "loss/crossentropy": 2.617629051208496, "loss/fcd": 0.45703125, "loss/idx": 18.0, "loss/logits": 0.24850602447986603, "step": 311 }, { "epoch": 0.00449130888545003, "grad_norm": 0.1025390625, "grad_norm_var": 0.00017232795556386312, "learning_rate": 0.0001, "loss": 0.201, "loss/crossentropy": 2.495308995246887, "loss/fcd": 0.43359375, "loss/idx": 18.0, "loss/logits": 0.20100131630897522, "step": 312 }, { "epoch": 0.00450570410623673, "grad_norm": 0.11279296875, "grad_norm_var": 0.00017122328281402588, "learning_rate": 0.0001, "loss": 0.2155, "loss/crossentropy": 2.6817585229873657, "loss/fcd": 0.46484375, "loss/idx": 18.0, "loss/logits": 0.2155066430568695, "step": 313 }, { "epoch": 0.004520099327023428, "grad_norm": 0.11083984375, "grad_norm_var": 0.00017289221286773682, "learning_rate": 0.0001, "loss": 0.2089, "loss/crossentropy": 2.4506349563598633, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.20890694856643677, "step": 314 }, { "epoch": 0.004534494547810127, "grad_norm": 0.10791015625, "grad_norm_var": 0.00016514460245768228, "learning_rate": 0.0001, "loss": 0.2212, "loss/crossentropy": 2.4268819093704224, "loss/fcd": 0.4326171875, "loss/idx": 18.0, "loss/logits": 0.2211536467075348, "step": 315 }, { "epoch": 0.0045488897685968255, "grad_norm": 0.1083984375, "grad_norm_var": 0.00016762415568033855, "learning_rate": 0.0001, "loss": 0.1915, "loss/crossentropy": 2.082051396369934, "loss/fcd": 0.4169921875, "loss/idx": 18.0, "loss/logits": 0.19149669259786606, "step": 316 }, { "epoch": 0.004563284989383525, "grad_norm": 0.11669921875, "grad_norm_var": 0.00016717910766601564, "learning_rate": 0.0001, "loss": 0.2085, "loss/crossentropy": 2.178563714027405, "loss/fcd": 0.4189453125, "loss/idx": 18.0, "loss/logits": 0.20848772674798965, "step": 317 }, { "epoch": 0.004577680210170224, "grad_norm": 0.10791015625, "grad_norm_var": 0.00016930003960927327, "learning_rate": 0.0001, "loss": 0.2261, "loss/crossentropy": 2.4262903928756714, "loss/fcd": 0.43359375, "loss/idx": 18.0, "loss/logits": 0.22605551034212112, "step": 318 }, { "epoch": 0.004592075430956922, "grad_norm": 0.12255859375, "grad_norm_var": 0.00016927321751912435, "learning_rate": 0.0001, "loss": 0.1989, "loss/crossentropy": 2.4706810116767883, "loss/fcd": 0.4443359375, "loss/idx": 18.0, "loss/logits": 0.19887082278728485, "step": 319 }, { "epoch": 0.004606470651743621, "grad_norm": 0.10791015625, "grad_norm_var": 5.278488000233968e-05, "learning_rate": 0.0001, "loss": 0.2139, "loss/crossentropy": 2.34406316280365, "loss/fcd": 0.4365234375, "loss/idx": 18.0, "loss/logits": 0.21389687806367874, "step": 320 }, { "epoch": 0.00462086587253032, "grad_norm": 0.171875, "grad_norm_var": 0.0002678145964940389, "learning_rate": 0.0001, "loss": 0.314, "loss/crossentropy": 2.252693295478821, "loss/fcd": 0.548828125, "loss/idx": 18.0, "loss/logits": 0.31398655474185944, "step": 321 }, { "epoch": 0.004635261093317019, "grad_norm": 0.10498046875, "grad_norm_var": 0.0002742727597554525, "learning_rate": 0.0001, "loss": 0.205, "loss/crossentropy": 2.3450491428375244, "loss/fcd": 0.4248046875, "loss/idx": 18.0, "loss/logits": 0.20497491210699081, "step": 322 }, { "epoch": 0.004649656314103717, "grad_norm": 0.130859375, "grad_norm_var": 0.0002702673276265462, "learning_rate": 0.0001, "loss": 0.2742, "loss/crossentropy": 2.6299513578414917, "loss/fcd": 0.5146484375, "loss/idx": 18.0, "loss/logits": 0.27415700256824493, "step": 323 }, { "epoch": 0.0046640515348904165, "grad_norm": 0.1064453125, "grad_norm_var": 0.0002730836470921834, "learning_rate": 0.0001, "loss": 0.2158, "loss/crossentropy": 2.512497067451477, "loss/fcd": 0.4248046875, "loss/idx": 18.0, "loss/logits": 0.21584390848875046, "step": 324 }, { "epoch": 0.004678446755677115, "grad_norm": 0.1064453125, "grad_norm_var": 0.0002787023782730103, "learning_rate": 0.0001, "loss": 0.2, "loss/crossentropy": 2.319981098175049, "loss/fcd": 0.4296875, "loss/idx": 18.0, "loss/logits": 0.20001471787691116, "step": 325 }, { "epoch": 0.004692841976463814, "grad_norm": 0.12890625, "grad_norm_var": 0.00028857290744781493, "learning_rate": 0.0001, "loss": 0.2693, "loss/crossentropy": 2.4298349618911743, "loss/fcd": 0.48046875, "loss/idx": 18.0, "loss/logits": 0.2693277597427368, "step": 326 }, { "epoch": 0.004707237197250513, "grad_norm": 0.1318359375, "grad_norm_var": 0.0003031412760416667, "learning_rate": 0.0001, "loss": 0.249, "loss/crossentropy": 2.555938482284546, "loss/fcd": 0.4873046875, "loss/idx": 18.0, "loss/logits": 0.2489527463912964, "step": 327 }, { "epoch": 0.0047216324180372115, "grad_norm": 0.1201171875, "grad_norm_var": 0.00028754870096842446, "learning_rate": 0.0001, "loss": 0.2233, "loss/crossentropy": 2.3032290935516357, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.2232954055070877, "step": 328 }, { "epoch": 0.004736027638823911, "grad_norm": 0.11181640625, "grad_norm_var": 0.0002883553504943848, "learning_rate": 0.0001, "loss": 0.2243, "loss/crossentropy": 2.3655673265457153, "loss/fcd": 0.4140625, "loss/idx": 18.0, "loss/logits": 0.2242700606584549, "step": 329 }, { "epoch": 0.004750422859610609, "grad_norm": 0.12109375, "grad_norm_var": 0.00028449594974517823, "learning_rate": 0.0001, "loss": 0.2429, "loss/crossentropy": 2.330072522163391, "loss/fcd": 0.4365234375, "loss/idx": 18.0, "loss/logits": 0.24292638152837753, "step": 330 }, { "epoch": 0.004764818080397308, "grad_norm": 0.111328125, "grad_norm_var": 0.0002801219622294108, "learning_rate": 0.0001, "loss": 0.2179, "loss/crossentropy": 2.2494866847991943, "loss/fcd": 0.4326171875, "loss/idx": 18.0, "loss/logits": 0.21786177903413773, "step": 331 }, { "epoch": 0.004779213301184007, "grad_norm": 0.1181640625, "grad_norm_var": 0.00027185678482055664, "learning_rate": 0.0001, "loss": 0.2415, "loss/crossentropy": 2.792868733406067, "loss/fcd": 0.4716796875, "loss/idx": 18.0, "loss/logits": 0.24153122305870056, "step": 332 }, { "epoch": 0.004793608521970706, "grad_norm": 0.10986328125, "grad_norm_var": 0.0002777258555094401, "learning_rate": 0.0001, "loss": 0.2367, "loss/crossentropy": 2.573932647705078, "loss/fcd": 0.4501953125, "loss/idx": 18.0, "loss/logits": 0.23671025037765503, "step": 333 }, { "epoch": 0.004808003742757404, "grad_norm": 0.09912109375, "grad_norm_var": 0.0002961436907450358, "learning_rate": 0.0001, "loss": 0.2002, "loss/crossentropy": 2.5787216424942017, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.20024622231721878, "step": 334 }, { "epoch": 0.004822398963544103, "grad_norm": 0.1005859375, "grad_norm_var": 0.00031576852003733315, "learning_rate": 0.0001, "loss": 0.2067, "loss/crossentropy": 2.5130008459091187, "loss/fcd": 0.4091796875, "loss/idx": 18.0, "loss/logits": 0.2067384421825409, "step": 335 }, { "epoch": 0.0048367941843308025, "grad_norm": 0.1962890625, "grad_norm_var": 0.0006899476051330566, "learning_rate": 0.0001, "loss": 0.2185, "loss/crossentropy": 2.2556002140045166, "loss/fcd": 0.494140625, "loss/idx": 18.0, "loss/logits": 0.2184857428073883, "step": 336 }, { "epoch": 0.004851189405117501, "grad_norm": 0.1064453125, "grad_norm_var": 0.0005320707956949869, "learning_rate": 0.0001, "loss": 0.2083, "loss/crossentropy": 2.421205759048462, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.20834489911794662, "step": 337 }, { "epoch": 0.0048655846259042, "grad_norm": 0.12109375, "grad_norm_var": 0.0005181382099787394, "learning_rate": 0.0001, "loss": 0.2009, "loss/crossentropy": 2.079905390739441, "loss/fcd": 0.41796875, "loss/idx": 18.0, "loss/logits": 0.20086795836687088, "step": 338 }, { "epoch": 0.004879979846690898, "grad_norm": 0.115234375, "grad_norm_var": 0.0005108267068862915, "learning_rate": 0.0001, "loss": 0.2437, "loss/crossentropy": 2.571584105491638, "loss/fcd": 0.4423828125, "loss/idx": 18.0, "loss/logits": 0.2436518296599388, "step": 339 }, { "epoch": 0.004894375067477598, "grad_norm": 0.130859375, "grad_norm_var": 0.0005070517460505167, "learning_rate": 0.0001, "loss": 0.252, "loss/crossentropy": 2.3673810958862305, "loss/fcd": 0.525390625, "loss/idx": 18.0, "loss/logits": 0.2520231306552887, "step": 340 }, { "epoch": 0.004908770288264296, "grad_norm": 0.1162109375, "grad_norm_var": 0.0004946142435073853, "learning_rate": 0.0001, "loss": 0.1946, "loss/crossentropy": 1.9378909468650818, "loss/fcd": 0.384765625, "loss/idx": 18.0, "loss/logits": 0.19459272176027298, "step": 341 }, { "epoch": 0.004923165509050995, "grad_norm": 0.10791015625, "grad_norm_var": 0.0005005518595377604, "learning_rate": 0.0001, "loss": 0.2064, "loss/crossentropy": 2.391346573829651, "loss/fcd": 0.421875, "loss/idx": 18.0, "loss/logits": 0.20641817897558212, "step": 342 }, { "epoch": 0.004937560729837694, "grad_norm": 0.10693359375, "grad_norm_var": 0.0004995892445246379, "learning_rate": 0.0001, "loss": 0.213, "loss/crossentropy": 2.4029276371002197, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.2129564881324768, "step": 343 }, { "epoch": 0.004951955950624393, "grad_norm": 0.12353515625, "grad_norm_var": 0.0005011399586995443, "learning_rate": 0.0001, "loss": 0.2122, "loss/crossentropy": 2.3750810623168945, "loss/fcd": 0.4462890625, "loss/idx": 18.0, "loss/logits": 0.21220777183771133, "step": 344 }, { "epoch": 0.004966351171411092, "grad_norm": 0.1142578125, "grad_norm_var": 0.0004993269840876262, "learning_rate": 0.0001, "loss": 0.2483, "loss/crossentropy": 2.713660955429077, "loss/fcd": 0.4853515625, "loss/idx": 18.0, "loss/logits": 0.24831371009349823, "step": 345 }, { "epoch": 0.00498074639219779, "grad_norm": 0.1025390625, "grad_norm_var": 0.0005148798227310181, "learning_rate": 0.0001, "loss": 0.2107, "loss/crossentropy": 2.550423502922058, "loss/fcd": 0.416015625, "loss/idx": 18.0, "loss/logits": 0.21066032350063324, "step": 346 }, { "epoch": 0.004995141612984489, "grad_norm": 0.10693359375, "grad_norm_var": 0.000519716739654541, "learning_rate": 0.0001, "loss": 0.2061, "loss/crossentropy": 2.5207024812698364, "loss/fcd": 0.42578125, "loss/idx": 18.0, "loss/logits": 0.2061041295528412, "step": 347 }, { "epoch": 0.005009536833771188, "grad_norm": 0.11865234375, "grad_norm_var": 0.0005197912454605102, "learning_rate": 0.0001, "loss": 0.2402, "loss/crossentropy": 2.3946497440338135, "loss/fcd": 0.44140625, "loss/idx": 18.0, "loss/logits": 0.24020669609308243, "step": 348 }, { "epoch": 0.005023932054557887, "grad_norm": 0.099609375, "grad_norm_var": 0.000536501407623291, "learning_rate": 0.0001, "loss": 0.2128, "loss/crossentropy": 2.516977548599243, "loss/fcd": 0.4091796875, "loss/idx": 18.0, "loss/logits": 0.21278280019760132, "step": 349 }, { "epoch": 0.005038327275344585, "grad_norm": 0.1259765625, "grad_norm_var": 0.0005188534657160441, "learning_rate": 0.0001, "loss": 0.2382, "loss/crossentropy": 2.5282589197158813, "loss/fcd": 0.455078125, "loss/idx": 18.0, "loss/logits": 0.23819837719202042, "step": 350 }, { "epoch": 0.0050527224961312844, "grad_norm": 0.10498046875, "grad_norm_var": 0.0005096713701883952, "learning_rate": 0.0001, "loss": 0.22, "loss/crossentropy": 2.448602795600891, "loss/fcd": 0.439453125, "loss/idx": 18.0, "loss/logits": 0.21995113044977188, "step": 351 }, { "epoch": 0.005067117716917984, "grad_norm": 0.10205078125, "grad_norm_var": 8.844435214996337e-05, "learning_rate": 0.0001, "loss": 0.1952, "loss/crossentropy": 2.4668463468551636, "loss/fcd": 0.4091796875, "loss/idx": 18.0, "loss/logits": 0.19519731402397156, "step": 352 }, { "epoch": 0.005081512937704682, "grad_norm": 0.1181640625, "grad_norm_var": 8.725225925445556e-05, "learning_rate": 0.0001, "loss": 0.2368, "loss/crossentropy": 2.315679907798767, "loss/fcd": 0.44140625, "loss/idx": 18.0, "loss/logits": 0.23678645491600037, "step": 353 }, { "epoch": 0.005095908158491381, "grad_norm": 0.1025390625, "grad_norm_var": 8.98192326227824e-05, "learning_rate": 0.0001, "loss": 0.2111, "loss/crossentropy": 2.417713761329651, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.21109846234321594, "step": 354 }, { "epoch": 0.0051103033792780795, "grad_norm": 0.10546875, "grad_norm_var": 9.192526340484619e-05, "learning_rate": 0.0001, "loss": 0.1963, "loss/crossentropy": 2.3545119762420654, "loss/fcd": 0.4169921875, "loss/idx": 18.0, "loss/logits": 0.19629193097352982, "step": 355 }, { "epoch": 0.005124698600064779, "grad_norm": 0.11181640625, "grad_norm_var": 6.585121154785156e-05, "learning_rate": 0.0001, "loss": 0.1816, "loss/crossentropy": 2.1606619358062744, "loss/fcd": 0.3876953125, "loss/idx": 18.0, "loss/logits": 0.18158919364213943, "step": 356 }, { "epoch": 0.005139093820851477, "grad_norm": 0.1171875, "grad_norm_var": 6.665786107381184e-05, "learning_rate": 0.0001, "loss": 0.2114, "loss/crossentropy": 2.429716110229492, "loss/fcd": 0.4169921875, "loss/idx": 18.0, "loss/logits": 0.21144652366638184, "step": 357 }, { "epoch": 0.005153489041638176, "grad_norm": 0.099609375, "grad_norm_var": 7.386902968088785e-05, "learning_rate": 0.0001, "loss": 0.2011, "loss/crossentropy": 2.511311650276184, "loss/fcd": 0.4091796875, "loss/idx": 18.0, "loss/logits": 0.20114467293024063, "step": 358 }, { "epoch": 0.0051678842624248746, "grad_norm": 0.10205078125, "grad_norm_var": 7.736583550771078e-05, "learning_rate": 0.0001, "loss": 0.1846, "loss/crossentropy": 2.1977522373199463, "loss/fcd": 0.373046875, "loss/idx": 18.0, "loss/logits": 0.184633307158947, "step": 359 }, { "epoch": 0.005182279483211574, "grad_norm": 0.11572265625, "grad_norm_var": 6.67800505956014e-05, "learning_rate": 0.0001, "loss": 0.2461, "loss/crossentropy": 2.605985164642334, "loss/fcd": 0.451171875, "loss/idx": 18.0, "loss/logits": 0.24613827466964722, "step": 360 }, { "epoch": 0.005196674703998273, "grad_norm": 0.12353515625, "grad_norm_var": 7.838805516560873e-05, "learning_rate": 0.0001, "loss": 0.231, "loss/crossentropy": 2.4244812726974487, "loss/fcd": 0.451171875, "loss/idx": 18.0, "loss/logits": 0.23104986548423767, "step": 361 }, { "epoch": 0.005211069924784971, "grad_norm": 0.11279296875, "grad_norm_var": 7.502933343251546e-05, "learning_rate": 0.0001, "loss": 0.2269, "loss/crossentropy": 2.4840404987335205, "loss/fcd": 0.4453125, "loss/idx": 18.0, "loss/logits": 0.22690805047750473, "step": 362 }, { "epoch": 0.0052254651455716705, "grad_norm": 0.126953125, "grad_norm_var": 9.071032206217447e-05, "learning_rate": 0.0001, "loss": 0.2776, "loss/crossentropy": 2.631165862083435, "loss/fcd": 0.5244140625, "loss/idx": 18.0, "loss/logits": 0.277616910636425, "step": 363 }, { "epoch": 0.005239860366358369, "grad_norm": 0.12353515625, "grad_norm_var": 9.673039118448893e-05, "learning_rate": 0.0001, "loss": 0.2285, "loss/crossentropy": 2.316849708557129, "loss/fcd": 0.4453125, "loss/idx": 18.0, "loss/logits": 0.228460393846035, "step": 364 }, { "epoch": 0.005254255587145068, "grad_norm": 0.1025390625, "grad_norm_var": 9.242693583170573e-05, "learning_rate": 0.0001, "loss": 0.1957, "loss/crossentropy": 2.315016031265259, "loss/fcd": 0.38671875, "loss/idx": 18.0, "loss/logits": 0.1956682875752449, "step": 365 }, { "epoch": 0.005268650807931766, "grad_norm": 0.1044921875, "grad_norm_var": 8.176167805989583e-05, "learning_rate": 0.0001, "loss": 0.2118, "loss/crossentropy": 2.3352142572402954, "loss/fcd": 0.42578125, "loss/idx": 18.0, "loss/logits": 0.21177390962839127, "step": 366 }, { "epoch": 0.0052830460287184655, "grad_norm": 0.1123046875, "grad_norm_var": 7.939239343007406e-05, "learning_rate": 0.0001, "loss": 0.2289, "loss/crossentropy": 2.511680841445923, "loss/fcd": 0.466796875, "loss/idx": 18.0, "loss/logits": 0.22891707718372345, "step": 367 }, { "epoch": 0.005297441249505164, "grad_norm": 0.109375, "grad_norm_var": 7.37150510152181e-05, "learning_rate": 0.0001, "loss": 0.2201, "loss/crossentropy": 2.2285088300704956, "loss/fcd": 0.431640625, "loss/idx": 18.0, "loss/logits": 0.22013359516859055, "step": 368 }, { "epoch": 0.005311836470291863, "grad_norm": 0.10400390625, "grad_norm_var": 7.414718468983968e-05, "learning_rate": 0.0001, "loss": 0.1957, "loss/crossentropy": 2.389556884765625, "loss/fcd": 0.4287109375, "loss/idx": 18.0, "loss/logits": 0.19569466263055801, "step": 369 }, { "epoch": 0.005326231691078562, "grad_norm": 0.10693359375, "grad_norm_var": 7.047255833943684e-05, "learning_rate": 0.0001, "loss": 0.2015, "loss/crossentropy": 2.2860642671585083, "loss/fcd": 0.4140625, "loss/idx": 18.0, "loss/logits": 0.20150135457515717, "step": 370 }, { "epoch": 0.005340626911865261, "grad_norm": 0.103515625, "grad_norm_var": 7.21891721089681e-05, "learning_rate": 0.0001, "loss": 0.1878, "loss/crossentropy": 2.1553120017051697, "loss/fcd": 0.3740234375, "loss/idx": 18.0, "loss/logits": 0.18780279159545898, "step": 371 }, { "epoch": 0.00535502213265196, "grad_norm": 0.126953125, "grad_norm_var": 8.811056613922119e-05, "learning_rate": 0.0001, "loss": 0.2079, "loss/crossentropy": 2.614238739013672, "loss/fcd": 0.43359375, "loss/idx": 18.0, "loss/logits": 0.20787174999713898, "step": 372 }, { "epoch": 0.005369417353438658, "grad_norm": 0.12255859375, "grad_norm_var": 9.365081787109375e-05, "learning_rate": 0.0001, "loss": 0.2249, "loss/crossentropy": 2.365216612815857, "loss/fcd": 0.421875, "loss/idx": 18.0, "loss/logits": 0.2249324843287468, "step": 373 }, { "epoch": 0.005383812574225357, "grad_norm": 0.11962890625, "grad_norm_var": 8.481244246164958e-05, "learning_rate": 0.0001, "loss": 0.2203, "loss/crossentropy": 2.6173166036605835, "loss/fcd": 0.443359375, "loss/idx": 18.0, "loss/logits": 0.2202518805861473, "step": 374 }, { "epoch": 0.005398207795012056, "grad_norm": 0.11279296875, "grad_norm_var": 7.554590702056885e-05, "learning_rate": 0.0001, "loss": 0.205, "loss/crossentropy": 2.2174978256225586, "loss/fcd": 0.3916015625, "loss/idx": 18.0, "loss/logits": 0.20496949553489685, "step": 375 }, { "epoch": 0.005412603015798755, "grad_norm": 0.10888671875, "grad_norm_var": 7.710357507069905e-05, "learning_rate": 0.0001, "loss": 0.1912, "loss/crossentropy": 2.2786842584609985, "loss/fcd": 0.3974609375, "loss/idx": 18.0, "loss/logits": 0.19121932238340378, "step": 376 }, { "epoch": 0.005426998236585453, "grad_norm": 0.10595703125, "grad_norm_var": 7.359882195790609e-05, "learning_rate": 0.0001, "loss": 0.2442, "loss/crossentropy": 2.5521395206451416, "loss/fcd": 0.458984375, "loss/idx": 18.0, "loss/logits": 0.24418669939041138, "step": 377 }, { "epoch": 0.005441393457372152, "grad_norm": 0.12060546875, "grad_norm_var": 7.750888665517172e-05, "learning_rate": 0.0001, "loss": 0.2245, "loss/crossentropy": 2.9219515323638916, "loss/fcd": 0.482421875, "loss/idx": 18.0, "loss/logits": 0.22447162866592407, "step": 378 }, { "epoch": 0.005455788678158852, "grad_norm": 0.11376953125, "grad_norm_var": 6.41783078511556e-05, "learning_rate": 0.0001, "loss": 0.2119, "loss/crossentropy": 2.393683671951294, "loss/fcd": 0.408203125, "loss/idx": 18.0, "loss/logits": 0.21186020970344543, "step": 379 }, { "epoch": 0.00547018389894555, "grad_norm": 0.1064453125, "grad_norm_var": 5.698104699452718e-05, "learning_rate": 0.0001, "loss": 0.2392, "loss/crossentropy": 2.7257591485977173, "loss/fcd": 0.4443359375, "loss/idx": 18.0, "loss/logits": 0.23916704207658768, "step": 380 }, { "epoch": 0.005484579119732249, "grad_norm": 0.1025390625, "grad_norm_var": 5.698104699452718e-05, "learning_rate": 0.0001, "loss": 0.2175, "loss/crossentropy": 2.604699730873108, "loss/fcd": 0.44140625, "loss/idx": 18.0, "loss/logits": 0.21749083697795868, "step": 381 }, { "epoch": 0.0054989743405189475, "grad_norm": 0.115234375, "grad_norm_var": 5.444586277008057e-05, "learning_rate": 0.0001, "loss": 0.2128, "loss/crossentropy": 2.3415403366088867, "loss/fcd": 0.41796875, "loss/idx": 18.0, "loss/logits": 0.21281517297029495, "step": 382 }, { "epoch": 0.005513369561305647, "grad_norm": 0.109375, "grad_norm_var": 5.485117435455322e-05, "learning_rate": 0.0001, "loss": 0.2118, "loss/crossentropy": 2.164521098136902, "loss/fcd": 0.4013671875, "loss/idx": 18.0, "loss/logits": 0.2117796689271927, "step": 383 }, { "epoch": 0.005527764782092345, "grad_norm": 0.11767578125, "grad_norm_var": 5.648930867513021e-05, "learning_rate": 0.0001, "loss": 0.235, "loss/crossentropy": 2.243640184402466, "loss/fcd": 0.44921875, "loss/idx": 18.0, "loss/logits": 0.23500269651412964, "step": 384 }, { "epoch": 0.005542160002879044, "grad_norm": 0.10498046875, "grad_norm_var": 5.546808242797852e-05, "learning_rate": 0.0001, "loss": 0.2025, "loss/crossentropy": 2.2612792253494263, "loss/fcd": 0.421875, "loss/idx": 18.0, "loss/logits": 0.20246511697769165, "step": 385 }, { "epoch": 0.005556555223665743, "grad_norm": 0.10888671875, "grad_norm_var": 5.429188410441081e-05, "learning_rate": 0.0001, "loss": 0.21, "loss/crossentropy": 2.6286587715148926, "loss/fcd": 0.447265625, "loss/idx": 18.0, "loss/logits": 0.21000967174768448, "step": 386 }, { "epoch": 0.005570950444452442, "grad_norm": 0.107421875, "grad_norm_var": 5.0572554270426434e-05, "learning_rate": 0.0001, "loss": 0.1951, "loss/crossentropy": 2.381960868835449, "loss/fcd": 0.38671875, "loss/idx": 18.0, "loss/logits": 0.19514141231775284, "step": 387 }, { "epoch": 0.005585345665239141, "grad_norm": 0.11962890625, "grad_norm_var": 4.003743330637614e-05, "learning_rate": 0.0001, "loss": 0.2116, "loss/crossentropy": 2.1127407550811768, "loss/fcd": 0.4326171875, "loss/idx": 18.0, "loss/logits": 0.21157918125391006, "step": 388 }, { "epoch": 0.005599740886025839, "grad_norm": 0.11669921875, "grad_norm_var": 3.414849440256754e-05, "learning_rate": 0.0001, "loss": 0.1979, "loss/crossentropy": 2.2836742401123047, "loss/fcd": 0.396484375, "loss/idx": 18.0, "loss/logits": 0.1978917270898819, "step": 389 }, { "epoch": 0.0056141361068125385, "grad_norm": 0.109375, "grad_norm_var": 3.0163923899332682e-05, "learning_rate": 0.0001, "loss": 0.1913, "loss/crossentropy": 2.141560196876526, "loss/fcd": 0.3857421875, "loss/idx": 18.0, "loss/logits": 0.19127248972654343, "step": 390 }, { "epoch": 0.005628531327599237, "grad_norm": 0.1279296875, "grad_norm_var": 4.756351312001546e-05, "learning_rate": 0.0001, "loss": 0.2001, "loss/crossentropy": 2.0556570291519165, "loss/fcd": 0.4052734375, "loss/idx": 18.0, "loss/logits": 0.20014575868844986, "step": 391 }, { "epoch": 0.005642926548385936, "grad_norm": 0.11865234375, "grad_norm_var": 4.919270674387614e-05, "learning_rate": 0.0001, "loss": 0.2564, "loss/crossentropy": 2.434049367904663, "loss/fcd": 0.478515625, "loss/idx": 18.0, "loss/logits": 0.2563505992293358, "step": 392 }, { "epoch": 0.005657321769172634, "grad_norm": 0.11328125, "grad_norm_var": 4.583994547526042e-05, "learning_rate": 0.0001, "loss": 0.2032, "loss/crossentropy": 2.31030809879303, "loss/fcd": 0.412109375, "loss/idx": 18.0, "loss/logits": 0.20315195620059967, "step": 393 }, { "epoch": 0.0056717169899593335, "grad_norm": 0.10888671875, "grad_norm_var": 4.297892252604167e-05, "learning_rate": 0.0001, "loss": 0.2028, "loss/crossentropy": 2.201537609100342, "loss/fcd": 0.423828125, "loss/idx": 18.0, "loss/logits": 0.20282629132270813, "step": 394 }, { "epoch": 0.005686112210746033, "grad_norm": 0.1103515625, "grad_norm_var": 4.315276940663656e-05, "learning_rate": 0.0001, "loss": 0.1968, "loss/crossentropy": 2.2024729251861572, "loss/fcd": 0.4013671875, "loss/idx": 18.0, "loss/logits": 0.1968480423092842, "step": 395 }, { "epoch": 0.005700507431532731, "grad_norm": 0.1064453125, "grad_norm_var": 4.315276940663656e-05, "learning_rate": 0.0001, "loss": 0.2005, "loss/crossentropy": 2.3388434648513794, "loss/fcd": 0.4169921875, "loss/idx": 18.0, "loss/logits": 0.20053986459970474, "step": 396 }, { "epoch": 0.00571490265231943, "grad_norm": 0.1220703125, "grad_norm_var": 4.148383935292562e-05, "learning_rate": 0.0001, "loss": 0.241, "loss/crossentropy": 2.9380890130996704, "loss/fcd": 0.470703125, "loss/idx": 18.0, "loss/logits": 0.24104679375886917, "step": 397 }, { "epoch": 0.005729297873106129, "grad_norm": 0.1181640625, "grad_norm_var": 4.267593224843343e-05, "learning_rate": 0.0001, "loss": 0.1981, "loss/crossentropy": 2.0873407125473022, "loss/fcd": 0.40234375, "loss/idx": 18.0, "loss/logits": 0.1980888992547989, "step": 398 }, { "epoch": 0.005743693093892828, "grad_norm": 0.1103515625, "grad_norm_var": 4.21673059463501e-05, "learning_rate": 0.0001, "loss": 0.2054, "loss/crossentropy": 2.398405909538269, "loss/fcd": 0.4306640625, "loss/idx": 18.0, "loss/logits": 0.20542413741350174, "step": 399 }, { "epoch": 0.005758088314679526, "grad_norm": 0.11181640625, "grad_norm_var": 4.1285157203674315e-05, "learning_rate": 0.0001, "loss": 0.2162, "loss/crossentropy": 2.4124823808670044, "loss/fcd": 0.4130859375, "loss/idx": 18.0, "loss/logits": 0.21624472737312317, "step": 400 }, { "epoch": 0.005772483535466225, "grad_norm": 0.11962890625, "grad_norm_var": 3.8185715675354e-05, "learning_rate": 0.0001, "loss": 0.2039, "loss/crossentropy": 2.495412826538086, "loss/fcd": 0.4296875, "loss/idx": 18.0, "loss/logits": 0.20386559516191483, "step": 401 }, { "epoch": 0.005786878756252924, "grad_norm": 0.11474609375, "grad_norm_var": 3.6063790321350095e-05, "learning_rate": 0.0001, "loss": 0.2092, "loss/crossentropy": 2.320030093193054, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.20922860503196716, "step": 402 }, { "epoch": 0.005801273977039623, "grad_norm": 0.11474609375, "grad_norm_var": 3.229379653930664e-05, "learning_rate": 0.0001, "loss": 0.2263, "loss/crossentropy": 2.5104581117630005, "loss/fcd": 0.46484375, "loss/idx": 18.0, "loss/logits": 0.22626767307519913, "step": 403 }, { "epoch": 0.005815669197826322, "grad_norm": 0.1064453125, "grad_norm_var": 3.532469272613525e-05, "learning_rate": 0.0001, "loss": 0.2136, "loss/crossentropy": 2.5909669399261475, "loss/fcd": 0.4365234375, "loss/idx": 18.0, "loss/logits": 0.213609017431736, "step": 404 }, { "epoch": 0.00583006441861302, "grad_norm": 0.10400390625, "grad_norm_var": 4.14202610651652e-05, "learning_rate": 0.0001, "loss": 0.1848, "loss/crossentropy": 2.099331498146057, "loss/fcd": 0.3916015625, "loss/idx": 18.0, "loss/logits": 0.18482983112335205, "step": 405 }, { "epoch": 0.00584445963939972, "grad_norm": 0.1005859375, "grad_norm_var": 5.114773909250895e-05, "learning_rate": 0.0001, "loss": 0.1927, "loss/crossentropy": 2.245216131210327, "loss/fcd": 0.41015625, "loss/idx": 18.0, "loss/logits": 0.1927170231938362, "step": 406 }, { "epoch": 0.005858854860186418, "grad_norm": 0.1103515625, "grad_norm_var": 3.54836384455363e-05, "learning_rate": 0.0001, "loss": 0.2359, "loss/crossentropy": 2.3967188596725464, "loss/fcd": 0.4189453125, "loss/idx": 18.0, "loss/logits": 0.2359299436211586, "step": 407 }, { "epoch": 0.005873250080973117, "grad_norm": 0.10791015625, "grad_norm_var": 3.3035874366760254e-05, "learning_rate": 0.0001, "loss": 0.2072, "loss/crossentropy": 2.4940836429595947, "loss/fcd": 0.4306640625, "loss/idx": 18.0, "loss/logits": 0.20715947449207306, "step": 408 }, { "epoch": 0.005887645301759815, "grad_norm": 0.1044921875, "grad_norm_var": 3.546774387359619e-05, "learning_rate": 0.0001, "loss": 0.2031, "loss/crossentropy": 2.4436358213424683, "loss/fcd": 0.416015625, "loss/idx": 18.0, "loss/logits": 0.20313256978988647, "step": 409 }, { "epoch": 0.005902040522546515, "grad_norm": 0.1083984375, "grad_norm_var": 3.5599867502848306e-05, "learning_rate": 0.0001, "loss": 0.2205, "loss/crossentropy": 2.3267935514450073, "loss/fcd": 0.4208984375, "loss/idx": 18.0, "loss/logits": 0.22049501538276672, "step": 410 }, { "epoch": 0.005916435743333213, "grad_norm": 0.1083984375, "grad_norm_var": 3.591775894165039e-05, "learning_rate": 0.0001, "loss": 0.2032, "loss/crossentropy": 2.262888252735138, "loss/fcd": 0.4091796875, "loss/idx": 18.0, "loss/logits": 0.20318371057510376, "step": 411 }, { "epoch": 0.005930830964119912, "grad_norm": 0.09619140625, "grad_norm_var": 4.8080086708068846e-05, "learning_rate": 0.0001, "loss": 0.2031, "loss/crossentropy": 2.5199403762817383, "loss/fcd": 0.412109375, "loss/idx": 18.0, "loss/logits": 0.20311684161424637, "step": 412 }, { "epoch": 0.005945226184906611, "grad_norm": 0.10009765625, "grad_norm_var": 4.258155822753906e-05, "learning_rate": 0.0001, "loss": 0.1989, "loss/crossentropy": 2.3285170793533325, "loss/fcd": 0.4189453125, "loss/idx": 18.0, "loss/logits": 0.19889184832572937, "step": 413 }, { "epoch": 0.00595962140569331, "grad_norm": 0.11474609375, "grad_norm_var": 3.891686598459879e-05, "learning_rate": 0.0001, "loss": 0.2254, "loss/crossentropy": 2.5566580295562744, "loss/fcd": 0.466796875, "loss/idx": 18.0, "loss/logits": 0.22541005164384842, "step": 414 }, { "epoch": 0.005974016626480009, "grad_norm": 0.12255859375, "grad_norm_var": 5.155801773071289e-05, "learning_rate": 0.0001, "loss": 0.2264, "loss/crossentropy": 2.537785768508911, "loss/fcd": 0.4892578125, "loss/idx": 18.0, "loss/logits": 0.22644919157028198, "step": 415 }, { "epoch": 0.005988411847266707, "grad_norm": 0.10205078125, "grad_norm_var": 5.3942203521728516e-05, "learning_rate": 0.0001, "loss": 0.2184, "loss/crossentropy": 2.6664167642593384, "loss/fcd": 0.4111328125, "loss/idx": 18.0, "loss/logits": 0.2184242233633995, "step": 416 }, { "epoch": 0.006002807068053406, "grad_norm": 0.1044921875, "grad_norm_var": 4.5719742774963376e-05, "learning_rate": 0.0001, "loss": 0.1944, "loss/crossentropy": 2.231179356575012, "loss/fcd": 0.40234375, "loss/idx": 18.0, "loss/logits": 0.19440477341413498, "step": 417 }, { "epoch": 0.006017202288840105, "grad_norm": 0.11572265625, "grad_norm_var": 4.672110080718994e-05, "learning_rate": 0.0001, "loss": 0.2664, "loss/crossentropy": 2.798780918121338, "loss/fcd": 0.513671875, "loss/idx": 18.0, "loss/logits": 0.26635295152664185, "step": 418 }, { "epoch": 0.006031597509626804, "grad_norm": 0.1357421875, "grad_norm_var": 9.435017903645834e-05, "learning_rate": 0.0001, "loss": 0.227, "loss/crossentropy": 2.5461186170578003, "loss/fcd": 0.455078125, "loss/idx": 18.0, "loss/logits": 0.22702706605196, "step": 419 }, { "epoch": 0.006045992730413502, "grad_norm": 0.1083984375, "grad_norm_var": 9.395281473795573e-05, "learning_rate": 0.0001, "loss": 0.2178, "loss/crossentropy": 2.566969871520996, "loss/fcd": 0.447265625, "loss/idx": 18.0, "loss/logits": 0.21776312589645386, "step": 420 }, { "epoch": 0.0060603879512002015, "grad_norm": 0.1005859375, "grad_norm_var": 9.696384270985921e-05, "learning_rate": 0.0001, "loss": 0.1927, "loss/crossentropy": 2.41417920589447, "loss/fcd": 0.4130859375, "loss/idx": 18.0, "loss/logits": 0.19272838532924652, "step": 421 }, { "epoch": 0.006074783171986901, "grad_norm": 0.11181640625, "grad_norm_var": 9.255409240722656e-05, "learning_rate": 0.0001, "loss": 0.2341, "loss/crossentropy": 2.432627320289612, "loss/fcd": 0.4423828125, "loss/idx": 18.0, "loss/logits": 0.2340926229953766, "step": 422 }, { "epoch": 0.006089178392773599, "grad_norm": 0.1123046875, "grad_norm_var": 9.301503499348958e-05, "learning_rate": 0.0001, "loss": 0.2064, "loss/crossentropy": 2.22554087638855, "loss/fcd": 0.4189453125, "loss/idx": 18.0, "loss/logits": 0.20639413595199585, "step": 423 }, { "epoch": 0.006103573613560298, "grad_norm": 0.10791015625, "grad_norm_var": 9.301503499348958e-05, "learning_rate": 0.0001, "loss": 0.2352, "loss/crossentropy": 2.7009902000427246, "loss/fcd": 0.4658203125, "loss/idx": 18.0, "loss/logits": 0.23516181111335754, "step": 424 }, { "epoch": 0.0061179688343469965, "grad_norm": 0.11962890625, "grad_norm_var": 9.698768456776937e-05, "learning_rate": 0.0001, "loss": 0.2242, "loss/crossentropy": 2.5257065296173096, "loss/fcd": 0.50390625, "loss/idx": 18.0, "loss/logits": 0.22422834485769272, "step": 425 }, { "epoch": 0.006132364055133696, "grad_norm": 0.10205078125, "grad_norm_var": 0.00010133981704711914, "learning_rate": 0.0001, "loss": 0.1991, "loss/crossentropy": 2.39576256275177, "loss/fcd": 0.3935546875, "loss/idx": 18.0, "loss/logits": 0.19912777841091156, "step": 426 }, { "epoch": 0.006146759275920394, "grad_norm": 0.1064453125, "grad_norm_var": 0.00010203917821248373, "learning_rate": 0.0001, "loss": 0.194, "loss/crossentropy": 2.337485671043396, "loss/fcd": 0.4072265625, "loss/idx": 18.0, "loss/logits": 0.19400090724229813, "step": 427 }, { "epoch": 0.006161154496707093, "grad_norm": 0.11767578125, "grad_norm_var": 9.119908014933268e-05, "learning_rate": 0.0001, "loss": 0.2313, "loss/crossentropy": 2.271997570991516, "loss/fcd": 0.4619140625, "loss/idx": 18.0, "loss/logits": 0.23129994422197342, "step": 428 }, { "epoch": 0.0061755497174937925, "grad_norm": 0.14453125, "grad_norm_var": 0.0001476993163426717, "learning_rate": 0.0001, "loss": 0.2351, "loss/crossentropy": 2.2284241318702698, "loss/fcd": 0.4794921875, "loss/idx": 18.0, "loss/logits": 0.23510510474443436, "step": 429 }, { "epoch": 0.006189944938280491, "grad_norm": 0.11279296875, "grad_norm_var": 0.00014778673648834227, "learning_rate": 0.0001, "loss": 0.2179, "loss/crossentropy": 2.487444758415222, "loss/fcd": 0.482421875, "loss/idx": 18.0, "loss/logits": 0.21794230490922928, "step": 430 }, { "epoch": 0.00620434015906719, "grad_norm": 0.11181640625, "grad_norm_var": 0.00014280378818511962, "learning_rate": 0.0001, "loss": 0.2053, "loss/crossentropy": 2.277848958969116, "loss/fcd": 0.4208984375, "loss/idx": 18.0, "loss/logits": 0.20534329116344452, "step": 431 }, { "epoch": 0.006218735379853888, "grad_norm": 0.1005859375, "grad_norm_var": 0.00014514923095703124, "learning_rate": 0.0001, "loss": 0.2112, "loss/crossentropy": 2.4542627334594727, "loss/fcd": 0.4052734375, "loss/idx": 18.0, "loss/logits": 0.21123766899108887, "step": 432 }, { "epoch": 0.0062331306006405875, "grad_norm": 0.1162109375, "grad_norm_var": 0.0001399993896484375, "learning_rate": 0.0001, "loss": 0.2215, "loss/crossentropy": 2.337994694709778, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.22145532071590424, "step": 433 }, { "epoch": 0.006247525821427286, "grad_norm": 0.10205078125, "grad_norm_var": 0.00014856656392415364, "learning_rate": 0.0001, "loss": 0.1865, "loss/crossentropy": 2.2629653215408325, "loss/fcd": 0.3896484375, "loss/idx": 18.0, "loss/logits": 0.18648526072502136, "step": 434 }, { "epoch": 0.006261921042213985, "grad_norm": 0.11328125, "grad_norm_var": 0.00011246601740519205, "learning_rate": 0.0001, "loss": 0.2497, "loss/crossentropy": 2.5573008060455322, "loss/fcd": 0.455078125, "loss/idx": 18.0, "loss/logits": 0.24965552240610123, "step": 435 }, { "epoch": 0.006276316263000683, "grad_norm": 0.1123046875, "grad_norm_var": 0.00011167128880818685, "learning_rate": 0.0001, "loss": 0.2199, "loss/crossentropy": 2.3618111610412598, "loss/fcd": 0.4365234375, "loss/idx": 18.0, "loss/logits": 0.21985996514558792, "step": 436 }, { "epoch": 0.006290711483787383, "grad_norm": 0.12060546875, "grad_norm_var": 0.0001062542200088501, "learning_rate": 0.0001, "loss": 0.1995, "loss/crossentropy": 1.9345441460609436, "loss/fcd": 0.40234375, "loss/idx": 18.0, "loss/logits": 0.19948522001504898, "step": 437 }, { "epoch": 0.006305106704574082, "grad_norm": 0.09765625, "grad_norm_var": 0.00012149413426717122, "learning_rate": 0.0001, "loss": 0.1829, "loss/crossentropy": 2.2750844955444336, "loss/fcd": 0.400390625, "loss/idx": 18.0, "loss/logits": 0.18287546932697296, "step": 438 }, { "epoch": 0.00631950192536078, "grad_norm": 0.10693359375, "grad_norm_var": 0.00012334088484446207, "learning_rate": 0.0001, "loss": 0.206, "loss/crossentropy": 2.178094267845154, "loss/fcd": 0.400390625, "loss/idx": 18.0, "loss/logits": 0.20597843825817108, "step": 439 }, { "epoch": 0.006333897146147479, "grad_norm": 0.1162109375, "grad_norm_var": 0.00012308756510416666, "learning_rate": 0.0001, "loss": 0.2026, "loss/crossentropy": 2.2465450763702393, "loss/fcd": 0.416015625, "loss/idx": 18.0, "loss/logits": 0.20255093276500702, "step": 440 }, { "epoch": 0.006348292366934178, "grad_norm": 0.11767578125, "grad_norm_var": 0.00012148221333821615, "learning_rate": 0.0001, "loss": 0.2194, "loss/crossentropy": 2.3598278760910034, "loss/fcd": 0.4599609375, "loss/idx": 18.0, "loss/logits": 0.21944674849510193, "step": 441 }, { "epoch": 0.006362687587720877, "grad_norm": 0.11181640625, "grad_norm_var": 0.00011393229166666667, "learning_rate": 0.0001, "loss": 0.2421, "loss/crossentropy": 2.523189663887024, "loss/fcd": 0.455078125, "loss/idx": 18.0, "loss/logits": 0.2420613244175911, "step": 442 }, { "epoch": 0.006377082808507575, "grad_norm": 0.10986328125, "grad_norm_var": 0.00011165837446848551, "learning_rate": 0.0001, "loss": 0.2116, "loss/crossentropy": 2.4686609506607056, "loss/fcd": 0.40625, "loss/idx": 18.0, "loss/logits": 0.2116122990846634, "step": 443 }, { "epoch": 0.006391478029294274, "grad_norm": 0.1181640625, "grad_norm_var": 0.00011196136474609376, "learning_rate": 0.0001, "loss": 0.2574, "loss/crossentropy": 2.7661678791046143, "loss/fcd": 0.5078125, "loss/idx": 18.0, "loss/logits": 0.2574233114719391, "step": 444 }, { "epoch": 0.006405873250080973, "grad_norm": 0.1435546875, "grad_norm_var": 0.00010795195897420248, "learning_rate": 0.0001, "loss": 0.2239, "loss/crossentropy": 1.9732567071914673, "loss/fcd": 0.4638671875, "loss/idx": 18.0, "loss/logits": 0.2238999307155609, "step": 445 }, { "epoch": 0.006420268470867672, "grad_norm": 0.10986328125, "grad_norm_var": 0.00010865529378255209, "learning_rate": 0.0001, "loss": 0.2209, "loss/crossentropy": 2.3994816541671753, "loss/fcd": 0.4267578125, "loss/idx": 18.0, "loss/logits": 0.22091981023550034, "step": 446 }, { "epoch": 0.006434663691654371, "grad_norm": 0.109375, "grad_norm_var": 0.00010942518711090087, "learning_rate": 0.0001, "loss": 0.2094, "loss/crossentropy": 2.385701537132263, "loss/fcd": 0.4072265625, "loss/idx": 18.0, "loss/logits": 0.209386445581913, "step": 447 }, { "epoch": 0.0064490589124410694, "grad_norm": 0.1103515625, "grad_norm_var": 9.9371870358785e-05, "learning_rate": 0.0001, "loss": 0.212, "loss/crossentropy": 2.204231023788452, "loss/fcd": 0.4638671875, "loss/idx": 18.0, "loss/logits": 0.2120284140110016, "step": 448 }, { "epoch": 0.006463454133227769, "grad_norm": 0.10693359375, "grad_norm_var": 0.00010139147440592448, "learning_rate": 0.0001, "loss": 0.2106, "loss/crossentropy": 2.4561452865600586, "loss/fcd": 0.42578125, "loss/idx": 18.0, "loss/logits": 0.21060140430927277, "step": 449 }, { "epoch": 0.006477849354014467, "grad_norm": 0.1337890625, "grad_norm_var": 0.00011837383111317953, "learning_rate": 0.0001, "loss": 0.2272, "loss/crossentropy": 2.2843399047851562, "loss/fcd": 0.5146484375, "loss/idx": 18.0, "loss/logits": 0.2271936535835266, "step": 450 }, { "epoch": 0.006492244574801166, "grad_norm": 0.1064453125, "grad_norm_var": 0.00012276868025461833, "learning_rate": 0.0001, "loss": 0.2133, "loss/crossentropy": 2.564459443092346, "loss/fcd": 0.443359375, "loss/idx": 18.0, "loss/logits": 0.21329496800899506, "step": 451 }, { "epoch": 0.0065066397955878645, "grad_norm": 0.10888671875, "grad_norm_var": 0.00012448628743489583, "learning_rate": 0.0001, "loss": 0.2099, "loss/crossentropy": 2.270860195159912, "loss/fcd": 0.412109375, "loss/idx": 18.0, "loss/logits": 0.20991922914981842, "step": 452 }, { "epoch": 0.006521035016374564, "grad_norm": 0.10205078125, "grad_norm_var": 0.00013029972712198893, "learning_rate": 0.0001, "loss": 0.2251, "loss/crossentropy": 2.482293486595154, "loss/fcd": 0.4296875, "loss/idx": 18.0, "loss/logits": 0.22511228173971176, "step": 453 }, { "epoch": 0.006535430237161262, "grad_norm": 0.154296875, "grad_norm_var": 0.00021419127782185872, "learning_rate": 0.0001, "loss": 0.2411, "loss/crossentropy": 2.320971131324768, "loss/fcd": 0.4951171875, "loss/idx": 18.0, "loss/logits": 0.24106843769550323, "step": 454 }, { "epoch": 0.006549825457947961, "grad_norm": 0.1044921875, "grad_norm_var": 0.00021772285302480062, "learning_rate": 0.0001, "loss": 0.2047, "loss/crossentropy": 2.3206406831741333, "loss/fcd": 0.40234375, "loss/idx": 18.0, "loss/logits": 0.20467744767665863, "step": 455 }, { "epoch": 0.0065642206787346604, "grad_norm": 0.1083984375, "grad_norm_var": 0.0002218236525853475, "learning_rate": 0.0001, "loss": 0.218, "loss/crossentropy": 2.492337226867676, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.2180488407611847, "step": 456 }, { "epoch": 0.006578615899521359, "grad_norm": 0.10888671875, "grad_norm_var": 0.00022468467553456625, "learning_rate": 0.0001, "loss": 0.2109, "loss/crossentropy": 2.3202375173568726, "loss/fcd": 0.4375, "loss/idx": 18.0, "loss/logits": 0.21094900369644165, "step": 457 }, { "epoch": 0.006593011120308058, "grad_norm": 0.10400390625, "grad_norm_var": 0.00023228228092193605, "learning_rate": 0.0001, "loss": 0.2025, "loss/crossentropy": 2.452348470687866, "loss/fcd": 0.4208984375, "loss/idx": 18.0, "loss/logits": 0.20247067511081696, "step": 458 }, { "epoch": 0.006607406341094756, "grad_norm": 0.138671875, "grad_norm_var": 0.00026457707087198894, "learning_rate": 0.0001, "loss": 0.2292, "loss/crossentropy": 2.8116979598999023, "loss/fcd": 0.466796875, "loss/idx": 18.0, "loss/logits": 0.2292005866765976, "step": 459 }, { "epoch": 0.0066218015618814555, "grad_norm": 0.10791015625, "grad_norm_var": 0.0002692292133967082, "learning_rate": 0.0001, "loss": 0.2044, "loss/crossentropy": 2.337909698486328, "loss/fcd": 0.427734375, "loss/idx": 18.0, "loss/logits": 0.20442651212215424, "step": 460 }, { "epoch": 0.006636196782668154, "grad_norm": 0.1044921875, "grad_norm_var": 0.00022170444329579672, "learning_rate": 0.0001, "loss": 0.2165, "loss/crossentropy": 2.452089309692383, "loss/fcd": 0.4169921875, "loss/idx": 18.0, "loss/logits": 0.21646161377429962, "step": 461 }, { "epoch": 0.006650592003454853, "grad_norm": 0.11669921875, "grad_norm_var": 0.00022114813327789307, "learning_rate": 0.0001, "loss": 0.2284, "loss/crossentropy": 2.596395969390869, "loss/fcd": 0.470703125, "loss/idx": 18.0, "loss/logits": 0.22836245596408844, "step": 462 }, { "epoch": 0.006664987224241552, "grad_norm": 0.10888671875, "grad_norm_var": 0.00022147099177042642, "learning_rate": 0.0001, "loss": 0.2309, "loss/crossentropy": 2.553429961204529, "loss/fcd": 0.44140625, "loss/idx": 18.0, "loss/logits": 0.23092983663082123, "step": 463 }, { "epoch": 0.0066793824450282506, "grad_norm": 0.09912109375, "grad_norm_var": 0.00023492872714996337, "learning_rate": 0.0001, "loss": 0.1827, "loss/crossentropy": 2.3164178133010864, "loss/fcd": 0.41015625, "loss/idx": 18.0, "loss/logits": 0.18271666765213013, "step": 464 }, { "epoch": 0.00669377766581495, "grad_norm": 0.1123046875, "grad_norm_var": 0.00023212035497029623, "learning_rate": 0.0001, "loss": 0.2093, "loss/crossentropy": 2.3045096397399902, "loss/fcd": 0.43359375, "loss/idx": 18.0, "loss/logits": 0.20930374413728714, "step": 465 }, { "epoch": 0.006708172886601648, "grad_norm": 0.10986328125, "grad_norm_var": 0.00020383894443511962, "learning_rate": 0.0001, "loss": 0.2332, "loss/crossentropy": 2.386527895927429, "loss/fcd": 0.4970703125, "loss/idx": 18.0, "loss/logits": 0.23323698341846466, "step": 466 }, { "epoch": 0.006722568107388347, "grad_norm": 0.11083984375, "grad_norm_var": 0.00020166635513305665, "learning_rate": 0.0001, "loss": 0.1988, "loss/crossentropy": 2.151167392730713, "loss/fcd": 0.4208984375, "loss/idx": 18.0, "loss/logits": 0.19876766949892044, "step": 467 }, { "epoch": 0.006736963328175046, "grad_norm": 0.1259765625, "grad_norm_var": 0.00021171470483144123, "learning_rate": 0.0001, "loss": 0.2219, "loss/crossentropy": 2.512352228164673, "loss/fcd": 0.43359375, "loss/idx": 18.0, "loss/logits": 0.22188346087932587, "step": 468 }, { "epoch": 0.006751358548961745, "grad_norm": 0.10400390625, "grad_norm_var": 0.00020895699659983317, "learning_rate": 0.0001, "loss": 0.202, "loss/crossentropy": 2.4446065425872803, "loss/fcd": 0.4326171875, "loss/idx": 18.0, "loss/logits": 0.2020409256219864, "step": 469 }, { "epoch": 0.006765753769748443, "grad_norm": 0.1162109375, "grad_norm_var": 9.334782759348551e-05, "learning_rate": 0.0001, "loss": 0.2175, "loss/crossentropy": 2.4017263650894165, "loss/fcd": 0.4365234375, "loss/idx": 18.0, "loss/logits": 0.21754636615514755, "step": 470 }, { "epoch": 0.006780148990535142, "grad_norm": 0.134765625, "grad_norm_var": 0.00012315809726715088, "learning_rate": 0.0001, "loss": 0.2326, "loss/crossentropy": 2.364670991897583, "loss/fcd": 0.4658203125, "loss/idx": 18.0, "loss/logits": 0.232588529586792, "step": 471 }, { "epoch": 0.0067945442113218416, "grad_norm": 0.10302734375, "grad_norm_var": 0.00012839237848917643, "learning_rate": 0.0001, "loss": 0.2316, "loss/crossentropy": 2.7174742221832275, "loss/fcd": 0.4501953125, "loss/idx": 18.0, "loss/logits": 0.23159676045179367, "step": 472 }, { "epoch": 0.00680893943210854, "grad_norm": 0.1044921875, "grad_norm_var": 0.00013192395369211832, "learning_rate": 0.0001, "loss": 0.2198, "loss/crossentropy": 2.472269654273987, "loss/fcd": 0.416015625, "loss/idx": 18.0, "loss/logits": 0.21980835497379303, "step": 473 }, { "epoch": 0.006823334652895239, "grad_norm": 0.11572265625, "grad_norm_var": 0.00012710789839426678, "learning_rate": 0.0001, "loss": 0.2208, "loss/crossentropy": 2.5979279279708862, "loss/fcd": 0.4453125, "loss/idx": 18.0, "loss/logits": 0.22081798315048218, "step": 474 }, { "epoch": 0.006837729873681937, "grad_norm": 0.10791015625, "grad_norm_var": 8.223454157511394e-05, "learning_rate": 0.0001, "loss": 0.2017, "loss/crossentropy": 2.337291121482849, "loss/fcd": 0.5244140625, "loss/idx": 18.0, "loss/logits": 0.20170452445745468, "step": 475 }, { "epoch": 0.006852125094468637, "grad_norm": 0.12353515625, "grad_norm_var": 9.024540583292643e-05, "learning_rate": 0.0001, "loss": 0.2224, "loss/crossentropy": 2.2137837409973145, "loss/fcd": 0.44921875, "loss/idx": 18.0, "loss/logits": 0.22239823639392853, "step": 476 }, { "epoch": 0.006866520315255335, "grad_norm": 0.1376953125, "grad_norm_var": 0.00012429157892862957, "learning_rate": 0.0001, "loss": 0.2253, "loss/crossentropy": 2.044808030128479, "loss/fcd": 0.45703125, "loss/idx": 18.0, "loss/logits": 0.22525641322135925, "step": 477 }, { "epoch": 0.006880915536042034, "grad_norm": 0.11083984375, "grad_norm_var": 0.00012467304865519205, "learning_rate": 0.0001, "loss": 0.2055, "loss/crossentropy": 2.176842510700226, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.20548538118600845, "step": 478 }, { "epoch": 0.0068953107568287325, "grad_norm": 0.11328125, "grad_norm_var": 0.0001228402058283488, "learning_rate": 0.0001, "loss": 0.2052, "loss/crossentropy": 2.43264901638031, "loss/fcd": 0.42578125, "loss/idx": 18.0, "loss/logits": 0.2051537036895752, "step": 479 }, { "epoch": 0.006909705977615432, "grad_norm": 0.1162109375, "grad_norm_var": 0.00010639429092407227, "learning_rate": 0.0001, "loss": 0.21, "loss/crossentropy": 2.24726939201355, "loss/fcd": 0.4169921875, "loss/idx": 18.0, "loss/logits": 0.210049070417881, "step": 480 }, { "epoch": 0.006924101198402131, "grad_norm": 0.1181640625, "grad_norm_var": 0.0001061081886291504, "learning_rate": 0.0001, "loss": 0.204, "loss/crossentropy": 2.71012020111084, "loss/fcd": 0.4716796875, "loss/idx": 18.0, "loss/logits": 0.20403584837913513, "step": 481 }, { "epoch": 0.006938496419188829, "grad_norm": 0.1181640625, "grad_norm_var": 0.00010386208693186442, "learning_rate": 0.0001, "loss": 0.2221, "loss/crossentropy": 2.3664560317993164, "loss/fcd": 0.4501953125, "loss/idx": 18.0, "loss/logits": 0.22213804721832275, "step": 482 }, { "epoch": 0.006952891639975528, "grad_norm": 0.10791015625, "grad_norm_var": 0.00010653237501780192, "learning_rate": 0.0001, "loss": 0.2118, "loss/crossentropy": 2.541406989097595, "loss/fcd": 0.4638671875, "loss/idx": 18.0, "loss/logits": 0.21182993054389954, "step": 483 }, { "epoch": 0.006967286860762227, "grad_norm": 0.1171875, "grad_norm_var": 9.980897108713786e-05, "learning_rate": 0.0001, "loss": 0.2098, "loss/crossentropy": 2.0675625801086426, "loss/fcd": 0.3955078125, "loss/idx": 18.0, "loss/logits": 0.20982014387845993, "step": 484 }, { "epoch": 0.006981682081548926, "grad_norm": 0.11181640625, "grad_norm_var": 9.15755828221639e-05, "learning_rate": 0.0001, "loss": 0.2191, "loss/crossentropy": 2.1868069767951965, "loss/fcd": 0.4140625, "loss/idx": 18.0, "loss/logits": 0.21909870952367783, "step": 485 }, { "epoch": 0.006996077302335624, "grad_norm": 0.10546875, "grad_norm_var": 9.856919447580973e-05, "learning_rate": 0.0001, "loss": 0.2026, "loss/crossentropy": 2.26907217502594, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.20264852046966553, "step": 486 }, { "epoch": 0.0070104725231223235, "grad_norm": 0.11572265625, "grad_norm_var": 7.203022638956706e-05, "learning_rate": 0.0001, "loss": 0.2109, "loss/crossentropy": 2.4840633869171143, "loss/fcd": 0.439453125, "loss/idx": 18.0, "loss/logits": 0.21092981100082397, "step": 487 }, { "epoch": 0.007024867743909022, "grad_norm": 0.115234375, "grad_norm_var": 6.31640354792277e-05, "learning_rate": 0.0001, "loss": 0.2276, "loss/crossentropy": 2.5656063556671143, "loss/fcd": 0.43359375, "loss/idx": 18.0, "loss/logits": 0.22764952480793, "step": 488 }, { "epoch": 0.007039262964695721, "grad_norm": 0.1455078125, "grad_norm_var": 0.0001110623280207316, "learning_rate": 0.0001, "loss": 0.2581, "loss/crossentropy": 2.414512276649475, "loss/fcd": 0.5146484375, "loss/idx": 18.0, "loss/logits": 0.258076474070549, "step": 489 }, { "epoch": 0.00705365818548242, "grad_norm": 0.1123046875, "grad_norm_var": 0.00011261304219563802, "learning_rate": 0.0001, "loss": 0.1995, "loss/crossentropy": 2.2527265548706055, "loss/fcd": 0.3984375, "loss/idx": 18.0, "loss/logits": 0.19945065677165985, "step": 490 }, { "epoch": 0.0070680534062691185, "grad_norm": 0.1064453125, "grad_norm_var": 0.00011458297570546469, "learning_rate": 0.0001, "loss": 0.214, "loss/crossentropy": 2.3830225467681885, "loss/fcd": 0.4140625, "loss/idx": 18.0, "loss/logits": 0.21395207196474075, "step": 491 }, { "epoch": 0.007082448627055818, "grad_norm": 0.10400390625, "grad_norm_var": 0.00012197395165761312, "learning_rate": 0.0001, "loss": 0.2245, "loss/crossentropy": 2.578980803489685, "loss/fcd": 0.412109375, "loss/idx": 18.0, "loss/logits": 0.2245059311389923, "step": 492 }, { "epoch": 0.007096843847842516, "grad_norm": 0.11328125, "grad_norm_var": 8.859535058339437e-05, "learning_rate": 0.0001, "loss": 0.2001, "loss/crossentropy": 2.0505954027175903, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.2001277357339859, "step": 493 }, { "epoch": 0.007111239068629215, "grad_norm": 0.111328125, "grad_norm_var": 8.837381998697917e-05, "learning_rate": 0.0001, "loss": 0.2057, "loss/crossentropy": 2.2900065183639526, "loss/fcd": 0.4326171875, "loss/idx": 18.0, "loss/logits": 0.2057407721877098, "step": 494 }, { "epoch": 0.007125634289415914, "grad_norm": 0.09765625, "grad_norm_var": 0.00010617574055989583, "learning_rate": 0.0001, "loss": 0.2047, "loss/crossentropy": 2.6084879636764526, "loss/fcd": 0.4052734375, "loss/idx": 18.0, "loss/logits": 0.2047056257724762, "step": 495 }, { "epoch": 0.007140029510202613, "grad_norm": 0.10986328125, "grad_norm_var": 0.0001064211130142212, "learning_rate": 0.0001, "loss": 0.1886, "loss/crossentropy": 2.2180538177490234, "loss/fcd": 0.41015625, "loss/idx": 18.0, "loss/logits": 0.18857233971357346, "step": 496 }, { "epoch": 0.007154424730989311, "grad_norm": 0.10205078125, "grad_norm_var": 0.0001118302345275879, "learning_rate": 0.0001, "loss": 0.2011, "loss/crossentropy": 2.3378570079803467, "loss/fcd": 0.4169921875, "loss/idx": 18.0, "loss/logits": 0.20111830532550812, "step": 497 }, { "epoch": 0.00716881995177601, "grad_norm": 0.099609375, "grad_norm_var": 0.00011839866638183594, "learning_rate": 0.0001, "loss": 0.2105, "loss/crossentropy": 2.4460572004318237, "loss/fcd": 0.41796875, "loss/idx": 18.0, "loss/logits": 0.21054691076278687, "step": 498 }, { "epoch": 0.0071832151725627095, "grad_norm": 0.1025390625, "grad_norm_var": 0.00012238721052805582, "learning_rate": 0.0001, "loss": 0.189, "loss/crossentropy": 2.358466863632202, "loss/fcd": 0.3779296875, "loss/idx": 18.0, "loss/logits": 0.18903843313455582, "step": 499 }, { "epoch": 0.007197610393349408, "grad_norm": 0.0986328125, "grad_norm_var": 0.00012767215569814045, "learning_rate": 0.0001, "loss": 0.23, "loss/crossentropy": 2.604634642601013, "loss/fcd": 0.4287109375, "loss/idx": 18.0, "loss/logits": 0.23002738505601883, "step": 500 }, { "epoch": 0.007212005614136107, "grad_norm": 0.10546875, "grad_norm_var": 0.0001282016436258952, "learning_rate": 0.0001, "loss": 0.1989, "loss/crossentropy": 2.3237578868865967, "loss/fcd": 0.408203125, "loss/idx": 18.0, "loss/logits": 0.19885492324829102, "step": 501 }, { "epoch": 0.007226400834922805, "grad_norm": 0.1015625, "grad_norm_var": 0.00013103087743123373, "learning_rate": 0.0001, "loss": 0.2136, "loss/crossentropy": 2.434670090675354, "loss/fcd": 0.4296875, "loss/idx": 18.0, "loss/logits": 0.21359950304031372, "step": 502 }, { "epoch": 0.007240796055709505, "grad_norm": 0.111328125, "grad_norm_var": 0.0001281966765721639, "learning_rate": 0.0001, "loss": 0.2027, "loss/crossentropy": 2.3419206142425537, "loss/fcd": 0.4189453125, "loss/idx": 18.0, "loss/logits": 0.20269384235143661, "step": 503 }, { "epoch": 0.007255191276496203, "grad_norm": 0.1142578125, "grad_norm_var": 0.0001273860534032186, "learning_rate": 0.0001, "loss": 0.2168, "loss/crossentropy": 2.418344020843506, "loss/fcd": 0.4287109375, "loss/idx": 18.0, "loss/logits": 0.21677344292402267, "step": 504 }, { "epoch": 0.007269586497282902, "grad_norm": 0.1181640625, "grad_norm_var": 3.9155284563700356e-05, "learning_rate": 0.0001, "loss": 0.2208, "loss/crossentropy": 2.5992894172668457, "loss/fcd": 0.45703125, "loss/idx": 18.0, "loss/logits": 0.22083494067192078, "step": 505 }, { "epoch": 0.007283981718069601, "grad_norm": 0.10302734375, "grad_norm_var": 3.770192464192708e-05, "learning_rate": 0.0001, "loss": 0.2111, "loss/crossentropy": 2.3879592418670654, "loss/fcd": 0.451171875, "loss/idx": 18.0, "loss/logits": 0.21107713878154755, "step": 506 }, { "epoch": 0.0072983769388563, "grad_norm": 0.10546875, "grad_norm_var": 3.7729740142822266e-05, "learning_rate": 0.0001, "loss": 0.2234, "loss/crossentropy": 2.7517272233963013, "loss/fcd": 0.447265625, "loss/idx": 18.0, "loss/logits": 0.22336142510175705, "step": 507 }, { "epoch": 0.007312772159642999, "grad_norm": 0.10400390625, "grad_norm_var": 3.7729740142822266e-05, "learning_rate": 0.0001, "loss": 0.1838, "loss/crossentropy": 2.1463602781295776, "loss/fcd": 0.4033203125, "loss/idx": 18.0, "loss/logits": 0.1838330551981926, "step": 508 }, { "epoch": 0.007327167380429697, "grad_norm": 0.10107421875, "grad_norm_var": 3.542006015777588e-05, "learning_rate": 0.0001, "loss": 0.2068, "loss/crossentropy": 2.4218236207962036, "loss/fcd": 0.4130859375, "loss/idx": 18.0, "loss/logits": 0.2068256437778473, "step": 509 }, { "epoch": 0.007341562601216396, "grad_norm": 0.1201171875, "grad_norm_var": 4.722177982330322e-05, "learning_rate": 0.0001, "loss": 0.2242, "loss/crossentropy": 2.253819227218628, "loss/fcd": 0.447265625, "loss/idx": 18.0, "loss/logits": 0.22418855130672455, "step": 510 }, { "epoch": 0.007355957822003095, "grad_norm": 0.10302734375, "grad_norm_var": 4.3102105458577474e-05, "learning_rate": 0.0001, "loss": 0.2045, "loss/crossentropy": 2.1473891735076904, "loss/fcd": 0.40234375, "loss/idx": 18.0, "loss/logits": 0.2044747918844223, "step": 511 }, { "epoch": 0.007370353042789794, "grad_norm": 0.1220703125, "grad_norm_var": 5.827645460764567e-05, "learning_rate": 0.0001, "loss": 0.2399, "loss/crossentropy": 2.4559924602508545, "loss/fcd": 0.4638671875, "loss/idx": 18.0, "loss/logits": 0.23986588418483734, "step": 512 }, { "epoch": 0.007384748263576492, "grad_norm": 0.1240234375, "grad_norm_var": 7.387797037760417e-05, "learning_rate": 0.0001, "loss": 0.2335, "loss/crossentropy": 2.3832513093948364, "loss/fcd": 0.482421875, "loss/idx": 18.0, "loss/logits": 0.2334604561328888, "step": 513 }, { "epoch": 0.0073991434843631914, "grad_norm": 0.11083984375, "grad_norm_var": 6.859997908274333e-05, "learning_rate": 0.0001, "loss": 0.2134, "loss/crossentropy": 2.4991053342819214, "loss/fcd": 0.439453125, "loss/idx": 18.0, "loss/logits": 0.21339743584394455, "step": 514 }, { "epoch": 0.007413538705149891, "grad_norm": 0.1376953125, "grad_norm_var": 0.00011509160200754802, "learning_rate": 0.0001, "loss": 0.2837, "loss/crossentropy": 2.707633852958679, "loss/fcd": 0.525390625, "loss/idx": 18.0, "loss/logits": 0.28367944806814194, "step": 515 }, { "epoch": 0.007427933925936589, "grad_norm": 0.10791015625, "grad_norm_var": 0.00010480483373006184, "learning_rate": 0.0001, "loss": 0.2266, "loss/crossentropy": 2.4334722757339478, "loss/fcd": 0.4580078125, "loss/idx": 18.0, "loss/logits": 0.22656814754009247, "step": 516 }, { "epoch": 0.007442329146723288, "grad_norm": 0.107421875, "grad_norm_var": 0.00010337432225545247, "learning_rate": 0.0001, "loss": 0.2487, "loss/crossentropy": 2.794032335281372, "loss/fcd": 0.4853515625, "loss/idx": 18.0, "loss/logits": 0.2486870214343071, "step": 517 }, { "epoch": 0.0074567243675099865, "grad_norm": 0.09912109375, "grad_norm_var": 0.00010714431603749593, "learning_rate": 0.0001, "loss": 0.1797, "loss/crossentropy": 2.4037466049194336, "loss/fcd": 0.408203125, "loss/idx": 18.0, "loss/logits": 0.1796710044145584, "step": 518 }, { "epoch": 0.007471119588296686, "grad_norm": 0.1181640625, "grad_norm_var": 0.00010959208011627198, "learning_rate": 0.0001, "loss": 0.2219, "loss/crossentropy": 2.3638296127319336, "loss/fcd": 0.42578125, "loss/idx": 18.0, "loss/logits": 0.22193115949630737, "step": 519 }, { "epoch": 0.007485514809083384, "grad_norm": 0.109375, "grad_norm_var": 0.00010979076226552328, "learning_rate": 0.0001, "loss": 0.2223, "loss/crossentropy": 2.555932879447937, "loss/fcd": 0.458984375, "loss/idx": 18.0, "loss/logits": 0.22230461984872818, "step": 520 }, { "epoch": 0.007499910029870083, "grad_norm": 0.1083984375, "grad_norm_var": 0.00010768473148345948, "learning_rate": 0.0001, "loss": 0.2183, "loss/crossentropy": 2.5372231006622314, "loss/fcd": 0.4541015625, "loss/idx": 18.0, "loss/logits": 0.21830307692289352, "step": 521 }, { "epoch": 0.0075143052506567816, "grad_norm": 0.11279296875, "grad_norm_var": 0.00010279715061187745, "learning_rate": 0.0001, "loss": 0.2068, "loss/crossentropy": 2.566136121749878, "loss/fcd": 0.4287109375, "loss/idx": 18.0, "loss/logits": 0.20676826685667038, "step": 522 }, { "epoch": 0.007528700471443481, "grad_norm": 0.119140625, "grad_norm_var": 0.00010263025760650634, "learning_rate": 0.0001, "loss": 0.2323, "loss/crossentropy": 2.4165902137756348, "loss/fcd": 0.5078125, "loss/idx": 18.0, "loss/logits": 0.23230554163455963, "step": 523 }, { "epoch": 0.00754309569223018, "grad_norm": 0.11474609375, "grad_norm_var": 9.721020857493083e-05, "learning_rate": 0.0001, "loss": 0.2198, "loss/crossentropy": 2.5744664669036865, "loss/fcd": 0.4638671875, "loss/idx": 18.0, "loss/logits": 0.21984682232141495, "step": 524 }, { "epoch": 0.007557490913016878, "grad_norm": 0.1103515625, "grad_norm_var": 8.722543716430665e-05, "learning_rate": 0.0001, "loss": 0.2083, "loss/crossentropy": 2.0694758892059326, "loss/fcd": 0.4228515625, "loss/idx": 18.0, "loss/logits": 0.20834185183048248, "step": 525 }, { "epoch": 0.0075718861338035775, "grad_norm": 0.107421875, "grad_norm_var": 8.707046508789062e-05, "learning_rate": 0.0001, "loss": 0.2306, "loss/crossentropy": 2.5832005739212036, "loss/fcd": 0.474609375, "loss/idx": 18.0, "loss/logits": 0.23062562197446823, "step": 526 }, { "epoch": 0.007586281354590276, "grad_norm": 0.11572265625, "grad_norm_var": 7.978677749633789e-05, "learning_rate": 0.0001, "loss": 0.2216, "loss/crossentropy": 2.38311767578125, "loss/fcd": 0.431640625, "loss/idx": 18.0, "loss/logits": 0.2215922325849533, "step": 527 }, { "epoch": 0.007600676575376975, "grad_norm": 0.1279296875, "grad_norm_var": 8.81791114807129e-05, "learning_rate": 0.0001, "loss": 0.1902, "loss/crossentropy": 1.8877107501029968, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.19022603332996368, "step": 528 }, { "epoch": 0.007615071796163673, "grad_norm": 0.12060546875, "grad_norm_var": 8.454223473866781e-05, "learning_rate": 0.0001, "loss": 0.2042, "loss/crossentropy": 2.158120632171631, "loss/fcd": 0.40234375, "loss/idx": 18.0, "loss/logits": 0.20415493100881577, "step": 529 }, { "epoch": 0.0076294670169503725, "grad_norm": 0.10498046875, "grad_norm_var": 8.933444817860921e-05, "learning_rate": 0.0001, "loss": 0.2037, "loss/crossentropy": 2.460996627807617, "loss/fcd": 0.431640625, "loss/idx": 18.0, "loss/logits": 0.20372942835092545, "step": 530 }, { "epoch": 0.007643862237737071, "grad_norm": 0.11181640625, "grad_norm_var": 4.8951307932535806e-05, "learning_rate": 0.0001, "loss": 0.2132, "loss/crossentropy": 2.226336717605591, "loss/fcd": 0.4189453125, "loss/idx": 18.0, "loss/logits": 0.21315942704677582, "step": 531 }, { "epoch": 0.00765825745852377, "grad_norm": 0.11181640625, "grad_norm_var": 4.7647953033447264e-05, "learning_rate": 0.0001, "loss": 0.2321, "loss/crossentropy": 2.446126341819763, "loss/fcd": 0.462890625, "loss/idx": 18.0, "loss/logits": 0.2321249470114708, "step": 532 }, { "epoch": 0.007672652679310469, "grad_norm": 0.1044921875, "grad_norm_var": 5.016326904296875e-05, "learning_rate": 0.0001, "loss": 0.2082, "loss/crossentropy": 2.3785619735717773, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.208193838596344, "step": 533 }, { "epoch": 0.007687047900097168, "grad_norm": 0.107421875, "grad_norm_var": 3.9878487586975095e-05, "learning_rate": 0.0001, "loss": 0.2113, "loss/crossentropy": 2.2807793617248535, "loss/fcd": 0.427734375, "loss/idx": 18.0, "loss/logits": 0.21130456030368805, "step": 534 }, { "epoch": 0.007701443120883867, "grad_norm": 0.10888671875, "grad_norm_var": 3.865162531534831e-05, "learning_rate": 0.0001, "loss": 0.231, "loss/crossentropy": 2.65705668926239, "loss/fcd": 0.4619140625, "loss/idx": 18.0, "loss/logits": 0.23100796341896057, "step": 535 }, { "epoch": 0.007715838341670565, "grad_norm": 0.11669921875, "grad_norm_var": 3.920296827952067e-05, "learning_rate": 0.0001, "loss": 0.2427, "loss/crossentropy": 2.493618130683899, "loss/fcd": 0.474609375, "loss/idx": 18.0, "loss/logits": 0.24267538636922836, "step": 536 }, { "epoch": 0.007730233562457264, "grad_norm": 0.1025390625, "grad_norm_var": 4.4710437456766765e-05, "learning_rate": 0.0001, "loss": 0.2032, "loss/crossentropy": 2.3469570875167847, "loss/fcd": 0.3955078125, "loss/idx": 18.0, "loss/logits": 0.2031807154417038, "step": 537 }, { "epoch": 0.007744628783243963, "grad_norm": 0.1201171875, "grad_norm_var": 4.851023356119792e-05, "learning_rate": 0.0001, "loss": 0.2267, "loss/crossentropy": 2.262601613998413, "loss/fcd": 0.42578125, "loss/idx": 18.0, "loss/logits": 0.22673919051885605, "step": 538 }, { "epoch": 0.007759024004030662, "grad_norm": 0.10693359375, "grad_norm_var": 4.749198754628499e-05, "learning_rate": 0.0001, "loss": 0.2243, "loss/crossentropy": 2.8090314865112305, "loss/fcd": 0.4482421875, "loss/idx": 18.0, "loss/logits": 0.22425533086061478, "step": 539 }, { "epoch": 0.00777341922481736, "grad_norm": 0.10791015625, "grad_norm_var": 4.793703556060791e-05, "learning_rate": 0.0001, "loss": 0.2073, "loss/crossentropy": 2.41420841217041, "loss/fcd": 0.4091796875, "loss/idx": 18.0, "loss/logits": 0.20730414986610413, "step": 540 }, { "epoch": 0.007787814445604059, "grad_norm": 0.10546875, "grad_norm_var": 5.024174849192301e-05, "learning_rate": 0.0001, "loss": 0.1982, "loss/crossentropy": 2.338608145713806, "loss/fcd": 0.4267578125, "loss/idx": 18.0, "loss/logits": 0.19816286861896515, "step": 541 }, { "epoch": 0.007802209666390759, "grad_norm": 0.10498046875, "grad_norm_var": 5.1875909169514976e-05, "learning_rate": 0.0001, "loss": 0.1913, "loss/crossentropy": 2.199298143386841, "loss/fcd": 0.3994140625, "loss/idx": 18.0, "loss/logits": 0.19127565622329712, "step": 542 }, { "epoch": 0.007816604887177458, "grad_norm": 0.1064453125, "grad_norm_var": 5.159278710683187e-05, "learning_rate": 0.0001, "loss": 0.2308, "loss/crossentropy": 2.403664708137512, "loss/fcd": 0.4345703125, "loss/idx": 18.0, "loss/logits": 0.23076358437538147, "step": 543 }, { "epoch": 0.007831000107964156, "grad_norm": 0.10693359375, "grad_norm_var": 3.05334726969401e-05, "learning_rate": 0.0001, "loss": 0.2145, "loss/crossentropy": 2.511462450027466, "loss/fcd": 0.4169921875, "loss/idx": 18.0, "loss/logits": 0.21448855847120285, "step": 544 }, { "epoch": 0.007845395328750854, "grad_norm": 0.10205078125, "grad_norm_var": 2.3965040842692057e-05, "learning_rate": 0.0001, "loss": 0.2069, "loss/crossentropy": 2.543255090713501, "loss/fcd": 0.4072265625, "loss/idx": 18.0, "loss/logits": 0.20689202100038528, "step": 545 }, { "epoch": 0.007859790549537553, "grad_norm": 0.09765625, "grad_norm_var": 3.03576389948527e-05, "learning_rate": 0.0001, "loss": 0.2131, "loss/crossentropy": 2.618165135383606, "loss/fcd": 0.431640625, "loss/idx": 18.0, "loss/logits": 0.2130560278892517, "step": 546 }, { "epoch": 0.007874185770324253, "grad_norm": 0.10888671875, "grad_norm_var": 2.9260913530985515e-05, "learning_rate": 0.0001, "loss": 0.205, "loss/crossentropy": 2.3171777725219727, "loss/fcd": 0.439453125, "loss/idx": 18.0, "loss/logits": 0.20504355430603027, "step": 547 }, { "epoch": 0.007888580991110951, "grad_norm": 0.10888671875, "grad_norm_var": 2.8092662493387858e-05, "learning_rate": 0.0001, "loss": 0.2055, "loss/crossentropy": 2.3242313861846924, "loss/fcd": 0.4365234375, "loss/idx": 18.0, "loss/logits": 0.20549335330724716, "step": 548 }, { "epoch": 0.00790297621189765, "grad_norm": 0.287109375, "grad_norm_var": 0.0020447880029678344, "learning_rate": 0.0001, "loss": 0.2562, "loss/crossentropy": 2.264755129814148, "loss/fcd": 0.548828125, "loss/idx": 18.0, "loss/logits": 0.2562015801668167, "step": 549 }, { "epoch": 0.00791737143268435, "grad_norm": 0.109375, "grad_norm_var": 0.002042093873023987, "learning_rate": 0.0001, "loss": 0.1887, "loss/crossentropy": 2.053212523460388, "loss/fcd": 0.3916015625, "loss/idx": 18.0, "loss/logits": 0.18873201310634613, "step": 550 }, { "epoch": 0.007931766653471048, "grad_norm": 0.109375, "grad_norm_var": 0.002041463057200114, "learning_rate": 0.0001, "loss": 0.1847, "loss/crossentropy": 2.1234816908836365, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.18469391763210297, "step": 551 }, { "epoch": 0.007946161874257746, "grad_norm": 0.1044921875, "grad_norm_var": 0.0020542532205581666, "learning_rate": 0.0001, "loss": 0.2206, "loss/crossentropy": 2.5538755655288696, "loss/fcd": 0.443359375, "loss/idx": 18.0, "loss/logits": 0.22058459371328354, "step": 552 }, { "epoch": 0.007960557095044445, "grad_norm": 0.10205078125, "grad_norm_var": 0.0020552794138590496, "learning_rate": 0.0001, "loss": 0.2014, "loss/crossentropy": 2.2996666431427, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.20140548795461655, "step": 553 }, { "epoch": 0.007974952315831145, "grad_norm": 0.1162109375, "grad_norm_var": 0.0020551522572835284, "learning_rate": 0.0001, "loss": 0.1931, "loss/crossentropy": 2.076995849609375, "loss/fcd": 0.3876953125, "loss/idx": 18.0, "loss/logits": 0.19310477375984192, "step": 554 }, { "epoch": 0.007989347536617843, "grad_norm": 0.10107421875, "grad_norm_var": 0.0020657857259114582, "learning_rate": 0.0001, "loss": 0.2072, "loss/crossentropy": 2.5844489336013794, "loss/fcd": 0.4453125, "loss/idx": 18.0, "loss/logits": 0.20723149180412292, "step": 555 }, { "epoch": 0.008003742757404541, "grad_norm": 0.1181640625, "grad_norm_var": 0.0020593394835789996, "learning_rate": 0.0001, "loss": 0.2353, "loss/crossentropy": 2.580026626586914, "loss/fcd": 0.4541015625, "loss/idx": 18.0, "loss/logits": 0.23526855558156967, "step": 556 }, { "epoch": 0.00801813797819124, "grad_norm": 0.1005859375, "grad_norm_var": 0.0020690351724624635, "learning_rate": 0.0001, "loss": 0.2148, "loss/crossentropy": 2.526800036430359, "loss/fcd": 0.4296875, "loss/idx": 18.0, "loss/logits": 0.21484342962503433, "step": 557 }, { "epoch": 0.00803253319897794, "grad_norm": 0.10400390625, "grad_norm_var": 0.002070759733517965, "learning_rate": 0.0001, "loss": 0.203, "loss/crossentropy": 2.459173560142517, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.2029871866106987, "step": 558 }, { "epoch": 0.008046928419764638, "grad_norm": 0.11865234375, "grad_norm_var": 0.002061744530995687, "learning_rate": 0.0001, "loss": 0.238, "loss/crossentropy": 2.560517430305481, "loss/fcd": 0.4775390625, "loss/idx": 18.0, "loss/logits": 0.23796609044075012, "step": 559 }, { "epoch": 0.008061323640551336, "grad_norm": 0.1044921875, "grad_norm_var": 0.002065872152646383, "learning_rate": 0.0001, "loss": 0.203, "loss/crossentropy": 2.476174235343933, "loss/fcd": 0.43359375, "loss/idx": 18.0, "loss/logits": 0.20299049466848373, "step": 560 }, { "epoch": 0.008075718861338036, "grad_norm": 0.10986328125, "grad_norm_var": 0.0020527432362238566, "learning_rate": 0.0001, "loss": 0.1991, "loss/crossentropy": 2.5808521509170532, "loss/fcd": 0.4267578125, "loss/idx": 18.0, "loss/logits": 0.19911371916532516, "step": 561 }, { "epoch": 0.008090114082124735, "grad_norm": 0.1025390625, "grad_norm_var": 0.002040464679400126, "learning_rate": 0.0001, "loss": 0.1829, "loss/crossentropy": 2.2305572628974915, "loss/fcd": 0.3935546875, "loss/idx": 18.0, "loss/logits": 0.182855024933815, "step": 562 }, { "epoch": 0.008104509302911433, "grad_norm": 0.10302734375, "grad_norm_var": 0.0020505974690119425, "learning_rate": 0.0001, "loss": 0.197, "loss/crossentropy": 2.2332805395126343, "loss/fcd": 0.40625, "loss/idx": 18.0, "loss/logits": 0.19702833145856857, "step": 563 }, { "epoch": 0.008118904523698131, "grad_norm": 0.11279296875, "grad_norm_var": 0.002046417196591695, "learning_rate": 0.0001, "loss": 0.2281, "loss/crossentropy": 2.634607434272766, "loss/fcd": 0.4599609375, "loss/idx": 18.0, "loss/logits": 0.2280866503715515, "step": 564 }, { "epoch": 0.008133299744484832, "grad_norm": 0.11865234375, "grad_norm_var": 4.386504491170247e-05, "learning_rate": 0.0001, "loss": 0.2263, "loss/crossentropy": 2.566808342933655, "loss/fcd": 0.458984375, "loss/idx": 18.0, "loss/logits": 0.22630243003368378, "step": 565 }, { "epoch": 0.00814769496527153, "grad_norm": 0.1064453125, "grad_norm_var": 4.404385884602864e-05, "learning_rate": 0.0001, "loss": 0.2021, "loss/crossentropy": 2.404551863670349, "loss/fcd": 0.4326171875, "loss/idx": 18.0, "loss/logits": 0.20205579698085785, "step": 566 }, { "epoch": 0.008162090186058228, "grad_norm": 0.10791015625, "grad_norm_var": 4.396339257558187e-05, "learning_rate": 0.0001, "loss": 0.1902, "loss/crossentropy": 2.052983283996582, "loss/fcd": 0.4033203125, "loss/idx": 18.0, "loss/logits": 0.19023562967777252, "step": 567 }, { "epoch": 0.008176485406844928, "grad_norm": 0.1015625, "grad_norm_var": 4.5942266782124835e-05, "learning_rate": 0.0001, "loss": 0.1904, "loss/crossentropy": 2.1084887981414795, "loss/fcd": 0.3916015625, "loss/idx": 18.0, "loss/logits": 0.1903728023171425, "step": 568 }, { "epoch": 0.008190880627631627, "grad_norm": 0.107421875, "grad_norm_var": 4.348357518513997e-05, "learning_rate": 0.0001, "loss": 0.2044, "loss/crossentropy": 2.528154492378235, "loss/fcd": 0.455078125, "loss/idx": 18.0, "loss/logits": 0.20440030097961426, "step": 569 }, { "epoch": 0.008205275848418325, "grad_norm": 0.10400390625, "grad_norm_var": 3.998180230458577e-05, "learning_rate": 0.0001, "loss": 0.2031, "loss/crossentropy": 2.2640358209609985, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.20313097536563873, "step": 570 }, { "epoch": 0.008219671069205023, "grad_norm": 0.125, "grad_norm_var": 5.50230344136556e-05, "learning_rate": 0.0001, "loss": 0.2647, "loss/crossentropy": 2.533176898956299, "loss/fcd": 0.5107421875, "loss/idx": 18.0, "loss/logits": 0.2647128999233246, "step": 571 }, { "epoch": 0.008234066289991723, "grad_norm": 0.10693359375, "grad_norm_var": 4.928807417551676e-05, "learning_rate": 0.0001, "loss": 0.2343, "loss/crossentropy": 2.5634536743164062, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.23427864164113998, "step": 572 }, { "epoch": 0.008248461510778422, "grad_norm": 0.10595703125, "grad_norm_var": 4.5518080393473305e-05, "learning_rate": 0.0001, "loss": 0.2025, "loss/crossentropy": 2.4560309648513794, "loss/fcd": 0.4296875, "loss/idx": 18.0, "loss/logits": 0.20253371447324753, "step": 573 }, { "epoch": 0.00826285673156512, "grad_norm": 0.1015625, "grad_norm_var": 4.742046197255453e-05, "learning_rate": 0.0001, "loss": 0.1913, "loss/crossentropy": 2.2056825160980225, "loss/fcd": 0.392578125, "loss/idx": 18.0, "loss/logits": 0.1913457065820694, "step": 574 }, { "epoch": 0.00827725195235182, "grad_norm": 0.10205078125, "grad_norm_var": 4.228651523590088e-05, "learning_rate": 0.0001, "loss": 0.2063, "loss/crossentropy": 2.330709218978882, "loss/fcd": 0.41796875, "loss/idx": 18.0, "loss/logits": 0.20634697377681732, "step": 575 }, { "epoch": 0.008291647173138518, "grad_norm": 0.11083984375, "grad_norm_var": 4.224777221679687e-05, "learning_rate": 0.0001, "loss": 0.1815, "loss/crossentropy": 1.9496164321899414, "loss/fcd": 0.3935546875, "loss/idx": 18.0, "loss/logits": 0.18145756423473358, "step": 576 }, { "epoch": 0.008306042393925217, "grad_norm": 0.1083984375, "grad_norm_var": 4.2000412940979e-05, "learning_rate": 0.0001, "loss": 0.2194, "loss/crossentropy": 2.6926685571670532, "loss/fcd": 0.4365234375, "loss/idx": 18.0, "loss/logits": 0.21936995536088943, "step": 577 }, { "epoch": 0.008320437614711915, "grad_norm": 0.1103515625, "grad_norm_var": 4.031558831532796e-05, "learning_rate": 0.0001, "loss": 0.1976, "loss/crossentropy": 2.299630641937256, "loss/fcd": 0.4072265625, "loss/idx": 18.0, "loss/logits": 0.1975780501961708, "step": 578 }, { "epoch": 0.008334832835498615, "grad_norm": 0.1044921875, "grad_norm_var": 3.9418538411458336e-05, "learning_rate": 0.0001, "loss": 0.1928, "loss/crossentropy": 2.3579829931259155, "loss/fcd": 0.4296875, "loss/idx": 18.0, "loss/logits": 0.19275517761707306, "step": 579 }, { "epoch": 0.008349228056285313, "grad_norm": 0.1025390625, "grad_norm_var": 3.998180230458577e-05, "learning_rate": 0.0001, "loss": 0.1892, "loss/crossentropy": 2.1995487213134766, "loss/fcd": 0.404296875, "loss/idx": 18.0, "loss/logits": 0.18920866400003433, "step": 580 }, { "epoch": 0.008363623277072012, "grad_norm": 0.10595703125, "grad_norm_var": 3.161331017812093e-05, "learning_rate": 0.0001, "loss": 0.2302, "loss/crossentropy": 2.54054057598114, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.23019887506961823, "step": 581 }, { "epoch": 0.00837801849785871, "grad_norm": 0.11279296875, "grad_norm_var": 3.3692518870035806e-05, "learning_rate": 0.0001, "loss": 0.2467, "loss/crossentropy": 2.5756444931030273, "loss/fcd": 0.490234375, "loss/idx": 18.0, "loss/logits": 0.2466834932565689, "step": 582 }, { "epoch": 0.00839241371864541, "grad_norm": 0.1259765625, "grad_norm_var": 5.541543165842692e-05, "learning_rate": 0.0001, "loss": 0.217, "loss/crossentropy": 2.225999653339386, "loss/fcd": 0.45703125, "loss/idx": 18.0, "loss/logits": 0.21695519983768463, "step": 583 }, { "epoch": 0.008406808939432108, "grad_norm": 0.1083984375, "grad_norm_var": 5.202194054921468e-05, "learning_rate": 0.0001, "loss": 0.2196, "loss/crossentropy": 2.315016746520996, "loss/fcd": 0.4287109375, "loss/idx": 18.0, "loss/logits": 0.21962474286556244, "step": 584 }, { "epoch": 0.008421204160218807, "grad_norm": 0.10595703125, "grad_norm_var": 5.244811375935872e-05, "learning_rate": 0.0001, "loss": 0.2172, "loss/crossentropy": 2.390581250190735, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.21722210943698883, "step": 585 }, { "epoch": 0.008435599381005507, "grad_norm": 0.10498046875, "grad_norm_var": 5.18798828125e-05, "learning_rate": 0.0001, "loss": 0.202, "loss/crossentropy": 2.451754093170166, "loss/fcd": 0.412109375, "loss/idx": 18.0, "loss/logits": 0.20203383266925812, "step": 586 }, { "epoch": 0.008449994601792205, "grad_norm": 0.1279296875, "grad_norm_var": 5.8710575103759766e-05, "learning_rate": 0.0001, "loss": 0.2503, "loss/crossentropy": 2.1609503030776978, "loss/fcd": 0.4873046875, "loss/idx": 18.0, "loss/logits": 0.2503489702939987, "step": 587 }, { "epoch": 0.008464389822578904, "grad_norm": 0.10888671875, "grad_norm_var": 5.839268366495768e-05, "learning_rate": 0.0001, "loss": 0.1994, "loss/crossentropy": 2.347867727279663, "loss/fcd": 0.41796875, "loss/idx": 18.0, "loss/logits": 0.1994006633758545, "step": 588 }, { "epoch": 0.008478785043365602, "grad_norm": 0.10400390625, "grad_norm_var": 5.947351455688476e-05, "learning_rate": 0.0001, "loss": 0.2185, "loss/crossentropy": 2.502691388130188, "loss/fcd": 0.4140625, "loss/idx": 18.0, "loss/logits": 0.2184857726097107, "step": 589 }, { "epoch": 0.008493180264152302, "grad_norm": 0.1083984375, "grad_norm_var": 5.555152893066406e-05, "learning_rate": 0.0001, "loss": 0.2103, "loss/crossentropy": 2.1813002228736877, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.21025604009628296, "step": 590 }, { "epoch": 0.008507575484939, "grad_norm": 0.10595703125, "grad_norm_var": 5.262692769368489e-05, "learning_rate": 0.0001, "loss": 0.2165, "loss/crossentropy": 2.544050931930542, "loss/fcd": 0.427734375, "loss/idx": 18.0, "loss/logits": 0.21648868918418884, "step": 591 }, { "epoch": 0.008521970705725699, "grad_norm": 0.126953125, "grad_norm_var": 7.121463616689046e-05, "learning_rate": 0.0001, "loss": 0.2075, "loss/crossentropy": 2.072811484336853, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.20745252817869186, "step": 592 }, { "epoch": 0.008536365926512399, "grad_norm": 0.09375, "grad_norm_var": 8.921523888905843e-05, "learning_rate": 0.0001, "loss": 0.1907, "loss/crossentropy": 2.657747268676758, "loss/fcd": 0.423828125, "loss/idx": 18.0, "loss/logits": 0.1907452642917633, "step": 593 }, { "epoch": 0.008550761147299097, "grad_norm": 0.18359375, "grad_norm_var": 0.0004295577605565389, "learning_rate": 0.0001, "loss": 0.2969, "loss/crossentropy": 2.365026593208313, "loss/fcd": 0.5234375, "loss/idx": 18.0, "loss/logits": 0.29685717821121216, "step": 594 }, { "epoch": 0.008565156368085795, "grad_norm": 0.10888671875, "grad_norm_var": 0.00042495330174764, "learning_rate": 0.0001, "loss": 0.2275, "loss/crossentropy": 2.617425799369812, "loss/fcd": 0.455078125, "loss/idx": 18.0, "loss/logits": 0.2274792492389679, "step": 595 }, { "epoch": 0.008579551588872494, "grad_norm": 0.099609375, "grad_norm_var": 0.0004302342732747396, "learning_rate": 0.0001, "loss": 0.2138, "loss/crossentropy": 2.589759111404419, "loss/fcd": 0.44140625, "loss/idx": 18.0, "loss/logits": 0.2137622982263565, "step": 596 }, { "epoch": 0.008593946809659194, "grad_norm": 0.123046875, "grad_norm_var": 0.00042901734511057533, "learning_rate": 0.0001, "loss": 0.2186, "loss/crossentropy": 2.165451228618622, "loss/fcd": 0.443359375, "loss/idx": 18.0, "loss/logits": 0.21863602101802826, "step": 597 }, { "epoch": 0.008608342030445892, "grad_norm": 0.111328125, "grad_norm_var": 0.00042969385782877604, "learning_rate": 0.0001, "loss": 0.2004, "loss/crossentropy": 2.421903610229492, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.20041261613368988, "step": 598 }, { "epoch": 0.00862273725123259, "grad_norm": 0.11474609375, "grad_norm_var": 0.0004218568404515584, "learning_rate": 0.0001, "loss": 0.249, "loss/crossentropy": 2.555266857147217, "loss/fcd": 0.478515625, "loss/idx": 18.0, "loss/logits": 0.2490156590938568, "step": 599 }, { "epoch": 0.008637132472019289, "grad_norm": 0.111328125, "grad_norm_var": 0.00041990180810292564, "learning_rate": 0.0001, "loss": 0.2241, "loss/crossentropy": 2.4431397914886475, "loss/fcd": 0.439453125, "loss/idx": 18.0, "loss/logits": 0.22414565831422806, "step": 600 }, { "epoch": 0.008651527692805989, "grad_norm": 0.12158203125, "grad_norm_var": 0.0004164050022761027, "learning_rate": 0.0001, "loss": 0.1972, "loss/crossentropy": 2.086324453353882, "loss/fcd": 0.4306640625, "loss/idx": 18.0, "loss/logits": 0.19717370718717575, "step": 601 }, { "epoch": 0.008665922913592687, "grad_norm": 0.10302734375, "grad_norm_var": 0.000419496496518453, "learning_rate": 0.0001, "loss": 0.1806, "loss/crossentropy": 2.2410671710968018, "loss/fcd": 0.3984375, "loss/idx": 18.0, "loss/logits": 0.18064773827791214, "step": 602 }, { "epoch": 0.008680318134379385, "grad_norm": 0.10205078125, "grad_norm_var": 0.00041954914728800456, "learning_rate": 0.0001, "loss": 0.214, "loss/crossentropy": 2.3243794441223145, "loss/fcd": 0.4345703125, "loss/idx": 18.0, "loss/logits": 0.2140304446220398, "step": 603 }, { "epoch": 0.008694713355166086, "grad_norm": 0.12353515625, "grad_norm_var": 0.0004225889841715495, "learning_rate": 0.0001, "loss": 0.2533, "loss/crossentropy": 2.4268319606781006, "loss/fcd": 0.48828125, "loss/idx": 18.0, "loss/logits": 0.25334879010915756, "step": 604 }, { "epoch": 0.008709108575952784, "grad_norm": 0.10009765625, "grad_norm_var": 0.00042932828267415365, "learning_rate": 0.0001, "loss": 0.1851, "loss/crossentropy": 2.3854864835739136, "loss/fcd": 0.408203125, "loss/idx": 18.0, "loss/logits": 0.18514161556959152, "step": 605 }, { "epoch": 0.008723503796739482, "grad_norm": 0.10400390625, "grad_norm_var": 0.0004343261321385702, "learning_rate": 0.0001, "loss": 0.1993, "loss/crossentropy": 2.385258913040161, "loss/fcd": 0.4482421875, "loss/idx": 18.0, "loss/logits": 0.19932958483695984, "step": 606 }, { "epoch": 0.00873789901752618, "grad_norm": 0.1064453125, "grad_norm_var": 0.00043377876281738283, "learning_rate": 0.0001, "loss": 0.2085, "loss/crossentropy": 2.4880837202072144, "loss/fcd": 0.4326171875, "loss/idx": 18.0, "loss/logits": 0.20852985978126526, "step": 607 }, { "epoch": 0.00875229423831288, "grad_norm": 0.111328125, "grad_norm_var": 0.0004233519236246745, "learning_rate": 0.0001, "loss": 0.2413, "loss/crossentropy": 2.486106753349304, "loss/fcd": 0.4619140625, "loss/idx": 18.0, "loss/logits": 0.2413138523697853, "step": 608 }, { "epoch": 0.008766689459099579, "grad_norm": 0.09912109375, "grad_norm_var": 0.0004109054803848267, "learning_rate": 0.0001, "loss": 0.2237, "loss/crossentropy": 2.713275671005249, "loss/fcd": 0.43359375, "loss/idx": 18.0, "loss/logits": 0.22369590401649475, "step": 609 }, { "epoch": 0.008781084679886277, "grad_norm": 0.10205078125, "grad_norm_var": 6.965001424153646e-05, "learning_rate": 0.0001, "loss": 0.2017, "loss/crossentropy": 2.4143831729888916, "loss/fcd": 0.4013671875, "loss/idx": 18.0, "loss/logits": 0.2017301544547081, "step": 610 }, { "epoch": 0.008795479900672977, "grad_norm": 0.10302734375, "grad_norm_var": 7.179578145345052e-05, "learning_rate": 0.0001, "loss": 0.2116, "loss/crossentropy": 2.3723723888397217, "loss/fcd": 0.427734375, "loss/idx": 18.0, "loss/logits": 0.21161457151174545, "step": 611 }, { "epoch": 0.008809875121459676, "grad_norm": 0.095703125, "grad_norm_var": 7.739067077636719e-05, "learning_rate": 0.0001, "loss": 0.1875, "loss/crossentropy": 2.3493517637252808, "loss/fcd": 0.416015625, "loss/idx": 18.0, "loss/logits": 0.18747683614492416, "step": 612 }, { "epoch": 0.008824270342246374, "grad_norm": 0.0986328125, "grad_norm_var": 6.656249364217122e-05, "learning_rate": 0.0001, "loss": 0.1934, "loss/crossentropy": 2.484821081161499, "loss/fcd": 0.4384765625, "loss/idx": 18.0, "loss/logits": 0.1934322491288185, "step": 613 }, { "epoch": 0.008838665563033072, "grad_norm": 0.2138671875, "grad_norm_var": 0.000786288579305013, "learning_rate": 0.0001, "loss": 0.2388, "loss/crossentropy": 2.2311092615127563, "loss/fcd": 0.521484375, "loss/idx": 18.0, "loss/logits": 0.23880772292613983, "step": 614 }, { "epoch": 0.008853060783819772, "grad_norm": 0.1142578125, "grad_norm_var": 0.0007862001657485962, "learning_rate": 0.0001, "loss": 0.209, "loss/crossentropy": 2.1388099193573, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.20903942734003067, "step": 615 }, { "epoch": 0.00886745600460647, "grad_norm": 0.1396484375, "grad_norm_var": 0.0008295287688573201, "learning_rate": 0.0001, "loss": 0.2045, "loss/crossentropy": 2.118674635887146, "loss/fcd": 0.501953125, "loss/idx": 18.0, "loss/logits": 0.20450318604707718, "step": 616 }, { "epoch": 0.008881851225393169, "grad_norm": 0.1025390625, "grad_norm_var": 0.0008352239926656087, "learning_rate": 0.0001, "loss": 0.1834, "loss/crossentropy": 2.149811267852783, "loss/fcd": 0.3935546875, "loss/idx": 18.0, "loss/logits": 0.18339695036411285, "step": 617 }, { "epoch": 0.008896246446179869, "grad_norm": 0.12109375, "grad_norm_var": 0.0008298943440119426, "learning_rate": 0.0001, "loss": 0.2338, "loss/crossentropy": 2.324687123298645, "loss/fcd": 0.4658203125, "loss/idx": 18.0, "loss/logits": 0.23375140875577927, "step": 618 }, { "epoch": 0.008910641666966567, "grad_norm": 0.1162109375, "grad_norm_var": 0.0008182843526204427, "learning_rate": 0.0001, "loss": 0.2211, "loss/crossentropy": 2.2215802669525146, "loss/fcd": 0.447265625, "loss/idx": 18.0, "loss/logits": 0.22114143520593643, "step": 619 }, { "epoch": 0.008925036887753266, "grad_norm": 0.0986328125, "grad_norm_var": 0.0008311023314793905, "learning_rate": 0.0001, "loss": 0.2057, "loss/crossentropy": 2.406686782836914, "loss/fcd": 0.42578125, "loss/idx": 18.0, "loss/logits": 0.20571539551019669, "step": 620 }, { "epoch": 0.008939432108539964, "grad_norm": 0.115234375, "grad_norm_var": 0.0008170286814371745, "learning_rate": 0.0001, "loss": 0.2058, "loss/crossentropy": 2.327828884124756, "loss/fcd": 0.4208984375, "loss/idx": 18.0, "loss/logits": 0.2057729959487915, "step": 621 }, { "epoch": 0.008953827329326664, "grad_norm": 0.1044921875, "grad_norm_var": 0.0008163203795750936, "learning_rate": 0.0001, "loss": 0.2041, "loss/crossentropy": 2.394818425178528, "loss/fcd": 0.4228515625, "loss/idx": 18.0, "loss/logits": 0.2041458711028099, "step": 622 }, { "epoch": 0.008968222550113363, "grad_norm": 0.09765625, "grad_norm_var": 0.0008313407500584921, "learning_rate": 0.0001, "loss": 0.1949, "loss/crossentropy": 2.384241223335266, "loss/fcd": 0.4375, "loss/idx": 18.0, "loss/logits": 0.19486035406589508, "step": 623 }, { "epoch": 0.00898261777090006, "grad_norm": 0.1025390625, "grad_norm_var": 0.0008399953444798787, "learning_rate": 0.0001, "loss": 0.2079, "loss/crossentropy": 2.4613648653030396, "loss/fcd": 0.427734375, "loss/idx": 18.0, "loss/logits": 0.20785125344991684, "step": 624 }, { "epoch": 0.00899701299168676, "grad_norm": 0.123046875, "grad_norm_var": 0.0008281668027242025, "learning_rate": 0.0001, "loss": 0.2737, "loss/crossentropy": 2.572801351547241, "loss/fcd": 0.54296875, "loss/idx": 18.0, "loss/logits": 0.27374986559152603, "step": 625 }, { "epoch": 0.00901140821247346, "grad_norm": 0.123046875, "grad_norm_var": 0.0008179575204849243, "learning_rate": 0.0001, "loss": 0.2015, "loss/crossentropy": 1.862765610218048, "loss/fcd": 0.4208984375, "loss/idx": 18.0, "loss/logits": 0.2015407457947731, "step": 626 }, { "epoch": 0.009025803433260158, "grad_norm": 0.107421875, "grad_norm_var": 0.0008110642433166504, "learning_rate": 0.0001, "loss": 0.2186, "loss/crossentropy": 2.5048106908798218, "loss/fcd": 0.4541015625, "loss/idx": 18.0, "loss/logits": 0.21860718727111816, "step": 627 }, { "epoch": 0.009040198654046856, "grad_norm": 0.1337890625, "grad_norm_var": 0.0007929325103759766, "learning_rate": 0.0001, "loss": 0.22, "loss/crossentropy": 2.3713510036468506, "loss/fcd": 0.4541015625, "loss/idx": 18.0, "loss/logits": 0.21999357640743256, "step": 628 }, { "epoch": 0.009054593874833556, "grad_norm": 0.1123046875, "grad_norm_var": 0.0007665634155273437, "learning_rate": 0.0001, "loss": 0.2194, "loss/crossentropy": 2.5511568784713745, "loss/fcd": 0.447265625, "loss/idx": 18.0, "loss/logits": 0.2194477617740631, "step": 629 }, { "epoch": 0.009068989095620254, "grad_norm": 0.1142578125, "grad_norm_var": 0.00014481544494628906, "learning_rate": 0.0001, "loss": 0.2324, "loss/crossentropy": 2.371564745903015, "loss/fcd": 0.4443359375, "loss/idx": 18.0, "loss/logits": 0.2323940396308899, "step": 630 }, { "epoch": 0.009083384316406953, "grad_norm": 0.10595703125, "grad_norm_var": 0.00014898677666982016, "learning_rate": 0.0001, "loss": 0.2114, "loss/crossentropy": 2.4888617992401123, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.21141232550144196, "step": 631 }, { "epoch": 0.009097779537193651, "grad_norm": 0.107421875, "grad_norm_var": 0.0001020421584447225, "learning_rate": 0.0001, "loss": 0.2337, "loss/crossentropy": 2.7666863203048706, "loss/fcd": 0.484375, "loss/idx": 18.0, "loss/logits": 0.23369022458791733, "step": 632 }, { "epoch": 0.009112174757980351, "grad_norm": 0.1025390625, "grad_norm_var": 0.0001020421584447225, "learning_rate": 0.0001, "loss": 0.2183, "loss/crossentropy": 2.369840621948242, "loss/fcd": 0.42578125, "loss/idx": 18.0, "loss/logits": 0.21829679608345032, "step": 633 }, { "epoch": 0.00912656997876705, "grad_norm": 0.115234375, "grad_norm_var": 9.677310784657796e-05, "learning_rate": 0.0001, "loss": 0.2211, "loss/crossentropy": 2.469444990158081, "loss/fcd": 0.4775390625, "loss/idx": 18.0, "loss/logits": 0.22109205275774002, "step": 634 }, { "epoch": 0.009140965199553748, "grad_norm": 0.111328125, "grad_norm_var": 9.50247049331665e-05, "learning_rate": 0.0001, "loss": 0.2079, "loss/crossentropy": 2.3658159971237183, "loss/fcd": 0.4619140625, "loss/idx": 18.0, "loss/logits": 0.20790337026119232, "step": 635 }, { "epoch": 0.009155360420340448, "grad_norm": 0.09423828125, "grad_norm_var": 0.0001034379005432129, "learning_rate": 0.0001, "loss": 0.1845, "loss/crossentropy": 2.618008255958557, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.18447843939065933, "step": 636 }, { "epoch": 0.009169755641127146, "grad_norm": 0.10986328125, "grad_norm_var": 0.00010196268558502198, "learning_rate": 0.0001, "loss": 0.2325, "loss/crossentropy": 2.4641441106796265, "loss/fcd": 0.453125, "loss/idx": 18.0, "loss/logits": 0.23245185613632202, "step": 637 }, { "epoch": 0.009184150861913844, "grad_norm": 0.11865234375, "grad_norm_var": 0.00010348955790201823, "learning_rate": 0.0001, "loss": 0.2099, "loss/crossentropy": 2.5920947790145874, "loss/fcd": 0.5068359375, "loss/idx": 18.0, "loss/logits": 0.20993127673864365, "step": 638 }, { "epoch": 0.009198546082700543, "grad_norm": 0.10400390625, "grad_norm_var": 9.453992048899332e-05, "learning_rate": 0.0001, "loss": 0.2037, "loss/crossentropy": 2.207823634147644, "loss/fcd": 0.4130859375, "loss/idx": 18.0, "loss/logits": 0.20368105918169022, "step": 639 }, { "epoch": 0.009212941303487243, "grad_norm": 0.1103515625, "grad_norm_var": 8.891324202219645e-05, "learning_rate": 0.0001, "loss": 0.1993, "loss/crossentropy": 2.3137396574020386, "loss/fcd": 0.4287109375, "loss/idx": 18.0, "loss/logits": 0.1993313431739807, "step": 640 }, { "epoch": 0.009227336524273941, "grad_norm": 0.10009765625, "grad_norm_var": 8.830626805623372e-05, "learning_rate": 0.0001, "loss": 0.2031, "loss/crossentropy": 2.4254961013793945, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.20313136279582977, "step": 641 }, { "epoch": 0.00924173174506064, "grad_norm": 0.107421875, "grad_norm_var": 7.775227228800456e-05, "learning_rate": 0.0001, "loss": 0.1889, "loss/crossentropy": 1.978569746017456, "loss/fcd": 0.400390625, "loss/idx": 18.0, "loss/logits": 0.18890459090471268, "step": 642 }, { "epoch": 0.00925612696584734, "grad_norm": 0.109375, "grad_norm_var": 7.740259170532226e-05, "learning_rate": 0.0001, "loss": 0.2315, "loss/crossentropy": 2.575870633125305, "loss/fcd": 0.484375, "loss/idx": 18.0, "loss/logits": 0.2314896583557129, "step": 643 }, { "epoch": 0.009270522186634038, "grad_norm": 0.095703125, "grad_norm_var": 4.6253204345703125e-05, "learning_rate": 0.0001, "loss": 0.1942, "loss/crossentropy": 2.4864895343780518, "loss/fcd": 0.4208984375, "loss/idx": 18.0, "loss/logits": 0.19422397762537003, "step": 644 }, { "epoch": 0.009284917407420736, "grad_norm": 0.11083984375, "grad_norm_var": 4.5433640480041504e-05, "learning_rate": 0.0001, "loss": 0.2053, "loss/crossentropy": 2.3608009815216064, "loss/fcd": 0.4375, "loss/idx": 18.0, "loss/logits": 0.20532061159610748, "step": 645 }, { "epoch": 0.009299312628207435, "grad_norm": 0.12109375, "grad_norm_var": 5.466838677724202e-05, "learning_rate": 0.0001, "loss": 0.2398, "loss/crossentropy": 2.3986343145370483, "loss/fcd": 0.5078125, "loss/idx": 18.0, "loss/logits": 0.23981131613254547, "step": 646 }, { "epoch": 0.009313707848994135, "grad_norm": 0.10498046875, "grad_norm_var": 5.496243635813395e-05, "learning_rate": 0.0001, "loss": 0.2281, "loss/crossentropy": 2.7443615198135376, "loss/fcd": 0.4541015625, "loss/idx": 18.0, "loss/logits": 0.2280602902173996, "step": 647 }, { "epoch": 0.009328103069780833, "grad_norm": 0.10107421875, "grad_norm_var": 5.771319071451823e-05, "learning_rate": 0.0001, "loss": 0.2143, "loss/crossentropy": 2.785035014152527, "loss/fcd": 0.453125, "loss/idx": 18.0, "loss/logits": 0.2142793908715248, "step": 648 }, { "epoch": 0.009342498290567531, "grad_norm": 0.111328125, "grad_norm_var": 5.6962172190348305e-05, "learning_rate": 0.0001, "loss": 0.2117, "loss/crossentropy": 2.3756792545318604, "loss/fcd": 0.462890625, "loss/idx": 18.0, "loss/logits": 0.21174004673957825, "step": 649 }, { "epoch": 0.00935689351135423, "grad_norm": 0.0986328125, "grad_norm_var": 5.784034729003906e-05, "learning_rate": 0.0001, "loss": 0.1971, "loss/crossentropy": 2.4738396406173706, "loss/fcd": 0.4169921875, "loss/idx": 18.0, "loss/logits": 0.19714603573083878, "step": 650 }, { "epoch": 0.00937128873214093, "grad_norm": 0.09619140625, "grad_norm_var": 6.304482618967692e-05, "learning_rate": 0.0001, "loss": 0.1734, "loss/crossentropy": 2.1993648409843445, "loss/fcd": 0.376953125, "loss/idx": 18.0, "loss/logits": 0.1733626276254654, "step": 651 }, { "epoch": 0.009385683952927628, "grad_norm": 0.1279296875, "grad_norm_var": 8.175770441691081e-05, "learning_rate": 0.0001, "loss": 0.2205, "loss/crossentropy": 2.3313381671905518, "loss/fcd": 0.447265625, "loss/idx": 18.0, "loss/logits": 0.22045698016881943, "step": 652 }, { "epoch": 0.009400079173714326, "grad_norm": 0.115234375, "grad_norm_var": 8.491575717926026e-05, "learning_rate": 0.0001, "loss": 0.2458, "loss/crossentropy": 2.5517263412475586, "loss/fcd": 0.47265625, "loss/idx": 18.0, "loss/logits": 0.2457558810710907, "step": 653 }, { "epoch": 0.009414474394501026, "grad_norm": 0.10498046875, "grad_norm_var": 7.773935794830322e-05, "learning_rate": 0.0001, "loss": 0.2048, "loss/crossentropy": 2.1996500492095947, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.20480218529701233, "step": 654 }, { "epoch": 0.009428869615287725, "grad_norm": 0.1396484375, "grad_norm_var": 0.00014075835545857747, "learning_rate": 0.0001, "loss": 0.2842, "loss/crossentropy": 2.34015429019928, "loss/fcd": 0.544921875, "loss/idx": 18.0, "loss/logits": 0.2842213958501816, "step": 655 }, { "epoch": 0.009443264836074423, "grad_norm": 0.1455078125, "grad_norm_var": 0.00022115310033162436, "learning_rate": 0.0001, "loss": 0.2686, "loss/crossentropy": 2.409281849861145, "loss/fcd": 0.5390625, "loss/idx": 18.0, "loss/logits": 0.26864343136548996, "step": 656 }, { "epoch": 0.009457660056861121, "grad_norm": 0.10986328125, "grad_norm_var": 0.0002117753028869629, "learning_rate": 0.0001, "loss": 0.2035, "loss/crossentropy": 2.250716805458069, "loss/fcd": 0.400390625, "loss/idx": 18.0, "loss/logits": 0.20348752290010452, "step": 657 }, { "epoch": 0.009472055277647821, "grad_norm": 0.109375, "grad_norm_var": 0.00021069447199503581, "learning_rate": 0.0001, "loss": 0.1903, "loss/crossentropy": 2.2704538106918335, "loss/fcd": 0.4111328125, "loss/idx": 18.0, "loss/logits": 0.19032004475593567, "step": 658 }, { "epoch": 0.00948645049843452, "grad_norm": 0.1064453125, "grad_norm_var": 0.0002124945322672526, "learning_rate": 0.0001, "loss": 0.2032, "loss/crossentropy": 2.31364369392395, "loss/fcd": 0.4052734375, "loss/idx": 18.0, "loss/logits": 0.20321927964687347, "step": 659 }, { "epoch": 0.009500845719221218, "grad_norm": 0.11181640625, "grad_norm_var": 0.00019279221693674725, "learning_rate": 0.0001, "loss": 0.1964, "loss/crossentropy": 1.959843933582306, "loss/fcd": 0.3955078125, "loss/idx": 18.0, "loss/logits": 0.19641809910535812, "step": 660 }, { "epoch": 0.009515240940007918, "grad_norm": 0.11279296875, "grad_norm_var": 0.00019235511620839437, "learning_rate": 0.0001, "loss": 0.2212, "loss/crossentropy": 2.4466131925582886, "loss/fcd": 0.4765625, "loss/idx": 18.0, "loss/logits": 0.22118167579174042, "step": 661 }, { "epoch": 0.009529636160794617, "grad_norm": 0.10498046875, "grad_norm_var": 0.00019238789876302082, "learning_rate": 0.0001, "loss": 0.1816, "loss/crossentropy": 2.186416506767273, "loss/fcd": 0.4052734375, "loss/idx": 18.0, "loss/logits": 0.18155072629451752, "step": 662 }, { "epoch": 0.009544031381581315, "grad_norm": 0.1142578125, "grad_norm_var": 0.00018840531508127847, "learning_rate": 0.0001, "loss": 0.2391, "loss/crossentropy": 2.504140853881836, "loss/fcd": 0.490234375, "loss/idx": 18.0, "loss/logits": 0.2391308844089508, "step": 663 }, { "epoch": 0.009558426602368013, "grad_norm": 0.11328125, "grad_norm_var": 0.0001780986785888672, "learning_rate": 0.0001, "loss": 0.2317, "loss/crossentropy": 2.4283803701400757, "loss/fcd": 0.45703125, "loss/idx": 18.0, "loss/logits": 0.23169831186532974, "step": 664 }, { "epoch": 0.009572821823154713, "grad_norm": 0.1005859375, "grad_norm_var": 0.0001889824867248535, "learning_rate": 0.0001, "loss": 0.2064, "loss/crossentropy": 2.438134789466858, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.20637594163417816, "step": 665 }, { "epoch": 0.009587217043941412, "grad_norm": 0.1103515625, "grad_norm_var": 0.00017477273941040038, "learning_rate": 0.0001, "loss": 0.1955, "loss/crossentropy": 2.309617757797241, "loss/fcd": 0.4287109375, "loss/idx": 18.0, "loss/logits": 0.1954583376646042, "step": 666 }, { "epoch": 0.00960161226472811, "grad_norm": 0.1064453125, "grad_norm_var": 0.00015706121921539308, "learning_rate": 0.0001, "loss": 0.2246, "loss/crossentropy": 2.53112256526947, "loss/fcd": 0.4482421875, "loss/idx": 18.0, "loss/logits": 0.22456367313861847, "step": 667 }, { "epoch": 0.009616007485514808, "grad_norm": 0.10302734375, "grad_norm_var": 0.00015153884887695313, "learning_rate": 0.0001, "loss": 0.2352, "loss/crossentropy": 2.456951379776001, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.23520664870738983, "step": 668 }, { "epoch": 0.009630402706301508, "grad_norm": 0.1220703125, "grad_norm_var": 0.0001564621925354004, "learning_rate": 0.0001, "loss": 0.2193, "loss/crossentropy": 2.065362870693207, "loss/fcd": 0.4296875, "loss/idx": 18.0, "loss/logits": 0.2193107306957245, "step": 669 }, { "epoch": 0.009644797927088207, "grad_norm": 0.115234375, "grad_norm_var": 0.0001514345407485962, "learning_rate": 0.0001, "loss": 0.2085, "loss/crossentropy": 2.2472126483917236, "loss/fcd": 0.47265625, "loss/idx": 18.0, "loss/logits": 0.20847148448228836, "step": 670 }, { "epoch": 0.009659193147874905, "grad_norm": 0.10400390625, "grad_norm_var": 0.00010944604873657227, "learning_rate": 0.0001, "loss": 0.2067, "loss/crossentropy": 2.3741711378097534, "loss/fcd": 0.42578125, "loss/idx": 18.0, "loss/logits": 0.20672930777072906, "step": 671 }, { "epoch": 0.009673588368661605, "grad_norm": 0.1044921875, "grad_norm_var": 3.067255020141602e-05, "learning_rate": 0.0001, "loss": 0.2208, "loss/crossentropy": 2.336190938949585, "loss/fcd": 0.4169921875, "loss/idx": 18.0, "loss/logits": 0.22081031650304794, "step": 672 }, { "epoch": 0.009687983589448303, "grad_norm": 0.10400390625, "grad_norm_var": 3.2389163970947264e-05, "learning_rate": 0.0001, "loss": 0.2063, "loss/crossentropy": 2.3808083534240723, "loss/fcd": 0.416015625, "loss/idx": 18.0, "loss/logits": 0.20631127804517746, "step": 673 }, { "epoch": 0.009702378810235002, "grad_norm": 0.09716796875, "grad_norm_var": 4.1007002194722494e-05, "learning_rate": 0.0001, "loss": 0.1958, "loss/crossentropy": 2.3818055391311646, "loss/fcd": 0.4130859375, "loss/idx": 18.0, "loss/logits": 0.19577700644731522, "step": 674 }, { "epoch": 0.0097167740310217, "grad_norm": 0.10888671875, "grad_norm_var": 4.0813287099202475e-05, "learning_rate": 0.0001, "loss": 0.2047, "loss/crossentropy": 2.5091700553894043, "loss/fcd": 0.453125, "loss/idx": 18.0, "loss/logits": 0.2047055885195732, "step": 675 }, { "epoch": 0.0097311692518084, "grad_norm": 0.10986328125, "grad_norm_var": 4.01457150777181e-05, "learning_rate": 0.0001, "loss": 0.2221, "loss/crossentropy": 2.404844641685486, "loss/fcd": 0.4501953125, "loss/idx": 18.0, "loss/logits": 0.2221018671989441, "step": 676 }, { "epoch": 0.009745564472595098, "grad_norm": 0.10302734375, "grad_norm_var": 4.01457150777181e-05, "learning_rate": 0.0001, "loss": 0.2118, "loss/crossentropy": 2.279817581176758, "loss/fcd": 0.4052734375, "loss/idx": 18.0, "loss/logits": 0.2117534652352333, "step": 677 }, { "epoch": 0.009759959693381797, "grad_norm": 0.1064453125, "grad_norm_var": 3.9767225583394365e-05, "learning_rate": 0.0001, "loss": 0.2293, "loss/crossentropy": 2.5266642570495605, "loss/fcd": 0.4296875, "loss/idx": 18.0, "loss/logits": 0.22927331924438477, "step": 678 }, { "epoch": 0.009774354914168497, "grad_norm": 0.11474609375, "grad_norm_var": 4.020929336547852e-05, "learning_rate": 0.0001, "loss": 0.2077, "loss/crossentropy": 2.128249764442444, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.20772311836481094, "step": 679 }, { "epoch": 0.009788750134955195, "grad_norm": 0.09814453125, "grad_norm_var": 4.331966241200765e-05, "learning_rate": 0.0001, "loss": 0.2052, "loss/crossentropy": 2.6459118127822876, "loss/fcd": 0.4296875, "loss/idx": 18.0, "loss/logits": 0.20518244057893753, "step": 680 }, { "epoch": 0.009803145355741893, "grad_norm": 0.11669921875, "grad_norm_var": 4.623730977376302e-05, "learning_rate": 0.0001, "loss": 0.2262, "loss/crossentropy": 2.3065195083618164, "loss/fcd": 0.45703125, "loss/idx": 18.0, "loss/logits": 0.2262207344174385, "step": 681 }, { "epoch": 0.009817540576528592, "grad_norm": 0.1015625, "grad_norm_var": 4.8061211903889976e-05, "learning_rate": 0.0001, "loss": 0.2083, "loss/crossentropy": 2.4284178018569946, "loss/fcd": 0.4228515625, "loss/idx": 18.0, "loss/logits": 0.20825288444757462, "step": 682 }, { "epoch": 0.009831935797315292, "grad_norm": 0.1025390625, "grad_norm_var": 4.942814509073893e-05, "learning_rate": 0.0001, "loss": 0.1944, "loss/crossentropy": 2.3323564529418945, "loss/fcd": 0.42578125, "loss/idx": 18.0, "loss/logits": 0.1943565011024475, "step": 683 }, { "epoch": 0.00984633101810199, "grad_norm": 0.10595703125, "grad_norm_var": 4.841486612955729e-05, "learning_rate": 0.0001, "loss": 0.2041, "loss/crossentropy": 2.2371606826782227, "loss/fcd": 0.4052734375, "loss/idx": 18.0, "loss/logits": 0.2040523663163185, "step": 684 }, { "epoch": 0.009860726238888689, "grad_norm": 0.11083984375, "grad_norm_var": 3.3997495969136556e-05, "learning_rate": 0.0001, "loss": 0.2427, "loss/crossentropy": 2.6680363416671753, "loss/fcd": 0.4697265625, "loss/idx": 18.0, "loss/logits": 0.24272434413433075, "step": 685 }, { "epoch": 0.009875121459675389, "grad_norm": 0.111328125, "grad_norm_var": 3.038942813873291e-05, "learning_rate": 0.0001, "loss": 0.2114, "loss/crossentropy": 2.392301321029663, "loss/fcd": 0.4267578125, "loss/idx": 18.0, "loss/logits": 0.2113867551088333, "step": 686 }, { "epoch": 0.009889516680462087, "grad_norm": 0.10693359375, "grad_norm_var": 3.0055642127990723e-05, "learning_rate": 0.0001, "loss": 0.238, "loss/crossentropy": 2.646833062171936, "loss/fcd": 0.458984375, "loss/idx": 18.0, "loss/logits": 0.23803511261940002, "step": 687 }, { "epoch": 0.009903911901248785, "grad_norm": 0.1015625, "grad_norm_var": 3.134310245513916e-05, "learning_rate": 0.0001, "loss": 0.1935, "loss/crossentropy": 2.256480574607849, "loss/fcd": 0.4013671875, "loss/idx": 18.0, "loss/logits": 0.19345563650131226, "step": 688 }, { "epoch": 0.009918307122035484, "grad_norm": 0.109375, "grad_norm_var": 3.155072530110677e-05, "learning_rate": 0.0001, "loss": 0.2321, "loss/crossentropy": 2.425878643989563, "loss/fcd": 0.451171875, "loss/idx": 18.0, "loss/logits": 0.23210398107767105, "step": 689 }, { "epoch": 0.009932702342822184, "grad_norm": 0.11083984375, "grad_norm_var": 2.6098887125651042e-05, "learning_rate": 0.0001, "loss": 0.2257, "loss/crossentropy": 2.565882086753845, "loss/fcd": 0.478515625, "loss/idx": 18.0, "loss/logits": 0.2256726175546646, "step": 690 }, { "epoch": 0.009947097563608882, "grad_norm": 0.09912109375, "grad_norm_var": 3.0152002970377605e-05, "learning_rate": 0.0001, "loss": 0.1932, "loss/crossentropy": 2.3051689863204956, "loss/fcd": 0.39453125, "loss/idx": 18.0, "loss/logits": 0.1932462379336357, "step": 691 }, { "epoch": 0.00996149278439558, "grad_norm": 0.10888671875, "grad_norm_var": 2.981424331665039e-05, "learning_rate": 0.0001, "loss": 0.2124, "loss/crossentropy": 2.275663137435913, "loss/fcd": 0.41796875, "loss/idx": 18.0, "loss/logits": 0.21243004500865936, "step": 692 }, { "epoch": 0.009975888005182279, "grad_norm": 0.1044921875, "grad_norm_var": 2.9221177101135254e-05, "learning_rate": 0.0001, "loss": 0.2011, "loss/crossentropy": 2.344420909881592, "loss/fcd": 0.408203125, "loss/idx": 18.0, "loss/logits": 0.20112024247646332, "step": 693 }, { "epoch": 0.009990283225968979, "grad_norm": 0.1455078125, "grad_norm_var": 0.00012252231438954672, "learning_rate": 0.0001, "loss": 0.2729, "loss/crossentropy": 2.1788020730018616, "loss/fcd": 0.521484375, "loss/idx": 18.0, "loss/logits": 0.27289582788944244, "step": 694 }, { "epoch": 0.010004678446755677, "grad_norm": 0.111328125, "grad_norm_var": 0.00012076298395792643, "learning_rate": 0.0001, "loss": 0.23, "loss/crossentropy": 2.405027389526367, "loss/fcd": 0.4306640625, "loss/idx": 18.0, "loss/logits": 0.22998760640621185, "step": 695 }, { "epoch": 0.010019073667542375, "grad_norm": 0.11767578125, "grad_norm_var": 0.00011615355809529622, "learning_rate": 0.0001, "loss": 0.2227, "loss/crossentropy": 2.9466445446014404, "loss/fcd": 0.4755859375, "loss/idx": 18.0, "loss/logits": 0.22269698232412338, "step": 696 }, { "epoch": 0.010033468888329075, "grad_norm": 0.107421875, "grad_norm_var": 0.00011360545953114828, "learning_rate": 0.0001, "loss": 0.2136, "loss/crossentropy": 3.066506266593933, "loss/fcd": 0.46875, "loss/idx": 18.0, "loss/logits": 0.21359677612781525, "step": 697 }, { "epoch": 0.010047864109115774, "grad_norm": 0.09130859375, "grad_norm_var": 0.0001313169797261556, "learning_rate": 0.0001, "loss": 0.1792, "loss/crossentropy": 2.3103922605514526, "loss/fcd": 0.388671875, "loss/idx": 18.0, "loss/logits": 0.17915956676006317, "step": 698 }, { "epoch": 0.010062259329902472, "grad_norm": 0.1044921875, "grad_norm_var": 0.000129854679107666, "learning_rate": 0.0001, "loss": 0.2095, "loss/crossentropy": 2.201840400695801, "loss/fcd": 0.4013671875, "loss/idx": 18.0, "loss/logits": 0.2094813957810402, "step": 699 }, { "epoch": 0.01007665455068917, "grad_norm": 0.09619140625, "grad_norm_var": 0.0001400272051493327, "learning_rate": 0.0001, "loss": 0.2055, "loss/crossentropy": 2.5452860593795776, "loss/fcd": 0.4267578125, "loss/idx": 18.0, "loss/logits": 0.20547957718372345, "step": 700 }, { "epoch": 0.01009104977147587, "grad_norm": 0.1103515625, "grad_norm_var": 0.00013989508152008058, "learning_rate": 0.0001, "loss": 0.1918, "loss/crossentropy": 2.007373094558716, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.19176460802555084, "step": 701 }, { "epoch": 0.010105444992262569, "grad_norm": 0.10498046875, "grad_norm_var": 0.00014006296793619792, "learning_rate": 0.0001, "loss": 0.2082, "loss/crossentropy": 2.3383631706237793, "loss/fcd": 0.4384765625, "loss/idx": 18.0, "loss/logits": 0.20822366327047348, "step": 702 }, { "epoch": 0.010119840213049267, "grad_norm": 0.1083984375, "grad_norm_var": 0.000139958659807841, "learning_rate": 0.0001, "loss": 0.1983, "loss/crossentropy": 1.9882320761680603, "loss/fcd": 0.4287109375, "loss/idx": 18.0, "loss/logits": 0.19827204197645187, "step": 703 }, { "epoch": 0.010134235433835967, "grad_norm": 0.11669921875, "grad_norm_var": 0.00014079014460245768, "learning_rate": 0.0001, "loss": 0.238, "loss/crossentropy": 2.5094656944274902, "loss/fcd": 0.46875, "loss/idx": 18.0, "loss/logits": 0.23796136677265167, "step": 704 }, { "epoch": 0.010148630654622666, "grad_norm": 0.10595703125, "grad_norm_var": 0.00014143685499827066, "learning_rate": 0.0001, "loss": 0.206, "loss/crossentropy": 2.1021994948387146, "loss/fcd": 0.443359375, "loss/idx": 18.0, "loss/logits": 0.2059553563594818, "step": 705 }, { "epoch": 0.010163025875409364, "grad_norm": 0.09814453125, "grad_norm_var": 0.00014835894107818605, "learning_rate": 0.0001, "loss": 0.2037, "loss/crossentropy": 2.3918451070785522, "loss/fcd": 0.4248046875, "loss/idx": 18.0, "loss/logits": 0.20372479408979416, "step": 706 }, { "epoch": 0.010177421096196062, "grad_norm": 0.1123046875, "grad_norm_var": 0.00014328956604003906, "learning_rate": 0.0001, "loss": 0.2379, "loss/crossentropy": 2.4593441486358643, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.23789776116609573, "step": 707 }, { "epoch": 0.010191816316982762, "grad_norm": 0.10400390625, "grad_norm_var": 0.00014485915501912436, "learning_rate": 0.0001, "loss": 0.1983, "loss/crossentropy": 2.2852306365966797, "loss/fcd": 0.4013671875, "loss/idx": 18.0, "loss/logits": 0.1982945054769516, "step": 708 }, { "epoch": 0.01020621153776946, "grad_norm": 0.1103515625, "grad_norm_var": 0.00014371474583943684, "learning_rate": 0.0001, "loss": 0.2481, "loss/crossentropy": 2.5582568645477295, "loss/fcd": 0.4580078125, "loss/idx": 18.0, "loss/logits": 0.24808169901371002, "step": 709 }, { "epoch": 0.010220606758556159, "grad_norm": 0.11083984375, "grad_norm_var": 5.040069421132406e-05, "learning_rate": 0.0001, "loss": 0.241, "loss/crossentropy": 2.4824811220169067, "loss/fcd": 0.51171875, "loss/idx": 18.0, "loss/logits": 0.2409602850675583, "step": 710 }, { "epoch": 0.010235001979342857, "grad_norm": 0.12158203125, "grad_norm_var": 6.302197774251302e-05, "learning_rate": 0.0001, "loss": 0.2292, "loss/crossentropy": 2.237234354019165, "loss/fcd": 0.4619140625, "loss/idx": 18.0, "loss/logits": 0.22918210923671722, "step": 711 }, { "epoch": 0.010249397200129557, "grad_norm": 0.10595703125, "grad_norm_var": 5.577405293782552e-05, "learning_rate": 0.0001, "loss": 0.207, "loss/crossentropy": 2.31795597076416, "loss/fcd": 0.4296875, "loss/idx": 18.0, "loss/logits": 0.20698396116495132, "step": 712 }, { "epoch": 0.010263792420916256, "grad_norm": 0.103515625, "grad_norm_var": 5.6409835815429686e-05, "learning_rate": 0.0001, "loss": 0.2065, "loss/crossentropy": 2.5415326356887817, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.20651167631149292, "step": 713 }, { "epoch": 0.010278187641702954, "grad_norm": 0.10791015625, "grad_norm_var": 3.9859612782796225e-05, "learning_rate": 0.0001, "loss": 0.2139, "loss/crossentropy": 2.2211133241653442, "loss/fcd": 0.4208984375, "loss/idx": 18.0, "loss/logits": 0.21389107406139374, "step": 714 }, { "epoch": 0.010292582862489654, "grad_norm": 0.1015625, "grad_norm_var": 4.161198933919271e-05, "learning_rate": 0.0001, "loss": 0.212, "loss/crossentropy": 2.3691943883895874, "loss/fcd": 0.4521484375, "loss/idx": 18.0, "loss/logits": 0.2119893953204155, "step": 715 }, { "epoch": 0.010306978083276352, "grad_norm": 0.10400390625, "grad_norm_var": 3.372828165690104e-05, "learning_rate": 0.0001, "loss": 0.2145, "loss/crossentropy": 2.390496850013733, "loss/fcd": 0.4287109375, "loss/idx": 18.0, "loss/logits": 0.2145363911986351, "step": 716 }, { "epoch": 0.01032137330406305, "grad_norm": 0.1005859375, "grad_norm_var": 3.650983174641927e-05, "learning_rate": 0.0001, "loss": 0.1619, "loss/crossentropy": 1.7626497149467468, "loss/fcd": 0.513671875, "loss/idx": 18.0, "loss/logits": 0.16188892722129822, "step": 717 }, { "epoch": 0.010335768524849749, "grad_norm": 0.11279296875, "grad_norm_var": 3.790855407714844e-05, "learning_rate": 0.0001, "loss": 0.2436, "loss/crossentropy": 2.6944552659988403, "loss/fcd": 0.4609375, "loss/idx": 18.0, "loss/logits": 0.2435958907008171, "step": 718 }, { "epoch": 0.01035016374563645, "grad_norm": 0.10107421875, "grad_norm_var": 4.0665268898010254e-05, "learning_rate": 0.0001, "loss": 0.2072, "loss/crossentropy": 2.383134961128235, "loss/fcd": 0.4091796875, "loss/idx": 18.0, "loss/logits": 0.20715758204460144, "step": 719 }, { "epoch": 0.010364558966423148, "grad_norm": 0.1005859375, "grad_norm_var": 3.6764144897460935e-05, "learning_rate": 0.0001, "loss": 0.2088, "loss/crossentropy": 2.5325286388397217, "loss/fcd": 0.4189453125, "loss/idx": 18.0, "loss/logits": 0.20877134799957275, "step": 720 }, { "epoch": 0.010378954187209846, "grad_norm": 0.1005859375, "grad_norm_var": 3.8829445838928225e-05, "learning_rate": 0.0001, "loss": 0.2236, "loss/crossentropy": 2.6585100889205933, "loss/fcd": 0.4140625, "loss/idx": 18.0, "loss/logits": 0.22355867177248, "step": 721 }, { "epoch": 0.010393349407996546, "grad_norm": 0.1025390625, "grad_norm_var": 3.544092178344727e-05, "learning_rate": 0.0001, "loss": 0.2016, "loss/crossentropy": 2.3599932193756104, "loss/fcd": 0.423828125, "loss/idx": 18.0, "loss/logits": 0.2016456127166748, "step": 722 }, { "epoch": 0.010407744628783244, "grad_norm": 0.1220703125, "grad_norm_var": 4.926919937133789e-05, "learning_rate": 0.0001, "loss": 0.2227, "loss/crossentropy": 2.1093697547912598, "loss/fcd": 0.4326171875, "loss/idx": 18.0, "loss/logits": 0.22268912196159363, "step": 723 }, { "epoch": 0.010422139849569943, "grad_norm": 0.1044921875, "grad_norm_var": 4.9097339312235516e-05, "learning_rate": 0.0001, "loss": 0.2061, "loss/crossentropy": 2.120736837387085, "loss/fcd": 0.41796875, "loss/idx": 18.0, "loss/logits": 0.20611396431922913, "step": 724 }, { "epoch": 0.010436535070356641, "grad_norm": 0.1123046875, "grad_norm_var": 5.023380120595296e-05, "learning_rate": 0.0001, "loss": 0.2238, "loss/crossentropy": 2.3321027755737305, "loss/fcd": 0.4560546875, "loss/idx": 18.0, "loss/logits": 0.22382070124149323, "step": 725 }, { "epoch": 0.010450930291143341, "grad_norm": 0.1083984375, "grad_norm_var": 4.936456680297852e-05, "learning_rate": 0.0001, "loss": 0.206, "loss/crossentropy": 2.2690643668174744, "loss/fcd": 0.4091796875, "loss/idx": 18.0, "loss/logits": 0.20598538219928741, "step": 726 }, { "epoch": 0.01046532551193004, "grad_norm": 0.10888671875, "grad_norm_var": 3.4538904825846356e-05, "learning_rate": 0.0001, "loss": 0.2204, "loss/crossentropy": 2.39444100856781, "loss/fcd": 0.4404296875, "loss/idx": 18.0, "loss/logits": 0.22039655596017838, "step": 727 }, { "epoch": 0.010479720732716738, "grad_norm": 0.10986328125, "grad_norm_var": 3.5429000854492186e-05, "learning_rate": 0.0001, "loss": 0.2212, "loss/crossentropy": 2.4713072776794434, "loss/fcd": 0.4306640625, "loss/idx": 18.0, "loss/logits": 0.2212340533733368, "step": 728 }, { "epoch": 0.010494115953503438, "grad_norm": 0.138671875, "grad_norm_var": 9.951591491699218e-05, "learning_rate": 0.0001, "loss": 0.2904, "loss/crossentropy": 2.2529489994049072, "loss/fcd": 0.5947265625, "loss/idx": 18.0, "loss/logits": 0.29038895666599274, "step": 729 }, { "epoch": 0.010508511174290136, "grad_norm": 0.142578125, "grad_norm_var": 0.00017181138197580975, "learning_rate": 0.0001, "loss": 0.2931, "loss/crossentropy": 2.3451786041259766, "loss/fcd": 0.5263671875, "loss/idx": 18.0, "loss/logits": 0.29310375452041626, "step": 730 }, { "epoch": 0.010522906395076834, "grad_norm": 0.111328125, "grad_norm_var": 0.00016589065392812093, "learning_rate": 0.0001, "loss": 0.2063, "loss/crossentropy": 2.3045698404312134, "loss/fcd": 0.4111328125, "loss/idx": 18.0, "loss/logits": 0.2062971591949463, "step": 731 }, { "epoch": 0.010537301615863533, "grad_norm": 0.10205078125, "grad_norm_var": 0.00016802847385406495, "learning_rate": 0.0001, "loss": 0.2105, "loss/crossentropy": 2.5085275173187256, "loss/fcd": 0.4384765625, "loss/idx": 18.0, "loss/logits": 0.21049045026302338, "step": 732 }, { "epoch": 0.010551696836650233, "grad_norm": 0.10107421875, "grad_norm_var": 0.00016735394795735676, "learning_rate": 0.0001, "loss": 0.2196, "loss/crossentropy": 2.644802451133728, "loss/fcd": 0.42578125, "loss/idx": 18.0, "loss/logits": 0.2196320742368698, "step": 733 }, { "epoch": 0.010566092057436931, "grad_norm": 0.10791015625, "grad_norm_var": 0.00016781091690063477, "learning_rate": 0.0001, "loss": 0.1974, "loss/crossentropy": 2.2515525817871094, "loss/fcd": 0.3974609375, "loss/idx": 18.0, "loss/logits": 0.19744951277971268, "step": 734 }, { "epoch": 0.01058048727822363, "grad_norm": 0.1015625, "grad_norm_var": 0.0001671860615412394, "learning_rate": 0.0001, "loss": 0.1965, "loss/crossentropy": 2.3382036685943604, "loss/fcd": 0.3974609375, "loss/idx": 18.0, "loss/logits": 0.1964586153626442, "step": 735 }, { "epoch": 0.010594882499010328, "grad_norm": 0.095703125, "grad_norm_var": 0.00017541150252024332, "learning_rate": 0.0001, "loss": 0.2012, "loss/crossentropy": 2.638480305671692, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.20124144107103348, "step": 736 }, { "epoch": 0.010609277719797028, "grad_norm": 0.10546875, "grad_norm_var": 0.00017036497592926025, "learning_rate": 0.0001, "loss": 0.222, "loss/crossentropy": 2.498441696166992, "loss/fcd": 0.431640625, "loss/idx": 18.0, "loss/logits": 0.22197365015745163, "step": 737 }, { "epoch": 0.010623672940583726, "grad_norm": 0.09814453125, "grad_norm_var": 0.00017648935317993164, "learning_rate": 0.0001, "loss": 0.1943, "loss/crossentropy": 2.2127552032470703, "loss/fcd": 0.3896484375, "loss/idx": 18.0, "loss/logits": 0.19434216618537903, "step": 738 }, { "epoch": 0.010638068161370424, "grad_norm": 0.1005859375, "grad_norm_var": 0.00017264286677042643, "learning_rate": 0.0001, "loss": 0.2065, "loss/crossentropy": 2.4787211418151855, "loss/fcd": 0.4140625, "loss/idx": 18.0, "loss/logits": 0.20646335184574127, "step": 739 }, { "epoch": 0.010652463382157125, "grad_norm": 0.1015625, "grad_norm_var": 0.0001750628153483073, "learning_rate": 0.0001, "loss": 0.1945, "loss/crossentropy": 2.167446494102478, "loss/fcd": 0.41015625, "loss/idx": 18.0, "loss/logits": 0.19451382011175156, "step": 740 }, { "epoch": 0.010666858602943823, "grad_norm": 0.10205078125, "grad_norm_var": 0.00017729500929514568, "learning_rate": 0.0001, "loss": 0.2069, "loss/crossentropy": 2.3936961889266968, "loss/fcd": 0.421875, "loss/idx": 18.0, "loss/logits": 0.20688295364379883, "step": 741 }, { "epoch": 0.010681253823730521, "grad_norm": 0.10595703125, "grad_norm_var": 0.00017769734064737957, "learning_rate": 0.0001, "loss": 0.2344, "loss/crossentropy": 2.502206325531006, "loss/fcd": 0.453125, "loss/idx": 18.0, "loss/logits": 0.23436500132083893, "step": 742 }, { "epoch": 0.01069564904451722, "grad_norm": 0.12451171875, "grad_norm_var": 0.00019410053888956706, "learning_rate": 0.0001, "loss": 0.2446, "loss/crossentropy": 2.7519075870513916, "loss/fcd": 0.4853515625, "loss/idx": 18.0, "loss/logits": 0.2446460798382759, "step": 743 }, { "epoch": 0.01071004426530392, "grad_norm": 0.109375, "grad_norm_var": 0.00019407967726389568, "learning_rate": 0.0001, "loss": 0.2061, "loss/crossentropy": 2.3958401679992676, "loss/fcd": 0.4345703125, "loss/idx": 18.0, "loss/logits": 0.20611582696437836, "step": 744 }, { "epoch": 0.010724439486090618, "grad_norm": 0.09716796875, "grad_norm_var": 0.00013910929361979167, "learning_rate": 0.0001, "loss": 0.2229, "loss/crossentropy": 2.6051418781280518, "loss/fcd": 0.427734375, "loss/idx": 18.0, "loss/logits": 0.22285999357700348, "step": 745 }, { "epoch": 0.010738834706877316, "grad_norm": 0.09814453125, "grad_norm_var": 4.988412062327067e-05, "learning_rate": 0.0001, "loss": 0.2096, "loss/crossentropy": 2.5375572443008423, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.20957274734973907, "step": 746 }, { "epoch": 0.010753229927664016, "grad_norm": 0.123046875, "grad_norm_var": 7.005433241526285e-05, "learning_rate": 0.0001, "loss": 0.2273, "loss/crossentropy": 2.2432570457458496, "loss/fcd": 0.46484375, "loss/idx": 18.0, "loss/logits": 0.22729168832302094, "step": 747 }, { "epoch": 0.010767625148450715, "grad_norm": 0.09912109375, "grad_norm_var": 7.160405317942301e-05, "learning_rate": 0.0001, "loss": 0.1981, "loss/crossentropy": 2.451253056526184, "loss/fcd": 0.404296875, "loss/idx": 18.0, "loss/logits": 0.1981128826737404, "step": 748 }, { "epoch": 0.010782020369237413, "grad_norm": 0.1083984375, "grad_norm_var": 7.164875666300455e-05, "learning_rate": 0.0001, "loss": 0.2238, "loss/crossentropy": 2.6088002920150757, "loss/fcd": 0.44921875, "loss/idx": 18.0, "loss/logits": 0.22383547574281693, "step": 749 }, { "epoch": 0.010796415590024111, "grad_norm": 0.11328125, "grad_norm_var": 7.559359073638916e-05, "learning_rate": 0.0001, "loss": 0.2204, "loss/crossentropy": 2.3209699392318726, "loss/fcd": 0.4443359375, "loss/idx": 18.0, "loss/logits": 0.2203991711139679, "step": 750 }, { "epoch": 0.010810810810810811, "grad_norm": 0.1240234375, "grad_norm_var": 9.606579939524333e-05, "learning_rate": 0.0001, "loss": 0.2522, "loss/crossentropy": 2.3715856075286865, "loss/fcd": 0.447265625, "loss/idx": 18.0, "loss/logits": 0.2521570920944214, "step": 751 }, { "epoch": 0.01082520603159751, "grad_norm": 0.115234375, "grad_norm_var": 9.13769006729126e-05, "learning_rate": 0.0001, "loss": 0.217, "loss/crossentropy": 2.3642451763153076, "loss/fcd": 0.4931640625, "loss/idx": 18.0, "loss/logits": 0.21697237342596054, "step": 752 }, { "epoch": 0.010839601252384208, "grad_norm": 0.10693359375, "grad_norm_var": 9.104013442993165e-05, "learning_rate": 0.0001, "loss": 0.2155, "loss/crossentropy": 2.4382712841033936, "loss/fcd": 0.4462890625, "loss/idx": 18.0, "loss/logits": 0.21553221344947815, "step": 753 }, { "epoch": 0.010853996473170906, "grad_norm": 0.1015625, "grad_norm_var": 8.729199568430583e-05, "learning_rate": 0.0001, "loss": 0.1936, "loss/crossentropy": 2.401493191719055, "loss/fcd": 0.41015625, "loss/idx": 18.0, "loss/logits": 0.19359815120697021, "step": 754 }, { "epoch": 0.010868391693957606, "grad_norm": 0.11279296875, "grad_norm_var": 8.423725763956706e-05, "learning_rate": 0.0001, "loss": 0.1962, "loss/crossentropy": 2.1797173619270325, "loss/fcd": 0.4287109375, "loss/idx": 18.0, "loss/logits": 0.19620782881975174, "step": 755 }, { "epoch": 0.010882786914744305, "grad_norm": 0.11767578125, "grad_norm_var": 8.459786574045817e-05, "learning_rate": 0.0001, "loss": 0.2162, "loss/crossentropy": 2.2014777660369873, "loss/fcd": 0.431640625, "loss/idx": 18.0, "loss/logits": 0.21618105471134186, "step": 756 }, { "epoch": 0.010897182135531003, "grad_norm": 0.10498046875, "grad_norm_var": 8.204678694407145e-05, "learning_rate": 0.0001, "loss": 0.2144, "loss/crossentropy": 2.5520023107528687, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.2143661305308342, "step": 757 }, { "epoch": 0.010911577356317703, "grad_norm": 0.1357421875, "grad_norm_var": 0.00012089014053344727, "learning_rate": 0.0001, "loss": 0.2172, "loss/crossentropy": 2.605940818786621, "loss/fcd": 0.470703125, "loss/idx": 18.0, "loss/logits": 0.2171928584575653, "step": 758 }, { "epoch": 0.010925972577104402, "grad_norm": 0.1015625, "grad_norm_var": 0.00011552075544993082, "learning_rate": 0.0001, "loss": 0.1896, "loss/crossentropy": 2.260614037513733, "loss/fcd": 0.3916015625, "loss/idx": 18.0, "loss/logits": 0.1895817369222641, "step": 759 }, { "epoch": 0.0109403677978911, "grad_norm": 0.130859375, "grad_norm_var": 0.00014096001784006754, "learning_rate": 0.0001, "loss": 0.2287, "loss/crossentropy": 2.3699567317962646, "loss/fcd": 0.5078125, "loss/idx": 18.0, "loss/logits": 0.2286616861820221, "step": 760 }, { "epoch": 0.010954763018677798, "grad_norm": 0.1123046875, "grad_norm_var": 0.00012553135553995768, "learning_rate": 0.0001, "loss": 0.1979, "loss/crossentropy": 2.0666418075561523, "loss/fcd": 0.431640625, "loss/idx": 18.0, "loss/logits": 0.19788716733455658, "step": 761 }, { "epoch": 0.010969158239464498, "grad_norm": 0.11669921875, "grad_norm_var": 0.0001106580098470052, "learning_rate": 0.0001, "loss": 0.2149, "loss/crossentropy": 2.25100314617157, "loss/fcd": 0.421875, "loss/idx": 18.0, "loss/logits": 0.21493691205978394, "step": 762 }, { "epoch": 0.010983553460251197, "grad_norm": 0.11669921875, "grad_norm_var": 0.00010553101698557536, "learning_rate": 0.0001, "loss": 0.2, "loss/crossentropy": 2.3312637209892273, "loss/fcd": 0.4306640625, "loss/idx": 18.0, "loss/logits": 0.19997263699769974, "step": 763 }, { "epoch": 0.010997948681037895, "grad_norm": 0.10595703125, "grad_norm_var": 9.52392816543579e-05, "learning_rate": 0.0001, "loss": 0.2213, "loss/crossentropy": 2.4567571878433228, "loss/fcd": 0.4169921875, "loss/idx": 18.0, "loss/logits": 0.2212778776884079, "step": 764 }, { "epoch": 0.011012343901824595, "grad_norm": 0.1279296875, "grad_norm_var": 0.00010437866051991781, "learning_rate": 0.0001, "loss": 0.2505, "loss/crossentropy": 2.3997398614883423, "loss/fcd": 0.48046875, "loss/idx": 18.0, "loss/logits": 0.25046147406101227, "step": 765 }, { "epoch": 0.011026739122611293, "grad_norm": 0.09716796875, "grad_norm_var": 0.00012486775716145834, "learning_rate": 0.0001, "loss": 0.1976, "loss/crossentropy": 2.327947497367859, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.1975831389427185, "step": 766 }, { "epoch": 0.011041134343397992, "grad_norm": 0.125, "grad_norm_var": 0.00012619892756144207, "learning_rate": 0.0001, "loss": 0.2108, "loss/crossentropy": 2.3216136693954468, "loss/fcd": 0.4521484375, "loss/idx": 18.0, "loss/logits": 0.21076547354459763, "step": 767 }, { "epoch": 0.01105552956418469, "grad_norm": 0.1298828125, "grad_norm_var": 0.00014139811197916668, "learning_rate": 0.0001, "loss": 0.2461, "loss/crossentropy": 2.2610775232315063, "loss/fcd": 0.5029296875, "loss/idx": 18.0, "loss/logits": 0.2461041733622551, "step": 768 }, { "epoch": 0.01106992478497139, "grad_norm": 0.109375, "grad_norm_var": 0.00013906856377919515, "learning_rate": 0.0001, "loss": 0.199, "loss/crossentropy": 2.2911869883537292, "loss/fcd": 0.4208984375, "loss/idx": 18.0, "loss/logits": 0.19901156425476074, "step": 769 }, { "epoch": 0.011084320005758088, "grad_norm": 0.10009765625, "grad_norm_var": 0.00014190276463826496, "learning_rate": 0.0001, "loss": 0.21, "loss/crossentropy": 2.432590365409851, "loss/fcd": 0.41796875, "loss/idx": 18.0, "loss/logits": 0.2100282460451126, "step": 770 }, { "epoch": 0.011098715226544787, "grad_norm": 0.10498046875, "grad_norm_var": 0.00014832417170206706, "learning_rate": 0.0001, "loss": 0.179, "loss/crossentropy": 2.154644250869751, "loss/fcd": 0.4853515625, "loss/idx": 18.0, "loss/logits": 0.17896521091461182, "step": 771 }, { "epoch": 0.011113110447331487, "grad_norm": 0.11572265625, "grad_norm_var": 0.00014781554539998373, "learning_rate": 0.0001, "loss": 0.2293, "loss/crossentropy": 2.5124725103378296, "loss/fcd": 0.453125, "loss/idx": 18.0, "loss/logits": 0.22927424311637878, "step": 772 }, { "epoch": 0.011127505668118185, "grad_norm": 0.111328125, "grad_norm_var": 0.00014212032159169514, "learning_rate": 0.0001, "loss": 0.2246, "loss/crossentropy": 2.411632537841797, "loss/fcd": 0.455078125, "loss/idx": 18.0, "loss/logits": 0.22455725073814392, "step": 773 }, { "epoch": 0.011141900888904883, "grad_norm": 0.11279296875, "grad_norm_var": 0.00011181831359863281, "learning_rate": 0.0001, "loss": 0.2336, "loss/crossentropy": 2.4840848445892334, "loss/fcd": 0.462890625, "loss/idx": 18.0, "loss/logits": 0.23362614214420319, "step": 774 }, { "epoch": 0.011156296109691582, "grad_norm": 0.1142578125, "grad_norm_var": 0.00010143518447875976, "learning_rate": 0.0001, "loss": 0.2162, "loss/crossentropy": 2.2171601057052612, "loss/fcd": 0.4228515625, "loss/idx": 18.0, "loss/logits": 0.2162095457315445, "step": 775 }, { "epoch": 0.011170691330478282, "grad_norm": 0.10400390625, "grad_norm_var": 8.772114912668864e-05, "learning_rate": 0.0001, "loss": 0.2101, "loss/crossentropy": 2.466732382774353, "loss/fcd": 0.4287109375, "loss/idx": 18.0, "loss/logits": 0.21009384095668793, "step": 776 }, { "epoch": 0.01118508655126498, "grad_norm": 0.11572265625, "grad_norm_var": 8.824268976847331e-05, "learning_rate": 0.0001, "loss": 0.2331, "loss/crossentropy": 2.463024854660034, "loss/fcd": 0.4765625, "loss/idx": 18.0, "loss/logits": 0.2330816239118576, "step": 777 }, { "epoch": 0.011199481772051679, "grad_norm": 0.11865234375, "grad_norm_var": 8.945067723592122e-05, "learning_rate": 0.0001, "loss": 0.2142, "loss/crossentropy": 2.1225094199180603, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.2141725867986679, "step": 778 }, { "epoch": 0.011213876992838377, "grad_norm": 0.11279296875, "grad_norm_var": 8.852879206339518e-05, "learning_rate": 0.0001, "loss": 0.2065, "loss/crossentropy": 2.0846282243728638, "loss/fcd": 0.4111328125, "loss/idx": 18.0, "loss/logits": 0.20651061832904816, "step": 779 }, { "epoch": 0.011228272213625077, "grad_norm": 0.11376953125, "grad_norm_var": 8.51591428120931e-05, "learning_rate": 0.0001, "loss": 0.2156, "loss/crossentropy": 2.2128478288650513, "loss/fcd": 0.3984375, "loss/idx": 18.0, "loss/logits": 0.21557357162237167, "step": 780 }, { "epoch": 0.011242667434411775, "grad_norm": 0.10498046875, "grad_norm_var": 7.343987623850504e-05, "learning_rate": 0.0001, "loss": 0.2015, "loss/crossentropy": 2.3130797147750854, "loss/fcd": 0.439453125, "loss/idx": 18.0, "loss/logits": 0.20153620839118958, "step": 781 }, { "epoch": 0.011257062655198474, "grad_norm": 0.1025390625, "grad_norm_var": 6.468693415323893e-05, "learning_rate": 0.0001, "loss": 0.2148, "loss/crossentropy": 2.5943338871002197, "loss/fcd": 0.458984375, "loss/idx": 18.0, "loss/logits": 0.2147517278790474, "step": 782 }, { "epoch": 0.011271457875985174, "grad_norm": 0.107421875, "grad_norm_var": 5.4101149241129555e-05, "learning_rate": 0.0001, "loss": 0.229, "loss/crossentropy": 2.7100160121917725, "loss/fcd": 0.4580078125, "loss/idx": 18.0, "loss/logits": 0.22897624969482422, "step": 783 }, { "epoch": 0.011285853096771872, "grad_norm": 0.11474609375, "grad_norm_var": 3.060400485992432e-05, "learning_rate": 0.0001, "loss": 0.2299, "loss/crossentropy": 2.500633478164673, "loss/fcd": 0.44921875, "loss/idx": 18.0, "loss/logits": 0.22991500794887543, "step": 784 }, { "epoch": 0.01130024831755857, "grad_norm": 0.10888671875, "grad_norm_var": 3.067255020141602e-05, "learning_rate": 0.0001, "loss": 0.226, "loss/crossentropy": 2.4316182136535645, "loss/fcd": 0.423828125, "loss/idx": 18.0, "loss/logits": 0.22597461938858032, "step": 785 }, { "epoch": 0.011314643538345269, "grad_norm": 0.1015625, "grad_norm_var": 2.8839707374572755e-05, "learning_rate": 0.0001, "loss": 0.217, "loss/crossentropy": 2.592137098312378, "loss/fcd": 0.45703125, "loss/idx": 18.0, "loss/logits": 0.2169811800122261, "step": 786 }, { "epoch": 0.011329038759131969, "grad_norm": 0.099609375, "grad_norm_var": 3.44236691792806e-05, "learning_rate": 0.0001, "loss": 0.1974, "loss/crossentropy": 2.287144422531128, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.1974037140607834, "step": 787 }, { "epoch": 0.011343433979918667, "grad_norm": 0.09521484375, "grad_norm_var": 4.4854482014973957e-05, "learning_rate": 0.0001, "loss": 0.2052, "loss/crossentropy": 2.4738489389419556, "loss/fcd": 0.421875, "loss/idx": 18.0, "loss/logits": 0.20523115992546082, "step": 788 }, { "epoch": 0.011357829200705365, "grad_norm": 0.1015625, "grad_norm_var": 4.7318140665690105e-05, "learning_rate": 0.0001, "loss": 0.215, "loss/crossentropy": 2.4524784088134766, "loss/fcd": 0.4560546875, "loss/idx": 18.0, "loss/logits": 0.21502291411161423, "step": 789 }, { "epoch": 0.011372224421492065, "grad_norm": 0.1005859375, "grad_norm_var": 4.888276259104411e-05, "learning_rate": 0.0001, "loss": 0.1999, "loss/crossentropy": 2.2310436964035034, "loss/fcd": 0.3916015625, "loss/idx": 18.0, "loss/logits": 0.19993127137422562, "step": 790 }, { "epoch": 0.011386619642278764, "grad_norm": 0.09765625, "grad_norm_var": 5.063911279042562e-05, "learning_rate": 0.0001, "loss": 0.2285, "loss/crossentropy": 2.613986611366272, "loss/fcd": 0.421875, "loss/idx": 18.0, "loss/logits": 0.22845745831727982, "step": 791 }, { "epoch": 0.011401014863065462, "grad_norm": 0.1005859375, "grad_norm_var": 5.238453547159831e-05, "learning_rate": 0.0001, "loss": 0.2083, "loss/crossentropy": 2.5012824535369873, "loss/fcd": 0.4287109375, "loss/idx": 18.0, "loss/logits": 0.20829569548368454, "step": 792 }, { "epoch": 0.01141541008385216, "grad_norm": 0.11572265625, "grad_norm_var": 5.238453547159831e-05, "learning_rate": 0.0001, "loss": 0.2444, "loss/crossentropy": 2.225709557533264, "loss/fcd": 0.45703125, "loss/idx": 18.0, "loss/logits": 0.2444288209080696, "step": 793 }, { "epoch": 0.01142980530463886, "grad_norm": 0.10009765625, "grad_norm_var": 4.264513651529948e-05, "learning_rate": 0.0001, "loss": 0.2043, "loss/crossentropy": 2.2551809549331665, "loss/fcd": 0.3994140625, "loss/idx": 18.0, "loss/logits": 0.20429246127605438, "step": 794 }, { "epoch": 0.011444200525425559, "grad_norm": 0.1025390625, "grad_norm_var": 3.8368503252665204e-05, "learning_rate": 0.0001, "loss": 0.2353, "loss/crossentropy": 2.4520708322525024, "loss/fcd": 0.44140625, "loss/idx": 18.0, "loss/logits": 0.2352810874581337, "step": 795 }, { "epoch": 0.011458595746212257, "grad_norm": 0.09423828125, "grad_norm_var": 3.733535607655843e-05, "learning_rate": 0.0001, "loss": 0.2003, "loss/crossentropy": 2.3560184240341187, "loss/fcd": 0.4130859375, "loss/idx": 18.0, "loss/logits": 0.2003132924437523, "step": 796 }, { "epoch": 0.011472990966998955, "grad_norm": 0.10009765625, "grad_norm_var": 3.7534038225809735e-05, "learning_rate": 0.0001, "loss": 0.2268, "loss/crossentropy": 2.6456328630447388, "loss/fcd": 0.4384765625, "loss/idx": 18.0, "loss/logits": 0.22680091857910156, "step": 797 }, { "epoch": 0.011487386187785656, "grad_norm": 0.1064453125, "grad_norm_var": 3.840823968251546e-05, "learning_rate": 0.0001, "loss": 0.2003, "loss/crossentropy": 2.5123294591903687, "loss/fcd": 0.439453125, "loss/idx": 18.0, "loss/logits": 0.20031608641147614, "step": 798 }, { "epoch": 0.011501781408572354, "grad_norm": 0.130859375, "grad_norm_var": 8.675952752431233e-05, "learning_rate": 0.0001, "loss": 0.2347, "loss/crossentropy": 2.2425618171691895, "loss/fcd": 0.451171875, "loss/idx": 18.0, "loss/logits": 0.23472215235233307, "step": 799 }, { "epoch": 0.011516176629359052, "grad_norm": 0.11767578125, "grad_norm_var": 9.133716424306234e-05, "learning_rate": 0.0001, "loss": 0.2115, "loss/crossentropy": 2.1281662583351135, "loss/fcd": 0.4208984375, "loss/idx": 18.0, "loss/logits": 0.21148262917995453, "step": 800 }, { "epoch": 0.011530571850145752, "grad_norm": 0.1083984375, "grad_norm_var": 9.107192357381185e-05, "learning_rate": 0.0001, "loss": 0.2171, "loss/crossentropy": 2.4536547660827637, "loss/fcd": 0.42578125, "loss/idx": 18.0, "loss/logits": 0.2171497568488121, "step": 801 }, { "epoch": 0.01154496707093245, "grad_norm": 0.1201171875, "grad_norm_var": 0.00010519027709960937, "learning_rate": 0.0001, "loss": 0.2157, "loss/crossentropy": 2.3697547912597656, "loss/fcd": 0.44921875, "loss/idx": 18.0, "loss/logits": 0.21570491790771484, "step": 802 }, { "epoch": 0.011559362291719149, "grad_norm": 0.11083984375, "grad_norm_var": 0.00010393361250559489, "learning_rate": 0.0001, "loss": 0.2189, "loss/crossentropy": 2.4509881734848022, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.21891363710165024, "step": 803 }, { "epoch": 0.011573757512505847, "grad_norm": 0.1103515625, "grad_norm_var": 9.564956029256185e-05, "learning_rate": 0.0001, "loss": 0.2175, "loss/crossentropy": 2.1731194853782654, "loss/fcd": 0.474609375, "loss/idx": 18.0, "loss/logits": 0.21748338639736176, "step": 804 }, { "epoch": 0.011588152733292547, "grad_norm": 0.10791015625, "grad_norm_var": 9.326040744781495e-05, "learning_rate": 0.0001, "loss": 0.2308, "loss/crossentropy": 2.4915411472320557, "loss/fcd": 0.45703125, "loss/idx": 18.0, "loss/logits": 0.2308463379740715, "step": 805 }, { "epoch": 0.011602547954079246, "grad_norm": 0.10107421875, "grad_norm_var": 9.280840555826823e-05, "learning_rate": 0.0001, "loss": 0.2106, "loss/crossentropy": 2.4593664407730103, "loss/fcd": 0.4306640625, "loss/idx": 18.0, "loss/logits": 0.21063391864299774, "step": 806 }, { "epoch": 0.011616943174865944, "grad_norm": 0.09912109375, "grad_norm_var": 9.096364180246988e-05, "learning_rate": 0.0001, "loss": 0.2093, "loss/crossentropy": 2.420872926712036, "loss/fcd": 0.431640625, "loss/idx": 18.0, "loss/logits": 0.20925325900316238, "step": 807 }, { "epoch": 0.011631338395652644, "grad_norm": 0.10009765625, "grad_norm_var": 9.145339330037434e-05, "learning_rate": 0.0001, "loss": 0.2035, "loss/crossentropy": 2.4789732694625854, "loss/fcd": 0.41796875, "loss/idx": 18.0, "loss/logits": 0.2034958302974701, "step": 808 }, { "epoch": 0.011645733616439342, "grad_norm": 0.0986328125, "grad_norm_var": 9.176631768544515e-05, "learning_rate": 0.0001, "loss": 0.2055, "loss/crossentropy": 2.356053352355957, "loss/fcd": 0.4296875, "loss/idx": 18.0, "loss/logits": 0.20546124875545502, "step": 809 }, { "epoch": 0.01166012883722604, "grad_norm": 0.1005859375, "grad_norm_var": 9.134610493977864e-05, "learning_rate": 0.0001, "loss": 0.1982, "loss/crossentropy": 2.286035180091858, "loss/fcd": 0.40625, "loss/idx": 18.0, "loss/logits": 0.19822601974010468, "step": 810 }, { "epoch": 0.011674524058012739, "grad_norm": 0.107421875, "grad_norm_var": 9.005467096964518e-05, "learning_rate": 0.0001, "loss": 0.2307, "loss/crossentropy": 2.546161413192749, "loss/fcd": 0.458984375, "loss/idx": 18.0, "loss/logits": 0.23073262721300125, "step": 811 }, { "epoch": 0.01168891927879944, "grad_norm": 0.10693359375, "grad_norm_var": 7.832845052083334e-05, "learning_rate": 0.0001, "loss": 0.214, "loss/crossentropy": 2.4045225381851196, "loss/fcd": 0.427734375, "loss/idx": 18.0, "loss/logits": 0.21400006115436554, "step": 812 }, { "epoch": 0.011703314499586137, "grad_norm": 0.10693359375, "grad_norm_var": 7.412830988566081e-05, "learning_rate": 0.0001, "loss": 0.2324, "loss/crossentropy": 2.3815245628356934, "loss/fcd": 0.44140625, "loss/idx": 18.0, "loss/logits": 0.23237691074609756, "step": 813 }, { "epoch": 0.011717709720372836, "grad_norm": 0.1064453125, "grad_norm_var": 7.412830988566081e-05, "learning_rate": 0.0001, "loss": 0.197, "loss/crossentropy": 2.2638756036758423, "loss/fcd": 0.4453125, "loss/idx": 18.0, "loss/logits": 0.19701003283262253, "step": 814 }, { "epoch": 0.011732104941159536, "grad_norm": 0.095703125, "grad_norm_var": 4.5804182688395184e-05, "learning_rate": 0.0001, "loss": 0.1993, "loss/crossentropy": 2.310957193374634, "loss/fcd": 0.3994140625, "loss/idx": 18.0, "loss/logits": 0.19931814819574356, "step": 815 }, { "epoch": 0.011746500161946234, "grad_norm": 0.13671875, "grad_norm_var": 9.775857130686441e-05, "learning_rate": 0.0001, "loss": 0.2326, "loss/crossentropy": 2.3524898290634155, "loss/fcd": 0.490234375, "loss/idx": 18.0, "loss/logits": 0.23264919221401215, "step": 816 }, { "epoch": 0.011760895382732933, "grad_norm": 0.109375, "grad_norm_var": 9.795725345611573e-05, "learning_rate": 0.0001, "loss": 0.2404, "loss/crossentropy": 2.542204737663269, "loss/fcd": 0.45703125, "loss/idx": 18.0, "loss/logits": 0.2403649091720581, "step": 817 }, { "epoch": 0.01177529060351963, "grad_norm": 0.1298828125, "grad_norm_var": 0.00012048780918121338, "learning_rate": 0.0001, "loss": 0.2259, "loss/crossentropy": 2.300834894180298, "loss/fcd": 0.4453125, "loss/idx": 18.0, "loss/logits": 0.2259274125099182, "step": 818 }, { "epoch": 0.011789685824306331, "grad_norm": 0.1044921875, "grad_norm_var": 0.0001206040382385254, "learning_rate": 0.0001, "loss": 0.2003, "loss/crossentropy": 2.309138298034668, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.2003060281276703, "step": 819 }, { "epoch": 0.01180408104509303, "grad_norm": 0.1083984375, "grad_norm_var": 0.00012012720108032227, "learning_rate": 0.0001, "loss": 0.2192, "loss/crossentropy": 2.516822099685669, "loss/fcd": 0.453125, "loss/idx": 18.0, "loss/logits": 0.2192147672176361, "step": 820 }, { "epoch": 0.011818476265879728, "grad_norm": 0.11279296875, "grad_norm_var": 0.00012189547220865885, "learning_rate": 0.0001, "loss": 0.222, "loss/crossentropy": 2.5142600536346436, "loss/fcd": 0.478515625, "loss/idx": 18.0, "loss/logits": 0.22199787199497223, "step": 821 }, { "epoch": 0.011832871486666426, "grad_norm": 0.10693359375, "grad_norm_var": 0.00011879603068033854, "learning_rate": 0.0001, "loss": 0.2006, "loss/crossentropy": 2.166727066040039, "loss/fcd": 0.3984375, "loss/idx": 18.0, "loss/logits": 0.20062025636434555, "step": 822 }, { "epoch": 0.011847266707453126, "grad_norm": 0.11572265625, "grad_norm_var": 0.00011602640151977539, "learning_rate": 0.0001, "loss": 0.2171, "loss/crossentropy": 2.2036046981811523, "loss/fcd": 0.443359375, "loss/idx": 18.0, "loss/logits": 0.21710190176963806, "step": 823 }, { "epoch": 0.011861661928239824, "grad_norm": 0.10791015625, "grad_norm_var": 0.00011036793390909831, "learning_rate": 0.0001, "loss": 0.2275, "loss/crossentropy": 2.2625406980514526, "loss/fcd": 0.4423828125, "loss/idx": 18.0, "loss/logits": 0.22749044001102448, "step": 824 }, { "epoch": 0.011876057149026523, "grad_norm": 0.7890625, "grad_norm_var": 0.028886699676513673, "learning_rate": 0.0001, "loss": 0.2046, "loss/crossentropy": 1.833857238292694, "loss/fcd": 0.5595703125, "loss/idx": 18.0, "loss/logits": 0.20455920696258545, "step": 825 }, { "epoch": 0.011890452369813223, "grad_norm": 0.1044921875, "grad_norm_var": 0.028860441843668618, "learning_rate": 0.0001, "loss": 0.2183, "loss/crossentropy": 2.433130979537964, "loss/fcd": 0.423828125, "loss/idx": 18.0, "loss/logits": 0.21825896203517914, "step": 826 }, { "epoch": 0.011904847590599921, "grad_norm": 0.1103515625, "grad_norm_var": 0.028843144575754803, "learning_rate": 0.0001, "loss": 0.2252, "loss/crossentropy": 2.3956053256988525, "loss/fcd": 0.4375, "loss/idx": 18.0, "loss/logits": 0.22523467242717743, "step": 827 }, { "epoch": 0.01191924281138662, "grad_norm": 0.10888671875, "grad_norm_var": 0.028831319014231364, "learning_rate": 0.0001, "loss": 0.2197, "loss/crossentropy": 2.4500895738601685, "loss/fcd": 0.458984375, "loss/idx": 18.0, "loss/logits": 0.2196703627705574, "step": 828 }, { "epoch": 0.011933638032173318, "grad_norm": 0.11181640625, "grad_norm_var": 0.0288025697072347, "learning_rate": 0.0001, "loss": 0.2463, "loss/crossentropy": 2.4316296577453613, "loss/fcd": 0.470703125, "loss/idx": 18.0, "loss/logits": 0.24632105976343155, "step": 829 }, { "epoch": 0.011948033252960018, "grad_norm": 0.11669921875, "grad_norm_var": 0.028744553526242573, "learning_rate": 0.0001, "loss": 0.2168, "loss/crossentropy": 2.432488799095154, "loss/fcd": 0.462890625, "loss/idx": 18.0, "loss/logits": 0.21679828315973282, "step": 830 }, { "epoch": 0.011962428473746716, "grad_norm": 0.09375, "grad_norm_var": 0.028760058681170146, "learning_rate": 0.0001, "loss": 0.1881, "loss/crossentropy": 2.3647295236587524, "loss/fcd": 0.3896484375, "loss/idx": 18.0, "loss/logits": 0.18810325115919113, "step": 831 }, { "epoch": 0.011976823694533414, "grad_norm": 0.11083984375, "grad_norm_var": 0.02886225382486979, "learning_rate": 0.0001, "loss": 0.2179, "loss/crossentropy": 2.4084588289260864, "loss/fcd": 0.4345703125, "loss/idx": 18.0, "loss/logits": 0.2179015353322029, "step": 832 }, { "epoch": 0.011991218915320115, "grad_norm": 0.10400390625, "grad_norm_var": 0.02889500359694163, "learning_rate": 0.0001, "loss": 0.2007, "loss/crossentropy": 2.3215763568878174, "loss/fcd": 0.4384765625, "loss/idx": 18.0, "loss/logits": 0.20068107545375824, "step": 833 }, { "epoch": 0.012005614136106813, "grad_norm": 0.10107421875, "grad_norm_var": 0.029032798608144124, "learning_rate": 0.0001, "loss": 0.1899, "loss/crossentropy": 2.1791869401931763, "loss/fcd": 0.416015625, "loss/idx": 18.0, "loss/logits": 0.18989010155200958, "step": 834 }, { "epoch": 0.012020009356893511, "grad_norm": 0.11279296875, "grad_norm_var": 0.02898623843987783, "learning_rate": 0.0001, "loss": 0.2187, "loss/crossentropy": 2.292221188545227, "loss/fcd": 0.4365234375, "loss/idx": 18.0, "loss/logits": 0.2187333106994629, "step": 835 }, { "epoch": 0.01203440457768021, "grad_norm": 0.1064453125, "grad_norm_var": 0.028997563322385154, "learning_rate": 0.0001, "loss": 0.216, "loss/crossentropy": 2.191875457763672, "loss/fcd": 0.439453125, "loss/idx": 18.0, "loss/logits": 0.21604549139738083, "step": 836 }, { "epoch": 0.01204879979846691, "grad_norm": 0.1298828125, "grad_norm_var": 0.028929102420806884, "learning_rate": 0.0001, "loss": 0.2124, "loss/crossentropy": 2.0077582597732544, "loss/fcd": 0.4228515625, "loss/idx": 18.0, "loss/logits": 0.21244481950998306, "step": 837 }, { "epoch": 0.012063195019253608, "grad_norm": 0.10693359375, "grad_norm_var": 0.028929102420806884, "learning_rate": 0.0001, "loss": 0.2144, "loss/crossentropy": 2.4339540004730225, "loss/fcd": 0.4296875, "loss/idx": 18.0, "loss/logits": 0.21442482620477676, "step": 838 }, { "epoch": 0.012077590240040306, "grad_norm": 0.220703125, "grad_norm_var": 0.0291112889846166, "learning_rate": 0.0001, "loss": 0.2018, "loss/crossentropy": 1.9582195281982422, "loss/fcd": 0.5849609375, "loss/idx": 18.0, "loss/logits": 0.20181410014629364, "step": 839 }, { "epoch": 0.012091985460827005, "grad_norm": 0.14453125, "grad_norm_var": 0.028948195775349937, "learning_rate": 0.0001, "loss": 0.2261, "loss/crossentropy": 1.961089551448822, "loss/fcd": 0.431640625, "loss/idx": 18.0, "loss/logits": 0.22611552476882935, "step": 840 }, { "epoch": 0.012106380681613705, "grad_norm": 0.10205078125, "grad_norm_var": 0.0008943786223729451, "learning_rate": 0.0001, "loss": 0.208, "loss/crossentropy": 2.2961446046829224, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.20795201510190964, "step": 841 }, { "epoch": 0.012120775902400403, "grad_norm": 0.10791015625, "grad_norm_var": 0.0008890310923258464, "learning_rate": 0.0001, "loss": 0.2208, "loss/crossentropy": 2.3987420797348022, "loss/fcd": 0.4345703125, "loss/idx": 18.0, "loss/logits": 0.2207762897014618, "step": 842 }, { "epoch": 0.012135171123187101, "grad_norm": 0.09912109375, "grad_norm_var": 0.0009084294239679972, "learning_rate": 0.0001, "loss": 0.1981, "loss/crossentropy": 2.284560799598694, "loss/fcd": 0.4033203125, "loss/idx": 18.0, "loss/logits": 0.19807633757591248, "step": 843 }, { "epoch": 0.012149566343973801, "grad_norm": 0.10595703125, "grad_norm_var": 0.0009122679630915324, "learning_rate": 0.0001, "loss": 0.2087, "loss/crossentropy": 2.5325080156326294, "loss/fcd": 0.4560546875, "loss/idx": 18.0, "loss/logits": 0.20872415602207184, "step": 844 }, { "epoch": 0.0121639615647605, "grad_norm": 0.1044921875, "grad_norm_var": 0.0009208361307779948, "learning_rate": 0.0001, "loss": 0.2326, "loss/crossentropy": 2.4612646102905273, "loss/fcd": 0.439453125, "loss/idx": 18.0, "loss/logits": 0.23255135864019394, "step": 845 }, { "epoch": 0.012178356785547198, "grad_norm": 0.1259765625, "grad_norm_var": 0.0009262154499689738, "learning_rate": 0.0001, "loss": 0.2151, "loss/crossentropy": 2.463419198989868, "loss/fcd": 0.4560546875, "loss/idx": 18.0, "loss/logits": 0.2151269093155861, "step": 846 }, { "epoch": 0.012192752006333896, "grad_norm": 0.11474609375, "grad_norm_var": 0.000887898604075114, "learning_rate": 0.0001, "loss": 0.2149, "loss/crossentropy": 2.0733948945999146, "loss/fcd": 0.412109375, "loss/idx": 18.0, "loss/logits": 0.21493042260408401, "step": 847 }, { "epoch": 0.012207147227120596, "grad_norm": 0.09375, "grad_norm_var": 0.000923815369606018, "learning_rate": 0.0001, "loss": 0.1887, "loss/crossentropy": 2.6692737340927124, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.18867085129022598, "step": 848 }, { "epoch": 0.012221542447907295, "grad_norm": 0.111328125, "grad_norm_var": 0.0009139657020568847, "learning_rate": 0.0001, "loss": 0.2055, "loss/crossentropy": 2.349723696708679, "loss/fcd": 0.4189453125, "loss/idx": 18.0, "loss/logits": 0.2054726406931877, "step": 849 }, { "epoch": 0.012235937668693993, "grad_norm": 0.103515625, "grad_norm_var": 0.0009088347355524699, "learning_rate": 0.0001, "loss": 0.2146, "loss/crossentropy": 2.4036346673965454, "loss/fcd": 0.4189453125, "loss/idx": 18.0, "loss/logits": 0.21456415951251984, "step": 850 }, { "epoch": 0.012250332889480693, "grad_norm": 0.12890625, "grad_norm_var": 0.0009135882059733073, "learning_rate": 0.0001, "loss": 0.2798, "loss/crossentropy": 2.601546049118042, "loss/fcd": 0.515625, "loss/idx": 18.0, "loss/logits": 0.27978505194187164, "step": 851 }, { "epoch": 0.012264728110267391, "grad_norm": 0.12109375, "grad_norm_var": 0.0009022037188212077, "learning_rate": 0.0001, "loss": 0.2426, "loss/crossentropy": 2.657235622406006, "loss/fcd": 0.48828125, "loss/idx": 18.0, "loss/logits": 0.24257582426071167, "step": 852 }, { "epoch": 0.01227912333105409, "grad_norm": 0.1044921875, "grad_norm_var": 0.0009092291196187338, "learning_rate": 0.0001, "loss": 0.2069, "loss/crossentropy": 2.515699028968811, "loss/fcd": 0.4189453125, "loss/idx": 18.0, "loss/logits": 0.20689593255519867, "step": 853 }, { "epoch": 0.012293518551840788, "grad_norm": 0.1298828125, "grad_norm_var": 0.0009068479140599569, "learning_rate": 0.0001, "loss": 0.2659, "loss/crossentropy": 2.6577337980270386, "loss/fcd": 0.4970703125, "loss/idx": 18.0, "loss/logits": 0.26587389409542084, "step": 854 }, { "epoch": 0.012307913772627488, "grad_norm": 0.10205078125, "grad_norm_var": 0.00019206603368123373, "learning_rate": 0.0001, "loss": 0.2053, "loss/crossentropy": 2.189553380012512, "loss/fcd": 0.4228515625, "loss/idx": 18.0, "loss/logits": 0.2052648663520813, "step": 855 }, { "epoch": 0.012322308993414187, "grad_norm": 0.111328125, "grad_norm_var": 0.0001191099484761556, "learning_rate": 0.0001, "loss": 0.2515, "loss/crossentropy": 2.7488744258880615, "loss/fcd": 0.47265625, "loss/idx": 18.0, "loss/logits": 0.2514711171388626, "step": 856 }, { "epoch": 0.012336704214200885, "grad_norm": 0.1025390625, "grad_norm_var": 0.00011858046054840088, "learning_rate": 0.0001, "loss": 0.2299, "loss/crossentropy": 2.5891239643096924, "loss/fcd": 0.4462890625, "loss/idx": 18.0, "loss/logits": 0.2299317717552185, "step": 857 }, { "epoch": 0.012351099434987585, "grad_norm": 0.09912109375, "grad_norm_var": 0.00012637674808502197, "learning_rate": 0.0001, "loss": 0.196, "loss/crossentropy": 2.2487235069274902, "loss/fcd": 0.4013671875, "loss/idx": 18.0, "loss/logits": 0.19604943692684174, "step": 858 }, { "epoch": 0.012365494655774283, "grad_norm": 0.10986328125, "grad_norm_var": 0.00011815925439198812, "learning_rate": 0.0001, "loss": 0.2355, "loss/crossentropy": 2.448120951652527, "loss/fcd": 0.4658203125, "loss/idx": 18.0, "loss/logits": 0.23546921461820602, "step": 859 }, { "epoch": 0.012379889876560982, "grad_norm": 0.1123046875, "grad_norm_var": 0.00011677742004394532, "learning_rate": 0.0001, "loss": 0.2299, "loss/crossentropy": 2.6376739740371704, "loss/fcd": 0.4765625, "loss/idx": 18.0, "loss/logits": 0.22993575036525726, "step": 860 }, { "epoch": 0.01239428509734768, "grad_norm": 0.11962890625, "grad_norm_var": 0.00011804004510243734, "learning_rate": 0.0001, "loss": 0.2362, "loss/crossentropy": 2.4012396335601807, "loss/fcd": 0.4541015625, "loss/idx": 18.0, "loss/logits": 0.2361709326505661, "step": 861 }, { "epoch": 0.01240868031813438, "grad_norm": 0.10546875, "grad_norm_var": 0.0001058568557103475, "learning_rate": 0.0001, "loss": 0.2031, "loss/crossentropy": 2.3792039155960083, "loss/fcd": 0.4306640625, "loss/idx": 18.0, "loss/logits": 0.2031245082616806, "step": 862 }, { "epoch": 0.012423075538921078, "grad_norm": 0.1435546875, "grad_norm_var": 0.00017355283101399738, "learning_rate": 0.0001, "loss": 0.1986, "loss/crossentropy": 2.1046639680862427, "loss/fcd": 0.4140625, "loss/idx": 18.0, "loss/logits": 0.19857460260391235, "step": 863 }, { "epoch": 0.012437470759707777, "grad_norm": 0.10009765625, "grad_norm_var": 0.00016026397546132405, "learning_rate": 0.0001, "loss": 0.2247, "loss/crossentropy": 2.6387428045272827, "loss/fcd": 0.4599609375, "loss/idx": 18.0, "loss/logits": 0.22474492341279984, "step": 864 }, { "epoch": 0.012451865980494475, "grad_norm": 0.10498046875, "grad_norm_var": 0.00016404787699381512, "learning_rate": 0.0001, "loss": 0.2216, "loss/crossentropy": 2.420724868774414, "loss/fcd": 0.4384765625, "loss/idx": 18.0, "loss/logits": 0.22158697247505188, "step": 865 }, { "epoch": 0.012466261201281175, "grad_norm": 0.11279296875, "grad_norm_var": 0.00015840431054433188, "learning_rate": 0.0001, "loss": 0.2081, "loss/crossentropy": 2.1139690279960632, "loss/fcd": 0.4599609375, "loss/idx": 18.0, "loss/logits": 0.20810745656490326, "step": 866 }, { "epoch": 0.012480656422067873, "grad_norm": 0.099609375, "grad_norm_var": 0.0001499404509862264, "learning_rate": 0.0001, "loss": 0.208, "loss/crossentropy": 2.540156126022339, "loss/fcd": 0.439453125, "loss/idx": 18.0, "loss/logits": 0.2079625502228737, "step": 867 }, { "epoch": 0.012495051642854572, "grad_norm": 0.10107421875, "grad_norm_var": 0.0001485149065653483, "learning_rate": 0.0001, "loss": 0.2183, "loss/crossentropy": 2.39568293094635, "loss/fcd": 0.4267578125, "loss/idx": 18.0, "loss/logits": 0.21834557503461838, "step": 868 }, { "epoch": 0.012509446863641272, "grad_norm": 0.10546875, "grad_norm_var": 0.00014786720275878907, "learning_rate": 0.0001, "loss": 0.2185, "loss/crossentropy": 2.3164258003234863, "loss/fcd": 0.4443359375, "loss/idx": 18.0, "loss/logits": 0.2184668406844139, "step": 869 }, { "epoch": 0.01252384208442797, "grad_norm": 0.109375, "grad_norm_var": 0.00011974573135375977, "learning_rate": 0.0001, "loss": 0.2123, "loss/crossentropy": 2.3733065128326416, "loss/fcd": 0.4384765625, "loss/idx": 18.0, "loss/logits": 0.21230120956897736, "step": 870 }, { "epoch": 0.012538237305214668, "grad_norm": 0.10302734375, "grad_norm_var": 0.00011893908182779948, "learning_rate": 0.0001, "loss": 0.217, "loss/crossentropy": 2.5337724685668945, "loss/fcd": 0.443359375, "loss/idx": 18.0, "loss/logits": 0.21697236597537994, "step": 871 }, { "epoch": 0.012552632526001367, "grad_norm": 0.1328125, "grad_norm_var": 0.00015513102213541666, "learning_rate": 0.0001, "loss": 0.2258, "loss/crossentropy": 2.4623916149139404, "loss/fcd": 0.44921875, "loss/idx": 18.0, "loss/logits": 0.2257620170712471, "step": 872 }, { "epoch": 0.012567027746788067, "grad_norm": 0.11767578125, "grad_norm_var": 0.00015417635440826417, "learning_rate": 0.0001, "loss": 0.1991, "loss/crossentropy": 2.1633788347244263, "loss/fcd": 0.41015625, "loss/idx": 18.0, "loss/logits": 0.1990898996591568, "step": 873 }, { "epoch": 0.012581422967574765, "grad_norm": 0.10009765625, "grad_norm_var": 0.00015268226464589438, "learning_rate": 0.0001, "loss": 0.2118, "loss/crossentropy": 2.4863176345825195, "loss/fcd": 0.431640625, "loss/idx": 18.0, "loss/logits": 0.21175408363342285, "step": 874 }, { "epoch": 0.012595818188361464, "grad_norm": 0.10888671875, "grad_norm_var": 0.00015290478865305582, "learning_rate": 0.0001, "loss": 0.2287, "loss/crossentropy": 2.6219388246536255, "loss/fcd": 0.4541015625, "loss/idx": 18.0, "loss/logits": 0.22867251932621002, "step": 875 }, { "epoch": 0.012610213409148164, "grad_norm": 0.10546875, "grad_norm_var": 0.0001546849807103475, "learning_rate": 0.0001, "loss": 0.2051, "loss/crossentropy": 2.1874676942825317, "loss/fcd": 0.44140625, "loss/idx": 18.0, "loss/logits": 0.205148346722126, "step": 876 }, { "epoch": 0.012624608629934862, "grad_norm": 0.1083984375, "grad_norm_var": 0.00014908711115519207, "learning_rate": 0.0001, "loss": 0.2029, "loss/crossentropy": 2.129917621612549, "loss/fcd": 0.412109375, "loss/idx": 18.0, "loss/logits": 0.20289119333028793, "step": 877 }, { "epoch": 0.01263900385072156, "grad_norm": 0.11376953125, "grad_norm_var": 0.0001484622557957967, "learning_rate": 0.0001, "loss": 0.2193, "loss/crossentropy": 2.0245165824890137, "loss/fcd": 0.4482421875, "loss/idx": 18.0, "loss/logits": 0.2193296253681183, "step": 878 }, { "epoch": 0.012653399071508259, "grad_norm": 0.10986328125, "grad_norm_var": 7.06632932027181e-05, "learning_rate": 0.0001, "loss": 0.197, "loss/crossentropy": 2.1348493099212646, "loss/fcd": 0.4072265625, "loss/idx": 18.0, "loss/logits": 0.19701501727104187, "step": 879 }, { "epoch": 0.012667794292294959, "grad_norm": 0.1171875, "grad_norm_var": 7.014175256093343e-05, "learning_rate": 0.0001, "loss": 0.2465, "loss/crossentropy": 2.4276719093322754, "loss/fcd": 0.447265625, "loss/idx": 18.0, "loss/logits": 0.24645158648490906, "step": 880 }, { "epoch": 0.012682189513081657, "grad_norm": 0.1103515625, "grad_norm_var": 6.877581278483072e-05, "learning_rate": 0.0001, "loss": 0.2378, "loss/crossentropy": 2.301609516143799, "loss/fcd": 0.451171875, "loss/idx": 18.0, "loss/logits": 0.23781420290470123, "step": 881 }, { "epoch": 0.012696584733868355, "grad_norm": 0.11083984375, "grad_norm_var": 6.821950276692708e-05, "learning_rate": 0.0001, "loss": 0.2172, "loss/crossentropy": 2.4009501934051514, "loss/fcd": 0.4345703125, "loss/idx": 18.0, "loss/logits": 0.21715039014816284, "step": 882 }, { "epoch": 0.012710979954655055, "grad_norm": 0.09521484375, "grad_norm_var": 7.529159386952718e-05, "learning_rate": 0.0001, "loss": 0.1797, "loss/crossentropy": 2.402838706970215, "loss/fcd": 0.3974609375, "loss/idx": 18.0, "loss/logits": 0.1796911582350731, "step": 883 }, { "epoch": 0.012725375175441754, "grad_norm": 0.1083984375, "grad_norm_var": 7.056792577107748e-05, "learning_rate": 0.0001, "loss": 0.2222, "loss/crossentropy": 2.573052167892456, "loss/fcd": 0.46875, "loss/idx": 18.0, "loss/logits": 0.22222469747066498, "step": 884 }, { "epoch": 0.012739770396228452, "grad_norm": 0.10302734375, "grad_norm_var": 7.23510980606079e-05, "learning_rate": 0.0001, "loss": 0.2008, "loss/crossentropy": 2.232303559780121, "loss/fcd": 0.40234375, "loss/idx": 18.0, "loss/logits": 0.20078317821025848, "step": 885 }, { "epoch": 0.01275416561701515, "grad_norm": 0.1044921875, "grad_norm_var": 7.402002811431884e-05, "learning_rate": 0.0001, "loss": 0.2282, "loss/crossentropy": 2.5020205974578857, "loss/fcd": 0.4580078125, "loss/idx": 18.0, "loss/logits": 0.22823868691921234, "step": 886 }, { "epoch": 0.01276856083780185, "grad_norm": 0.1025390625, "grad_norm_var": 7.444620132446289e-05, "learning_rate": 0.0001, "loss": 0.212, "loss/crossentropy": 2.333465099334717, "loss/fcd": 0.439453125, "loss/idx": 18.0, "loss/logits": 0.212021104991436, "step": 887 }, { "epoch": 0.012782956058588549, "grad_norm": 0.10498046875, "grad_norm_var": 3.565847873687744e-05, "learning_rate": 0.0001, "loss": 0.2017, "loss/crossentropy": 2.190958023071289, "loss/fcd": 0.404296875, "loss/idx": 18.0, "loss/logits": 0.20165172219276428, "step": 888 }, { "epoch": 0.012797351279375247, "grad_norm": 0.10107421875, "grad_norm_var": 3.0524532000223796e-05, "learning_rate": 0.0001, "loss": 0.2102, "loss/crossentropy": 2.4441522359848022, "loss/fcd": 0.4169921875, "loss/idx": 18.0, "loss/logits": 0.21022119373083115, "step": 889 }, { "epoch": 0.012811746500161945, "grad_norm": 0.1044921875, "grad_norm_var": 2.795855204264323e-05, "learning_rate": 0.0001, "loss": 0.2209, "loss/crossentropy": 2.504876732826233, "loss/fcd": 0.4453125, "loss/idx": 18.0, "loss/logits": 0.22085034102201462, "step": 890 }, { "epoch": 0.012826141720948646, "grad_norm": 0.11279296875, "grad_norm_var": 2.9993057250976562e-05, "learning_rate": 0.0001, "loss": 0.2144, "loss/crossentropy": 2.03822124004364, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.2144273966550827, "step": 891 }, { "epoch": 0.012840536941735344, "grad_norm": 0.09521484375, "grad_norm_var": 3.87340784072876e-05, "learning_rate": 0.0001, "loss": 0.1949, "loss/crossentropy": 2.364703059196472, "loss/fcd": 0.4140625, "loss/idx": 18.0, "loss/logits": 0.19488562643527985, "step": 892 }, { "epoch": 0.012854932162522042, "grad_norm": 0.09716796875, "grad_norm_var": 4.364649454752604e-05, "learning_rate": 0.0001, "loss": 0.2072, "loss/crossentropy": 2.37164306640625, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.2072392851114273, "step": 893 }, { "epoch": 0.012869327383308742, "grad_norm": 0.1083984375, "grad_norm_var": 3.96798054377238e-05, "learning_rate": 0.0001, "loss": 0.2372, "loss/crossentropy": 2.513867974281311, "loss/fcd": 0.474609375, "loss/idx": 18.0, "loss/logits": 0.2371513769030571, "step": 894 }, { "epoch": 0.01288372260409544, "grad_norm": 0.11376953125, "grad_norm_var": 4.2969981829325356e-05, "learning_rate": 0.0001, "loss": 0.2024, "loss/crossentropy": 2.266432523727417, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.20237434655427933, "step": 895 }, { "epoch": 0.012898117824882139, "grad_norm": 0.1083984375, "grad_norm_var": 3.424386183420817e-05, "learning_rate": 0.0001, "loss": 0.2159, "loss/crossentropy": 2.191688299179077, "loss/fcd": 0.4296875, "loss/idx": 18.0, "loss/logits": 0.21588657796382904, "step": 896 }, { "epoch": 0.012912513045668837, "grad_norm": 0.1064453125, "grad_norm_var": 3.24477752049764e-05, "learning_rate": 0.0001, "loss": 0.2044, "loss/crossentropy": 2.1034794449806213, "loss/fcd": 0.421875, "loss/idx": 18.0, "loss/logits": 0.2044026404619217, "step": 897 }, { "epoch": 0.012926908266455537, "grad_norm": 0.10498046875, "grad_norm_var": 2.989669640858968e-05, "learning_rate": 0.0001, "loss": 0.1955, "loss/crossentropy": 2.1368765830993652, "loss/fcd": 0.4033203125, "loss/idx": 18.0, "loss/logits": 0.19553573429584503, "step": 898 }, { "epoch": 0.012941303487242236, "grad_norm": 0.1044921875, "grad_norm_var": 2.3837884267171224e-05, "learning_rate": 0.0001, "loss": 0.2218, "loss/crossentropy": 2.5139960050582886, "loss/fcd": 0.453125, "loss/idx": 18.0, "loss/logits": 0.2217550054192543, "step": 899 }, { "epoch": 0.012955698708028934, "grad_norm": 0.115234375, "grad_norm_var": 2.9818216959635416e-05, "learning_rate": 0.0001, "loss": 0.224, "loss/crossentropy": 2.2468815445899963, "loss/fcd": 0.427734375, "loss/idx": 18.0, "loss/logits": 0.22403018921613693, "step": 900 }, { "epoch": 0.012970093928815634, "grad_norm": 0.1220703125, "grad_norm_var": 4.6284000078837076e-05, "learning_rate": 0.0001, "loss": 0.2005, "loss/crossentropy": 1.8389571905136108, "loss/fcd": 0.41796875, "loss/idx": 18.0, "loss/logits": 0.20048467069864273, "step": 901 }, { "epoch": 0.012984489149602332, "grad_norm": 0.1103515625, "grad_norm_var": 4.6736995379130046e-05, "learning_rate": 0.0001, "loss": 0.2181, "loss/crossentropy": 2.47100293636322, "loss/fcd": 0.4404296875, "loss/idx": 18.0, "loss/logits": 0.21807243674993515, "step": 902 }, { "epoch": 0.01299888437038903, "grad_norm": 0.142578125, "grad_norm_var": 0.00012298325697580973, "learning_rate": 0.0001, "loss": 0.2141, "loss/crossentropy": 2.011506676673889, "loss/fcd": 0.4443359375, "loss/idx": 18.0, "loss/logits": 0.21408168226480484, "step": 903 }, { "epoch": 0.013013279591175729, "grad_norm": 0.1064453125, "grad_norm_var": 0.00012222925821940103, "learning_rate": 0.0001, "loss": 0.2293, "loss/crossentropy": 2.7555429935455322, "loss/fcd": 0.4814453125, "loss/idx": 18.0, "loss/logits": 0.22929980605840683, "step": 904 }, { "epoch": 0.013027674811962429, "grad_norm": 0.10791015625, "grad_norm_var": 0.00011736154556274414, "learning_rate": 0.0001, "loss": 0.2118, "loss/crossentropy": 2.3159666061401367, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.21182847768068314, "step": 905 }, { "epoch": 0.013042070032749127, "grad_norm": 0.09814453125, "grad_norm_var": 0.0001245806614557902, "learning_rate": 0.0001, "loss": 0.2026, "loss/crossentropy": 2.5774402618408203, "loss/fcd": 0.4140625, "loss/idx": 18.0, "loss/logits": 0.20259930193424225, "step": 906 }, { "epoch": 0.013056465253535826, "grad_norm": 0.1044921875, "grad_norm_var": 0.00012540817260742188, "learning_rate": 0.0001, "loss": 0.2085, "loss/crossentropy": 2.3706518411636353, "loss/fcd": 0.4306640625, "loss/idx": 18.0, "loss/logits": 0.2085341438651085, "step": 907 }, { "epoch": 0.013070860474322524, "grad_norm": 0.103515625, "grad_norm_var": 0.0001143127679824829, "learning_rate": 0.0001, "loss": 0.2094, "loss/crossentropy": 2.5629345178604126, "loss/fcd": 0.458984375, "loss/idx": 18.0, "loss/logits": 0.2093740627169609, "step": 908 }, { "epoch": 0.013085255695109224, "grad_norm": 0.10498046875, "grad_norm_var": 0.0001051257054011027, "learning_rate": 0.0001, "loss": 0.1934, "loss/crossentropy": 2.204862952232361, "loss/fcd": 0.4296875, "loss/idx": 18.0, "loss/logits": 0.1933920904994011, "step": 909 }, { "epoch": 0.013099650915895922, "grad_norm": 0.08935546875, "grad_norm_var": 0.00013220707575480143, "learning_rate": 0.0001, "loss": 0.1996, "loss/crossentropy": 2.6174755096435547, "loss/fcd": 0.4140625, "loss/idx": 18.0, "loss/logits": 0.19955138117074966, "step": 910 }, { "epoch": 0.01311404613668262, "grad_norm": 0.10205078125, "grad_norm_var": 0.0001332561175028483, "learning_rate": 0.0001, "loss": 0.2163, "loss/crossentropy": 2.3615927696228027, "loss/fcd": 0.4580078125, "loss/idx": 18.0, "loss/logits": 0.21632999181747437, "step": 911 }, { "epoch": 0.013128441357469321, "grad_norm": 0.10302734375, "grad_norm_var": 0.00013492802778879802, "learning_rate": 0.0001, "loss": 0.2018, "loss/crossentropy": 2.3113813400268555, "loss/fcd": 0.435546875, "loss/idx": 18.0, "loss/logits": 0.20180628448724747, "step": 912 }, { "epoch": 0.01314283657825602, "grad_norm": 0.10498046875, "grad_norm_var": 0.00013534228006998697, "learning_rate": 0.0001, "loss": 0.2166, "loss/crossentropy": 2.575196623802185, "loss/fcd": 0.453125, "loss/idx": 18.0, "loss/logits": 0.2166244387626648, "step": 913 }, { "epoch": 0.013157231799042718, "grad_norm": 0.107421875, "grad_norm_var": 0.0001348008712132772, "learning_rate": 0.0001, "loss": 0.2273, "loss/crossentropy": 2.4861690998077393, "loss/fcd": 0.4462890625, "loss/idx": 18.0, "loss/logits": 0.22732173651456833, "step": 914 }, { "epoch": 0.013171627019829416, "grad_norm": 0.119140625, "grad_norm_var": 0.00014147659142812094, "learning_rate": 0.0001, "loss": 0.2495, "loss/crossentropy": 2.6098480224609375, "loss/fcd": 0.5009765625, "loss/idx": 18.0, "loss/logits": 0.24945074319839478, "step": 915 }, { "epoch": 0.013186022240616116, "grad_norm": 0.10791015625, "grad_norm_var": 0.00013860066731770834, "learning_rate": 0.0001, "loss": 0.2244, "loss/crossentropy": 2.3869473934173584, "loss/fcd": 0.44921875, "loss/idx": 18.0, "loss/logits": 0.2244347631931305, "step": 916 }, { "epoch": 0.013200417461402814, "grad_norm": 0.1044921875, "grad_norm_var": 0.00012586911519368488, "learning_rate": 0.0001, "loss": 0.2015, "loss/crossentropy": 2.152829647064209, "loss/fcd": 0.3984375, "loss/idx": 18.0, "loss/logits": 0.20145908743143082, "step": 917 }, { "epoch": 0.013214812682189513, "grad_norm": 0.1015625, "grad_norm_var": 0.00012712081273396809, "learning_rate": 0.0001, "loss": 0.2334, "loss/crossentropy": 2.607330799102783, "loss/fcd": 0.4599609375, "loss/idx": 18.0, "loss/logits": 0.23341640084981918, "step": 918 }, { "epoch": 0.013229207902976213, "grad_norm": 0.09619140625, "grad_norm_var": 4.0013591448465986e-05, "learning_rate": 0.0001, "loss": 0.2237, "loss/crossentropy": 2.6008039712905884, "loss/fcd": 0.44921875, "loss/idx": 18.0, "loss/logits": 0.22374649345874786, "step": 919 }, { "epoch": 0.013243603123762911, "grad_norm": 0.1259765625, "grad_norm_var": 7.061064243316651e-05, "learning_rate": 0.0001, "loss": 0.2637, "loss/crossentropy": 2.669323205947876, "loss/fcd": 0.509765625, "loss/idx": 18.0, "loss/logits": 0.26368650794029236, "step": 920 }, { "epoch": 0.01325799834454961, "grad_norm": 0.107421875, "grad_norm_var": 7.044076919555664e-05, "learning_rate": 0.0001, "loss": 0.2218, "loss/crossentropy": 2.663564920425415, "loss/fcd": 0.4560546875, "loss/idx": 18.0, "loss/logits": 0.22182673960924149, "step": 921 }, { "epoch": 0.013272393565336308, "grad_norm": 0.1015625, "grad_norm_var": 6.802777449289957e-05, "learning_rate": 0.0001, "loss": 0.2233, "loss/crossentropy": 2.503962516784668, "loss/fcd": 0.427734375, "loss/idx": 18.0, "loss/logits": 0.22326037287712097, "step": 922 }, { "epoch": 0.013286788786123008, "grad_norm": 0.10009765625, "grad_norm_var": 6.968180338541666e-05, "learning_rate": 0.0001, "loss": 0.2051, "loss/crossentropy": 2.488289475440979, "loss/fcd": 0.4482421875, "loss/idx": 18.0, "loss/logits": 0.20506569743156433, "step": 923 }, { "epoch": 0.013301184006909706, "grad_norm": 0.1240234375, "grad_norm_var": 9.196201960245768e-05, "learning_rate": 0.0001, "loss": 0.2772, "loss/crossentropy": 2.7090861797332764, "loss/fcd": 0.537109375, "loss/idx": 18.0, "loss/logits": 0.2771788090467453, "step": 924 }, { "epoch": 0.013315579227696404, "grad_norm": 0.09765625, "grad_norm_var": 9.65664784113566e-05, "learning_rate": 0.0001, "loss": 0.2027, "loss/crossentropy": 2.4453471899032593, "loss/fcd": 0.4375, "loss/idx": 18.0, "loss/logits": 0.202697291970253, "step": 925 }, { "epoch": 0.013329974448483104, "grad_norm": 0.10205078125, "grad_norm_var": 7.879634698232015e-05, "learning_rate": 0.0001, "loss": 0.2021, "loss/crossentropy": 2.4429107904434204, "loss/fcd": 0.4248046875, "loss/idx": 18.0, "loss/logits": 0.20214182883501053, "step": 926 }, { "epoch": 0.013344369669269803, "grad_norm": 0.11181640625, "grad_norm_var": 7.883608341217042e-05, "learning_rate": 0.0001, "loss": 0.2276, "loss/crossentropy": 2.4106050729751587, "loss/fcd": 0.509765625, "loss/idx": 18.0, "loss/logits": 0.22757098823785782, "step": 927 }, { "epoch": 0.013358764890056501, "grad_norm": 0.1171875, "grad_norm_var": 8.347431818644206e-05, "learning_rate": 0.0001, "loss": 0.1978, "loss/crossentropy": 2.1090660095214844, "loss/fcd": 0.4111328125, "loss/idx": 18.0, "loss/logits": 0.197757326066494, "step": 928 }, { "epoch": 0.0133731601108432, "grad_norm": 0.1044921875, "grad_norm_var": 8.369187513987223e-05, "learning_rate": 0.0001, "loss": 0.2262, "loss/crossentropy": 2.443942070007324, "loss/fcd": 0.4521484375, "loss/idx": 18.0, "loss/logits": 0.22621066123247147, "step": 929 }, { "epoch": 0.0133875553316299, "grad_norm": 0.1005859375, "grad_norm_var": 8.71966282526652e-05, "learning_rate": 0.0001, "loss": 0.1729, "loss/crossentropy": 2.2424585819244385, "loss/fcd": 0.4091796875, "loss/idx": 18.0, "loss/logits": 0.17293807864189148, "step": 930 }, { "epoch": 0.013401950552416598, "grad_norm": 0.115234375, "grad_norm_var": 8.215804894765219e-05, "learning_rate": 0.0001, "loss": 0.1967, "loss/crossentropy": 2.0759899616241455, "loss/fcd": 0.41796875, "loss/idx": 18.0, "loss/logits": 0.19673413038253784, "step": 931 }, { "epoch": 0.013416345773203296, "grad_norm": 0.12060546875, "grad_norm_var": 9.310940901438395e-05, "learning_rate": 0.0001, "loss": 0.2183, "loss/crossentropy": 2.1836588382720947, "loss/fcd": 0.443359375, "loss/idx": 18.0, "loss/logits": 0.21834751218557358, "step": 932 }, { "epoch": 0.013430740993989995, "grad_norm": 0.0966796875, "grad_norm_var": 0.00010077059268951417, "learning_rate": 0.0001, "loss": 0.2045, "loss/crossentropy": 2.431378960609436, "loss/fcd": 0.408203125, "loss/idx": 18.0, "loss/logits": 0.2045089453458786, "step": 933 }, { "epoch": 0.013445136214776695, "grad_norm": 0.1005859375, "grad_norm_var": 0.00010162889957427978, "learning_rate": 0.0001, "loss": 0.2197, "loss/crossentropy": 2.4605276584625244, "loss/fcd": 0.42578125, "loss/idx": 18.0, "loss/logits": 0.2197011262178421, "step": 934 }, { "epoch": 0.013459531435563393, "grad_norm": 0.1123046875, "grad_norm_var": 9.326934814453125e-05, "learning_rate": 0.0001, "loss": 0.2397, "loss/crossentropy": 2.551363468170166, "loss/fcd": 0.470703125, "loss/idx": 18.0, "loss/logits": 0.23971816152334213, "step": 935 }, { "epoch": 0.013473926656350091, "grad_norm": 0.099609375, "grad_norm_var": 7.578134536743165e-05, "learning_rate": 0.0001, "loss": 0.2057, "loss/crossentropy": 2.3025097846984863, "loss/fcd": 0.4375, "loss/idx": 18.0, "loss/logits": 0.20573781430721283, "step": 936 }, { "epoch": 0.013488321877136791, "grad_norm": 0.10205078125, "grad_norm_var": 7.72784153620402e-05, "learning_rate": 0.0001, "loss": 0.2001, "loss/crossentropy": 2.358902096748352, "loss/fcd": 0.4208984375, "loss/idx": 18.0, "loss/logits": 0.20014435052871704, "step": 937 }, { "epoch": 0.01350271709792349, "grad_norm": 0.12890625, "grad_norm_var": 0.00010542770226796468, "learning_rate": 0.0001, "loss": 0.2339, "loss/crossentropy": 2.2731123566627502, "loss/fcd": 0.4287109375, "loss/idx": 18.0, "loss/logits": 0.23394957929849625, "step": 938 }, { "epoch": 0.013517112318710188, "grad_norm": 0.10498046875, "grad_norm_var": 0.00010153353214263916, "learning_rate": 0.0001, "loss": 0.1838, "loss/crossentropy": 2.0903587341308594, "loss/fcd": 0.3876953125, "loss/idx": 18.0, "loss/logits": 0.1838395819067955, "step": 939 }, { "epoch": 0.013531507539496886, "grad_norm": 0.11083984375, "grad_norm_var": 8.541345596313477e-05, "learning_rate": 0.0001, "loss": 0.205, "loss/crossentropy": 2.3965718746185303, "loss/fcd": 0.455078125, "loss/idx": 18.0, "loss/logits": 0.2049787938594818, "step": 940 }, { "epoch": 0.013545902760283586, "grad_norm": 0.11083984375, "grad_norm_var": 7.835924625396729e-05, "learning_rate": 0.0001, "loss": 0.1831, "loss/crossentropy": 2.0947141647338867, "loss/fcd": 0.4453125, "loss/idx": 18.0, "loss/logits": 0.1831398606300354, "step": 941 }, { "epoch": 0.013560297981070285, "grad_norm": 0.1044921875, "grad_norm_var": 7.657607396443685e-05, "learning_rate": 0.0001, "loss": 0.223, "loss/crossentropy": 2.4057345390319824, "loss/fcd": 0.427734375, "loss/idx": 18.0, "loss/logits": 0.22295735031366348, "step": 942 }, { "epoch": 0.013574693201856983, "grad_norm": 0.1103515625, "grad_norm_var": 7.612605889638265e-05, "learning_rate": 0.0001, "loss": 0.2116, "loss/crossentropy": 2.507383942604065, "loss/fcd": 0.451171875, "loss/idx": 18.0, "loss/logits": 0.21163207292556763, "step": 943 }, { "epoch": 0.013589088422643683, "grad_norm": 0.0966796875, "grad_norm_var": 7.929702599843344e-05, "learning_rate": 0.0001, "loss": 0.199, "loss/crossentropy": 2.5167927742004395, "loss/fcd": 0.4296875, "loss/idx": 18.0, "loss/logits": 0.19900661706924438, "step": 944 }, { "epoch": 0.013603483643430381, "grad_norm": 0.099609375, "grad_norm_var": 8.271435896555583e-05, "learning_rate": 0.0001, "loss": 0.223, "loss/crossentropy": 2.495205879211426, "loss/fcd": 0.427734375, "loss/idx": 18.0, "loss/logits": 0.2230018451809883, "step": 945 }, { "epoch": 0.01361787886421708, "grad_norm": 0.1015625, "grad_norm_var": 8.191963036855062e-05, "learning_rate": 0.0001, "loss": 0.1923, "loss/crossentropy": 2.3246638774871826, "loss/fcd": 0.4072265625, "loss/idx": 18.0, "loss/logits": 0.19232943654060364, "step": 946 }, { "epoch": 0.013632274085003778, "grad_norm": 0.10791015625, "grad_norm_var": 7.743438084920248e-05, "learning_rate": 0.0001, "loss": 0.2183, "loss/crossentropy": 2.342094659805298, "loss/fcd": 0.44140625, "loss/idx": 18.0, "loss/logits": 0.21831049770116806, "step": 947 }, { "epoch": 0.013646669305790478, "grad_norm": 0.09765625, "grad_norm_var": 6.79562489191691e-05, "learning_rate": 0.0001, "loss": 0.2115, "loss/crossentropy": 2.395784616470337, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.2115183100104332, "step": 948 }, { "epoch": 0.013661064526577177, "grad_norm": 0.09326171875, "grad_norm_var": 7.262229919433594e-05, "learning_rate": 0.0001, "loss": 0.1866, "loss/crossentropy": 2.3914581537246704, "loss/fcd": 0.408203125, "loss/idx": 18.0, "loss/logits": 0.1866021454334259, "step": 949 }, { "epoch": 0.013675459747363875, "grad_norm": 0.10009765625, "grad_norm_var": 7.293124993642171e-05, "learning_rate": 0.0001, "loss": 0.2197, "loss/crossentropy": 2.7165035009384155, "loss/fcd": 0.43359375, "loss/idx": 18.0, "loss/logits": 0.21966220438480377, "step": 950 }, { "epoch": 0.013689854968150573, "grad_norm": 0.0947265625, "grad_norm_var": 7.529159386952718e-05, "learning_rate": 0.0001, "loss": 0.2021, "loss/crossentropy": 2.4402034282684326, "loss/fcd": 0.43359375, "loss/idx": 18.0, "loss/logits": 0.20212795585393906, "step": 951 }, { "epoch": 0.013704250188937273, "grad_norm": 0.1201171875, "grad_norm_var": 8.964439233144124e-05, "learning_rate": 0.0001, "loss": 0.2361, "loss/crossentropy": 2.3232113122940063, "loss/fcd": 0.455078125, "loss/idx": 18.0, "loss/logits": 0.2360788732767105, "step": 952 }, { "epoch": 0.013718645409723972, "grad_norm": 0.09619140625, "grad_norm_var": 9.429355462392171e-05, "learning_rate": 0.0001, "loss": 0.2187, "loss/crossentropy": 2.51291286945343, "loss/fcd": 0.4423828125, "loss/idx": 18.0, "loss/logits": 0.21865685284137726, "step": 953 }, { "epoch": 0.01373304063051067, "grad_norm": 0.099609375, "grad_norm_var": 5.412002404530843e-05, "learning_rate": 0.0001, "loss": 0.2028, "loss/crossentropy": 2.311350464820862, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.20284704118967056, "step": 954 }, { "epoch": 0.01374743585129737, "grad_norm": 0.10302734375, "grad_norm_var": 5.3857763608296714e-05, "learning_rate": 0.0001, "loss": 0.2074, "loss/crossentropy": 2.3779343366622925, "loss/fcd": 0.42578125, "loss/idx": 18.0, "loss/logits": 0.20741773396730423, "step": 955 }, { "epoch": 0.013761831072084068, "grad_norm": 0.10009765625, "grad_norm_var": 4.9749016761779784e-05, "learning_rate": 0.0001, "loss": 0.1919, "loss/crossentropy": 2.285262107849121, "loss/fcd": 0.3984375, "loss/idx": 18.0, "loss/logits": 0.19189584255218506, "step": 956 }, { "epoch": 0.013776226292870767, "grad_norm": 0.10498046875, "grad_norm_var": 4.519522190093994e-05, "learning_rate": 0.0001, "loss": 0.2057, "loss/crossentropy": 2.509137988090515, "loss/fcd": 0.4453125, "loss/idx": 18.0, "loss/logits": 0.20570625364780426, "step": 957 }, { "epoch": 0.013790621513657465, "grad_norm": 0.09375, "grad_norm_var": 4.8692027727762856e-05, "learning_rate": 0.0001, "loss": 0.1856, "loss/crossentropy": 2.298775553703308, "loss/fcd": 0.400390625, "loss/idx": 18.0, "loss/logits": 0.18564346432685852, "step": 958 }, { "epoch": 0.013805016734444165, "grad_norm": 0.1396484375, "grad_norm_var": 0.000137979785601298, "learning_rate": 0.0001, "loss": 0.255, "loss/crossentropy": 2.497642993927002, "loss/fcd": 0.4765625, "loss/idx": 18.0, "loss/logits": 0.2549555003643036, "step": 959 }, { "epoch": 0.013819411955230863, "grad_norm": 0.1025390625, "grad_norm_var": 0.00013514260450998942, "learning_rate": 0.0001, "loss": 0.2086, "loss/crossentropy": 2.250615358352661, "loss/fcd": 0.427734375, "loss/idx": 18.0, "loss/logits": 0.20857169479131699, "step": 960 }, { "epoch": 0.013833807176017562, "grad_norm": 0.1162109375, "grad_norm_var": 0.00014392435550689698, "learning_rate": 0.0001, "loss": 0.2192, "loss/crossentropy": 2.349924087524414, "loss/fcd": 0.4716796875, "loss/idx": 18.0, "loss/logits": 0.2192462459206581, "step": 961 }, { "epoch": 0.013848202396804262, "grad_norm": 0.11962890625, "grad_norm_var": 0.00015734036763509113, "learning_rate": 0.0001, "loss": 0.2206, "loss/crossentropy": 2.6663416624069214, "loss/fcd": 0.46875, "loss/idx": 18.0, "loss/logits": 0.22060546278953552, "step": 962 }, { "epoch": 0.01386259761759096, "grad_norm": 0.119140625, "grad_norm_var": 0.00016869604587554932, "learning_rate": 0.0001, "loss": 0.2306, "loss/crossentropy": 2.5041009187698364, "loss/fcd": 0.4501953125, "loss/idx": 18.0, "loss/logits": 0.23058706521987915, "step": 963 }, { "epoch": 0.013876992838377658, "grad_norm": 0.10888671875, "grad_norm_var": 0.00016364653905232747, "learning_rate": 0.0001, "loss": 0.2344, "loss/crossentropy": 2.6687543392181396, "loss/fcd": 0.4609375, "loss/idx": 18.0, "loss/logits": 0.2344193086028099, "step": 964 }, { "epoch": 0.013891388059164357, "grad_norm": 0.091796875, "grad_norm_var": 0.00016646285851796468, "learning_rate": 0.0001, "loss": 0.2073, "loss/crossentropy": 2.6841739416122437, "loss/fcd": 0.4150390625, "loss/idx": 18.0, "loss/logits": 0.2073235660791397, "step": 965 }, { "epoch": 0.013905783279951057, "grad_norm": 0.10205078125, "grad_norm_var": 0.00016492903232574464, "learning_rate": 0.0001, "loss": 0.23, "loss/crossentropy": 2.511627197265625, "loss/fcd": 0.47265625, "loss/idx": 18.0, "loss/logits": 0.23003337532281876, "step": 966 }, { "epoch": 0.013920178500737755, "grad_norm": 0.111328125, "grad_norm_var": 0.0001549313465754191, "learning_rate": 0.0001, "loss": 0.2113, "loss/crossentropy": 2.2891138792037964, "loss/fcd": 0.4267578125, "loss/idx": 18.0, "loss/logits": 0.21126385778188705, "step": 967 }, { "epoch": 0.013934573721524453, "grad_norm": 0.134765625, "grad_norm_var": 0.0001918862263361613, "learning_rate": 0.0001, "loss": 0.2406, "loss/crossentropy": 2.610072374343872, "loss/fcd": 0.4755859375, "loss/idx": 18.0, "loss/logits": 0.24063490331172943, "step": 968 }, { "epoch": 0.013948968942311154, "grad_norm": 0.1181640625, "grad_norm_var": 0.0001845995585123698, "learning_rate": 0.0001, "loss": 0.2615, "loss/crossentropy": 2.6200684309005737, "loss/fcd": 0.484375, "loss/idx": 18.0, "loss/logits": 0.2615353539586067, "step": 969 }, { "epoch": 0.013963364163097852, "grad_norm": 0.115234375, "grad_norm_var": 0.00017747879028320312, "learning_rate": 0.0001, "loss": 0.2364, "loss/crossentropy": 2.344622015953064, "loss/fcd": 0.4375, "loss/idx": 18.0, "loss/logits": 0.23641209304332733, "step": 970 }, { "epoch": 0.01397775938388455, "grad_norm": 0.11328125, "grad_norm_var": 0.00017270147800445556, "learning_rate": 0.0001, "loss": 0.2414, "loss/crossentropy": 2.6739622354507446, "loss/fcd": 0.50390625, "loss/idx": 18.0, "loss/logits": 0.24144794046878815, "step": 971 }, { "epoch": 0.013992154604671249, "grad_norm": 0.13671875, "grad_norm_var": 0.00019855499267578124, "learning_rate": 0.0001, "loss": 0.1994, "loss/crossentropy": 2.1970399618148804, "loss/fcd": 0.544921875, "loss/idx": 18.0, "loss/logits": 0.19944548606872559, "step": 972 }, { "epoch": 0.014006549825457949, "grad_norm": 0.1279296875, "grad_norm_var": 0.00020308395226796468, "learning_rate": 0.0001, "loss": 0.2182, "loss/crossentropy": 2.0679745078086853, "loss/fcd": 0.4267578125, "loss/idx": 18.0, "loss/logits": 0.21820590645074844, "step": 973 }, { "epoch": 0.014020945046244647, "grad_norm": 0.11572265625, "grad_norm_var": 0.0001689751942952474, "learning_rate": 0.0001, "loss": 0.2122, "loss/crossentropy": 2.2601789236068726, "loss/fcd": 0.43359375, "loss/idx": 18.0, "loss/logits": 0.21224602311849594, "step": 974 }, { "epoch": 0.014035340267031345, "grad_norm": 0.115234375, "grad_norm_var": 0.00013271570205688476, "learning_rate": 0.0001, "loss": 0.2131, "loss/crossentropy": 2.215391755104065, "loss/fcd": 0.439453125, "loss/idx": 18.0, "loss/logits": 0.21311646699905396, "step": 975 }, { "epoch": 0.014049735487818044, "grad_norm": 0.0908203125, "grad_norm_var": 0.00016161203384399415, "learning_rate": 0.0001, "loss": 0.2055, "loss/crossentropy": 2.593106508255005, "loss/fcd": 0.4111328125, "loss/idx": 18.0, "loss/logits": 0.20546036958694458, "step": 976 }, { "epoch": 0.014064130708604744, "grad_norm": 0.1044921875, "grad_norm_var": 0.000168001651763916, "learning_rate": 0.0001, "loss": 0.2212, "loss/crossentropy": 2.4123164415359497, "loss/fcd": 0.4482421875, "loss/idx": 18.0, "loss/logits": 0.22117872536182404, "step": 977 }, { "epoch": 0.014078525929391442, "grad_norm": 0.1357421875, "grad_norm_var": 0.00019616186618804933, "learning_rate": 0.0001, "loss": 0.2037, "loss/crossentropy": 2.111898362636566, "loss/fcd": 0.47265625, "loss/idx": 18.0, "loss/logits": 0.20368139445781708, "step": 978 }, { "epoch": 0.01409292115017814, "grad_norm": 0.10302734375, "grad_norm_var": 0.00020366907119750977, "learning_rate": 0.0001, "loss": 0.2233, "loss/crossentropy": 2.484625220298767, "loss/fcd": 0.466796875, "loss/idx": 18.0, "loss/logits": 0.22334980964660645, "step": 979 }, { "epoch": 0.01410731637096484, "grad_norm": 0.103515625, "grad_norm_var": 0.0002091874678929647, "learning_rate": 0.0001, "loss": 0.2286, "loss/crossentropy": 2.5562527179718018, "loss/fcd": 0.455078125, "loss/idx": 18.0, "loss/logits": 0.22860489040613174, "step": 980 }, { "epoch": 0.014121711591751539, "grad_norm": 0.109375, "grad_norm_var": 0.0001770724852879842, "learning_rate": 0.0001, "loss": 0.2195, "loss/crossentropy": 2.372304320335388, "loss/fcd": 0.4423828125, "loss/idx": 18.0, "loss/logits": 0.21949142217636108, "step": 981 }, { "epoch": 0.014136106812538237, "grad_norm": 0.10888671875, "grad_norm_var": 0.00016833841800689697, "learning_rate": 0.0001, "loss": 0.2166, "loss/crossentropy": 2.5525119304656982, "loss/fcd": 0.453125, "loss/idx": 18.0, "loss/logits": 0.21661998331546783, "step": 982 }, { "epoch": 0.014150502033324935, "grad_norm": 0.099609375, "grad_norm_var": 0.00018307268619537352, "learning_rate": 0.0001, "loss": 0.205, "loss/crossentropy": 2.346623182296753, "loss/fcd": 0.416015625, "loss/idx": 18.0, "loss/logits": 0.205020934343338, "step": 983 }, { "epoch": 0.014164897254111635, "grad_norm": 0.09814453125, "grad_norm_var": 0.00016809701919555663, "learning_rate": 0.0001, "loss": 0.1916, "loss/crossentropy": 2.372196674346924, "loss/fcd": 0.408203125, "loss/idx": 18.0, "loss/logits": 0.19163141399621964, "step": 984 }, { "epoch": 0.014179292474898334, "grad_norm": 0.11962890625, "grad_norm_var": 0.00016938745975494384, "learning_rate": 0.0001, "loss": 0.2246, "loss/crossentropy": 2.2609957456588745, "loss/fcd": 0.447265625, "loss/idx": 18.0, "loss/logits": 0.2246478945016861, "step": 985 }, { "epoch": 0.014193687695685032, "grad_norm": 0.1201171875, "grad_norm_var": 0.000172765056292216, "learning_rate": 0.0001, "loss": 0.2191, "loss/crossentropy": 2.2087113857269287, "loss/fcd": 0.4287109375, "loss/idx": 18.0, "loss/logits": 0.21905823051929474, "step": 986 }, { "epoch": 0.014208082916471732, "grad_norm": 0.1171875, "grad_norm_var": 0.00017405251661936443, "learning_rate": 0.0001, "loss": 0.22, "loss/crossentropy": 2.257576823234558, "loss/fcd": 0.44921875, "loss/idx": 18.0, "loss/logits": 0.2199638932943344, "step": 987 }, { "epoch": 0.01422247813725843, "grad_norm": 0.10693359375, "grad_norm_var": 0.00013484557469685873, "learning_rate": 0.0001, "loss": 0.2398, "loss/crossentropy": 2.626092791557312, "loss/fcd": 0.462890625, "loss/idx": 18.0, "loss/logits": 0.23977234959602356, "step": 988 }, { "epoch": 0.014236873358045129, "grad_norm": 0.107421875, "grad_norm_var": 0.00011490186055501302, "learning_rate": 0.0001, "loss": 0.2098, "loss/crossentropy": 2.4047662019729614, "loss/fcd": 0.4453125, "loss/idx": 18.0, "loss/logits": 0.20984865725040436, "step": 989 }, { "epoch": 0.014251268578831827, "grad_norm": 0.11328125, "grad_norm_var": 0.00011332730452219645, "learning_rate": 0.0001, "loss": 0.2132, "loss/crossentropy": 2.4295172691345215, "loss/fcd": 0.4462890625, "loss/idx": 18.0, "loss/logits": 0.21317294985055923, "step": 990 }, { "epoch": 0.014265663799618527, "grad_norm": 0.12451171875, "grad_norm_var": 0.0001256903012593587, "learning_rate": 0.0001, "loss": 0.2325, "loss/crossentropy": 2.5081902742385864, "loss/fcd": 0.47265625, "loss/idx": 18.0, "loss/logits": 0.23250433802604675, "step": 991 }, { "epoch": 0.014280059020405226, "grad_norm": 0.1376953125, "grad_norm_var": 0.00014209349950154623, "learning_rate": 0.0001, "loss": 0.2266, "loss/crossentropy": 2.0926729440689087, "loss/fcd": 0.4951171875, "loss/idx": 18.0, "loss/logits": 0.226626954972744, "step": 992 }, { "epoch": 0.014294454241191924, "grad_norm": 0.10595703125, "grad_norm_var": 0.00014054675896962482, "learning_rate": 0.0001, "loss": 0.2136, "loss/crossentropy": 2.383934497833252, "loss/fcd": 0.4609375, "loss/idx": 18.0, "loss/logits": 0.21359023451805115, "step": 993 }, { "epoch": 0.014308849461978622, "grad_norm": 0.11865234375, "grad_norm_var": 0.00010741154352823893, "learning_rate": 0.0001, "loss": 0.2225, "loss/crossentropy": 2.4633511304855347, "loss/fcd": 0.443359375, "loss/idx": 18.0, "loss/logits": 0.2224937155842781, "step": 994 }, { "epoch": 0.014323244682765322, "grad_norm": 0.10107421875, "grad_norm_var": 0.00011001825332641601, "learning_rate": 0.0001, "loss": 0.1897, "loss/crossentropy": 2.1795610189437866, "loss/fcd": 0.3994140625, "loss/idx": 18.0, "loss/logits": 0.18965643644332886, "step": 995 }, { "epoch": 0.01433763990355202, "grad_norm": 0.1015625, "grad_norm_var": 0.00011246601740519205, "learning_rate": 0.0001, "loss": 0.1976, "loss/crossentropy": 2.336984634399414, "loss/fcd": 0.4033203125, "loss/idx": 18.0, "loss/logits": 0.1975797638297081, "step": 996 }, { "epoch": 0.014352035124338719, "grad_norm": 0.099609375, "grad_norm_var": 0.00012168486913045247, "learning_rate": 0.0001, "loss": 0.2147, "loss/crossentropy": 2.664496660232544, "loss/fcd": 0.44921875, "loss/idx": 18.0, "loss/logits": 0.2147291675209999, "step": 997 }, { "epoch": 0.014366430345125419, "grad_norm": 0.09912109375, "grad_norm_var": 0.00013074477513631185, "learning_rate": 0.0001, "loss": 0.2094, "loss/crossentropy": 2.33840548992157, "loss/fcd": 0.419921875, "loss/idx": 18.0, "loss/logits": 0.209433451294899, "step": 998 }, { "epoch": 0.014380825565912117, "grad_norm": 0.10107421875, "grad_norm_var": 0.00012872119744618735, "learning_rate": 0.0001, "loss": 0.2101, "loss/crossentropy": 2.5498578548431396, "loss/fcd": 0.431640625, "loss/idx": 18.0, "loss/logits": 0.210076242685318, "step": 999 }, { "epoch": 0.014395220786698816, "grad_norm": 0.103515625, "grad_norm_var": 0.00012149810791015626, "learning_rate": 0.0001, "loss": 0.2192, "loss/crossentropy": 2.547055721282959, "loss/fcd": 0.4560546875, "loss/idx": 18.0, "loss/logits": 0.21922268718481064, "step": 1000 } ], "logging_steps": 1, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.51753290940416e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }