qwen18404 / trainer_state.json
semran1's picture
Upload folder using huggingface_hub
cd4ca81 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.014395220786698816,
"eval_steps": 1000,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.4395220786698816e-05,
"grad_norm": 0.18359375,
"learning_rate": 0.0001,
"loss": 0.2384,
"loss/crossentropy": 2.463143229484558,
"loss/fcd": 0.4892578125,
"loss/idx": 18.0,
"loss/logits": 0.23836339265108109,
"step": 1
},
{
"epoch": 2.8790441573397632e-05,
"grad_norm": 0.1328125,
"learning_rate": 0.0001,
"loss": 0.2453,
"loss/crossentropy": 2.74690580368042,
"loss/fcd": 0.462890625,
"loss/idx": 18.0,
"loss/logits": 0.2453368902206421,
"step": 2
},
{
"epoch": 4.3185662360096445e-05,
"grad_norm": 0.15625,
"learning_rate": 0.0001,
"loss": 0.2292,
"loss/crossentropy": 2.3877265453338623,
"loss/fcd": 0.4130859375,
"loss/idx": 18.0,
"loss/logits": 0.22919423878192902,
"step": 3
},
{
"epoch": 5.7580883146795265e-05,
"grad_norm": 0.130859375,
"learning_rate": 0.0001,
"loss": 0.2284,
"loss/crossentropy": 2.392206907272339,
"loss/fcd": 0.453125,
"loss/idx": 18.0,
"loss/logits": 0.22838981449604034,
"step": 4
},
{
"epoch": 7.197610393349408e-05,
"grad_norm": 0.138671875,
"learning_rate": 0.0001,
"loss": 0.2237,
"loss/crossentropy": 2.1798477172851562,
"loss/fcd": 0.4384765625,
"loss/idx": 18.0,
"loss/logits": 0.22366443276405334,
"step": 5
},
{
"epoch": 8.637132472019289e-05,
"grad_norm": 0.1357421875,
"learning_rate": 0.0001,
"loss": 0.2644,
"loss/crossentropy": 2.492342710494995,
"loss/fcd": 0.462890625,
"loss/idx": 18.0,
"loss/logits": 0.2643834352493286,
"step": 6
},
{
"epoch": 0.00010076654550689171,
"grad_norm": 0.150390625,
"learning_rate": 0.0001,
"loss": 0.211,
"loss/crossentropy": 2.035392999649048,
"loss/fcd": 0.453125,
"loss/idx": 18.0,
"loss/logits": 0.21097075939178467,
"step": 7
},
{
"epoch": 0.00011516176629359053,
"grad_norm": 0.13671875,
"learning_rate": 0.0001,
"loss": 0.2388,
"loss/crossentropy": 2.3071805238723755,
"loss/fcd": 0.4453125,
"loss/idx": 18.0,
"loss/logits": 0.23878887295722961,
"step": 8
},
{
"epoch": 0.00012955698708028935,
"grad_norm": 0.126953125,
"learning_rate": 0.0001,
"loss": 0.2061,
"loss/crossentropy": 2.1987677812576294,
"loss/fcd": 0.4091796875,
"loss/idx": 18.0,
"loss/logits": 0.20613879710435867,
"step": 9
},
{
"epoch": 0.00014395220786698817,
"grad_norm": 0.1240234375,
"learning_rate": 0.0001,
"loss": 0.2075,
"loss/crossentropy": 1.9901325702667236,
"loss/fcd": 0.4091796875,
"loss/idx": 18.0,
"loss/logits": 0.20753345638513565,
"step": 10
},
{
"epoch": 0.000158347428653687,
"grad_norm": 0.1298828125,
"learning_rate": 0.0001,
"loss": 0.2213,
"loss/crossentropy": 2.3090018033981323,
"loss/fcd": 0.431640625,
"loss/idx": 18.0,
"loss/logits": 0.2212577611207962,
"step": 11
},
{
"epoch": 0.00017274264944038578,
"grad_norm": 0.1513671875,
"learning_rate": 0.0001,
"loss": 0.2689,
"loss/crossentropy": 2.2487552165985107,
"loss/fcd": 0.5,
"loss/idx": 18.0,
"loss/logits": 0.26888714730739594,
"step": 12
},
{
"epoch": 0.0001871378702270846,
"grad_norm": 0.12109375,
"learning_rate": 0.0001,
"loss": 0.2335,
"loss/crossentropy": 2.3826037645339966,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.23347856849431992,
"step": 13
},
{
"epoch": 0.00020153309101378342,
"grad_norm": 0.115234375,
"learning_rate": 0.0001,
"loss": 0.2299,
"loss/crossentropy": 2.524248242378235,
"loss/fcd": 0.4560546875,
"loss/idx": 18.0,
"loss/logits": 0.22988282144069672,
"step": 14
},
{
"epoch": 0.00021592831180048224,
"grad_norm": 0.1279296875,
"learning_rate": 0.0001,
"loss": 0.2354,
"loss/crossentropy": 2.33734929561615,
"loss/fcd": 0.4482421875,
"loss/idx": 18.0,
"loss/logits": 0.23541489243507385,
"step": 15
},
{
"epoch": 0.00023032353258718106,
"grad_norm": 0.12060546875,
"grad_norm_var": 0.0002919107675552368,
"learning_rate": 0.0001,
"loss": 0.2428,
"loss/crossentropy": 2.3426687717437744,
"loss/fcd": 0.455078125,
"loss/idx": 18.0,
"loss/logits": 0.24281759560108185,
"step": 16
},
{
"epoch": 0.0002447187533738799,
"grad_norm": 0.1171875,
"grad_norm_var": 0.00014951129754384358,
"learning_rate": 0.0001,
"loss": 0.2399,
"loss/crossentropy": 2.634019374847412,
"loss/fcd": 0.4580078125,
"loss/idx": 18.0,
"loss/logits": 0.23987850546836853,
"step": 17
},
{
"epoch": 0.0002591139741605787,
"grad_norm": 0.1279296875,
"grad_norm_var": 0.0001506239175796509,
"learning_rate": 0.0001,
"loss": 0.2267,
"loss/crossentropy": 2.2048473358154297,
"loss/fcd": 0.43359375,
"loss/idx": 18.0,
"loss/logits": 0.2267211154103279,
"step": 18
},
{
"epoch": 0.0002735091949472775,
"grad_norm": 0.1201171875,
"grad_norm_var": 0.0001150439182917277,
"learning_rate": 0.0001,
"loss": 0.2111,
"loss/crossentropy": 2.421955704689026,
"loss/fcd": 0.4169921875,
"loss/idx": 18.0,
"loss/logits": 0.21106208860874176,
"step": 19
},
{
"epoch": 0.00028790441573397634,
"grad_norm": 0.125,
"grad_norm_var": 0.00011625985304514567,
"learning_rate": 0.0001,
"loss": 0.2474,
"loss/crossentropy": 2.4863855838775635,
"loss/fcd": 0.4990234375,
"loss/idx": 18.0,
"loss/logits": 0.24741190671920776,
"step": 20
},
{
"epoch": 0.00030229963652067516,
"grad_norm": 0.11962890625,
"grad_norm_var": 0.00011513630549112956,
"learning_rate": 0.0001,
"loss": 0.2185,
"loss/crossentropy": 2.2641090154647827,
"loss/fcd": 0.4462890625,
"loss/idx": 18.0,
"loss/logits": 0.21849986910820007,
"step": 21
},
{
"epoch": 0.000316694857307374,
"grad_norm": 0.11669921875,
"grad_norm_var": 0.0001184294621149699,
"learning_rate": 0.0001,
"loss": 0.2309,
"loss/crossentropy": 2.614189624786377,
"loss/fcd": 0.4580078125,
"loss/idx": 18.0,
"loss/logits": 0.23093532770872116,
"step": 22
},
{
"epoch": 0.00033109007809407274,
"grad_norm": 0.1123046875,
"grad_norm_var": 8.991460005442301e-05,
"learning_rate": 0.0001,
"loss": 0.2174,
"loss/crossentropy": 2.258315682411194,
"loss/fcd": 0.408203125,
"loss/idx": 18.0,
"loss/logits": 0.21735627949237823,
"step": 23
},
{
"epoch": 0.00034548529888077156,
"grad_norm": 0.1279296875,
"grad_norm_var": 8.047322432200113e-05,
"learning_rate": 0.0001,
"loss": 0.2413,
"loss/crossentropy": 2.355400562286377,
"loss/fcd": 0.466796875,
"loss/idx": 18.0,
"loss/logits": 0.2412610948085785,
"step": 24
},
{
"epoch": 0.0003598805196674704,
"grad_norm": 0.130859375,
"grad_norm_var": 8.29686721165975e-05,
"learning_rate": 0.0001,
"loss": 0.2334,
"loss/crossentropy": 2.4980456829071045,
"loss/fcd": 0.4599609375,
"loss/idx": 18.0,
"loss/logits": 0.23341741412878036,
"step": 25
},
{
"epoch": 0.0003742757404541692,
"grad_norm": 0.11279296875,
"grad_norm_var": 9.11712646484375e-05,
"learning_rate": 0.0001,
"loss": 0.2017,
"loss/crossentropy": 2.1927164793014526,
"loss/fcd": 0.427734375,
"loss/idx": 18.0,
"loss/logits": 0.20174731314182281,
"step": 26
},
{
"epoch": 0.000388670961240868,
"grad_norm": 0.173828125,
"grad_norm_var": 0.0002490639686584473,
"learning_rate": 0.0001,
"loss": 0.253,
"loss/crossentropy": 2.5806944370269775,
"loss/fcd": 0.4658203125,
"loss/idx": 18.0,
"loss/logits": 0.2529568448662758,
"step": 27
},
{
"epoch": 0.00040306618202756684,
"grad_norm": 0.11572265625,
"grad_norm_var": 0.0002092510461807251,
"learning_rate": 0.0001,
"loss": 0.2384,
"loss/crossentropy": 2.292937397956848,
"loss/fcd": 0.43359375,
"loss/idx": 18.0,
"loss/logits": 0.2384110689163208,
"step": 28
},
{
"epoch": 0.00041746140281426566,
"grad_norm": 0.12451171875,
"grad_norm_var": 0.0002086321512858073,
"learning_rate": 0.0001,
"loss": 0.257,
"loss/crossentropy": 2.4048542976379395,
"loss/fcd": 0.482421875,
"loss/idx": 18.0,
"loss/logits": 0.25698406249284744,
"step": 29
},
{
"epoch": 0.0004318566236009645,
"grad_norm": 0.107421875,
"grad_norm_var": 0.00022185643513997395,
"learning_rate": 0.0001,
"loss": 0.2111,
"loss/crossentropy": 2.4948848485946655,
"loss/fcd": 0.431640625,
"loss/idx": 18.0,
"loss/logits": 0.21111004799604416,
"step": 30
},
{
"epoch": 0.0004462518443876633,
"grad_norm": 0.1142578125,
"grad_norm_var": 0.00022597312927246093,
"learning_rate": 0.0001,
"loss": 0.2299,
"loss/crossentropy": 2.233025908470154,
"loss/fcd": 0.421875,
"loss/idx": 18.0,
"loss/logits": 0.22986605763435364,
"step": 31
},
{
"epoch": 0.0004606470651743621,
"grad_norm": 0.11669921875,
"grad_norm_var": 0.0002281347910563151,
"learning_rate": 0.0001,
"loss": 0.2272,
"loss/crossentropy": 2.448768973350525,
"loss/fcd": 0.4462890625,
"loss/idx": 18.0,
"loss/logits": 0.22723641991615295,
"step": 32
},
{
"epoch": 0.00047504228596106094,
"grad_norm": 0.111328125,
"grad_norm_var": 0.0002345720926920573,
"learning_rate": 0.0001,
"loss": 0.1984,
"loss/crossentropy": 2.116120755672455,
"loss/fcd": 0.3935546875,
"loss/idx": 18.0,
"loss/logits": 0.19836096465587616,
"step": 33
},
{
"epoch": 0.0004894375067477598,
"grad_norm": 0.1259765625,
"grad_norm_var": 0.00023334821065266927,
"learning_rate": 0.0001,
"loss": 0.2416,
"loss/crossentropy": 2.3083192110061646,
"loss/fcd": 0.447265625,
"loss/idx": 18.0,
"loss/logits": 0.2415921539068222,
"step": 34
},
{
"epoch": 0.0005038327275344585,
"grad_norm": 0.1337890625,
"grad_norm_var": 0.00024124781290690104,
"learning_rate": 0.0001,
"loss": 0.2936,
"loss/crossentropy": 2.6550590991973877,
"loss/fcd": 0.529296875,
"loss/idx": 18.0,
"loss/logits": 0.29357363283634186,
"step": 35
},
{
"epoch": 0.0005182279483211574,
"grad_norm": 0.1025390625,
"grad_norm_var": 0.00026692946751912433,
"learning_rate": 0.0001,
"loss": 0.2041,
"loss/crossentropy": 2.341429352760315,
"loss/fcd": 0.4140625,
"loss/idx": 18.0,
"loss/logits": 0.20406678318977356,
"step": 36
},
{
"epoch": 0.0005326231691078562,
"grad_norm": 0.11376953125,
"grad_norm_var": 0.00027064879735310874,
"learning_rate": 0.0001,
"loss": 0.225,
"loss/crossentropy": 2.350203037261963,
"loss/fcd": 0.41796875,
"loss/idx": 18.0,
"loss/logits": 0.22499807178974152,
"step": 37
},
{
"epoch": 0.000547018389894555,
"grad_norm": 0.11083984375,
"grad_norm_var": 0.00027637084325154624,
"learning_rate": 0.0001,
"loss": 0.222,
"loss/crossentropy": 2.209356427192688,
"loss/fcd": 0.421875,
"loss/idx": 18.0,
"loss/logits": 0.22195565700531006,
"step": 38
},
{
"epoch": 0.0005614136106812538,
"grad_norm": 0.11376953125,
"grad_norm_var": 0.00027482410271962486,
"learning_rate": 0.0001,
"loss": 0.2432,
"loss/crossentropy": 2.6039966344833374,
"loss/fcd": 0.4638671875,
"loss/idx": 18.0,
"loss/logits": 0.24324779212474823,
"step": 39
},
{
"epoch": 0.0005758088314679527,
"grad_norm": 0.11865234375,
"grad_norm_var": 0.00027163426081339517,
"learning_rate": 0.0001,
"loss": 0.2133,
"loss/crossentropy": 2.3391385078430176,
"loss/fcd": 0.416015625,
"loss/idx": 18.0,
"loss/logits": 0.21334318816661835,
"step": 40
},
{
"epoch": 0.0005902040522546514,
"grad_norm": 0.1015625,
"grad_norm_var": 0.00028450886408487954,
"learning_rate": 0.0001,
"loss": 0.2074,
"loss/crossentropy": 2.5192357301712036,
"loss/fcd": 0.4091796875,
"loss/idx": 18.0,
"loss/logits": 0.2073945701122284,
"step": 41
},
{
"epoch": 0.0006045992730413503,
"grad_norm": 0.10791015625,
"grad_norm_var": 0.0002897739410400391,
"learning_rate": 0.0001,
"loss": 0.1894,
"loss/crossentropy": 2.35784912109375,
"loss/fcd": 0.4248046875,
"loss/idx": 18.0,
"loss/logits": 0.18937092274427414,
"step": 42
},
{
"epoch": 0.0006189944938280491,
"grad_norm": 0.11669921875,
"grad_norm_var": 7.068216800689697e-05,
"learning_rate": 0.0001,
"loss": 0.1925,
"loss/crossentropy": 2.0304250717163086,
"loss/fcd": 0.40625,
"loss/idx": 18.0,
"loss/logits": 0.19247674196958542,
"step": 43
},
{
"epoch": 0.000633389714614748,
"grad_norm": 0.111328125,
"grad_norm_var": 7.129907608032227e-05,
"learning_rate": 0.0001,
"loss": 0.2238,
"loss/crossentropy": 2.257385492324829,
"loss/fcd": 0.4384765625,
"loss/idx": 18.0,
"loss/logits": 0.22384560853242874,
"step": 44
},
{
"epoch": 0.0006477849354014467,
"grad_norm": 0.10986328125,
"grad_norm_var": 6.504058837890625e-05,
"learning_rate": 0.0001,
"loss": 0.2174,
"loss/crossentropy": 2.47000515460968,
"loss/fcd": 0.4189453125,
"loss/idx": 18.0,
"loss/logits": 0.2174428552389145,
"step": 45
},
{
"epoch": 0.0006621801561881455,
"grad_norm": 0.1318359375,
"grad_norm_var": 8.242527643839518e-05,
"learning_rate": 0.0001,
"loss": 0.2356,
"loss/crossentropy": 2.77071475982666,
"loss/fcd": 0.4501953125,
"loss/idx": 18.0,
"loss/logits": 0.23558437824249268,
"step": 46
},
{
"epoch": 0.0006765753769748444,
"grad_norm": 0.11669921875,
"grad_norm_var": 8.253951867421468e-05,
"learning_rate": 0.0001,
"loss": 0.2314,
"loss/crossentropy": 2.3579763174057007,
"loss/fcd": 0.431640625,
"loss/idx": 18.0,
"loss/logits": 0.23137739300727844,
"step": 47
},
{
"epoch": 0.0006909705977615431,
"grad_norm": 0.11962890625,
"grad_norm_var": 8.366008599599202e-05,
"learning_rate": 0.0001,
"loss": 0.2147,
"loss/crossentropy": 2.4674328565597534,
"loss/fcd": 0.48046875,
"loss/idx": 18.0,
"loss/logits": 0.2146531641483307,
"step": 48
},
{
"epoch": 0.000705365818548242,
"grad_norm": 0.13671875,
"grad_norm_var": 0.00011021196842193603,
"learning_rate": 0.0001,
"loss": 0.2773,
"loss/crossentropy": 2.5875381231307983,
"loss/fcd": 0.501953125,
"loss/idx": 18.0,
"loss/logits": 0.27730000019073486,
"step": 49
},
{
"epoch": 0.0007197610393349408,
"grad_norm": 0.10400390625,
"grad_norm_var": 0.00011401176452636718,
"learning_rate": 0.0001,
"loss": 0.1992,
"loss/crossentropy": 2.3770352602005005,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.1992211416363716,
"step": 50
},
{
"epoch": 0.0007341562601216396,
"grad_norm": 0.115234375,
"grad_norm_var": 9.05315081278483e-05,
"learning_rate": 0.0001,
"loss": 0.2208,
"loss/crossentropy": 2.503299593925476,
"loss/fcd": 0.4462890625,
"loss/idx": 18.0,
"loss/logits": 0.22077593207359314,
"step": 51
},
{
"epoch": 0.0007485514809083384,
"grad_norm": 0.1181640625,
"grad_norm_var": 8.09947649637858e-05,
"learning_rate": 0.0001,
"loss": 0.2448,
"loss/crossentropy": 2.5992391109466553,
"loss/fcd": 0.478515625,
"loss/idx": 18.0,
"loss/logits": 0.2447950839996338,
"step": 52
},
{
"epoch": 0.0007629467016950373,
"grad_norm": 0.11083984375,
"grad_norm_var": 8.217493693033855e-05,
"learning_rate": 0.0001,
"loss": 0.2302,
"loss/crossentropy": 2.5341001749038696,
"loss/fcd": 0.421875,
"loss/idx": 18.0,
"loss/logits": 0.23020881414413452,
"step": 53
},
{
"epoch": 0.000777341922481736,
"grad_norm": 0.11328125,
"grad_norm_var": 8.111695448557536e-05,
"learning_rate": 0.0001,
"loss": 0.242,
"loss/crossentropy": 2.594543933868408,
"loss/fcd": 0.4580078125,
"loss/idx": 18.0,
"loss/logits": 0.24200908839702606,
"step": 54
},
{
"epoch": 0.0007917371432684349,
"grad_norm": 0.1142578125,
"grad_norm_var": 8.102655410766602e-05,
"learning_rate": 0.0001,
"loss": 0.2157,
"loss/crossentropy": 2.35564386844635,
"loss/fcd": 0.4296875,
"loss/idx": 18.0,
"loss/logits": 0.21571539342403412,
"step": 55
},
{
"epoch": 0.0008061323640551337,
"grad_norm": 0.140625,
"grad_norm_var": 0.00012067854404449463,
"learning_rate": 0.0001,
"loss": 0.2262,
"loss/crossentropy": 2.5845850706100464,
"loss/fcd": 0.505859375,
"loss/idx": 18.0,
"loss/logits": 0.22615493834018707,
"step": 56
},
{
"epoch": 0.0008205275848418326,
"grad_norm": 0.10791015625,
"grad_norm_var": 0.00011030832926432292,
"learning_rate": 0.0001,
"loss": 0.2264,
"loss/crossentropy": 2.6225829124450684,
"loss/fcd": 0.451171875,
"loss/idx": 18.0,
"loss/logits": 0.22639667242765427,
"step": 57
},
{
"epoch": 0.0008349228056285313,
"grad_norm": 0.12158203125,
"grad_norm_var": 0.00010507901509602865,
"learning_rate": 0.0001,
"loss": 0.2332,
"loss/crossentropy": 2.49368155002594,
"loss/fcd": 0.4404296875,
"loss/idx": 18.0,
"loss/logits": 0.23318731039762497,
"step": 58
},
{
"epoch": 0.0008493180264152302,
"grad_norm": 0.10986328125,
"grad_norm_var": 0.00010922352472941081,
"learning_rate": 0.0001,
"loss": 0.2161,
"loss/crossentropy": 2.365482449531555,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.21612977981567383,
"step": 59
},
{
"epoch": 0.000863713247201929,
"grad_norm": 0.12451171875,
"grad_norm_var": 0.00010903577009836832,
"learning_rate": 0.0001,
"loss": 0.2292,
"loss/crossentropy": 2.450873017311096,
"loss/fcd": 0.4462890625,
"loss/idx": 18.0,
"loss/logits": 0.22924820333719254,
"step": 60
},
{
"epoch": 0.0008781084679886277,
"grad_norm": 0.10400390625,
"grad_norm_var": 0.0001178810993830363,
"learning_rate": 0.0001,
"loss": 0.2071,
"loss/crossentropy": 2.364640951156616,
"loss/fcd": 0.412109375,
"loss/idx": 18.0,
"loss/logits": 0.20711997151374817,
"step": 61
},
{
"epoch": 0.0008925036887753266,
"grad_norm": 0.10009765625,
"grad_norm_var": 0.00012259483337402345,
"learning_rate": 0.0001,
"loss": 0.1989,
"loss/crossentropy": 2.430219888687134,
"loss/fcd": 0.4072265625,
"loss/idx": 18.0,
"loss/logits": 0.19892004877328873,
"step": 62
},
{
"epoch": 0.0009068989095620254,
"grad_norm": 0.1083984375,
"grad_norm_var": 0.00012622574965159098,
"learning_rate": 0.0001,
"loss": 0.2166,
"loss/crossentropy": 2.412087559700012,
"loss/fcd": 0.41015625,
"loss/idx": 18.0,
"loss/logits": 0.21661554276943207,
"step": 63
},
{
"epoch": 0.0009212941303487242,
"grad_norm": 0.11279296875,
"grad_norm_var": 0.00012544691562652588,
"learning_rate": 0.0001,
"loss": 0.2152,
"loss/crossentropy": 2.369842290878296,
"loss/fcd": 0.4306640625,
"loss/idx": 18.0,
"loss/logits": 0.21520362049341202,
"step": 64
},
{
"epoch": 0.000935689351135423,
"grad_norm": 0.1279296875,
"grad_norm_var": 0.00010499060153961182,
"learning_rate": 0.0001,
"loss": 0.2505,
"loss/crossentropy": 2.5731316804885864,
"loss/fcd": 0.484375,
"loss/idx": 18.0,
"loss/logits": 0.2505309656262398,
"step": 65
},
{
"epoch": 0.0009500845719221219,
"grad_norm": 0.11572265625,
"grad_norm_var": 9.702742099761962e-05,
"learning_rate": 0.0001,
"loss": 0.2162,
"loss/crossentropy": 2.4219590425491333,
"loss/fcd": 0.4228515625,
"loss/idx": 18.0,
"loss/logits": 0.2161625698208809,
"step": 66
},
{
"epoch": 0.0009644797927088206,
"grad_norm": 0.109375,
"grad_norm_var": 9.924471378326416e-05,
"learning_rate": 0.0001,
"loss": 0.1955,
"loss/crossentropy": 2.072207987308502,
"loss/fcd": 0.4052734375,
"loss/idx": 18.0,
"loss/logits": 0.19551369547843933,
"step": 67
},
{
"epoch": 0.0009788750134955195,
"grad_norm": 0.10302734375,
"grad_norm_var": 0.00010709762573242187,
"learning_rate": 0.0001,
"loss": 0.2152,
"loss/crossentropy": 2.4199079275131226,
"loss/fcd": 0.404296875,
"loss/idx": 18.0,
"loss/logits": 0.2152082547545433,
"step": 68
},
{
"epoch": 0.0009932702342822183,
"grad_norm": 0.1181640625,
"grad_norm_var": 0.00010735094547271728,
"learning_rate": 0.0001,
"loss": 0.1939,
"loss/crossentropy": 2.073515832424164,
"loss/fcd": 0.4111328125,
"loss/idx": 18.0,
"loss/logits": 0.19392766803503036,
"step": 69
},
{
"epoch": 0.001007665455068917,
"grad_norm": 0.12109375,
"grad_norm_var": 0.00010992586612701416,
"learning_rate": 0.0001,
"loss": 0.244,
"loss/crossentropy": 2.376970887184143,
"loss/fcd": 0.4453125,
"loss/idx": 18.0,
"loss/logits": 0.2439984604716301,
"step": 70
},
{
"epoch": 0.001022060675855616,
"grad_norm": 0.1201171875,
"grad_norm_var": 0.00011152327060699463,
"learning_rate": 0.0001,
"loss": 0.2341,
"loss/crossentropy": 2.329576015472412,
"loss/fcd": 0.4716796875,
"loss/idx": 18.0,
"loss/logits": 0.23405101150274277,
"step": 71
},
{
"epoch": 0.0010364558966423148,
"grad_norm": 0.11083984375,
"grad_norm_var": 6.649891535441081e-05,
"learning_rate": 0.0001,
"loss": 0.2061,
"loss/crossentropy": 2.240494966506958,
"loss/fcd": 0.431640625,
"loss/idx": 18.0,
"loss/logits": 0.20607301592826843,
"step": 72
},
{
"epoch": 0.0010508511174290136,
"grad_norm": 0.115234375,
"grad_norm_var": 6.442765394846599e-05,
"learning_rate": 0.0001,
"loss": 0.2261,
"loss/crossentropy": 2.58719003200531,
"loss/fcd": 0.4501953125,
"loss/idx": 18.0,
"loss/logits": 0.22607439756393433,
"step": 73
},
{
"epoch": 0.0010652463382157123,
"grad_norm": 0.10693359375,
"grad_norm_var": 6.287793318430583e-05,
"learning_rate": 0.0001,
"loss": 0.2209,
"loss/crossentropy": 2.458608031272888,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.22093002498149872,
"step": 74
},
{
"epoch": 0.001079641559002411,
"grad_norm": 0.1015625,
"grad_norm_var": 7.06632932027181e-05,
"learning_rate": 0.0001,
"loss": 0.189,
"loss/crossentropy": 2.4321776628494263,
"loss/fcd": 0.40625,
"loss/idx": 18.0,
"loss/logits": 0.18898583948612213,
"step": 75
},
{
"epoch": 0.00109403677978911,
"grad_norm": 0.11279296875,
"grad_norm_var": 6.0458978017171226e-05,
"learning_rate": 0.0001,
"loss": 0.241,
"loss/crossentropy": 2.5812788009643555,
"loss/fcd": 0.4453125,
"loss/idx": 18.0,
"loss/logits": 0.24104547500610352,
"step": 76
},
{
"epoch": 0.0011084320005758088,
"grad_norm": 0.10986328125,
"grad_norm_var": 5.65489133199056e-05,
"learning_rate": 0.0001,
"loss": 0.203,
"loss/crossentropy": 2.408494293689728,
"loss/fcd": 0.4111328125,
"loss/idx": 18.0,
"loss/logits": 0.20304062217473984,
"step": 77
},
{
"epoch": 0.0011228272213625076,
"grad_norm": 0.1337890625,
"grad_norm_var": 7.34796126683553e-05,
"learning_rate": 0.0001,
"loss": 0.2287,
"loss/crossentropy": 2.239099144935608,
"loss/fcd": 0.46484375,
"loss/idx": 18.0,
"loss/logits": 0.2286640703678131,
"step": 78
},
{
"epoch": 0.0011372224421492064,
"grad_norm": 0.12353515625,
"grad_norm_var": 7.603565851847331e-05,
"learning_rate": 0.0001,
"loss": 0.2298,
"loss/crossentropy": 2.357472777366638,
"loss/fcd": 0.48046875,
"loss/idx": 18.0,
"loss/logits": 0.22975638508796692,
"step": 79
},
{
"epoch": 0.0011516176629359054,
"grad_norm": 0.12158203125,
"grad_norm_var": 7.807413736979166e-05,
"learning_rate": 0.0001,
"loss": 0.2185,
"loss/crossentropy": 2.4130557775497437,
"loss/fcd": 0.4765625,
"loss/idx": 18.0,
"loss/logits": 0.21845312416553497,
"step": 80
},
{
"epoch": 0.0011660128837226041,
"grad_norm": 0.11962890625,
"grad_norm_var": 6.88701868057251e-05,
"learning_rate": 0.0001,
"loss": 0.2304,
"loss/crossentropy": 2.293164014816284,
"loss/fcd": 0.466796875,
"loss/idx": 18.0,
"loss/logits": 0.2303522452712059,
"step": 81
},
{
"epoch": 0.0011804081045093029,
"grad_norm": 0.10107421875,
"grad_norm_var": 8.126795291900635e-05,
"learning_rate": 0.0001,
"loss": 0.1964,
"loss/crossentropy": 2.2688822746276855,
"loss/fcd": 0.3935546875,
"loss/idx": 18.0,
"loss/logits": 0.19636806100606918,
"step": 82
},
{
"epoch": 0.0011948033252960016,
"grad_norm": 0.11376953125,
"grad_norm_var": 7.959604263305665e-05,
"learning_rate": 0.0001,
"loss": 0.2397,
"loss/crossentropy": 2.398077368736267,
"loss/fcd": 0.4404296875,
"loss/idx": 18.0,
"loss/logits": 0.23967822641134262,
"step": 83
},
{
"epoch": 0.0012091985460827006,
"grad_norm": 0.099609375,
"grad_norm_var": 8.558332920074462e-05,
"learning_rate": 0.0001,
"loss": 0.2079,
"loss/crossentropy": 2.524065375328064,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.20788107812404633,
"step": 84
},
{
"epoch": 0.0012235937668693994,
"grad_norm": 0.1103515625,
"grad_norm_var": 8.542438348134359e-05,
"learning_rate": 0.0001,
"loss": 0.2091,
"loss/crossentropy": 2.398527979850769,
"loss/fcd": 0.4296875,
"loss/idx": 18.0,
"loss/logits": 0.20912020653486252,
"step": 85
},
{
"epoch": 0.0012379889876560982,
"grad_norm": 0.12109375,
"grad_norm_var": 8.542438348134359e-05,
"learning_rate": 0.0001,
"loss": 0.2435,
"loss/crossentropy": 2.4105933904647827,
"loss/fcd": 0.48828125,
"loss/idx": 18.0,
"loss/logits": 0.24351391196250916,
"step": 86
},
{
"epoch": 0.001252384208442797,
"grad_norm": 0.12158203125,
"grad_norm_var": 8.678038914998373e-05,
"learning_rate": 0.0001,
"loss": 0.2189,
"loss/crossentropy": 2.26534903049469,
"loss/fcd": 0.443359375,
"loss/idx": 18.0,
"loss/logits": 0.21887247264385223,
"step": 87
},
{
"epoch": 0.001266779429229496,
"grad_norm": 0.1162109375,
"grad_norm_var": 8.635421593983968e-05,
"learning_rate": 0.0001,
"loss": 0.1992,
"loss/crossentropy": 2.1426846981048584,
"loss/fcd": 0.4296875,
"loss/idx": 18.0,
"loss/logits": 0.1991657018661499,
"step": 88
},
{
"epoch": 0.0012811746500161947,
"grad_norm": 0.11279296875,
"grad_norm_var": 8.641878763834635e-05,
"learning_rate": 0.0001,
"loss": 0.2104,
"loss/crossentropy": 2.2193171977996826,
"loss/fcd": 0.4140625,
"loss/idx": 18.0,
"loss/logits": 0.21043668687343597,
"step": 89
},
{
"epoch": 0.0012955698708028934,
"grad_norm": 0.14453125,
"grad_norm_var": 0.00013866325219472249,
"learning_rate": 0.0001,
"loss": 0.2259,
"loss/crossentropy": 2.4619998931884766,
"loss/fcd": 0.4755859375,
"loss/idx": 18.0,
"loss/logits": 0.22587314993143082,
"step": 90
},
{
"epoch": 0.0013099650915895922,
"grad_norm": 0.1162109375,
"grad_norm_var": 0.00012292762597401936,
"learning_rate": 0.0001,
"loss": 0.2502,
"loss/crossentropy": 2.5881928205490112,
"loss/fcd": 0.4638671875,
"loss/idx": 18.0,
"loss/logits": 0.2501572445034981,
"step": 91
},
{
"epoch": 0.001324360312376291,
"grad_norm": 0.123046875,
"grad_norm_var": 0.0001231988271077474,
"learning_rate": 0.0001,
"loss": 0.2096,
"loss/crossentropy": 2.28423535823822,
"loss/fcd": 0.4140625,
"loss/idx": 18.0,
"loss/logits": 0.20956922322511673,
"step": 92
},
{
"epoch": 0.00133875553316299,
"grad_norm": 0.119140625,
"grad_norm_var": 0.0001184612512588501,
"learning_rate": 0.0001,
"loss": 0.2081,
"loss/crossentropy": 2.0630246996879578,
"loss/fcd": 0.4189453125,
"loss/idx": 18.0,
"loss/logits": 0.208104208111763,
"step": 93
},
{
"epoch": 0.0013531507539496887,
"grad_norm": 0.1142578125,
"grad_norm_var": 0.00010280509789784749,
"learning_rate": 0.0001,
"loss": 0.2285,
"loss/crossentropy": 2.4672012329101562,
"loss/fcd": 0.4375,
"loss/idx": 18.0,
"loss/logits": 0.22853697836399078,
"step": 94
},
{
"epoch": 0.0013675459747363875,
"grad_norm": 0.109375,
"grad_norm_var": 0.00010375579198201497,
"learning_rate": 0.0001,
"loss": 0.2217,
"loss/crossentropy": 2.432914614677429,
"loss/fcd": 0.42578125,
"loss/idx": 18.0,
"loss/logits": 0.22167058289051056,
"step": 95
},
{
"epoch": 0.0013819411955230862,
"grad_norm": 0.103515625,
"grad_norm_var": 0.00011195242404937744,
"learning_rate": 0.0001,
"loss": 0.198,
"loss/crossentropy": 2.522903800010681,
"loss/fcd": 0.416015625,
"loss/idx": 18.0,
"loss/logits": 0.19795683026313782,
"step": 96
},
{
"epoch": 0.0013963364163097852,
"grad_norm": 0.109375,
"grad_norm_var": 0.00011272430419921875,
"learning_rate": 0.0001,
"loss": 0.2049,
"loss/crossentropy": 2.149677038192749,
"loss/fcd": 0.3984375,
"loss/idx": 18.0,
"loss/logits": 0.20487764477729797,
"step": 97
},
{
"epoch": 0.001410731637096484,
"grad_norm": 0.10546875,
"grad_norm_var": 0.00010592043399810791,
"learning_rate": 0.0001,
"loss": 0.2057,
"loss/crossentropy": 2.467462182044983,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.20570345222949982,
"step": 98
},
{
"epoch": 0.0014251268578831828,
"grad_norm": 0.12890625,
"grad_norm_var": 0.00011771519978841146,
"learning_rate": 0.0001,
"loss": 0.2105,
"loss/crossentropy": 2.353211760520935,
"loss/fcd": 0.4306640625,
"loss/idx": 18.0,
"loss/logits": 0.21045749634504318,
"step": 99
},
{
"epoch": 0.0014395220786698815,
"grad_norm": 0.1279296875,
"grad_norm_var": 0.00010607639948527019,
"learning_rate": 0.0001,
"loss": 0.2903,
"loss/crossentropy": 2.590612769126892,
"loss/fcd": 0.47265625,
"loss/idx": 18.0,
"loss/logits": 0.290309339761734,
"step": 100
},
{
"epoch": 0.0014539172994565805,
"grad_norm": 0.1259765625,
"grad_norm_var": 0.00010594924290974935,
"learning_rate": 0.0001,
"loss": 0.2467,
"loss/crossentropy": 2.3608927726745605,
"loss/fcd": 0.4638671875,
"loss/idx": 18.0,
"loss/logits": 0.24666306376457214,
"step": 101
},
{
"epoch": 0.0014683125202432793,
"grad_norm": 0.1298828125,
"grad_norm_var": 0.00011356671651204427,
"learning_rate": 0.0001,
"loss": 0.2413,
"loss/crossentropy": 2.1008136868476868,
"loss/fcd": 0.4521484375,
"loss/idx": 18.0,
"loss/logits": 0.24129530787467957,
"step": 102
},
{
"epoch": 0.001482707741029978,
"grad_norm": 0.11279296875,
"grad_norm_var": 0.0001156767209370931,
"learning_rate": 0.0001,
"loss": 0.2219,
"loss/crossentropy": 2.36824232339859,
"loss/fcd": 0.4462890625,
"loss/idx": 18.0,
"loss/logits": 0.2218664586544037,
"step": 103
},
{
"epoch": 0.0014971029618166768,
"grad_norm": 0.1181640625,
"grad_norm_var": 0.00011526346206665039,
"learning_rate": 0.0001,
"loss": 0.2459,
"loss/crossentropy": 2.3991124629974365,
"loss/fcd": 0.4423828125,
"loss/idx": 18.0,
"loss/logits": 0.24585890769958496,
"step": 104
},
{
"epoch": 0.0015114981826033756,
"grad_norm": 0.1005859375,
"grad_norm_var": 0.00013441145420074464,
"learning_rate": 0.0001,
"loss": 0.2028,
"loss/crossentropy": 2.5206661224365234,
"loss/fcd": 0.416015625,
"loss/idx": 18.0,
"loss/logits": 0.20282022655010223,
"step": 105
},
{
"epoch": 0.0015258934033900746,
"grad_norm": 0.123046875,
"grad_norm_var": 8.746683597564698e-05,
"learning_rate": 0.0001,
"loss": 0.2339,
"loss/crossentropy": 2.294739842414856,
"loss/fcd": 0.4619140625,
"loss/idx": 18.0,
"loss/logits": 0.2339302897453308,
"step": 106
},
{
"epoch": 0.0015402886241767733,
"grad_norm": 0.11181640625,
"grad_norm_var": 8.897781372070312e-05,
"learning_rate": 0.0001,
"loss": 0.2028,
"loss/crossentropy": 2.430526852607727,
"loss/fcd": 0.404296875,
"loss/idx": 18.0,
"loss/logits": 0.20277925580739975,
"step": 107
},
{
"epoch": 0.001554683844963472,
"grad_norm": 0.10400390625,
"grad_norm_var": 9.490549564361572e-05,
"learning_rate": 0.0001,
"loss": 0.1685,
"loss/crossentropy": 1.9886462688446045,
"loss/fcd": 0.4814453125,
"loss/idx": 18.0,
"loss/logits": 0.16851283982396126,
"step": 108
},
{
"epoch": 0.0015690790657501708,
"grad_norm": 0.11376953125,
"grad_norm_var": 9.39329465230306e-05,
"learning_rate": 0.0001,
"loss": 0.2232,
"loss/crossentropy": 2.3031085729599,
"loss/fcd": 0.41796875,
"loss/idx": 18.0,
"loss/logits": 0.2231953889131546,
"step": 109
},
{
"epoch": 0.0015834742865368698,
"grad_norm": 0.1064453125,
"grad_norm_var": 9.844700495402019e-05,
"learning_rate": 0.0001,
"loss": 0.2145,
"loss/crossentropy": 2.4420076608657837,
"loss/fcd": 0.427734375,
"loss/idx": 18.0,
"loss/logits": 0.2144630402326584,
"step": 110
},
{
"epoch": 0.0015978695073235686,
"grad_norm": 0.10693359375,
"grad_norm_var": 0.00010046859582265218,
"learning_rate": 0.0001,
"loss": 0.1999,
"loss/crossentropy": 2.265585422515869,
"loss/fcd": 0.3876953125,
"loss/idx": 18.0,
"loss/logits": 0.19986777007579803,
"step": 111
},
{
"epoch": 0.0016122647281102674,
"grad_norm": 0.11572265625,
"grad_norm_var": 9.224812189737956e-05,
"learning_rate": 0.0001,
"loss": 0.2391,
"loss/crossentropy": 2.5880415439605713,
"loss/fcd": 0.447265625,
"loss/idx": 18.0,
"loss/logits": 0.23910276591777802,
"step": 112
},
{
"epoch": 0.0016266599488969661,
"grad_norm": 0.1181640625,
"grad_norm_var": 9.04242197672526e-05,
"learning_rate": 0.0001,
"loss": 0.2466,
"loss/crossentropy": 2.6048234701156616,
"loss/fcd": 0.47265625,
"loss/idx": 18.0,
"loss/logits": 0.24656572192907333,
"step": 113
},
{
"epoch": 0.0016410551696836651,
"grad_norm": 0.134765625,
"grad_norm_var": 0.00010449091593424479,
"learning_rate": 0.0001,
"loss": 0.2386,
"loss/crossentropy": 2.1900378465652466,
"loss/fcd": 0.484375,
"loss/idx": 18.0,
"loss/logits": 0.23863784968852997,
"step": 114
},
{
"epoch": 0.0016554503904703639,
"grad_norm": 0.10986328125,
"grad_norm_var": 9.802083174387614e-05,
"learning_rate": 0.0001,
"loss": 0.2123,
"loss/crossentropy": 2.547809600830078,
"loss/fcd": 0.4423828125,
"loss/idx": 18.0,
"loss/logits": 0.21230217069387436,
"step": 115
},
{
"epoch": 0.0016698456112570626,
"grad_norm": 0.255859375,
"grad_norm_var": 0.0013202657302220663,
"learning_rate": 0.0001,
"loss": 0.2161,
"loss/crossentropy": 2.586913585662842,
"loss/fcd": 0.431640625,
"loss/idx": 18.0,
"loss/logits": 0.2160758599638939,
"step": 116
},
{
"epoch": 0.0016842408320437614,
"grad_norm": 0.1171875,
"grad_norm_var": 0.0013230552275975546,
"learning_rate": 0.0001,
"loss": 0.2282,
"loss/crossentropy": 2.3776031732559204,
"loss/fcd": 0.4462890625,
"loss/idx": 18.0,
"loss/logits": 0.2281685397028923,
"step": 117
},
{
"epoch": 0.0016986360528304604,
"grad_norm": 0.11572265625,
"grad_norm_var": 0.0013238906860351563,
"learning_rate": 0.0001,
"loss": 0.2269,
"loss/crossentropy": 2.5417513847351074,
"loss/fcd": 0.455078125,
"loss/idx": 18.0,
"loss/logits": 0.226931631565094,
"step": 118
},
{
"epoch": 0.0017130312736171592,
"grad_norm": 0.109375,
"grad_norm_var": 0.0013291825850804647,
"learning_rate": 0.0001,
"loss": 0.2172,
"loss/crossentropy": 2.416541814804077,
"loss/fcd": 0.40625,
"loss/idx": 18.0,
"loss/logits": 0.2172057330608368,
"step": 119
},
{
"epoch": 0.001727426494403858,
"grad_norm": 0.1103515625,
"grad_norm_var": 0.0013376067082087198,
"learning_rate": 0.0001,
"loss": 0.2088,
"loss/crossentropy": 2.3803776502609253,
"loss/fcd": 0.4375,
"loss/idx": 18.0,
"loss/logits": 0.20880089700222015,
"step": 120
},
{
"epoch": 0.0017418217151905567,
"grad_norm": 0.11962890625,
"grad_norm_var": 0.0013056437174479166,
"learning_rate": 0.0001,
"loss": 0.2246,
"loss/crossentropy": 2.4869107007980347,
"loss/fcd": 0.4501953125,
"loss/idx": 18.0,
"loss/logits": 0.22459682077169418,
"step": 121
},
{
"epoch": 0.0017562169359772554,
"grad_norm": 0.10595703125,
"grad_norm_var": 0.0013244539499282838,
"learning_rate": 0.0001,
"loss": 0.2141,
"loss/crossentropy": 2.2889301776885986,
"loss/fcd": 0.4130859375,
"loss/idx": 18.0,
"loss/logits": 0.21405386179685593,
"step": 122
},
{
"epoch": 0.0017706121567639544,
"grad_norm": 0.10888671875,
"grad_norm_var": 0.0013290554285049438,
"learning_rate": 0.0001,
"loss": 0.183,
"loss/crossentropy": 2.2636550664901733,
"loss/fcd": 0.3916015625,
"loss/idx": 18.0,
"loss/logits": 0.1829545795917511,
"step": 123
},
{
"epoch": 0.0017850073775506532,
"grad_norm": 0.123046875,
"grad_norm_var": 0.0013059258460998535,
"learning_rate": 0.0001,
"loss": 0.2259,
"loss/crossentropy": 2.3760812282562256,
"loss/fcd": 0.4501953125,
"loss/idx": 18.0,
"loss/logits": 0.2258809506893158,
"step": 124
},
{
"epoch": 0.001799402598337352,
"grad_norm": 0.10791015625,
"grad_norm_var": 0.001315462589263916,
"learning_rate": 0.0001,
"loss": 0.2041,
"loss/crossentropy": 2.595892906188965,
"loss/fcd": 0.4189453125,
"loss/idx": 18.0,
"loss/logits": 0.20406261831521988,
"step": 125
},
{
"epoch": 0.0018137978191240507,
"grad_norm": 0.126953125,
"grad_norm_var": 0.001296854019165039,
"learning_rate": 0.0001,
"loss": 0.2131,
"loss/crossentropy": 2.3521647453308105,
"loss/fcd": 0.431640625,
"loss/idx": 18.0,
"loss/logits": 0.21306610107421875,
"step": 126
},
{
"epoch": 0.0018281930399107497,
"grad_norm": 0.10205078125,
"grad_norm_var": 0.0013095498085021972,
"learning_rate": 0.0001,
"loss": 0.2135,
"loss/crossentropy": 2.5395818948745728,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.21354226768016815,
"step": 127
},
{
"epoch": 0.0018425882606974485,
"grad_norm": 0.10791015625,
"grad_norm_var": 0.0013218204180399577,
"learning_rate": 0.0001,
"loss": 0.1906,
"loss/crossentropy": 2.154847741127014,
"loss/fcd": 0.3994140625,
"loss/idx": 18.0,
"loss/logits": 0.19056915491819382,
"step": 128
},
{
"epoch": 0.0018569834814841472,
"grad_norm": 0.11962890625,
"grad_norm_var": 0.0013209412495295207,
"learning_rate": 0.0001,
"loss": 0.2376,
"loss/crossentropy": 2.3668060302734375,
"loss/fcd": 0.4482421875,
"loss/idx": 18.0,
"loss/logits": 0.23755235970020294,
"step": 129
},
{
"epoch": 0.001871378702270846,
"grad_norm": 0.1044921875,
"grad_norm_var": 0.0013325204451878866,
"learning_rate": 0.0001,
"loss": 0.2072,
"loss/crossentropy": 2.4412275552749634,
"loss/fcd": 0.439453125,
"loss/idx": 18.0,
"loss/logits": 0.20715615153312683,
"step": 130
},
{
"epoch": 0.001885773923057545,
"grad_norm": 0.11865234375,
"grad_norm_var": 0.0013236512740453085,
"learning_rate": 0.0001,
"loss": 0.2363,
"loss/crossentropy": 2.589287519454956,
"loss/fcd": 0.4853515625,
"loss/idx": 18.0,
"loss/logits": 0.2362738400697708,
"step": 131
},
{
"epoch": 0.0019001691438442438,
"grad_norm": 0.1181640625,
"grad_norm_var": 5.292793114980062e-05,
"learning_rate": 0.0001,
"loss": 0.1757,
"loss/crossentropy": 2.1394956707954407,
"loss/fcd": 0.41015625,
"loss/idx": 18.0,
"loss/logits": 0.17568951100111008,
"step": 132
},
{
"epoch": 0.0019145643646309425,
"grad_norm": 0.1044921875,
"grad_norm_var": 5.675057570139567e-05,
"learning_rate": 0.0001,
"loss": 0.2141,
"loss/crossentropy": 2.5705530643463135,
"loss/fcd": 0.431640625,
"loss/idx": 18.0,
"loss/logits": 0.21412815153598785,
"step": 133
},
{
"epoch": 0.0019289595854176413,
"grad_norm": 0.1171875,
"grad_norm_var": 5.7474772135416666e-05,
"learning_rate": 0.0001,
"loss": 0.2091,
"loss/crossentropy": 2.2588201761245728,
"loss/fcd": 0.400390625,
"loss/idx": 18.0,
"loss/logits": 0.20908734947443008,
"step": 134
},
{
"epoch": 0.00194335480620434,
"grad_norm": 0.10595703125,
"grad_norm_var": 5.976259708404541e-05,
"learning_rate": 0.0001,
"loss": 0.2167,
"loss/crossentropy": 2.432557463645935,
"loss/fcd": 0.427734375,
"loss/idx": 18.0,
"loss/logits": 0.2166854664683342,
"step": 135
},
{
"epoch": 0.001957750026991039,
"grad_norm": 0.11328125,
"grad_norm_var": 5.942881107330322e-05,
"learning_rate": 0.0001,
"loss": 0.2177,
"loss/crossentropy": 2.4058191776275635,
"loss/fcd": 0.4443359375,
"loss/idx": 18.0,
"loss/logits": 0.21774785220623016,
"step": 136
},
{
"epoch": 0.001972145247777738,
"grad_norm": 0.1044921875,
"grad_norm_var": 5.98907470703125e-05,
"learning_rate": 0.0001,
"loss": 0.1946,
"loss/crossentropy": 2.441463589668274,
"loss/fcd": 0.421875,
"loss/idx": 18.0,
"loss/logits": 0.19459661096334457,
"step": 137
},
{
"epoch": 0.0019865404685644366,
"grad_norm": 0.12353515625,
"grad_norm_var": 6.546974182128907e-05,
"learning_rate": 0.0001,
"loss": 0.2507,
"loss/crossentropy": 2.5539783239364624,
"loss/fcd": 0.484375,
"loss/idx": 18.0,
"loss/logits": 0.25072282552719116,
"step": 138
},
{
"epoch": 0.0020009356893511353,
"grad_norm": 0.0986328125,
"grad_norm_var": 7.754862308502197e-05,
"learning_rate": 0.0001,
"loss": 0.2023,
"loss/crossentropy": 2.5158984661102295,
"loss/fcd": 0.41015625,
"loss/idx": 18.0,
"loss/logits": 0.2023158147931099,
"step": 139
},
{
"epoch": 0.002015330910137834,
"grad_norm": 0.109375,
"grad_norm_var": 6.959338982899983e-05,
"learning_rate": 0.0001,
"loss": 0.1937,
"loss/crossentropy": 2.2275065183639526,
"loss/fcd": 0.4248046875,
"loss/idx": 18.0,
"loss/logits": 0.19366320967674255,
"step": 140
},
{
"epoch": 0.002029726130924533,
"grad_norm": 0.1171875,
"grad_norm_var": 7.063150405883789e-05,
"learning_rate": 0.0001,
"loss": 0.1863,
"loss/crossentropy": 2.422375202178955,
"loss/fcd": 0.44921875,
"loss/idx": 18.0,
"loss/logits": 0.18625369668006897,
"step": 141
},
{
"epoch": 0.002044121351711232,
"grad_norm": 0.11474609375,
"grad_norm_var": 5.560616652170817e-05,
"learning_rate": 0.0001,
"loss": 0.207,
"loss/crossentropy": 2.209444999694824,
"loss/fcd": 0.4248046875,
"loss/idx": 18.0,
"loss/logits": 0.2070077657699585,
"step": 142
},
{
"epoch": 0.002058516572497931,
"grad_norm": 0.11083984375,
"grad_norm_var": 4.966954390207927e-05,
"learning_rate": 0.0001,
"loss": 0.2254,
"loss/crossentropy": 2.641687750816345,
"loss/fcd": 0.453125,
"loss/idx": 18.0,
"loss/logits": 0.2254098877310753,
"step": 143
},
{
"epoch": 0.0020729117932846296,
"grad_norm": 0.1083984375,
"grad_norm_var": 4.943211873372396e-05,
"learning_rate": 0.0001,
"loss": 0.2174,
"loss/crossentropy": 2.4751927852630615,
"loss/fcd": 0.4375,
"loss/idx": 18.0,
"loss/logits": 0.2174309641122818,
"step": 144
},
{
"epoch": 0.0020873070140713284,
"grad_norm": 0.10986328125,
"grad_norm_var": 4.522005716959635e-05,
"learning_rate": 0.0001,
"loss": 0.2059,
"loss/crossentropy": 2.703999638557434,
"loss/fcd": 0.4501953125,
"loss/idx": 18.0,
"loss/logits": 0.20589765906333923,
"step": 145
},
{
"epoch": 0.002101702234858027,
"grad_norm": 0.11474609375,
"grad_norm_var": 4.261235396067301e-05,
"learning_rate": 0.0001,
"loss": 0.2243,
"loss/crossentropy": 2.3885515928268433,
"loss/fcd": 0.443359375,
"loss/idx": 18.0,
"loss/logits": 0.22432449460029602,
"step": 146
},
{
"epoch": 0.002116097455644726,
"grad_norm": 0.1142578125,
"grad_norm_var": 3.983179728190104e-05,
"learning_rate": 0.0001,
"loss": 0.2524,
"loss/crossentropy": 2.471445918083191,
"loss/fcd": 0.470703125,
"loss/idx": 18.0,
"loss/logits": 0.25243769586086273,
"step": 147
},
{
"epoch": 0.0021304926764314247,
"grad_norm": 0.10693359375,
"grad_norm_var": 3.784398237864176e-05,
"learning_rate": 0.0001,
"loss": 0.2059,
"loss/crossentropy": 2.4856609106063843,
"loss/fcd": 0.41015625,
"loss/idx": 18.0,
"loss/logits": 0.20586465299129486,
"step": 148
},
{
"epoch": 0.0021448878972181234,
"grad_norm": 0.1240234375,
"grad_norm_var": 4.507601261138916e-05,
"learning_rate": 0.0001,
"loss": 0.238,
"loss/crossentropy": 2.4825209379196167,
"loss/fcd": 0.4912109375,
"loss/idx": 18.0,
"loss/logits": 0.23801030218601227,
"step": 149
},
{
"epoch": 0.002159283118004822,
"grad_norm": 0.11279296875,
"grad_norm_var": 4.329681396484375e-05,
"learning_rate": 0.0001,
"loss": 0.218,
"loss/crossentropy": 2.373395562171936,
"loss/fcd": 0.42578125,
"loss/idx": 18.0,
"loss/logits": 0.21804769337177277,
"step": 150
},
{
"epoch": 0.0021736783387915214,
"grad_norm": 0.1162109375,
"grad_norm_var": 4.1857361793518066e-05,
"learning_rate": 0.0001,
"loss": 0.2016,
"loss/crossentropy": 2.242987275123596,
"loss/fcd": 0.41015625,
"loss/idx": 18.0,
"loss/logits": 0.2015869840979576,
"step": 151
},
{
"epoch": 0.00218807355957822,
"grad_norm": 0.1103515625,
"grad_norm_var": 4.2071938514709474e-05,
"learning_rate": 0.0001,
"loss": 0.2289,
"loss/crossentropy": 2.6060279607772827,
"loss/fcd": 0.443359375,
"loss/idx": 18.0,
"loss/logits": 0.2288510948419571,
"step": 152
},
{
"epoch": 0.002202468780364919,
"grad_norm": 0.10595703125,
"grad_norm_var": 4.068613052368164e-05,
"learning_rate": 0.0001,
"loss": 0.2122,
"loss/crossentropy": 2.4911882877349854,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.2122008204460144,
"step": 153
},
{
"epoch": 0.0022168640011516177,
"grad_norm": 0.11279296875,
"grad_norm_var": 3.1900405883789065e-05,
"learning_rate": 0.0001,
"loss": 0.1964,
"loss/crossentropy": 2.2283207178115845,
"loss/fcd": 0.388671875,
"loss/idx": 18.0,
"loss/logits": 0.19640249013900757,
"step": 154
},
{
"epoch": 0.0022312592219383164,
"grad_norm": 0.1240234375,
"grad_norm_var": 2.7974446614583332e-05,
"learning_rate": 0.0001,
"loss": 0.2518,
"loss/crossentropy": 2.6885886192321777,
"loss/fcd": 0.498046875,
"loss/idx": 18.0,
"loss/logits": 0.251840204000473,
"step": 155
},
{
"epoch": 0.002245654442725015,
"grad_norm": 0.1123046875,
"grad_norm_var": 2.698500951131185e-05,
"learning_rate": 0.0001,
"loss": 0.2217,
"loss/crossentropy": 2.3278268575668335,
"loss/fcd": 0.4111328125,
"loss/idx": 18.0,
"loss/logits": 0.22167562693357468,
"step": 156
},
{
"epoch": 0.002260049663511714,
"grad_norm": 0.1171875,
"grad_norm_var": 2.698500951131185e-05,
"learning_rate": 0.0001,
"loss": 0.2134,
"loss/crossentropy": 2.2359228134155273,
"loss/fcd": 0.4306640625,
"loss/idx": 18.0,
"loss/logits": 0.21338575333356857,
"step": 157
},
{
"epoch": 0.0022744448842984127,
"grad_norm": 0.1220703125,
"grad_norm_var": 3.1589468320210776e-05,
"learning_rate": 0.0001,
"loss": 0.1983,
"loss/crossentropy": 2.1452057361602783,
"loss/fcd": 0.40625,
"loss/idx": 18.0,
"loss/logits": 0.19825652241706848,
"step": 158
},
{
"epoch": 0.002288840105085112,
"grad_norm": 0.11328125,
"grad_norm_var": 3.095865249633789e-05,
"learning_rate": 0.0001,
"loss": 0.2048,
"loss/crossentropy": 2.075889527797699,
"loss/fcd": 0.400390625,
"loss/idx": 18.0,
"loss/logits": 0.20479386299848557,
"step": 159
},
{
"epoch": 0.0023032353258718107,
"grad_norm": 0.123046875,
"grad_norm_var": 3.3283233642578124e-05,
"learning_rate": 0.0001,
"loss": 0.206,
"loss/crossentropy": 2.2650269269943237,
"loss/fcd": 0.4375,
"loss/idx": 18.0,
"loss/logits": 0.20604287087917328,
"step": 160
},
{
"epoch": 0.0023176305466585095,
"grad_norm": 0.11328125,
"grad_norm_var": 3.167688846588135e-05,
"learning_rate": 0.0001,
"loss": 0.2225,
"loss/crossentropy": 2.385145902633667,
"loss/fcd": 0.416015625,
"loss/idx": 18.0,
"loss/logits": 0.22251462936401367,
"step": 161
},
{
"epoch": 0.0023320257674452082,
"grad_norm": 0.11669921875,
"grad_norm_var": 3.179609775543213e-05,
"learning_rate": 0.0001,
"loss": 0.2162,
"loss/crossentropy": 2.3363062143325806,
"loss/fcd": 0.43359375,
"loss/idx": 18.0,
"loss/logits": 0.21623297035694122,
"step": 162
},
{
"epoch": 0.002346420988231907,
"grad_norm": 0.11328125,
"grad_norm_var": 3.199477990468343e-05,
"learning_rate": 0.0001,
"loss": 0.2196,
"loss/crossentropy": 2.258102059364319,
"loss/fcd": 0.4111328125,
"loss/idx": 18.0,
"loss/logits": 0.219633050262928,
"step": 163
},
{
"epoch": 0.0023608162090186058,
"grad_norm": 0.11328125,
"grad_norm_var": 2.7461846669514975e-05,
"learning_rate": 0.0001,
"loss": 0.2229,
"loss/crossentropy": 2.477385640144348,
"loss/fcd": 0.4384765625,
"loss/idx": 18.0,
"loss/logits": 0.22293243557214737,
"step": 164
},
{
"epoch": 0.0023752114298053045,
"grad_norm": 0.11376953125,
"grad_norm_var": 2.260108788808187e-05,
"learning_rate": 0.0001,
"loss": 0.206,
"loss/crossentropy": 2.5965325832366943,
"loss/fcd": 0.4345703125,
"loss/idx": 18.0,
"loss/logits": 0.20604980736970901,
"step": 165
},
{
"epoch": 0.0023896066505920033,
"grad_norm": 0.12158203125,
"grad_norm_var": 2.48183806737264e-05,
"learning_rate": 0.0001,
"loss": 0.2456,
"loss/crossentropy": 2.391031265258789,
"loss/fcd": 0.4580078125,
"loss/idx": 18.0,
"loss/logits": 0.24561651051044464,
"step": 166
},
{
"epoch": 0.002404001871378702,
"grad_norm": 0.11083984375,
"grad_norm_var": 2.616246541341146e-05,
"learning_rate": 0.0001,
"loss": 0.2163,
"loss/crossentropy": 2.534990668296814,
"loss/fcd": 0.431640625,
"loss/idx": 18.0,
"loss/logits": 0.21634604781866074,
"step": 167
},
{
"epoch": 0.0024183970921654013,
"grad_norm": 0.130859375,
"grad_norm_var": 3.909667332967122e-05,
"learning_rate": 0.0001,
"loss": 0.2477,
"loss/crossentropy": 2.354380965232849,
"loss/fcd": 0.482421875,
"loss/idx": 18.0,
"loss/logits": 0.24768973886966705,
"step": 168
},
{
"epoch": 0.0024327923129521,
"grad_norm": 0.1142578125,
"grad_norm_var": 3.171662489573161e-05,
"learning_rate": 0.0001,
"loss": 0.2411,
"loss/crossentropy": 2.430347204208374,
"loss/fcd": 0.447265625,
"loss/idx": 18.0,
"loss/logits": 0.2411317229270935,
"step": 169
},
{
"epoch": 0.002447187533738799,
"grad_norm": 0.11328125,
"grad_norm_var": 3.145535786946615e-05,
"learning_rate": 0.0001,
"loss": 0.2188,
"loss/crossentropy": 2.312503755092621,
"loss/fcd": 0.4169921875,
"loss/idx": 18.0,
"loss/logits": 0.21878328174352646,
"step": 170
},
{
"epoch": 0.0024615827545254976,
"grad_norm": 0.109375,
"grad_norm_var": 3.1276543935139975e-05,
"learning_rate": 0.0001,
"loss": 0.2309,
"loss/crossentropy": 2.5175788402557373,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.23088021576404572,
"step": 171
},
{
"epoch": 0.0024759779753121963,
"grad_norm": 0.1123046875,
"grad_norm_var": 3.1276543935139975e-05,
"learning_rate": 0.0001,
"loss": 0.2165,
"loss/crossentropy": 2.484018087387085,
"loss/fcd": 0.4140625,
"loss/idx": 18.0,
"loss/logits": 0.2165074348449707,
"step": 172
},
{
"epoch": 0.002490373196098895,
"grad_norm": 0.1318359375,
"grad_norm_var": 4.671414693196615e-05,
"learning_rate": 0.0001,
"loss": 0.2481,
"loss/crossentropy": 2.2699760794639587,
"loss/fcd": 0.486328125,
"loss/idx": 18.0,
"loss/logits": 0.24809680879116058,
"step": 173
},
{
"epoch": 0.002504768416885594,
"grad_norm": 0.10693359375,
"grad_norm_var": 5.0933162371317545e-05,
"learning_rate": 0.0001,
"loss": 0.1955,
"loss/crossentropy": 2.2288765907287598,
"loss/fcd": 0.3857421875,
"loss/idx": 18.0,
"loss/logits": 0.19553960859775543,
"step": 174
},
{
"epoch": 0.0025191636376722926,
"grad_norm": 0.1220703125,
"grad_norm_var": 5.243519941965739e-05,
"learning_rate": 0.0001,
"loss": 0.213,
"loss/crossentropy": 2.4654963612556458,
"loss/fcd": 0.453125,
"loss/idx": 18.0,
"loss/logits": 0.2130081057548523,
"step": 175
},
{
"epoch": 0.002533558858458992,
"grad_norm": 0.1162109375,
"grad_norm_var": 4.954238732655843e-05,
"learning_rate": 0.0001,
"loss": 0.21,
"loss/crossentropy": 2.2151373624801636,
"loss/fcd": 0.4072265625,
"loss/idx": 18.0,
"loss/logits": 0.20995519310235977,
"step": 176
},
{
"epoch": 0.0025479540792456906,
"grad_norm": 0.11669921875,
"grad_norm_var": 4.892349243164062e-05,
"learning_rate": 0.0001,
"loss": 0.2092,
"loss/crossentropy": 2.4239630699157715,
"loss/fcd": 0.4306640625,
"loss/idx": 18.0,
"loss/logits": 0.20919281244277954,
"step": 177
},
{
"epoch": 0.0025623493000323893,
"grad_norm": 0.11572265625,
"grad_norm_var": 4.8951307932535806e-05,
"learning_rate": 0.0001,
"loss": 0.2638,
"loss/crossentropy": 2.718831419944763,
"loss/fcd": 0.4755859375,
"loss/idx": 18.0,
"loss/logits": 0.2638430893421173,
"step": 178
},
{
"epoch": 0.002576744520819088,
"grad_norm": 0.10302734375,
"grad_norm_var": 5.977849165598551e-05,
"learning_rate": 0.0001,
"loss": 0.1942,
"loss/crossentropy": 2.4341124296188354,
"loss/fcd": 0.4189453125,
"loss/idx": 18.0,
"loss/logits": 0.1942092925310135,
"step": 179
},
{
"epoch": 0.002591139741605787,
"grad_norm": 0.1142578125,
"grad_norm_var": 5.9516231218973795e-05,
"learning_rate": 0.0001,
"loss": 0.223,
"loss/crossentropy": 2.3784589767456055,
"loss/fcd": 0.41015625,
"loss/idx": 18.0,
"loss/logits": 0.2230425328016281,
"step": 180
},
{
"epoch": 0.0026055349623924856,
"grad_norm": 0.11083984375,
"grad_norm_var": 6.085137526194254e-05,
"learning_rate": 0.0001,
"loss": 0.1995,
"loss/crossentropy": 2.1103312969207764,
"loss/fcd": 0.41015625,
"loss/idx": 18.0,
"loss/logits": 0.1994745284318924,
"step": 181
},
{
"epoch": 0.0026199301831791844,
"grad_norm": 0.1123046875,
"grad_norm_var": 5.8869520823160805e-05,
"learning_rate": 0.0001,
"loss": 0.2228,
"loss/crossentropy": 2.173603892326355,
"loss/fcd": 0.4228515625,
"loss/idx": 18.0,
"loss/logits": 0.22283250093460083,
"step": 182
},
{
"epoch": 0.002634325403965883,
"grad_norm": 0.12353515625,
"grad_norm_var": 6.18139902750651e-05,
"learning_rate": 0.0001,
"loss": 0.2399,
"loss/crossentropy": 2.3933345079421997,
"loss/fcd": 0.45703125,
"loss/idx": 18.0,
"loss/logits": 0.23994869738817215,
"step": 183
},
{
"epoch": 0.002648720624752582,
"grad_norm": 0.12890625,
"grad_norm_var": 5.8142344156901043e-05,
"learning_rate": 0.0001,
"loss": 0.2447,
"loss/crossentropy": 2.5679067373275757,
"loss/fcd": 0.474609375,
"loss/idx": 18.0,
"loss/logits": 0.24468251317739487,
"step": 184
},
{
"epoch": 0.002663115845539281,
"grad_norm": 0.11279296875,
"grad_norm_var": 5.8562556902567545e-05,
"learning_rate": 0.0001,
"loss": 0.1861,
"loss/crossentropy": 1.966173768043518,
"loss/fcd": 0.3759765625,
"loss/idx": 18.0,
"loss/logits": 0.18611325323581696,
"step": 185
},
{
"epoch": 0.00267751106632598,
"grad_norm": 0.1865234375,
"grad_norm_var": 0.0003708908955256144,
"learning_rate": 0.0001,
"loss": 0.3475,
"loss/crossentropy": 2.386851668357849,
"loss/fcd": 0.560546875,
"loss/idx": 18.0,
"loss/logits": 0.3474508970975876,
"step": 186
},
{
"epoch": 0.0026919062871126787,
"grad_norm": 0.11279296875,
"grad_norm_var": 0.0003666838010152181,
"learning_rate": 0.0001,
"loss": 0.2127,
"loss/crossentropy": 2.4003021717071533,
"loss/fcd": 0.4287109375,
"loss/idx": 18.0,
"loss/logits": 0.21274058520793915,
"step": 187
},
{
"epoch": 0.0027063015078993774,
"grad_norm": 0.15234375,
"grad_norm_var": 0.00042354265848795574,
"learning_rate": 0.0001,
"loss": 0.2758,
"loss/crossentropy": 2.218628406524658,
"loss/fcd": 0.4873046875,
"loss/idx": 18.0,
"loss/logits": 0.2757628411054611,
"step": 188
},
{
"epoch": 0.002720696728686076,
"grad_norm": 0.10986328125,
"grad_norm_var": 0.00042761067549387615,
"learning_rate": 0.0001,
"loss": 0.1907,
"loss/crossentropy": 2.1557281017303467,
"loss/fcd": 0.388671875,
"loss/idx": 18.0,
"loss/logits": 0.1906721442937851,
"step": 189
},
{
"epoch": 0.002735091949472775,
"grad_norm": 0.10595703125,
"grad_norm_var": 0.000429573655128479,
"learning_rate": 0.0001,
"loss": 0.1882,
"loss/crossentropy": 2.047899842262268,
"loss/fcd": 0.3857421875,
"loss/idx": 18.0,
"loss/logits": 0.1881674826145172,
"step": 190
},
{
"epoch": 0.0027494871702594737,
"grad_norm": 0.10302734375,
"grad_norm_var": 0.00045076608657836916,
"learning_rate": 0.0001,
"loss": 0.1987,
"loss/crossentropy": 2.2902016639709473,
"loss/fcd": 0.3974609375,
"loss/idx": 18.0,
"loss/logits": 0.1987495943903923,
"step": 191
},
{
"epoch": 0.0027638823910461725,
"grad_norm": 0.107421875,
"grad_norm_var": 0.0004603862762451172,
"learning_rate": 0.0001,
"loss": 0.1967,
"loss/crossentropy": 2.296987771987915,
"loss/fcd": 0.3837890625,
"loss/idx": 18.0,
"loss/logits": 0.1967175006866455,
"step": 192
},
{
"epoch": 0.0027782776118328717,
"grad_norm": 0.119140625,
"grad_norm_var": 0.00045976539452870685,
"learning_rate": 0.0001,
"loss": 0.2354,
"loss/crossentropy": 2.2293859124183655,
"loss/fcd": 0.427734375,
"loss/idx": 18.0,
"loss/logits": 0.23544982075691223,
"step": 193
},
{
"epoch": 0.0027926728326195705,
"grad_norm": 0.1044921875,
"grad_norm_var": 0.0004739085833231608,
"learning_rate": 0.0001,
"loss": 0.2093,
"loss/crossentropy": 2.3077027797698975,
"loss/fcd": 0.4384765625,
"loss/idx": 18.0,
"loss/logits": 0.20933127403259277,
"step": 194
},
{
"epoch": 0.0028070680534062692,
"grad_norm": 0.11767578125,
"grad_norm_var": 0.0004557291666666667,
"learning_rate": 0.0001,
"loss": 0.2349,
"loss/crossentropy": 2.5241353511810303,
"loss/fcd": 0.423828125,
"loss/idx": 18.0,
"loss/logits": 0.23492421209812164,
"step": 195
},
{
"epoch": 0.002821463274192968,
"grad_norm": 0.10888671875,
"grad_norm_var": 0.0004617283741633097,
"learning_rate": 0.0001,
"loss": 0.2089,
"loss/crossentropy": 2.2112027406692505,
"loss/fcd": 0.41796875,
"loss/idx": 18.0,
"loss/logits": 0.20893365144729614,
"step": 196
},
{
"epoch": 0.0028358584949796668,
"grad_norm": 0.095703125,
"grad_norm_var": 0.0004940946896870931,
"learning_rate": 0.0001,
"loss": 0.1738,
"loss/crossentropy": 2.3283063173294067,
"loss/fcd": 0.392578125,
"loss/idx": 18.0,
"loss/logits": 0.1738404482603073,
"step": 197
},
{
"epoch": 0.0028502537157663655,
"grad_norm": 0.11279296875,
"grad_norm_var": 0.0004936844110488891,
"learning_rate": 0.0001,
"loss": 0.2259,
"loss/crossentropy": 2.4649304151535034,
"loss/fcd": 0.4697265625,
"loss/idx": 18.0,
"loss/logits": 0.22589464485645294,
"step": 198
},
{
"epoch": 0.0028646489365530643,
"grad_norm": 0.142578125,
"grad_norm_var": 0.0005282044410705566,
"learning_rate": 0.0001,
"loss": 0.2334,
"loss/crossentropy": 2.4893065690994263,
"loss/fcd": 0.443359375,
"loss/idx": 18.0,
"loss/logits": 0.23335154354572296,
"step": 199
},
{
"epoch": 0.002879044157339763,
"grad_norm": 0.1044921875,
"grad_norm_var": 0.000536648432413737,
"learning_rate": 0.0001,
"loss": 0.2107,
"loss/crossentropy": 2.5291190147399902,
"loss/fcd": 0.431640625,
"loss/idx": 18.0,
"loss/logits": 0.2106790393590927,
"step": 200
},
{
"epoch": 0.002893439378126462,
"grad_norm": 0.11279296875,
"grad_norm_var": 0.000536648432413737,
"learning_rate": 0.0001,
"loss": 0.2286,
"loss/crossentropy": 2.5203051567077637,
"loss/fcd": 0.4521484375,
"loss/idx": 18.0,
"loss/logits": 0.2286214381456375,
"step": 201
},
{
"epoch": 0.002907834598913161,
"grad_norm": 0.11181640625,
"grad_norm_var": 0.00020819405714670816,
"learning_rate": 0.0001,
"loss": 0.2162,
"loss/crossentropy": 2.1828808784484863,
"loss/fcd": 0.40234375,
"loss/idx": 18.0,
"loss/logits": 0.21623709797859192,
"step": 202
},
{
"epoch": 0.00292222981969986,
"grad_norm": 0.109375,
"grad_norm_var": 0.0002094109853108724,
"learning_rate": 0.0001,
"loss": 0.1716,
"loss/crossentropy": 1.858969271183014,
"loss/fcd": 0.5029296875,
"loss/idx": 18.0,
"loss/logits": 0.17157060280442238,
"step": 203
},
{
"epoch": 0.0029366250404865585,
"grad_norm": 0.11376953125,
"grad_norm_var": 0.0001033852497736613,
"learning_rate": 0.0001,
"loss": 0.2325,
"loss/crossentropy": 2.4954288005828857,
"loss/fcd": 0.4296875,
"loss/idx": 18.0,
"loss/logits": 0.23250436782836914,
"step": 204
},
{
"epoch": 0.0029510202612732573,
"grad_norm": 0.11669921875,
"grad_norm_var": 0.00010505417982737223,
"learning_rate": 0.0001,
"loss": 0.2277,
"loss/crossentropy": 2.389811635017395,
"loss/fcd": 0.4482421875,
"loss/idx": 18.0,
"loss/logits": 0.22774703800678253,
"step": 205
},
{
"epoch": 0.002965415482059956,
"grad_norm": 0.1328125,
"grad_norm_var": 0.00012969573338826498,
"learning_rate": 0.0001,
"loss": 0.1985,
"loss/crossentropy": 2.144119679927826,
"loss/fcd": 0.412109375,
"loss/idx": 18.0,
"loss/logits": 0.19851599633693695,
"step": 206
},
{
"epoch": 0.002979810702846655,
"grad_norm": 0.1220703125,
"grad_norm_var": 0.00012617011864980062,
"learning_rate": 0.0001,
"loss": 0.2454,
"loss/crossentropy": 2.509921073913574,
"loss/fcd": 0.4638671875,
"loss/idx": 18.0,
"loss/logits": 0.24539965391159058,
"step": 207
},
{
"epoch": 0.0029942059236333536,
"grad_norm": 0.1142578125,
"grad_norm_var": 0.0001226097345352173,
"learning_rate": 0.0001,
"loss": 0.238,
"loss/crossentropy": 2.336063265800476,
"loss/fcd": 0.478515625,
"loss/idx": 18.0,
"loss/logits": 0.237995944917202,
"step": 208
},
{
"epoch": 0.0030086011444200524,
"grad_norm": 0.1435546875,
"grad_norm_var": 0.0001734723647435506,
"learning_rate": 0.0001,
"loss": 0.2493,
"loss/crossentropy": 2.3922590017318726,
"loss/fcd": 0.484375,
"loss/idx": 18.0,
"loss/logits": 0.24932140111923218,
"step": 209
},
{
"epoch": 0.003022996365206751,
"grad_norm": 0.115234375,
"grad_norm_var": 0.0001635064681371053,
"learning_rate": 0.0001,
"loss": 0.2434,
"loss/crossentropy": 2.597308397293091,
"loss/fcd": 0.470703125,
"loss/idx": 18.0,
"loss/logits": 0.24338021874427795,
"step": 210
},
{
"epoch": 0.0030373915859934503,
"grad_norm": 0.1123046875,
"grad_norm_var": 0.00016493797302246093,
"learning_rate": 0.0001,
"loss": 0.2082,
"loss/crossentropy": 2.3584909439086914,
"loss/fcd": 0.42578125,
"loss/idx": 18.0,
"loss/logits": 0.20816650241613388,
"step": 211
},
{
"epoch": 0.003051786806780149,
"grad_norm": 0.10791015625,
"grad_norm_var": 0.0001660307248433431,
"learning_rate": 0.0001,
"loss": 0.2122,
"loss/crossentropy": 2.3587781190872192,
"loss/fcd": 0.4111328125,
"loss/idx": 18.0,
"loss/logits": 0.21221671998500824,
"step": 212
},
{
"epoch": 0.003066182027566848,
"grad_norm": 0.1298828125,
"grad_norm_var": 0.0001430829366048177,
"learning_rate": 0.0001,
"loss": 0.2491,
"loss/crossentropy": 2.4296464920043945,
"loss/fcd": 0.46875,
"loss/idx": 18.0,
"loss/logits": 0.24906984716653824,
"step": 213
},
{
"epoch": 0.0030805772483535466,
"grad_norm": 0.123046875,
"grad_norm_var": 0.00014130969842274984,
"learning_rate": 0.0001,
"loss": 0.2168,
"loss/crossentropy": 2.1808066368103027,
"loss/fcd": 0.4248046875,
"loss/idx": 18.0,
"loss/logits": 0.21684125810861588,
"step": 214
},
{
"epoch": 0.0030949724691402454,
"grad_norm": 0.1025390625,
"grad_norm_var": 0.00011850098768870035,
"learning_rate": 0.0001,
"loss": 0.2052,
"loss/crossentropy": 2.3064663410186768,
"loss/fcd": 0.41015625,
"loss/idx": 18.0,
"loss/logits": 0.20520812273025513,
"step": 215
},
{
"epoch": 0.003109367689926944,
"grad_norm": 0.1162109375,
"grad_norm_var": 0.00010748604933420817,
"learning_rate": 0.0001,
"loss": 0.2414,
"loss/crossentropy": 2.4660093784332275,
"loss/fcd": 0.4609375,
"loss/idx": 18.0,
"loss/logits": 0.241433747112751,
"step": 216
},
{
"epoch": 0.003123762910713643,
"grad_norm": 0.11669921875,
"grad_norm_var": 0.00010584890842437744,
"learning_rate": 0.0001,
"loss": 0.2161,
"loss/crossentropy": 2.2378053665161133,
"loss/fcd": 0.4130859375,
"loss/idx": 18.0,
"loss/logits": 0.21608934551477432,
"step": 217
},
{
"epoch": 0.0031381581315003417,
"grad_norm": 0.11669921875,
"grad_norm_var": 0.00010330577691396078,
"learning_rate": 0.0001,
"loss": 0.2132,
"loss/crossentropy": 2.312962532043457,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.21319198608398438,
"step": 218
},
{
"epoch": 0.003152553352287041,
"grad_norm": 0.11474609375,
"grad_norm_var": 9.870529174804688e-05,
"learning_rate": 0.0001,
"loss": 0.2193,
"loss/crossentropy": 2.3573015928268433,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.21930547058582306,
"step": 219
},
{
"epoch": 0.0031669485730737397,
"grad_norm": 0.119140625,
"grad_norm_var": 9.701152642567952e-05,
"learning_rate": 0.0001,
"loss": 0.2496,
"loss/crossentropy": 2.6434515714645386,
"loss/fcd": 0.4638671875,
"loss/idx": 18.0,
"loss/logits": 0.24958771467208862,
"step": 220
},
{
"epoch": 0.0031813437938604384,
"grad_norm": 0.1396484375,
"grad_norm_var": 0.0001229246457417806,
"learning_rate": 0.0001,
"loss": 0.2262,
"loss/crossentropy": 2.2807798981666565,
"loss/fcd": 0.50390625,
"loss/idx": 18.0,
"loss/logits": 0.22618486732244492,
"step": 221
},
{
"epoch": 0.003195739014647137,
"grad_norm": 0.12060546875,
"grad_norm_var": 0.00011207163333892822,
"learning_rate": 0.0001,
"loss": 0.2143,
"loss/crossentropy": 2.350602626800537,
"loss/fcd": 0.42578125,
"loss/idx": 18.0,
"loss/logits": 0.2142939791083336,
"step": 222
},
{
"epoch": 0.003210134235433836,
"grad_norm": 0.193359375,
"grad_norm_var": 0.00045262078444163,
"learning_rate": 0.0001,
"loss": 0.2248,
"loss/crossentropy": 2.7532432079315186,
"loss/fcd": 0.447265625,
"loss/idx": 18.0,
"loss/logits": 0.22482239454984665,
"step": 223
},
{
"epoch": 0.0032245294562205347,
"grad_norm": 0.1240234375,
"grad_norm_var": 0.000445746382077535,
"learning_rate": 0.0001,
"loss": 0.2287,
"loss/crossentropy": 2.397303342819214,
"loss/fcd": 0.4541015625,
"loss/idx": 18.0,
"loss/logits": 0.22872482240200043,
"step": 224
},
{
"epoch": 0.0032389246770072335,
"grad_norm": 0.119140625,
"grad_norm_var": 0.00042170584201812745,
"learning_rate": 0.0001,
"loss": 0.1917,
"loss/crossentropy": 2.161116361618042,
"loss/fcd": 0.390625,
"loss/idx": 18.0,
"loss/logits": 0.1917443946003914,
"step": 225
},
{
"epoch": 0.0032533198977939323,
"grad_norm": 0.10986328125,
"grad_norm_var": 0.0004292130470275879,
"learning_rate": 0.0001,
"loss": 0.2043,
"loss/crossentropy": 2.131627917289734,
"loss/fcd": 0.392578125,
"loss/idx": 18.0,
"loss/logits": 0.20434105396270752,
"step": 226
},
{
"epoch": 0.003267715118580631,
"grad_norm": 0.12060546875,
"grad_norm_var": 0.0004218329985936483,
"learning_rate": 0.0001,
"loss": 0.218,
"loss/crossentropy": 2.5683807134628296,
"loss/fcd": 0.4580078125,
"loss/idx": 18.0,
"loss/logits": 0.21802888065576553,
"step": 227
},
{
"epoch": 0.0032821103393673302,
"grad_norm": 0.134765625,
"grad_norm_var": 0.00041150649388631185,
"learning_rate": 0.0001,
"loss": 0.2244,
"loss/crossentropy": 2.4449127912521362,
"loss/fcd": 0.4658203125,
"loss/idx": 18.0,
"loss/logits": 0.2243650108575821,
"step": 228
},
{
"epoch": 0.003296505560154029,
"grad_norm": 0.11376953125,
"grad_norm_var": 0.00041737457116444905,
"learning_rate": 0.0001,
"loss": 0.1937,
"loss/crossentropy": 2.1692421436309814,
"loss/fcd": 0.3828125,
"loss/idx": 18.0,
"loss/logits": 0.19374938309192657,
"step": 229
},
{
"epoch": 0.0033109007809407278,
"grad_norm": 0.10986328125,
"grad_norm_var": 0.0004300077756245931,
"learning_rate": 0.0001,
"loss": 0.2099,
"loss/crossentropy": 2.1626864671707153,
"loss/fcd": 0.4140625,
"loss/idx": 18.0,
"loss/logits": 0.20994187891483307,
"step": 230
},
{
"epoch": 0.0033252960017274265,
"grad_norm": 0.1123046875,
"grad_norm_var": 0.0004090269406636556,
"learning_rate": 0.0001,
"loss": 0.2224,
"loss/crossentropy": 2.4669238328933716,
"loss/fcd": 0.4404296875,
"loss/idx": 18.0,
"loss/logits": 0.2224324494600296,
"step": 231
},
{
"epoch": 0.0033396912225141253,
"grad_norm": 0.11328125,
"grad_norm_var": 0.00041254361470540363,
"learning_rate": 0.0001,
"loss": 0.241,
"loss/crossentropy": 2.534782886505127,
"loss/fcd": 0.4384765625,
"loss/idx": 18.0,
"loss/logits": 0.24097825586795807,
"step": 232
},
{
"epoch": 0.003354086443300824,
"grad_norm": 0.11572265625,
"grad_norm_var": 0.000413509209950765,
"learning_rate": 0.0001,
"loss": 0.229,
"loss/crossentropy": 2.4167356491088867,
"loss/fcd": 0.4248046875,
"loss/idx": 18.0,
"loss/logits": 0.22896190732717514,
"step": 233
},
{
"epoch": 0.003368481664087523,
"grad_norm": 0.1220703125,
"grad_norm_var": 0.0004103730122248332,
"learning_rate": 0.0001,
"loss": 0.2494,
"loss/crossentropy": 2.544241964817047,
"loss/fcd": 0.478515625,
"loss/idx": 18.0,
"loss/logits": 0.2493698000907898,
"step": 234
},
{
"epoch": 0.0033828768848742216,
"grad_norm": 0.12060546875,
"grad_norm_var": 0.0004053423802057902,
"learning_rate": 0.0001,
"loss": 0.2229,
"loss/crossentropy": 2.656595230102539,
"loss/fcd": 0.45703125,
"loss/idx": 18.0,
"loss/logits": 0.22288895398378372,
"step": 235
},
{
"epoch": 0.0033972721056609208,
"grad_norm": 0.109375,
"grad_norm_var": 0.0004180183013280233,
"learning_rate": 0.0001,
"loss": 0.2,
"loss/crossentropy": 2.153246819972992,
"loss/fcd": 0.41015625,
"loss/idx": 18.0,
"loss/logits": 0.19998866319656372,
"step": 236
},
{
"epoch": 0.0034116673264476195,
"grad_norm": 0.1181640625,
"grad_norm_var": 0.0004011462132136027,
"learning_rate": 0.0001,
"loss": 0.2159,
"loss/crossentropy": 2.3706564903259277,
"loss/fcd": 0.41796875,
"loss/idx": 18.0,
"loss/logits": 0.21588444709777832,
"step": 237
},
{
"epoch": 0.0034260625472343183,
"grad_norm": 0.10986328125,
"grad_norm_var": 0.0004108498493830363,
"learning_rate": 0.0001,
"loss": 0.2155,
"loss/crossentropy": 2.377021312713623,
"loss/fcd": 0.4287109375,
"loss/idx": 18.0,
"loss/logits": 0.21553778648376465,
"step": 238
},
{
"epoch": 0.003440457768021017,
"grad_norm": 0.10888671875,
"grad_norm_var": 4.942814509073893e-05,
"learning_rate": 0.0001,
"loss": 0.1947,
"loss/crossentropy": 2.1807267665863037,
"loss/fcd": 0.41796875,
"loss/idx": 18.0,
"loss/logits": 0.19467756152153015,
"step": 239
},
{
"epoch": 0.003454852988807716,
"grad_norm": 0.107421875,
"grad_norm_var": 4.976590474446614e-05,
"learning_rate": 0.0001,
"loss": 0.2297,
"loss/crossentropy": 2.5010019540786743,
"loss/fcd": 0.4169921875,
"loss/idx": 18.0,
"loss/logits": 0.22967635095119476,
"step": 240
},
{
"epoch": 0.0034692482095944146,
"grad_norm": 0.12109375,
"grad_norm_var": 5.098978678385417e-05,
"learning_rate": 0.0001,
"loss": 0.2257,
"loss/crossentropy": 2.1949596405029297,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.2256992757320404,
"step": 241
},
{
"epoch": 0.0034836434303811134,
"grad_norm": 0.099609375,
"grad_norm_var": 6.52382771174113e-05,
"learning_rate": 0.0001,
"loss": 0.1786,
"loss/crossentropy": 2.3066688776016235,
"loss/fcd": 0.404296875,
"loss/idx": 18.0,
"loss/logits": 0.17856091260910034,
"step": 242
},
{
"epoch": 0.003498038651167812,
"grad_norm": 0.1357421875,
"grad_norm_var": 9.119908014933268e-05,
"learning_rate": 0.0001,
"loss": 0.2979,
"loss/crossentropy": 2.833424210548401,
"loss/fcd": 0.53125,
"loss/idx": 18.0,
"loss/logits": 0.29794102907180786,
"step": 243
},
{
"epoch": 0.003512433871954511,
"grad_norm": 0.11083984375,
"grad_norm_var": 6.642242272694906e-05,
"learning_rate": 0.0001,
"loss": 0.2084,
"loss/crossentropy": 2.4168113470077515,
"loss/fcd": 0.3994140625,
"loss/idx": 18.0,
"loss/logits": 0.2084333300590515,
"step": 244
},
{
"epoch": 0.00352682909274121,
"grad_norm": 0.10546875,
"grad_norm_var": 7.130304972330729e-05,
"learning_rate": 0.0001,
"loss": 0.2081,
"loss/crossentropy": 2.4122915267944336,
"loss/fcd": 0.427734375,
"loss/idx": 18.0,
"loss/logits": 0.20814163982868195,
"step": 245
},
{
"epoch": 0.003541224313527909,
"grad_norm": 0.11376953125,
"grad_norm_var": 7.022221883138021e-05,
"learning_rate": 0.0001,
"loss": 0.2177,
"loss/crossentropy": 2.357482075691223,
"loss/fcd": 0.4267578125,
"loss/idx": 18.0,
"loss/logits": 0.2177310660481453,
"step": 246
},
{
"epoch": 0.0035556195343146076,
"grad_norm": 0.1064453125,
"grad_norm_var": 7.370313008626302e-05,
"learning_rate": 0.0001,
"loss": 0.225,
"loss/crossentropy": 2.329651951789856,
"loss/fcd": 0.427734375,
"loss/idx": 18.0,
"loss/logits": 0.22500982880592346,
"step": 247
},
{
"epoch": 0.0035700147551013064,
"grad_norm": 0.1123046875,
"grad_norm_var": 7.381041844685872e-05,
"learning_rate": 0.0001,
"loss": 0.177,
"loss/crossentropy": 2.0500356554985046,
"loss/fcd": 0.380859375,
"loss/idx": 18.0,
"loss/logits": 0.1770332083106041,
"step": 248
},
{
"epoch": 0.003584409975888005,
"grad_norm": 0.1123046875,
"grad_norm_var": 7.356703281402588e-05,
"learning_rate": 0.0001,
"loss": 0.1987,
"loss/crossentropy": 2.2625420093536377,
"loss/fcd": 0.3935546875,
"loss/idx": 18.0,
"loss/logits": 0.19871972501277924,
"step": 249
},
{
"epoch": 0.003598805196674704,
"grad_norm": 0.10791015625,
"grad_norm_var": 6.967782974243164e-05,
"learning_rate": 0.0001,
"loss": 0.1945,
"loss/crossentropy": 2.5878301858901978,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.19449464231729507,
"step": 250
},
{
"epoch": 0.0036132004174614027,
"grad_norm": 0.11962890625,
"grad_norm_var": 6.86804453531901e-05,
"learning_rate": 0.0001,
"loss": 0.218,
"loss/crossentropy": 2.4477245807647705,
"loss/fcd": 0.4501953125,
"loss/idx": 18.0,
"loss/logits": 0.21796388924121857,
"step": 251
},
{
"epoch": 0.0036275956382481015,
"grad_norm": 0.12158203125,
"grad_norm_var": 7.302661736806234e-05,
"learning_rate": 0.0001,
"loss": 0.2601,
"loss/crossentropy": 2.5919313430786133,
"loss/fcd": 0.466796875,
"loss/idx": 18.0,
"loss/logits": 0.2600754201412201,
"step": 252
},
{
"epoch": 0.0036419908590348007,
"grad_norm": 0.1083984375,
"grad_norm_var": 7.251004378000895e-05,
"learning_rate": 0.0001,
"loss": 0.1969,
"loss/crossentropy": 2.3274489641189575,
"loss/fcd": 0.3916015625,
"loss/idx": 18.0,
"loss/logits": 0.1969192698597908,
"step": 253
},
{
"epoch": 0.0036563860798214994,
"grad_norm": 0.115234375,
"grad_norm_var": 7.236798604329428e-05,
"learning_rate": 0.0001,
"loss": 0.2279,
"loss/crossentropy": 2.4737610816955566,
"loss/fcd": 0.4326171875,
"loss/idx": 18.0,
"loss/logits": 0.227908656001091,
"step": 254
},
{
"epoch": 0.003670781300608198,
"grad_norm": 0.11669921875,
"grad_norm_var": 7.198651631673177e-05,
"learning_rate": 0.0001,
"loss": 0.2286,
"loss/crossentropy": 2.442078709602356,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.2285866141319275,
"step": 255
},
{
"epoch": 0.003685176521394897,
"grad_norm": 0.10986328125,
"grad_norm_var": 7.04119602839152e-05,
"learning_rate": 0.0001,
"loss": 0.2116,
"loss/crossentropy": 2.2948302030563354,
"loss/fcd": 0.427734375,
"loss/idx": 18.0,
"loss/logits": 0.21162152290344238,
"step": 256
},
{
"epoch": 0.0036995717421815957,
"grad_norm": 0.10498046875,
"grad_norm_var": 7.044474283854166e-05,
"learning_rate": 0.0001,
"loss": 0.2117,
"loss/crossentropy": 2.3752611875534058,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.21169160306453705,
"step": 257
},
{
"epoch": 0.0037139669629682945,
"grad_norm": 0.1044921875,
"grad_norm_var": 6.351073582967122e-05,
"learning_rate": 0.0001,
"loss": 0.1965,
"loss/crossentropy": 2.3940770626068115,
"loss/fcd": 0.4111328125,
"loss/idx": 18.0,
"loss/logits": 0.19652695208787918,
"step": 258
},
{
"epoch": 0.0037283621837549932,
"grad_norm": 0.12451171875,
"grad_norm_var": 3.712077935536702e-05,
"learning_rate": 0.0001,
"loss": 0.2326,
"loss/crossentropy": 2.329423666000366,
"loss/fcd": 0.482421875,
"loss/idx": 18.0,
"loss/logits": 0.23256323486566544,
"step": 259
},
{
"epoch": 0.003742757404541692,
"grad_norm": 0.109375,
"grad_norm_var": 3.7511189778645836e-05,
"learning_rate": 0.0001,
"loss": 0.2077,
"loss/crossentropy": 2.2093913555145264,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.20770975947380066,
"step": 260
},
{
"epoch": 0.0037571526253283908,
"grad_norm": 0.11474609375,
"grad_norm_var": 3.47365935643514e-05,
"learning_rate": 0.0001,
"loss": 0.2247,
"loss/crossentropy": 2.3547682762145996,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.2247237116098404,
"step": 261
},
{
"epoch": 0.00377154784611509,
"grad_norm": 0.10400390625,
"grad_norm_var": 3.922681013743083e-05,
"learning_rate": 0.0001,
"loss": 0.188,
"loss/crossentropy": 2.2215335369110107,
"loss/fcd": 0.4091796875,
"loss/idx": 18.0,
"loss/logits": 0.18795417994260788,
"step": 262
},
{
"epoch": 0.0037859430669017887,
"grad_norm": 0.11865234375,
"grad_norm_var": 3.945032755533854e-05,
"learning_rate": 0.0001,
"loss": 0.2405,
"loss/crossentropy": 2.5075334310531616,
"loss/fcd": 0.462890625,
"loss/idx": 18.0,
"loss/logits": 0.24054966121912003,
"step": 263
},
{
"epoch": 0.0038003382876884875,
"grad_norm": 0.10107421875,
"grad_norm_var": 4.806419213612874e-05,
"learning_rate": 0.0001,
"loss": 0.1904,
"loss/crossentropy": 2.4045649766921997,
"loss/fcd": 0.380859375,
"loss/idx": 18.0,
"loss/logits": 0.1903528869152069,
"step": 264
},
{
"epoch": 0.0038147335084751863,
"grad_norm": 0.1171875,
"grad_norm_var": 4.969338575998942e-05,
"learning_rate": 0.0001,
"loss": 0.2226,
"loss/crossentropy": 2.2266829013824463,
"loss/fcd": 0.416015625,
"loss/idx": 18.0,
"loss/logits": 0.2226012423634529,
"step": 265
},
{
"epoch": 0.003829128729261885,
"grad_norm": 0.11376953125,
"grad_norm_var": 4.833439985911051e-05,
"learning_rate": 0.0001,
"loss": 0.2003,
"loss/crossentropy": 2.052451729774475,
"loss/fcd": 0.4033203125,
"loss/idx": 18.0,
"loss/logits": 0.20034398138523102,
"step": 266
},
{
"epoch": 0.003843523950048584,
"grad_norm": 0.10400390625,
"grad_norm_var": 4.928807417551676e-05,
"learning_rate": 0.0001,
"loss": 0.2289,
"loss/crossentropy": 2.7160192728042603,
"loss/fcd": 0.443359375,
"loss/idx": 18.0,
"loss/logits": 0.22888437658548355,
"step": 267
},
{
"epoch": 0.0038579191708352826,
"grad_norm": 0.1201171875,
"grad_norm_var": 4.750887552897136e-05,
"learning_rate": 0.0001,
"loss": 0.2423,
"loss/crossentropy": 2.2038062810897827,
"loss/fcd": 0.44140625,
"loss/idx": 18.0,
"loss/logits": 0.24225647747516632,
"step": 268
},
{
"epoch": 0.0038723143916219813,
"grad_norm": 0.11474609375,
"grad_norm_var": 4.7237674395243326e-05,
"learning_rate": 0.0001,
"loss": 0.2423,
"loss/crossentropy": 2.5651720762252808,
"loss/fcd": 0.4423828125,
"loss/idx": 18.0,
"loss/logits": 0.24227841198444366,
"step": 269
},
{
"epoch": 0.00388670961240868,
"grad_norm": 0.11669921875,
"grad_norm_var": 4.798571268717448e-05,
"learning_rate": 0.0001,
"loss": 0.2367,
"loss/crossentropy": 2.645506978034973,
"loss/fcd": 0.470703125,
"loss/idx": 18.0,
"loss/logits": 0.23671862483024597,
"step": 270
},
{
"epoch": 0.0039011048331953793,
"grad_norm": 0.10791015625,
"grad_norm_var": 4.752079645792643e-05,
"learning_rate": 0.0001,
"loss": 0.2177,
"loss/crossentropy": 2.5453277826309204,
"loss/fcd": 0.4267578125,
"loss/idx": 18.0,
"loss/logits": 0.21773213893175125,
"step": 271
},
{
"epoch": 0.003915500053982078,
"grad_norm": 0.11181640625,
"grad_norm_var": 4.729827245076497e-05,
"learning_rate": 0.0001,
"loss": 0.1938,
"loss/crossentropy": 2.4203790426254272,
"loss/fcd": 0.439453125,
"loss/idx": 18.0,
"loss/logits": 0.19378525018692017,
"step": 272
},
{
"epoch": 0.003929895274768776,
"grad_norm": 0.12255859375,
"grad_norm_var": 5.0731499989827474e-05,
"learning_rate": 0.0001,
"loss": 0.2212,
"loss/crossentropy": 2.1389888525009155,
"loss/fcd": 0.42578125,
"loss/idx": 18.0,
"loss/logits": 0.22124628722667694,
"step": 273
},
{
"epoch": 0.003944290495555476,
"grad_norm": 0.10986328125,
"grad_norm_var": 4.654626051584879e-05,
"learning_rate": 0.0001,
"loss": 0.2305,
"loss/crossentropy": 2.364627480506897,
"loss/fcd": 0.4599609375,
"loss/idx": 18.0,
"loss/logits": 0.2305009961128235,
"step": 274
},
{
"epoch": 0.003958685716342175,
"grad_norm": 0.107421875,
"grad_norm_var": 3.90013058980306e-05,
"learning_rate": 0.0001,
"loss": 0.2175,
"loss/crossentropy": 2.290530562400818,
"loss/fcd": 0.421875,
"loss/idx": 18.0,
"loss/logits": 0.21754977107048035,
"step": 275
},
{
"epoch": 0.003973080937128873,
"grad_norm": 0.10986328125,
"grad_norm_var": 3.883739312489828e-05,
"learning_rate": 0.0001,
"loss": 0.223,
"loss/crossentropy": 2.2974144220352173,
"loss/fcd": 0.412109375,
"loss/idx": 18.0,
"loss/logits": 0.22295525670051575,
"step": 276
},
{
"epoch": 0.003987476157915572,
"grad_norm": 0.10595703125,
"grad_norm_var": 4.0625532468159996e-05,
"learning_rate": 0.0001,
"loss": 0.2162,
"loss/crossentropy": 2.5710668563842773,
"loss/fcd": 0.4443359375,
"loss/idx": 18.0,
"loss/logits": 0.21617399901151657,
"step": 277
},
{
"epoch": 0.004001871378702271,
"grad_norm": 0.1123046875,
"grad_norm_var": 3.652175267537435e-05,
"learning_rate": 0.0001,
"loss": 0.2197,
"loss/crossentropy": 2.4304351806640625,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.21974685788154602,
"step": 278
},
{
"epoch": 0.00401626659948897,
"grad_norm": 0.126953125,
"grad_norm_var": 4.805624485015869e-05,
"learning_rate": 0.0001,
"loss": 0.2358,
"loss/crossentropy": 2.475973963737488,
"loss/fcd": 0.451171875,
"loss/idx": 18.0,
"loss/logits": 0.2358318790793419,
"step": 279
},
{
"epoch": 0.004030661820275668,
"grad_norm": 0.109375,
"grad_norm_var": 3.956158955891927e-05,
"learning_rate": 0.0001,
"loss": 0.2156,
"loss/crossentropy": 2.5783761739730835,
"loss/fcd": 0.421875,
"loss/idx": 18.0,
"loss/logits": 0.21563701331615448,
"step": 280
},
{
"epoch": 0.004045057041062367,
"grad_norm": 0.1171875,
"grad_norm_var": 3.956158955891927e-05,
"learning_rate": 0.0001,
"loss": 0.2139,
"loss/crossentropy": 2.36005961894989,
"loss/fcd": 0.4140625,
"loss/idx": 18.0,
"loss/logits": 0.21386945247650146,
"step": 281
},
{
"epoch": 0.004059452261849066,
"grad_norm": 0.1142578125,
"grad_norm_var": 3.961622714996338e-05,
"learning_rate": 0.0001,
"loss": 0.2326,
"loss/crossentropy": 2.589225172996521,
"loss/fcd": 0.4345703125,
"loss/idx": 18.0,
"loss/logits": 0.23257827758789062,
"step": 282
},
{
"epoch": 0.004073847482635765,
"grad_norm": 0.10693359375,
"grad_norm_var": 3.656446933746338e-05,
"learning_rate": 0.0001,
"loss": 0.2159,
"loss/crossentropy": 2.340222954750061,
"loss/fcd": 0.46484375,
"loss/idx": 18.0,
"loss/logits": 0.21591536700725555,
"step": 283
},
{
"epoch": 0.004088242703422464,
"grad_norm": 0.11376953125,
"grad_norm_var": 3.337462743123372e-05,
"learning_rate": 0.0001,
"loss": 0.238,
"loss/crossentropy": 2.484541654586792,
"loss/fcd": 0.451171875,
"loss/idx": 18.0,
"loss/logits": 0.23801321536302567,
"step": 284
},
{
"epoch": 0.0041026379242091624,
"grad_norm": 0.11279296875,
"grad_norm_var": 3.315210342407227e-05,
"learning_rate": 0.0001,
"loss": 0.2338,
"loss/crossentropy": 2.4735066890716553,
"loss/fcd": 0.443359375,
"loss/idx": 18.0,
"loss/logits": 0.23380715399980545,
"step": 285
},
{
"epoch": 0.004117033144995862,
"grad_norm": 0.1025390625,
"grad_norm_var": 3.8424134254455565e-05,
"learning_rate": 0.0001,
"loss": 0.1958,
"loss/crossentropy": 2.296001434326172,
"loss/fcd": 0.40625,
"loss/idx": 18.0,
"loss/logits": 0.19581247121095657,
"step": 286
},
{
"epoch": 0.00413142836578256,
"grad_norm": 0.10498046875,
"grad_norm_var": 4.054605960845947e-05,
"learning_rate": 0.0001,
"loss": 0.2233,
"loss/crossentropy": 2.469460368156433,
"loss/fcd": 0.4296875,
"loss/idx": 18.0,
"loss/logits": 0.22334590554237366,
"step": 287
},
{
"epoch": 0.004145823586569259,
"grad_norm": 0.10986328125,
"grad_norm_var": 4.0776530901590984e-05,
"learning_rate": 0.0001,
"loss": 0.2563,
"loss/crossentropy": 2.3161216378211975,
"loss/fcd": 0.505859375,
"loss/idx": 18.0,
"loss/logits": 0.2562841549515724,
"step": 288
},
{
"epoch": 0.0041602188073559575,
"grad_norm": 0.11083984375,
"grad_norm_var": 3.233651320139567e-05,
"learning_rate": 0.0001,
"loss": 0.2132,
"loss/crossentropy": 2.571072220802307,
"loss/fcd": 0.412109375,
"loss/idx": 18.0,
"loss/logits": 0.2131756693124771,
"step": 289
},
{
"epoch": 0.004174614028142657,
"grad_norm": 0.11279296875,
"grad_norm_var": 3.245572249094645e-05,
"learning_rate": 0.0001,
"loss": 0.2147,
"loss/crossentropy": 2.3715583086013794,
"loss/fcd": 0.3994140625,
"loss/idx": 18.0,
"loss/logits": 0.21474920213222504,
"step": 290
},
{
"epoch": 0.004189009248929355,
"grad_norm": 0.11083984375,
"grad_norm_var": 3.1503041585286457e-05,
"learning_rate": 0.0001,
"loss": 0.2157,
"loss/crossentropy": 2.379094123840332,
"loss/fcd": 0.4189453125,
"loss/idx": 18.0,
"loss/logits": 0.21565410494804382,
"step": 291
},
{
"epoch": 0.004203404469716054,
"grad_norm": 0.154296875,
"grad_norm_var": 0.00014622112115224203,
"learning_rate": 0.0001,
"loss": 0.2908,
"loss/crossentropy": 2.696184992790222,
"loss/fcd": 0.548828125,
"loss/idx": 18.0,
"loss/logits": 0.29075586795806885,
"step": 292
},
{
"epoch": 0.0042177996905027534,
"grad_norm": 0.10595703125,
"grad_norm_var": 0.00014622112115224203,
"learning_rate": 0.0001,
"loss": 0.2109,
"loss/crossentropy": 2.4592641592025757,
"loss/fcd": 0.4365234375,
"loss/idx": 18.0,
"loss/logits": 0.2109208032488823,
"step": 293
},
{
"epoch": 0.004232194911289452,
"grad_norm": 0.11083984375,
"grad_norm_var": 0.00014670689900716147,
"learning_rate": 0.0001,
"loss": 0.219,
"loss/crossentropy": 2.6254968643188477,
"loss/fcd": 0.4599609375,
"loss/idx": 18.0,
"loss/logits": 0.21897459030151367,
"step": 294
},
{
"epoch": 0.004246590132076151,
"grad_norm": 0.12451171875,
"grad_norm_var": 0.00014286736647288004,
"learning_rate": 0.0001,
"loss": 0.2263,
"loss/crossentropy": 2.7246745824813843,
"loss/fcd": 0.4658203125,
"loss/idx": 18.0,
"loss/logits": 0.2262566015124321,
"step": 295
},
{
"epoch": 0.004260985352862849,
"grad_norm": 0.1064453125,
"grad_norm_var": 0.00014515618483225504,
"learning_rate": 0.0001,
"loss": 0.2029,
"loss/crossentropy": 2.3958386182785034,
"loss/fcd": 0.40625,
"loss/idx": 18.0,
"loss/logits": 0.20287074148654938,
"step": 296
},
{
"epoch": 0.0042753805736495485,
"grad_norm": 0.11083984375,
"grad_norm_var": 0.00014470418294270832,
"learning_rate": 0.0001,
"loss": 0.2364,
"loss/crossentropy": 2.457562804222107,
"loss/fcd": 0.431640625,
"loss/idx": 18.0,
"loss/logits": 0.2363838478922844,
"step": 297
},
{
"epoch": 0.004289775794436247,
"grad_norm": 0.11474609375,
"grad_norm_var": 0.0001447826623916626,
"learning_rate": 0.0001,
"loss": 0.244,
"loss/crossentropy": 2.29829204082489,
"loss/fcd": 0.4521484375,
"loss/idx": 18.0,
"loss/logits": 0.24399850517511368,
"step": 298
},
{
"epoch": 0.004304171015222946,
"grad_norm": 0.1025390625,
"grad_norm_var": 0.00014972686767578125,
"learning_rate": 0.0001,
"loss": 0.2147,
"loss/crossentropy": 2.6273841857910156,
"loss/fcd": 0.43359375,
"loss/idx": 18.0,
"loss/logits": 0.21469515562057495,
"step": 299
},
{
"epoch": 0.004318566236009644,
"grad_norm": 0.1201171875,
"grad_norm_var": 0.00015286505222320557,
"learning_rate": 0.0001,
"loss": 0.2202,
"loss/crossentropy": 2.4213569164276123,
"loss/fcd": 0.4267578125,
"loss/idx": 18.0,
"loss/logits": 0.22024693340063095,
"step": 300
},
{
"epoch": 0.0043329614567963436,
"grad_norm": 0.11279296875,
"grad_norm_var": 0.00015286505222320557,
"learning_rate": 0.0001,
"loss": 0.2201,
"loss/crossentropy": 2.4482584595680237,
"loss/fcd": 0.447265625,
"loss/idx": 18.0,
"loss/logits": 0.2201283797621727,
"step": 301
},
{
"epoch": 0.004347356677583043,
"grad_norm": 0.1103515625,
"grad_norm_var": 0.0001453310251235962,
"learning_rate": 0.0001,
"loss": 0.1952,
"loss/crossentropy": 2.16507089138031,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.1952093541622162,
"step": 302
},
{
"epoch": 0.004361751898369741,
"grad_norm": 0.109375,
"grad_norm_var": 0.000141298770904541,
"learning_rate": 0.0001,
"loss": 0.2006,
"loss/crossentropy": 2.2546703815460205,
"loss/fcd": 0.40234375,
"loss/idx": 18.0,
"loss/logits": 0.2005770206451416,
"step": 303
},
{
"epoch": 0.00437614711915644,
"grad_norm": 0.1572265625,
"grad_norm_var": 0.00025413731733957924,
"learning_rate": 0.0001,
"loss": 0.2731,
"loss/crossentropy": 2.345265507698059,
"loss/fcd": 0.470703125,
"loss/idx": 18.0,
"loss/logits": 0.27310631424188614,
"step": 304
},
{
"epoch": 0.004390542339943139,
"grad_norm": 0.115234375,
"grad_norm_var": 0.0002516428629557292,
"learning_rate": 0.0001,
"loss": 0.2321,
"loss/crossentropy": 2.4603192806243896,
"loss/fcd": 0.4462890625,
"loss/idx": 18.0,
"loss/logits": 0.23207177966833115,
"step": 305
},
{
"epoch": 0.004404937560729838,
"grad_norm": 0.1103515625,
"grad_norm_var": 0.00025352537631988524,
"learning_rate": 0.0001,
"loss": 0.222,
"loss/crossentropy": 2.598379373550415,
"loss/fcd": 0.423828125,
"loss/idx": 18.0,
"loss/logits": 0.22200769931077957,
"step": 306
},
{
"epoch": 0.004419332781516536,
"grad_norm": 0.1328125,
"grad_norm_var": 0.0002648353576660156,
"learning_rate": 0.0001,
"loss": 0.248,
"loss/crossentropy": 2.2982794046401978,
"loss/fcd": 0.447265625,
"loss/idx": 18.0,
"loss/logits": 0.2480178400874138,
"step": 307
},
{
"epoch": 0.004433728002303235,
"grad_norm": 0.10888671875,
"grad_norm_var": 0.00017789900302886963,
"learning_rate": 0.0001,
"loss": 0.1998,
"loss/crossentropy": 2.2329931259155273,
"loss/fcd": 0.388671875,
"loss/idx": 18.0,
"loss/logits": 0.1997941955924034,
"step": 308
},
{
"epoch": 0.0044481232230899346,
"grad_norm": 0.11328125,
"grad_norm_var": 0.00017162561416625977,
"learning_rate": 0.0001,
"loss": 0.2278,
"loss/crossentropy": 2.4267385005950928,
"loss/fcd": 0.4384765625,
"loss/idx": 18.0,
"loss/logits": 0.22776535153388977,
"step": 309
},
{
"epoch": 0.004462518443876633,
"grad_norm": 0.1201171875,
"grad_norm_var": 0.00017028550306955972,
"learning_rate": 0.0001,
"loss": 0.2176,
"loss/crossentropy": 2.1391916275024414,
"loss/fcd": 0.46484375,
"loss/idx": 18.0,
"loss/logits": 0.2176017314195633,
"step": 310
},
{
"epoch": 0.004476913664663332,
"grad_norm": 0.11474609375,
"grad_norm_var": 0.00016627212365468344,
"learning_rate": 0.0001,
"loss": 0.2485,
"loss/crossentropy": 2.617629051208496,
"loss/fcd": 0.45703125,
"loss/idx": 18.0,
"loss/logits": 0.24850602447986603,
"step": 311
},
{
"epoch": 0.00449130888545003,
"grad_norm": 0.1025390625,
"grad_norm_var": 0.00017232795556386312,
"learning_rate": 0.0001,
"loss": 0.201,
"loss/crossentropy": 2.495308995246887,
"loss/fcd": 0.43359375,
"loss/idx": 18.0,
"loss/logits": 0.20100131630897522,
"step": 312
},
{
"epoch": 0.00450570410623673,
"grad_norm": 0.11279296875,
"grad_norm_var": 0.00017122328281402588,
"learning_rate": 0.0001,
"loss": 0.2155,
"loss/crossentropy": 2.6817585229873657,
"loss/fcd": 0.46484375,
"loss/idx": 18.0,
"loss/logits": 0.2155066430568695,
"step": 313
},
{
"epoch": 0.004520099327023428,
"grad_norm": 0.11083984375,
"grad_norm_var": 0.00017289221286773682,
"learning_rate": 0.0001,
"loss": 0.2089,
"loss/crossentropy": 2.4506349563598633,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.20890694856643677,
"step": 314
},
{
"epoch": 0.004534494547810127,
"grad_norm": 0.10791015625,
"grad_norm_var": 0.00016514460245768228,
"learning_rate": 0.0001,
"loss": 0.2212,
"loss/crossentropy": 2.4268819093704224,
"loss/fcd": 0.4326171875,
"loss/idx": 18.0,
"loss/logits": 0.2211536467075348,
"step": 315
},
{
"epoch": 0.0045488897685968255,
"grad_norm": 0.1083984375,
"grad_norm_var": 0.00016762415568033855,
"learning_rate": 0.0001,
"loss": 0.1915,
"loss/crossentropy": 2.082051396369934,
"loss/fcd": 0.4169921875,
"loss/idx": 18.0,
"loss/logits": 0.19149669259786606,
"step": 316
},
{
"epoch": 0.004563284989383525,
"grad_norm": 0.11669921875,
"grad_norm_var": 0.00016717910766601564,
"learning_rate": 0.0001,
"loss": 0.2085,
"loss/crossentropy": 2.178563714027405,
"loss/fcd": 0.4189453125,
"loss/idx": 18.0,
"loss/logits": 0.20848772674798965,
"step": 317
},
{
"epoch": 0.004577680210170224,
"grad_norm": 0.10791015625,
"grad_norm_var": 0.00016930003960927327,
"learning_rate": 0.0001,
"loss": 0.2261,
"loss/crossentropy": 2.4262903928756714,
"loss/fcd": 0.43359375,
"loss/idx": 18.0,
"loss/logits": 0.22605551034212112,
"step": 318
},
{
"epoch": 0.004592075430956922,
"grad_norm": 0.12255859375,
"grad_norm_var": 0.00016927321751912435,
"learning_rate": 0.0001,
"loss": 0.1989,
"loss/crossentropy": 2.4706810116767883,
"loss/fcd": 0.4443359375,
"loss/idx": 18.0,
"loss/logits": 0.19887082278728485,
"step": 319
},
{
"epoch": 0.004606470651743621,
"grad_norm": 0.10791015625,
"grad_norm_var": 5.278488000233968e-05,
"learning_rate": 0.0001,
"loss": 0.2139,
"loss/crossentropy": 2.34406316280365,
"loss/fcd": 0.4365234375,
"loss/idx": 18.0,
"loss/logits": 0.21389687806367874,
"step": 320
},
{
"epoch": 0.00462086587253032,
"grad_norm": 0.171875,
"grad_norm_var": 0.0002678145964940389,
"learning_rate": 0.0001,
"loss": 0.314,
"loss/crossentropy": 2.252693295478821,
"loss/fcd": 0.548828125,
"loss/idx": 18.0,
"loss/logits": 0.31398655474185944,
"step": 321
},
{
"epoch": 0.004635261093317019,
"grad_norm": 0.10498046875,
"grad_norm_var": 0.0002742727597554525,
"learning_rate": 0.0001,
"loss": 0.205,
"loss/crossentropy": 2.3450491428375244,
"loss/fcd": 0.4248046875,
"loss/idx": 18.0,
"loss/logits": 0.20497491210699081,
"step": 322
},
{
"epoch": 0.004649656314103717,
"grad_norm": 0.130859375,
"grad_norm_var": 0.0002702673276265462,
"learning_rate": 0.0001,
"loss": 0.2742,
"loss/crossentropy": 2.6299513578414917,
"loss/fcd": 0.5146484375,
"loss/idx": 18.0,
"loss/logits": 0.27415700256824493,
"step": 323
},
{
"epoch": 0.0046640515348904165,
"grad_norm": 0.1064453125,
"grad_norm_var": 0.0002730836470921834,
"learning_rate": 0.0001,
"loss": 0.2158,
"loss/crossentropy": 2.512497067451477,
"loss/fcd": 0.4248046875,
"loss/idx": 18.0,
"loss/logits": 0.21584390848875046,
"step": 324
},
{
"epoch": 0.004678446755677115,
"grad_norm": 0.1064453125,
"grad_norm_var": 0.0002787023782730103,
"learning_rate": 0.0001,
"loss": 0.2,
"loss/crossentropy": 2.319981098175049,
"loss/fcd": 0.4296875,
"loss/idx": 18.0,
"loss/logits": 0.20001471787691116,
"step": 325
},
{
"epoch": 0.004692841976463814,
"grad_norm": 0.12890625,
"grad_norm_var": 0.00028857290744781493,
"learning_rate": 0.0001,
"loss": 0.2693,
"loss/crossentropy": 2.4298349618911743,
"loss/fcd": 0.48046875,
"loss/idx": 18.0,
"loss/logits": 0.2693277597427368,
"step": 326
},
{
"epoch": 0.004707237197250513,
"grad_norm": 0.1318359375,
"grad_norm_var": 0.0003031412760416667,
"learning_rate": 0.0001,
"loss": 0.249,
"loss/crossentropy": 2.555938482284546,
"loss/fcd": 0.4873046875,
"loss/idx": 18.0,
"loss/logits": 0.2489527463912964,
"step": 327
},
{
"epoch": 0.0047216324180372115,
"grad_norm": 0.1201171875,
"grad_norm_var": 0.00028754870096842446,
"learning_rate": 0.0001,
"loss": 0.2233,
"loss/crossentropy": 2.3032290935516357,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.2232954055070877,
"step": 328
},
{
"epoch": 0.004736027638823911,
"grad_norm": 0.11181640625,
"grad_norm_var": 0.0002883553504943848,
"learning_rate": 0.0001,
"loss": 0.2243,
"loss/crossentropy": 2.3655673265457153,
"loss/fcd": 0.4140625,
"loss/idx": 18.0,
"loss/logits": 0.2242700606584549,
"step": 329
},
{
"epoch": 0.004750422859610609,
"grad_norm": 0.12109375,
"grad_norm_var": 0.00028449594974517823,
"learning_rate": 0.0001,
"loss": 0.2429,
"loss/crossentropy": 2.330072522163391,
"loss/fcd": 0.4365234375,
"loss/idx": 18.0,
"loss/logits": 0.24292638152837753,
"step": 330
},
{
"epoch": 0.004764818080397308,
"grad_norm": 0.111328125,
"grad_norm_var": 0.0002801219622294108,
"learning_rate": 0.0001,
"loss": 0.2179,
"loss/crossentropy": 2.2494866847991943,
"loss/fcd": 0.4326171875,
"loss/idx": 18.0,
"loss/logits": 0.21786177903413773,
"step": 331
},
{
"epoch": 0.004779213301184007,
"grad_norm": 0.1181640625,
"grad_norm_var": 0.00027185678482055664,
"learning_rate": 0.0001,
"loss": 0.2415,
"loss/crossentropy": 2.792868733406067,
"loss/fcd": 0.4716796875,
"loss/idx": 18.0,
"loss/logits": 0.24153122305870056,
"step": 332
},
{
"epoch": 0.004793608521970706,
"grad_norm": 0.10986328125,
"grad_norm_var": 0.0002777258555094401,
"learning_rate": 0.0001,
"loss": 0.2367,
"loss/crossentropy": 2.573932647705078,
"loss/fcd": 0.4501953125,
"loss/idx": 18.0,
"loss/logits": 0.23671025037765503,
"step": 333
},
{
"epoch": 0.004808003742757404,
"grad_norm": 0.09912109375,
"grad_norm_var": 0.0002961436907450358,
"learning_rate": 0.0001,
"loss": 0.2002,
"loss/crossentropy": 2.5787216424942017,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.20024622231721878,
"step": 334
},
{
"epoch": 0.004822398963544103,
"grad_norm": 0.1005859375,
"grad_norm_var": 0.00031576852003733315,
"learning_rate": 0.0001,
"loss": 0.2067,
"loss/crossentropy": 2.5130008459091187,
"loss/fcd": 0.4091796875,
"loss/idx": 18.0,
"loss/logits": 0.2067384421825409,
"step": 335
},
{
"epoch": 0.0048367941843308025,
"grad_norm": 0.1962890625,
"grad_norm_var": 0.0006899476051330566,
"learning_rate": 0.0001,
"loss": 0.2185,
"loss/crossentropy": 2.2556002140045166,
"loss/fcd": 0.494140625,
"loss/idx": 18.0,
"loss/logits": 0.2184857428073883,
"step": 336
},
{
"epoch": 0.004851189405117501,
"grad_norm": 0.1064453125,
"grad_norm_var": 0.0005320707956949869,
"learning_rate": 0.0001,
"loss": 0.2083,
"loss/crossentropy": 2.421205759048462,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.20834489911794662,
"step": 337
},
{
"epoch": 0.0048655846259042,
"grad_norm": 0.12109375,
"grad_norm_var": 0.0005181382099787394,
"learning_rate": 0.0001,
"loss": 0.2009,
"loss/crossentropy": 2.079905390739441,
"loss/fcd": 0.41796875,
"loss/idx": 18.0,
"loss/logits": 0.20086795836687088,
"step": 338
},
{
"epoch": 0.004879979846690898,
"grad_norm": 0.115234375,
"grad_norm_var": 0.0005108267068862915,
"learning_rate": 0.0001,
"loss": 0.2437,
"loss/crossentropy": 2.571584105491638,
"loss/fcd": 0.4423828125,
"loss/idx": 18.0,
"loss/logits": 0.2436518296599388,
"step": 339
},
{
"epoch": 0.004894375067477598,
"grad_norm": 0.130859375,
"grad_norm_var": 0.0005070517460505167,
"learning_rate": 0.0001,
"loss": 0.252,
"loss/crossentropy": 2.3673810958862305,
"loss/fcd": 0.525390625,
"loss/idx": 18.0,
"loss/logits": 0.2520231306552887,
"step": 340
},
{
"epoch": 0.004908770288264296,
"grad_norm": 0.1162109375,
"grad_norm_var": 0.0004946142435073853,
"learning_rate": 0.0001,
"loss": 0.1946,
"loss/crossentropy": 1.9378909468650818,
"loss/fcd": 0.384765625,
"loss/idx": 18.0,
"loss/logits": 0.19459272176027298,
"step": 341
},
{
"epoch": 0.004923165509050995,
"grad_norm": 0.10791015625,
"grad_norm_var": 0.0005005518595377604,
"learning_rate": 0.0001,
"loss": 0.2064,
"loss/crossentropy": 2.391346573829651,
"loss/fcd": 0.421875,
"loss/idx": 18.0,
"loss/logits": 0.20641817897558212,
"step": 342
},
{
"epoch": 0.004937560729837694,
"grad_norm": 0.10693359375,
"grad_norm_var": 0.0004995892445246379,
"learning_rate": 0.0001,
"loss": 0.213,
"loss/crossentropy": 2.4029276371002197,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.2129564881324768,
"step": 343
},
{
"epoch": 0.004951955950624393,
"grad_norm": 0.12353515625,
"grad_norm_var": 0.0005011399586995443,
"learning_rate": 0.0001,
"loss": 0.2122,
"loss/crossentropy": 2.3750810623168945,
"loss/fcd": 0.4462890625,
"loss/idx": 18.0,
"loss/logits": 0.21220777183771133,
"step": 344
},
{
"epoch": 0.004966351171411092,
"grad_norm": 0.1142578125,
"grad_norm_var": 0.0004993269840876262,
"learning_rate": 0.0001,
"loss": 0.2483,
"loss/crossentropy": 2.713660955429077,
"loss/fcd": 0.4853515625,
"loss/idx": 18.0,
"loss/logits": 0.24831371009349823,
"step": 345
},
{
"epoch": 0.00498074639219779,
"grad_norm": 0.1025390625,
"grad_norm_var": 0.0005148798227310181,
"learning_rate": 0.0001,
"loss": 0.2107,
"loss/crossentropy": 2.550423502922058,
"loss/fcd": 0.416015625,
"loss/idx": 18.0,
"loss/logits": 0.21066032350063324,
"step": 346
},
{
"epoch": 0.004995141612984489,
"grad_norm": 0.10693359375,
"grad_norm_var": 0.000519716739654541,
"learning_rate": 0.0001,
"loss": 0.2061,
"loss/crossentropy": 2.5207024812698364,
"loss/fcd": 0.42578125,
"loss/idx": 18.0,
"loss/logits": 0.2061041295528412,
"step": 347
},
{
"epoch": 0.005009536833771188,
"grad_norm": 0.11865234375,
"grad_norm_var": 0.0005197912454605102,
"learning_rate": 0.0001,
"loss": 0.2402,
"loss/crossentropy": 2.3946497440338135,
"loss/fcd": 0.44140625,
"loss/idx": 18.0,
"loss/logits": 0.24020669609308243,
"step": 348
},
{
"epoch": 0.005023932054557887,
"grad_norm": 0.099609375,
"grad_norm_var": 0.000536501407623291,
"learning_rate": 0.0001,
"loss": 0.2128,
"loss/crossentropy": 2.516977548599243,
"loss/fcd": 0.4091796875,
"loss/idx": 18.0,
"loss/logits": 0.21278280019760132,
"step": 349
},
{
"epoch": 0.005038327275344585,
"grad_norm": 0.1259765625,
"grad_norm_var": 0.0005188534657160441,
"learning_rate": 0.0001,
"loss": 0.2382,
"loss/crossentropy": 2.5282589197158813,
"loss/fcd": 0.455078125,
"loss/idx": 18.0,
"loss/logits": 0.23819837719202042,
"step": 350
},
{
"epoch": 0.0050527224961312844,
"grad_norm": 0.10498046875,
"grad_norm_var": 0.0005096713701883952,
"learning_rate": 0.0001,
"loss": 0.22,
"loss/crossentropy": 2.448602795600891,
"loss/fcd": 0.439453125,
"loss/idx": 18.0,
"loss/logits": 0.21995113044977188,
"step": 351
},
{
"epoch": 0.005067117716917984,
"grad_norm": 0.10205078125,
"grad_norm_var": 8.844435214996337e-05,
"learning_rate": 0.0001,
"loss": 0.1952,
"loss/crossentropy": 2.4668463468551636,
"loss/fcd": 0.4091796875,
"loss/idx": 18.0,
"loss/logits": 0.19519731402397156,
"step": 352
},
{
"epoch": 0.005081512937704682,
"grad_norm": 0.1181640625,
"grad_norm_var": 8.725225925445556e-05,
"learning_rate": 0.0001,
"loss": 0.2368,
"loss/crossentropy": 2.315679907798767,
"loss/fcd": 0.44140625,
"loss/idx": 18.0,
"loss/logits": 0.23678645491600037,
"step": 353
},
{
"epoch": 0.005095908158491381,
"grad_norm": 0.1025390625,
"grad_norm_var": 8.98192326227824e-05,
"learning_rate": 0.0001,
"loss": 0.2111,
"loss/crossentropy": 2.417713761329651,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.21109846234321594,
"step": 354
},
{
"epoch": 0.0051103033792780795,
"grad_norm": 0.10546875,
"grad_norm_var": 9.192526340484619e-05,
"learning_rate": 0.0001,
"loss": 0.1963,
"loss/crossentropy": 2.3545119762420654,
"loss/fcd": 0.4169921875,
"loss/idx": 18.0,
"loss/logits": 0.19629193097352982,
"step": 355
},
{
"epoch": 0.005124698600064779,
"grad_norm": 0.11181640625,
"grad_norm_var": 6.585121154785156e-05,
"learning_rate": 0.0001,
"loss": 0.1816,
"loss/crossentropy": 2.1606619358062744,
"loss/fcd": 0.3876953125,
"loss/idx": 18.0,
"loss/logits": 0.18158919364213943,
"step": 356
},
{
"epoch": 0.005139093820851477,
"grad_norm": 0.1171875,
"grad_norm_var": 6.665786107381184e-05,
"learning_rate": 0.0001,
"loss": 0.2114,
"loss/crossentropy": 2.429716110229492,
"loss/fcd": 0.4169921875,
"loss/idx": 18.0,
"loss/logits": 0.21144652366638184,
"step": 357
},
{
"epoch": 0.005153489041638176,
"grad_norm": 0.099609375,
"grad_norm_var": 7.386902968088785e-05,
"learning_rate": 0.0001,
"loss": 0.2011,
"loss/crossentropy": 2.511311650276184,
"loss/fcd": 0.4091796875,
"loss/idx": 18.0,
"loss/logits": 0.20114467293024063,
"step": 358
},
{
"epoch": 0.0051678842624248746,
"grad_norm": 0.10205078125,
"grad_norm_var": 7.736583550771078e-05,
"learning_rate": 0.0001,
"loss": 0.1846,
"loss/crossentropy": 2.1977522373199463,
"loss/fcd": 0.373046875,
"loss/idx": 18.0,
"loss/logits": 0.184633307158947,
"step": 359
},
{
"epoch": 0.005182279483211574,
"grad_norm": 0.11572265625,
"grad_norm_var": 6.67800505956014e-05,
"learning_rate": 0.0001,
"loss": 0.2461,
"loss/crossentropy": 2.605985164642334,
"loss/fcd": 0.451171875,
"loss/idx": 18.0,
"loss/logits": 0.24613827466964722,
"step": 360
},
{
"epoch": 0.005196674703998273,
"grad_norm": 0.12353515625,
"grad_norm_var": 7.838805516560873e-05,
"learning_rate": 0.0001,
"loss": 0.231,
"loss/crossentropy": 2.4244812726974487,
"loss/fcd": 0.451171875,
"loss/idx": 18.0,
"loss/logits": 0.23104986548423767,
"step": 361
},
{
"epoch": 0.005211069924784971,
"grad_norm": 0.11279296875,
"grad_norm_var": 7.502933343251546e-05,
"learning_rate": 0.0001,
"loss": 0.2269,
"loss/crossentropy": 2.4840404987335205,
"loss/fcd": 0.4453125,
"loss/idx": 18.0,
"loss/logits": 0.22690805047750473,
"step": 362
},
{
"epoch": 0.0052254651455716705,
"grad_norm": 0.126953125,
"grad_norm_var": 9.071032206217447e-05,
"learning_rate": 0.0001,
"loss": 0.2776,
"loss/crossentropy": 2.631165862083435,
"loss/fcd": 0.5244140625,
"loss/idx": 18.0,
"loss/logits": 0.277616910636425,
"step": 363
},
{
"epoch": 0.005239860366358369,
"grad_norm": 0.12353515625,
"grad_norm_var": 9.673039118448893e-05,
"learning_rate": 0.0001,
"loss": 0.2285,
"loss/crossentropy": 2.316849708557129,
"loss/fcd": 0.4453125,
"loss/idx": 18.0,
"loss/logits": 0.228460393846035,
"step": 364
},
{
"epoch": 0.005254255587145068,
"grad_norm": 0.1025390625,
"grad_norm_var": 9.242693583170573e-05,
"learning_rate": 0.0001,
"loss": 0.1957,
"loss/crossentropy": 2.315016031265259,
"loss/fcd": 0.38671875,
"loss/idx": 18.0,
"loss/logits": 0.1956682875752449,
"step": 365
},
{
"epoch": 0.005268650807931766,
"grad_norm": 0.1044921875,
"grad_norm_var": 8.176167805989583e-05,
"learning_rate": 0.0001,
"loss": 0.2118,
"loss/crossentropy": 2.3352142572402954,
"loss/fcd": 0.42578125,
"loss/idx": 18.0,
"loss/logits": 0.21177390962839127,
"step": 366
},
{
"epoch": 0.0052830460287184655,
"grad_norm": 0.1123046875,
"grad_norm_var": 7.939239343007406e-05,
"learning_rate": 0.0001,
"loss": 0.2289,
"loss/crossentropy": 2.511680841445923,
"loss/fcd": 0.466796875,
"loss/idx": 18.0,
"loss/logits": 0.22891707718372345,
"step": 367
},
{
"epoch": 0.005297441249505164,
"grad_norm": 0.109375,
"grad_norm_var": 7.37150510152181e-05,
"learning_rate": 0.0001,
"loss": 0.2201,
"loss/crossentropy": 2.2285088300704956,
"loss/fcd": 0.431640625,
"loss/idx": 18.0,
"loss/logits": 0.22013359516859055,
"step": 368
},
{
"epoch": 0.005311836470291863,
"grad_norm": 0.10400390625,
"grad_norm_var": 7.414718468983968e-05,
"learning_rate": 0.0001,
"loss": 0.1957,
"loss/crossentropy": 2.389556884765625,
"loss/fcd": 0.4287109375,
"loss/idx": 18.0,
"loss/logits": 0.19569466263055801,
"step": 369
},
{
"epoch": 0.005326231691078562,
"grad_norm": 0.10693359375,
"grad_norm_var": 7.047255833943684e-05,
"learning_rate": 0.0001,
"loss": 0.2015,
"loss/crossentropy": 2.2860642671585083,
"loss/fcd": 0.4140625,
"loss/idx": 18.0,
"loss/logits": 0.20150135457515717,
"step": 370
},
{
"epoch": 0.005340626911865261,
"grad_norm": 0.103515625,
"grad_norm_var": 7.21891721089681e-05,
"learning_rate": 0.0001,
"loss": 0.1878,
"loss/crossentropy": 2.1553120017051697,
"loss/fcd": 0.3740234375,
"loss/idx": 18.0,
"loss/logits": 0.18780279159545898,
"step": 371
},
{
"epoch": 0.00535502213265196,
"grad_norm": 0.126953125,
"grad_norm_var": 8.811056613922119e-05,
"learning_rate": 0.0001,
"loss": 0.2079,
"loss/crossentropy": 2.614238739013672,
"loss/fcd": 0.43359375,
"loss/idx": 18.0,
"loss/logits": 0.20787174999713898,
"step": 372
},
{
"epoch": 0.005369417353438658,
"grad_norm": 0.12255859375,
"grad_norm_var": 9.365081787109375e-05,
"learning_rate": 0.0001,
"loss": 0.2249,
"loss/crossentropy": 2.365216612815857,
"loss/fcd": 0.421875,
"loss/idx": 18.0,
"loss/logits": 0.2249324843287468,
"step": 373
},
{
"epoch": 0.005383812574225357,
"grad_norm": 0.11962890625,
"grad_norm_var": 8.481244246164958e-05,
"learning_rate": 0.0001,
"loss": 0.2203,
"loss/crossentropy": 2.6173166036605835,
"loss/fcd": 0.443359375,
"loss/idx": 18.0,
"loss/logits": 0.2202518805861473,
"step": 374
},
{
"epoch": 0.005398207795012056,
"grad_norm": 0.11279296875,
"grad_norm_var": 7.554590702056885e-05,
"learning_rate": 0.0001,
"loss": 0.205,
"loss/crossentropy": 2.2174978256225586,
"loss/fcd": 0.3916015625,
"loss/idx": 18.0,
"loss/logits": 0.20496949553489685,
"step": 375
},
{
"epoch": 0.005412603015798755,
"grad_norm": 0.10888671875,
"grad_norm_var": 7.710357507069905e-05,
"learning_rate": 0.0001,
"loss": 0.1912,
"loss/crossentropy": 2.2786842584609985,
"loss/fcd": 0.3974609375,
"loss/idx": 18.0,
"loss/logits": 0.19121932238340378,
"step": 376
},
{
"epoch": 0.005426998236585453,
"grad_norm": 0.10595703125,
"grad_norm_var": 7.359882195790609e-05,
"learning_rate": 0.0001,
"loss": 0.2442,
"loss/crossentropy": 2.5521395206451416,
"loss/fcd": 0.458984375,
"loss/idx": 18.0,
"loss/logits": 0.24418669939041138,
"step": 377
},
{
"epoch": 0.005441393457372152,
"grad_norm": 0.12060546875,
"grad_norm_var": 7.750888665517172e-05,
"learning_rate": 0.0001,
"loss": 0.2245,
"loss/crossentropy": 2.9219515323638916,
"loss/fcd": 0.482421875,
"loss/idx": 18.0,
"loss/logits": 0.22447162866592407,
"step": 378
},
{
"epoch": 0.005455788678158852,
"grad_norm": 0.11376953125,
"grad_norm_var": 6.41783078511556e-05,
"learning_rate": 0.0001,
"loss": 0.2119,
"loss/crossentropy": 2.393683671951294,
"loss/fcd": 0.408203125,
"loss/idx": 18.0,
"loss/logits": 0.21186020970344543,
"step": 379
},
{
"epoch": 0.00547018389894555,
"grad_norm": 0.1064453125,
"grad_norm_var": 5.698104699452718e-05,
"learning_rate": 0.0001,
"loss": 0.2392,
"loss/crossentropy": 2.7257591485977173,
"loss/fcd": 0.4443359375,
"loss/idx": 18.0,
"loss/logits": 0.23916704207658768,
"step": 380
},
{
"epoch": 0.005484579119732249,
"grad_norm": 0.1025390625,
"grad_norm_var": 5.698104699452718e-05,
"learning_rate": 0.0001,
"loss": 0.2175,
"loss/crossentropy": 2.604699730873108,
"loss/fcd": 0.44140625,
"loss/idx": 18.0,
"loss/logits": 0.21749083697795868,
"step": 381
},
{
"epoch": 0.0054989743405189475,
"grad_norm": 0.115234375,
"grad_norm_var": 5.444586277008057e-05,
"learning_rate": 0.0001,
"loss": 0.2128,
"loss/crossentropy": 2.3415403366088867,
"loss/fcd": 0.41796875,
"loss/idx": 18.0,
"loss/logits": 0.21281517297029495,
"step": 382
},
{
"epoch": 0.005513369561305647,
"grad_norm": 0.109375,
"grad_norm_var": 5.485117435455322e-05,
"learning_rate": 0.0001,
"loss": 0.2118,
"loss/crossentropy": 2.164521098136902,
"loss/fcd": 0.4013671875,
"loss/idx": 18.0,
"loss/logits": 0.2117796689271927,
"step": 383
},
{
"epoch": 0.005527764782092345,
"grad_norm": 0.11767578125,
"grad_norm_var": 5.648930867513021e-05,
"learning_rate": 0.0001,
"loss": 0.235,
"loss/crossentropy": 2.243640184402466,
"loss/fcd": 0.44921875,
"loss/idx": 18.0,
"loss/logits": 0.23500269651412964,
"step": 384
},
{
"epoch": 0.005542160002879044,
"grad_norm": 0.10498046875,
"grad_norm_var": 5.546808242797852e-05,
"learning_rate": 0.0001,
"loss": 0.2025,
"loss/crossentropy": 2.2612792253494263,
"loss/fcd": 0.421875,
"loss/idx": 18.0,
"loss/logits": 0.20246511697769165,
"step": 385
},
{
"epoch": 0.005556555223665743,
"grad_norm": 0.10888671875,
"grad_norm_var": 5.429188410441081e-05,
"learning_rate": 0.0001,
"loss": 0.21,
"loss/crossentropy": 2.6286587715148926,
"loss/fcd": 0.447265625,
"loss/idx": 18.0,
"loss/logits": 0.21000967174768448,
"step": 386
},
{
"epoch": 0.005570950444452442,
"grad_norm": 0.107421875,
"grad_norm_var": 5.0572554270426434e-05,
"learning_rate": 0.0001,
"loss": 0.1951,
"loss/crossentropy": 2.381960868835449,
"loss/fcd": 0.38671875,
"loss/idx": 18.0,
"loss/logits": 0.19514141231775284,
"step": 387
},
{
"epoch": 0.005585345665239141,
"grad_norm": 0.11962890625,
"grad_norm_var": 4.003743330637614e-05,
"learning_rate": 0.0001,
"loss": 0.2116,
"loss/crossentropy": 2.1127407550811768,
"loss/fcd": 0.4326171875,
"loss/idx": 18.0,
"loss/logits": 0.21157918125391006,
"step": 388
},
{
"epoch": 0.005599740886025839,
"grad_norm": 0.11669921875,
"grad_norm_var": 3.414849440256754e-05,
"learning_rate": 0.0001,
"loss": 0.1979,
"loss/crossentropy": 2.2836742401123047,
"loss/fcd": 0.396484375,
"loss/idx": 18.0,
"loss/logits": 0.1978917270898819,
"step": 389
},
{
"epoch": 0.0056141361068125385,
"grad_norm": 0.109375,
"grad_norm_var": 3.0163923899332682e-05,
"learning_rate": 0.0001,
"loss": 0.1913,
"loss/crossentropy": 2.141560196876526,
"loss/fcd": 0.3857421875,
"loss/idx": 18.0,
"loss/logits": 0.19127248972654343,
"step": 390
},
{
"epoch": 0.005628531327599237,
"grad_norm": 0.1279296875,
"grad_norm_var": 4.756351312001546e-05,
"learning_rate": 0.0001,
"loss": 0.2001,
"loss/crossentropy": 2.0556570291519165,
"loss/fcd": 0.4052734375,
"loss/idx": 18.0,
"loss/logits": 0.20014575868844986,
"step": 391
},
{
"epoch": 0.005642926548385936,
"grad_norm": 0.11865234375,
"grad_norm_var": 4.919270674387614e-05,
"learning_rate": 0.0001,
"loss": 0.2564,
"loss/crossentropy": 2.434049367904663,
"loss/fcd": 0.478515625,
"loss/idx": 18.0,
"loss/logits": 0.2563505992293358,
"step": 392
},
{
"epoch": 0.005657321769172634,
"grad_norm": 0.11328125,
"grad_norm_var": 4.583994547526042e-05,
"learning_rate": 0.0001,
"loss": 0.2032,
"loss/crossentropy": 2.31030809879303,
"loss/fcd": 0.412109375,
"loss/idx": 18.0,
"loss/logits": 0.20315195620059967,
"step": 393
},
{
"epoch": 0.0056717169899593335,
"grad_norm": 0.10888671875,
"grad_norm_var": 4.297892252604167e-05,
"learning_rate": 0.0001,
"loss": 0.2028,
"loss/crossentropy": 2.201537609100342,
"loss/fcd": 0.423828125,
"loss/idx": 18.0,
"loss/logits": 0.20282629132270813,
"step": 394
},
{
"epoch": 0.005686112210746033,
"grad_norm": 0.1103515625,
"grad_norm_var": 4.315276940663656e-05,
"learning_rate": 0.0001,
"loss": 0.1968,
"loss/crossentropy": 2.2024729251861572,
"loss/fcd": 0.4013671875,
"loss/idx": 18.0,
"loss/logits": 0.1968480423092842,
"step": 395
},
{
"epoch": 0.005700507431532731,
"grad_norm": 0.1064453125,
"grad_norm_var": 4.315276940663656e-05,
"learning_rate": 0.0001,
"loss": 0.2005,
"loss/crossentropy": 2.3388434648513794,
"loss/fcd": 0.4169921875,
"loss/idx": 18.0,
"loss/logits": 0.20053986459970474,
"step": 396
},
{
"epoch": 0.00571490265231943,
"grad_norm": 0.1220703125,
"grad_norm_var": 4.148383935292562e-05,
"learning_rate": 0.0001,
"loss": 0.241,
"loss/crossentropy": 2.9380890130996704,
"loss/fcd": 0.470703125,
"loss/idx": 18.0,
"loss/logits": 0.24104679375886917,
"step": 397
},
{
"epoch": 0.005729297873106129,
"grad_norm": 0.1181640625,
"grad_norm_var": 4.267593224843343e-05,
"learning_rate": 0.0001,
"loss": 0.1981,
"loss/crossentropy": 2.0873407125473022,
"loss/fcd": 0.40234375,
"loss/idx": 18.0,
"loss/logits": 0.1980888992547989,
"step": 398
},
{
"epoch": 0.005743693093892828,
"grad_norm": 0.1103515625,
"grad_norm_var": 4.21673059463501e-05,
"learning_rate": 0.0001,
"loss": 0.2054,
"loss/crossentropy": 2.398405909538269,
"loss/fcd": 0.4306640625,
"loss/idx": 18.0,
"loss/logits": 0.20542413741350174,
"step": 399
},
{
"epoch": 0.005758088314679526,
"grad_norm": 0.11181640625,
"grad_norm_var": 4.1285157203674315e-05,
"learning_rate": 0.0001,
"loss": 0.2162,
"loss/crossentropy": 2.4124823808670044,
"loss/fcd": 0.4130859375,
"loss/idx": 18.0,
"loss/logits": 0.21624472737312317,
"step": 400
},
{
"epoch": 0.005772483535466225,
"grad_norm": 0.11962890625,
"grad_norm_var": 3.8185715675354e-05,
"learning_rate": 0.0001,
"loss": 0.2039,
"loss/crossentropy": 2.495412826538086,
"loss/fcd": 0.4296875,
"loss/idx": 18.0,
"loss/logits": 0.20386559516191483,
"step": 401
},
{
"epoch": 0.005786878756252924,
"grad_norm": 0.11474609375,
"grad_norm_var": 3.6063790321350095e-05,
"learning_rate": 0.0001,
"loss": 0.2092,
"loss/crossentropy": 2.320030093193054,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.20922860503196716,
"step": 402
},
{
"epoch": 0.005801273977039623,
"grad_norm": 0.11474609375,
"grad_norm_var": 3.229379653930664e-05,
"learning_rate": 0.0001,
"loss": 0.2263,
"loss/crossentropy": 2.5104581117630005,
"loss/fcd": 0.46484375,
"loss/idx": 18.0,
"loss/logits": 0.22626767307519913,
"step": 403
},
{
"epoch": 0.005815669197826322,
"grad_norm": 0.1064453125,
"grad_norm_var": 3.532469272613525e-05,
"learning_rate": 0.0001,
"loss": 0.2136,
"loss/crossentropy": 2.5909669399261475,
"loss/fcd": 0.4365234375,
"loss/idx": 18.0,
"loss/logits": 0.213609017431736,
"step": 404
},
{
"epoch": 0.00583006441861302,
"grad_norm": 0.10400390625,
"grad_norm_var": 4.14202610651652e-05,
"learning_rate": 0.0001,
"loss": 0.1848,
"loss/crossentropy": 2.099331498146057,
"loss/fcd": 0.3916015625,
"loss/idx": 18.0,
"loss/logits": 0.18482983112335205,
"step": 405
},
{
"epoch": 0.00584445963939972,
"grad_norm": 0.1005859375,
"grad_norm_var": 5.114773909250895e-05,
"learning_rate": 0.0001,
"loss": 0.1927,
"loss/crossentropy": 2.245216131210327,
"loss/fcd": 0.41015625,
"loss/idx": 18.0,
"loss/logits": 0.1927170231938362,
"step": 406
},
{
"epoch": 0.005858854860186418,
"grad_norm": 0.1103515625,
"grad_norm_var": 3.54836384455363e-05,
"learning_rate": 0.0001,
"loss": 0.2359,
"loss/crossentropy": 2.3967188596725464,
"loss/fcd": 0.4189453125,
"loss/idx": 18.0,
"loss/logits": 0.2359299436211586,
"step": 407
},
{
"epoch": 0.005873250080973117,
"grad_norm": 0.10791015625,
"grad_norm_var": 3.3035874366760254e-05,
"learning_rate": 0.0001,
"loss": 0.2072,
"loss/crossentropy": 2.4940836429595947,
"loss/fcd": 0.4306640625,
"loss/idx": 18.0,
"loss/logits": 0.20715947449207306,
"step": 408
},
{
"epoch": 0.005887645301759815,
"grad_norm": 0.1044921875,
"grad_norm_var": 3.546774387359619e-05,
"learning_rate": 0.0001,
"loss": 0.2031,
"loss/crossentropy": 2.4436358213424683,
"loss/fcd": 0.416015625,
"loss/idx": 18.0,
"loss/logits": 0.20313256978988647,
"step": 409
},
{
"epoch": 0.005902040522546515,
"grad_norm": 0.1083984375,
"grad_norm_var": 3.5599867502848306e-05,
"learning_rate": 0.0001,
"loss": 0.2205,
"loss/crossentropy": 2.3267935514450073,
"loss/fcd": 0.4208984375,
"loss/idx": 18.0,
"loss/logits": 0.22049501538276672,
"step": 410
},
{
"epoch": 0.005916435743333213,
"grad_norm": 0.1083984375,
"grad_norm_var": 3.591775894165039e-05,
"learning_rate": 0.0001,
"loss": 0.2032,
"loss/crossentropy": 2.262888252735138,
"loss/fcd": 0.4091796875,
"loss/idx": 18.0,
"loss/logits": 0.20318371057510376,
"step": 411
},
{
"epoch": 0.005930830964119912,
"grad_norm": 0.09619140625,
"grad_norm_var": 4.8080086708068846e-05,
"learning_rate": 0.0001,
"loss": 0.2031,
"loss/crossentropy": 2.5199403762817383,
"loss/fcd": 0.412109375,
"loss/idx": 18.0,
"loss/logits": 0.20311684161424637,
"step": 412
},
{
"epoch": 0.005945226184906611,
"grad_norm": 0.10009765625,
"grad_norm_var": 4.258155822753906e-05,
"learning_rate": 0.0001,
"loss": 0.1989,
"loss/crossentropy": 2.3285170793533325,
"loss/fcd": 0.4189453125,
"loss/idx": 18.0,
"loss/logits": 0.19889184832572937,
"step": 413
},
{
"epoch": 0.00595962140569331,
"grad_norm": 0.11474609375,
"grad_norm_var": 3.891686598459879e-05,
"learning_rate": 0.0001,
"loss": 0.2254,
"loss/crossentropy": 2.5566580295562744,
"loss/fcd": 0.466796875,
"loss/idx": 18.0,
"loss/logits": 0.22541005164384842,
"step": 414
},
{
"epoch": 0.005974016626480009,
"grad_norm": 0.12255859375,
"grad_norm_var": 5.155801773071289e-05,
"learning_rate": 0.0001,
"loss": 0.2264,
"loss/crossentropy": 2.537785768508911,
"loss/fcd": 0.4892578125,
"loss/idx": 18.0,
"loss/logits": 0.22644919157028198,
"step": 415
},
{
"epoch": 0.005988411847266707,
"grad_norm": 0.10205078125,
"grad_norm_var": 5.3942203521728516e-05,
"learning_rate": 0.0001,
"loss": 0.2184,
"loss/crossentropy": 2.6664167642593384,
"loss/fcd": 0.4111328125,
"loss/idx": 18.0,
"loss/logits": 0.2184242233633995,
"step": 416
},
{
"epoch": 0.006002807068053406,
"grad_norm": 0.1044921875,
"grad_norm_var": 4.5719742774963376e-05,
"learning_rate": 0.0001,
"loss": 0.1944,
"loss/crossentropy": 2.231179356575012,
"loss/fcd": 0.40234375,
"loss/idx": 18.0,
"loss/logits": 0.19440477341413498,
"step": 417
},
{
"epoch": 0.006017202288840105,
"grad_norm": 0.11572265625,
"grad_norm_var": 4.672110080718994e-05,
"learning_rate": 0.0001,
"loss": 0.2664,
"loss/crossentropy": 2.798780918121338,
"loss/fcd": 0.513671875,
"loss/idx": 18.0,
"loss/logits": 0.26635295152664185,
"step": 418
},
{
"epoch": 0.006031597509626804,
"grad_norm": 0.1357421875,
"grad_norm_var": 9.435017903645834e-05,
"learning_rate": 0.0001,
"loss": 0.227,
"loss/crossentropy": 2.5461186170578003,
"loss/fcd": 0.455078125,
"loss/idx": 18.0,
"loss/logits": 0.22702706605196,
"step": 419
},
{
"epoch": 0.006045992730413502,
"grad_norm": 0.1083984375,
"grad_norm_var": 9.395281473795573e-05,
"learning_rate": 0.0001,
"loss": 0.2178,
"loss/crossentropy": 2.566969871520996,
"loss/fcd": 0.447265625,
"loss/idx": 18.0,
"loss/logits": 0.21776312589645386,
"step": 420
},
{
"epoch": 0.0060603879512002015,
"grad_norm": 0.1005859375,
"grad_norm_var": 9.696384270985921e-05,
"learning_rate": 0.0001,
"loss": 0.1927,
"loss/crossentropy": 2.41417920589447,
"loss/fcd": 0.4130859375,
"loss/idx": 18.0,
"loss/logits": 0.19272838532924652,
"step": 421
},
{
"epoch": 0.006074783171986901,
"grad_norm": 0.11181640625,
"grad_norm_var": 9.255409240722656e-05,
"learning_rate": 0.0001,
"loss": 0.2341,
"loss/crossentropy": 2.432627320289612,
"loss/fcd": 0.4423828125,
"loss/idx": 18.0,
"loss/logits": 0.2340926229953766,
"step": 422
},
{
"epoch": 0.006089178392773599,
"grad_norm": 0.1123046875,
"grad_norm_var": 9.301503499348958e-05,
"learning_rate": 0.0001,
"loss": 0.2064,
"loss/crossentropy": 2.22554087638855,
"loss/fcd": 0.4189453125,
"loss/idx": 18.0,
"loss/logits": 0.20639413595199585,
"step": 423
},
{
"epoch": 0.006103573613560298,
"grad_norm": 0.10791015625,
"grad_norm_var": 9.301503499348958e-05,
"learning_rate": 0.0001,
"loss": 0.2352,
"loss/crossentropy": 2.7009902000427246,
"loss/fcd": 0.4658203125,
"loss/idx": 18.0,
"loss/logits": 0.23516181111335754,
"step": 424
},
{
"epoch": 0.0061179688343469965,
"grad_norm": 0.11962890625,
"grad_norm_var": 9.698768456776937e-05,
"learning_rate": 0.0001,
"loss": 0.2242,
"loss/crossentropy": 2.5257065296173096,
"loss/fcd": 0.50390625,
"loss/idx": 18.0,
"loss/logits": 0.22422834485769272,
"step": 425
},
{
"epoch": 0.006132364055133696,
"grad_norm": 0.10205078125,
"grad_norm_var": 0.00010133981704711914,
"learning_rate": 0.0001,
"loss": 0.1991,
"loss/crossentropy": 2.39576256275177,
"loss/fcd": 0.3935546875,
"loss/idx": 18.0,
"loss/logits": 0.19912777841091156,
"step": 426
},
{
"epoch": 0.006146759275920394,
"grad_norm": 0.1064453125,
"grad_norm_var": 0.00010203917821248373,
"learning_rate": 0.0001,
"loss": 0.194,
"loss/crossentropy": 2.337485671043396,
"loss/fcd": 0.4072265625,
"loss/idx": 18.0,
"loss/logits": 0.19400090724229813,
"step": 427
},
{
"epoch": 0.006161154496707093,
"grad_norm": 0.11767578125,
"grad_norm_var": 9.119908014933268e-05,
"learning_rate": 0.0001,
"loss": 0.2313,
"loss/crossentropy": 2.271997570991516,
"loss/fcd": 0.4619140625,
"loss/idx": 18.0,
"loss/logits": 0.23129994422197342,
"step": 428
},
{
"epoch": 0.0061755497174937925,
"grad_norm": 0.14453125,
"grad_norm_var": 0.0001476993163426717,
"learning_rate": 0.0001,
"loss": 0.2351,
"loss/crossentropy": 2.2284241318702698,
"loss/fcd": 0.4794921875,
"loss/idx": 18.0,
"loss/logits": 0.23510510474443436,
"step": 429
},
{
"epoch": 0.006189944938280491,
"grad_norm": 0.11279296875,
"grad_norm_var": 0.00014778673648834227,
"learning_rate": 0.0001,
"loss": 0.2179,
"loss/crossentropy": 2.487444758415222,
"loss/fcd": 0.482421875,
"loss/idx": 18.0,
"loss/logits": 0.21794230490922928,
"step": 430
},
{
"epoch": 0.00620434015906719,
"grad_norm": 0.11181640625,
"grad_norm_var": 0.00014280378818511962,
"learning_rate": 0.0001,
"loss": 0.2053,
"loss/crossentropy": 2.277848958969116,
"loss/fcd": 0.4208984375,
"loss/idx": 18.0,
"loss/logits": 0.20534329116344452,
"step": 431
},
{
"epoch": 0.006218735379853888,
"grad_norm": 0.1005859375,
"grad_norm_var": 0.00014514923095703124,
"learning_rate": 0.0001,
"loss": 0.2112,
"loss/crossentropy": 2.4542627334594727,
"loss/fcd": 0.4052734375,
"loss/idx": 18.0,
"loss/logits": 0.21123766899108887,
"step": 432
},
{
"epoch": 0.0062331306006405875,
"grad_norm": 0.1162109375,
"grad_norm_var": 0.0001399993896484375,
"learning_rate": 0.0001,
"loss": 0.2215,
"loss/crossentropy": 2.337994694709778,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.22145532071590424,
"step": 433
},
{
"epoch": 0.006247525821427286,
"grad_norm": 0.10205078125,
"grad_norm_var": 0.00014856656392415364,
"learning_rate": 0.0001,
"loss": 0.1865,
"loss/crossentropy": 2.2629653215408325,
"loss/fcd": 0.3896484375,
"loss/idx": 18.0,
"loss/logits": 0.18648526072502136,
"step": 434
},
{
"epoch": 0.006261921042213985,
"grad_norm": 0.11328125,
"grad_norm_var": 0.00011246601740519205,
"learning_rate": 0.0001,
"loss": 0.2497,
"loss/crossentropy": 2.5573008060455322,
"loss/fcd": 0.455078125,
"loss/idx": 18.0,
"loss/logits": 0.24965552240610123,
"step": 435
},
{
"epoch": 0.006276316263000683,
"grad_norm": 0.1123046875,
"grad_norm_var": 0.00011167128880818685,
"learning_rate": 0.0001,
"loss": 0.2199,
"loss/crossentropy": 2.3618111610412598,
"loss/fcd": 0.4365234375,
"loss/idx": 18.0,
"loss/logits": 0.21985996514558792,
"step": 436
},
{
"epoch": 0.006290711483787383,
"grad_norm": 0.12060546875,
"grad_norm_var": 0.0001062542200088501,
"learning_rate": 0.0001,
"loss": 0.1995,
"loss/crossentropy": 1.9345441460609436,
"loss/fcd": 0.40234375,
"loss/idx": 18.0,
"loss/logits": 0.19948522001504898,
"step": 437
},
{
"epoch": 0.006305106704574082,
"grad_norm": 0.09765625,
"grad_norm_var": 0.00012149413426717122,
"learning_rate": 0.0001,
"loss": 0.1829,
"loss/crossentropy": 2.2750844955444336,
"loss/fcd": 0.400390625,
"loss/idx": 18.0,
"loss/logits": 0.18287546932697296,
"step": 438
},
{
"epoch": 0.00631950192536078,
"grad_norm": 0.10693359375,
"grad_norm_var": 0.00012334088484446207,
"learning_rate": 0.0001,
"loss": 0.206,
"loss/crossentropy": 2.178094267845154,
"loss/fcd": 0.400390625,
"loss/idx": 18.0,
"loss/logits": 0.20597843825817108,
"step": 439
},
{
"epoch": 0.006333897146147479,
"grad_norm": 0.1162109375,
"grad_norm_var": 0.00012308756510416666,
"learning_rate": 0.0001,
"loss": 0.2026,
"loss/crossentropy": 2.2465450763702393,
"loss/fcd": 0.416015625,
"loss/idx": 18.0,
"loss/logits": 0.20255093276500702,
"step": 440
},
{
"epoch": 0.006348292366934178,
"grad_norm": 0.11767578125,
"grad_norm_var": 0.00012148221333821615,
"learning_rate": 0.0001,
"loss": 0.2194,
"loss/crossentropy": 2.3598278760910034,
"loss/fcd": 0.4599609375,
"loss/idx": 18.0,
"loss/logits": 0.21944674849510193,
"step": 441
},
{
"epoch": 0.006362687587720877,
"grad_norm": 0.11181640625,
"grad_norm_var": 0.00011393229166666667,
"learning_rate": 0.0001,
"loss": 0.2421,
"loss/crossentropy": 2.523189663887024,
"loss/fcd": 0.455078125,
"loss/idx": 18.0,
"loss/logits": 0.2420613244175911,
"step": 442
},
{
"epoch": 0.006377082808507575,
"grad_norm": 0.10986328125,
"grad_norm_var": 0.00011165837446848551,
"learning_rate": 0.0001,
"loss": 0.2116,
"loss/crossentropy": 2.4686609506607056,
"loss/fcd": 0.40625,
"loss/idx": 18.0,
"loss/logits": 0.2116122990846634,
"step": 443
},
{
"epoch": 0.006391478029294274,
"grad_norm": 0.1181640625,
"grad_norm_var": 0.00011196136474609376,
"learning_rate": 0.0001,
"loss": 0.2574,
"loss/crossentropy": 2.7661678791046143,
"loss/fcd": 0.5078125,
"loss/idx": 18.0,
"loss/logits": 0.2574233114719391,
"step": 444
},
{
"epoch": 0.006405873250080973,
"grad_norm": 0.1435546875,
"grad_norm_var": 0.00010795195897420248,
"learning_rate": 0.0001,
"loss": 0.2239,
"loss/crossentropy": 1.9732567071914673,
"loss/fcd": 0.4638671875,
"loss/idx": 18.0,
"loss/logits": 0.2238999307155609,
"step": 445
},
{
"epoch": 0.006420268470867672,
"grad_norm": 0.10986328125,
"grad_norm_var": 0.00010865529378255209,
"learning_rate": 0.0001,
"loss": 0.2209,
"loss/crossentropy": 2.3994816541671753,
"loss/fcd": 0.4267578125,
"loss/idx": 18.0,
"loss/logits": 0.22091981023550034,
"step": 446
},
{
"epoch": 0.006434663691654371,
"grad_norm": 0.109375,
"grad_norm_var": 0.00010942518711090087,
"learning_rate": 0.0001,
"loss": 0.2094,
"loss/crossentropy": 2.385701537132263,
"loss/fcd": 0.4072265625,
"loss/idx": 18.0,
"loss/logits": 0.209386445581913,
"step": 447
},
{
"epoch": 0.0064490589124410694,
"grad_norm": 0.1103515625,
"grad_norm_var": 9.9371870358785e-05,
"learning_rate": 0.0001,
"loss": 0.212,
"loss/crossentropy": 2.204231023788452,
"loss/fcd": 0.4638671875,
"loss/idx": 18.0,
"loss/logits": 0.2120284140110016,
"step": 448
},
{
"epoch": 0.006463454133227769,
"grad_norm": 0.10693359375,
"grad_norm_var": 0.00010139147440592448,
"learning_rate": 0.0001,
"loss": 0.2106,
"loss/crossentropy": 2.4561452865600586,
"loss/fcd": 0.42578125,
"loss/idx": 18.0,
"loss/logits": 0.21060140430927277,
"step": 449
},
{
"epoch": 0.006477849354014467,
"grad_norm": 0.1337890625,
"grad_norm_var": 0.00011837383111317953,
"learning_rate": 0.0001,
"loss": 0.2272,
"loss/crossentropy": 2.2843399047851562,
"loss/fcd": 0.5146484375,
"loss/idx": 18.0,
"loss/logits": 0.2271936535835266,
"step": 450
},
{
"epoch": 0.006492244574801166,
"grad_norm": 0.1064453125,
"grad_norm_var": 0.00012276868025461833,
"learning_rate": 0.0001,
"loss": 0.2133,
"loss/crossentropy": 2.564459443092346,
"loss/fcd": 0.443359375,
"loss/idx": 18.0,
"loss/logits": 0.21329496800899506,
"step": 451
},
{
"epoch": 0.0065066397955878645,
"grad_norm": 0.10888671875,
"grad_norm_var": 0.00012448628743489583,
"learning_rate": 0.0001,
"loss": 0.2099,
"loss/crossentropy": 2.270860195159912,
"loss/fcd": 0.412109375,
"loss/idx": 18.0,
"loss/logits": 0.20991922914981842,
"step": 452
},
{
"epoch": 0.006521035016374564,
"grad_norm": 0.10205078125,
"grad_norm_var": 0.00013029972712198893,
"learning_rate": 0.0001,
"loss": 0.2251,
"loss/crossentropy": 2.482293486595154,
"loss/fcd": 0.4296875,
"loss/idx": 18.0,
"loss/logits": 0.22511228173971176,
"step": 453
},
{
"epoch": 0.006535430237161262,
"grad_norm": 0.154296875,
"grad_norm_var": 0.00021419127782185872,
"learning_rate": 0.0001,
"loss": 0.2411,
"loss/crossentropy": 2.320971131324768,
"loss/fcd": 0.4951171875,
"loss/idx": 18.0,
"loss/logits": 0.24106843769550323,
"step": 454
},
{
"epoch": 0.006549825457947961,
"grad_norm": 0.1044921875,
"grad_norm_var": 0.00021772285302480062,
"learning_rate": 0.0001,
"loss": 0.2047,
"loss/crossentropy": 2.3206406831741333,
"loss/fcd": 0.40234375,
"loss/idx": 18.0,
"loss/logits": 0.20467744767665863,
"step": 455
},
{
"epoch": 0.0065642206787346604,
"grad_norm": 0.1083984375,
"grad_norm_var": 0.0002218236525853475,
"learning_rate": 0.0001,
"loss": 0.218,
"loss/crossentropy": 2.492337226867676,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.2180488407611847,
"step": 456
},
{
"epoch": 0.006578615899521359,
"grad_norm": 0.10888671875,
"grad_norm_var": 0.00022468467553456625,
"learning_rate": 0.0001,
"loss": 0.2109,
"loss/crossentropy": 2.3202375173568726,
"loss/fcd": 0.4375,
"loss/idx": 18.0,
"loss/logits": 0.21094900369644165,
"step": 457
},
{
"epoch": 0.006593011120308058,
"grad_norm": 0.10400390625,
"grad_norm_var": 0.00023228228092193605,
"learning_rate": 0.0001,
"loss": 0.2025,
"loss/crossentropy": 2.452348470687866,
"loss/fcd": 0.4208984375,
"loss/idx": 18.0,
"loss/logits": 0.20247067511081696,
"step": 458
},
{
"epoch": 0.006607406341094756,
"grad_norm": 0.138671875,
"grad_norm_var": 0.00026457707087198894,
"learning_rate": 0.0001,
"loss": 0.2292,
"loss/crossentropy": 2.8116979598999023,
"loss/fcd": 0.466796875,
"loss/idx": 18.0,
"loss/logits": 0.2292005866765976,
"step": 459
},
{
"epoch": 0.0066218015618814555,
"grad_norm": 0.10791015625,
"grad_norm_var": 0.0002692292133967082,
"learning_rate": 0.0001,
"loss": 0.2044,
"loss/crossentropy": 2.337909698486328,
"loss/fcd": 0.427734375,
"loss/idx": 18.0,
"loss/logits": 0.20442651212215424,
"step": 460
},
{
"epoch": 0.006636196782668154,
"grad_norm": 0.1044921875,
"grad_norm_var": 0.00022170444329579672,
"learning_rate": 0.0001,
"loss": 0.2165,
"loss/crossentropy": 2.452089309692383,
"loss/fcd": 0.4169921875,
"loss/idx": 18.0,
"loss/logits": 0.21646161377429962,
"step": 461
},
{
"epoch": 0.006650592003454853,
"grad_norm": 0.11669921875,
"grad_norm_var": 0.00022114813327789307,
"learning_rate": 0.0001,
"loss": 0.2284,
"loss/crossentropy": 2.596395969390869,
"loss/fcd": 0.470703125,
"loss/idx": 18.0,
"loss/logits": 0.22836245596408844,
"step": 462
},
{
"epoch": 0.006664987224241552,
"grad_norm": 0.10888671875,
"grad_norm_var": 0.00022147099177042642,
"learning_rate": 0.0001,
"loss": 0.2309,
"loss/crossentropy": 2.553429961204529,
"loss/fcd": 0.44140625,
"loss/idx": 18.0,
"loss/logits": 0.23092983663082123,
"step": 463
},
{
"epoch": 0.0066793824450282506,
"grad_norm": 0.09912109375,
"grad_norm_var": 0.00023492872714996337,
"learning_rate": 0.0001,
"loss": 0.1827,
"loss/crossentropy": 2.3164178133010864,
"loss/fcd": 0.41015625,
"loss/idx": 18.0,
"loss/logits": 0.18271666765213013,
"step": 464
},
{
"epoch": 0.00669377766581495,
"grad_norm": 0.1123046875,
"grad_norm_var": 0.00023212035497029623,
"learning_rate": 0.0001,
"loss": 0.2093,
"loss/crossentropy": 2.3045096397399902,
"loss/fcd": 0.43359375,
"loss/idx": 18.0,
"loss/logits": 0.20930374413728714,
"step": 465
},
{
"epoch": 0.006708172886601648,
"grad_norm": 0.10986328125,
"grad_norm_var": 0.00020383894443511962,
"learning_rate": 0.0001,
"loss": 0.2332,
"loss/crossentropy": 2.386527895927429,
"loss/fcd": 0.4970703125,
"loss/idx": 18.0,
"loss/logits": 0.23323698341846466,
"step": 466
},
{
"epoch": 0.006722568107388347,
"grad_norm": 0.11083984375,
"grad_norm_var": 0.00020166635513305665,
"learning_rate": 0.0001,
"loss": 0.1988,
"loss/crossentropy": 2.151167392730713,
"loss/fcd": 0.4208984375,
"loss/idx": 18.0,
"loss/logits": 0.19876766949892044,
"step": 467
},
{
"epoch": 0.006736963328175046,
"grad_norm": 0.1259765625,
"grad_norm_var": 0.00021171470483144123,
"learning_rate": 0.0001,
"loss": 0.2219,
"loss/crossentropy": 2.512352228164673,
"loss/fcd": 0.43359375,
"loss/idx": 18.0,
"loss/logits": 0.22188346087932587,
"step": 468
},
{
"epoch": 0.006751358548961745,
"grad_norm": 0.10400390625,
"grad_norm_var": 0.00020895699659983317,
"learning_rate": 0.0001,
"loss": 0.202,
"loss/crossentropy": 2.4446065425872803,
"loss/fcd": 0.4326171875,
"loss/idx": 18.0,
"loss/logits": 0.2020409256219864,
"step": 469
},
{
"epoch": 0.006765753769748443,
"grad_norm": 0.1162109375,
"grad_norm_var": 9.334782759348551e-05,
"learning_rate": 0.0001,
"loss": 0.2175,
"loss/crossentropy": 2.4017263650894165,
"loss/fcd": 0.4365234375,
"loss/idx": 18.0,
"loss/logits": 0.21754636615514755,
"step": 470
},
{
"epoch": 0.006780148990535142,
"grad_norm": 0.134765625,
"grad_norm_var": 0.00012315809726715088,
"learning_rate": 0.0001,
"loss": 0.2326,
"loss/crossentropy": 2.364670991897583,
"loss/fcd": 0.4658203125,
"loss/idx": 18.0,
"loss/logits": 0.232588529586792,
"step": 471
},
{
"epoch": 0.0067945442113218416,
"grad_norm": 0.10302734375,
"grad_norm_var": 0.00012839237848917643,
"learning_rate": 0.0001,
"loss": 0.2316,
"loss/crossentropy": 2.7174742221832275,
"loss/fcd": 0.4501953125,
"loss/idx": 18.0,
"loss/logits": 0.23159676045179367,
"step": 472
},
{
"epoch": 0.00680893943210854,
"grad_norm": 0.1044921875,
"grad_norm_var": 0.00013192395369211832,
"learning_rate": 0.0001,
"loss": 0.2198,
"loss/crossentropy": 2.472269654273987,
"loss/fcd": 0.416015625,
"loss/idx": 18.0,
"loss/logits": 0.21980835497379303,
"step": 473
},
{
"epoch": 0.006823334652895239,
"grad_norm": 0.11572265625,
"grad_norm_var": 0.00012710789839426678,
"learning_rate": 0.0001,
"loss": 0.2208,
"loss/crossentropy": 2.5979279279708862,
"loss/fcd": 0.4453125,
"loss/idx": 18.0,
"loss/logits": 0.22081798315048218,
"step": 474
},
{
"epoch": 0.006837729873681937,
"grad_norm": 0.10791015625,
"grad_norm_var": 8.223454157511394e-05,
"learning_rate": 0.0001,
"loss": 0.2017,
"loss/crossentropy": 2.337291121482849,
"loss/fcd": 0.5244140625,
"loss/idx": 18.0,
"loss/logits": 0.20170452445745468,
"step": 475
},
{
"epoch": 0.006852125094468637,
"grad_norm": 0.12353515625,
"grad_norm_var": 9.024540583292643e-05,
"learning_rate": 0.0001,
"loss": 0.2224,
"loss/crossentropy": 2.2137837409973145,
"loss/fcd": 0.44921875,
"loss/idx": 18.0,
"loss/logits": 0.22239823639392853,
"step": 476
},
{
"epoch": 0.006866520315255335,
"grad_norm": 0.1376953125,
"grad_norm_var": 0.00012429157892862957,
"learning_rate": 0.0001,
"loss": 0.2253,
"loss/crossentropy": 2.044808030128479,
"loss/fcd": 0.45703125,
"loss/idx": 18.0,
"loss/logits": 0.22525641322135925,
"step": 477
},
{
"epoch": 0.006880915536042034,
"grad_norm": 0.11083984375,
"grad_norm_var": 0.00012467304865519205,
"learning_rate": 0.0001,
"loss": 0.2055,
"loss/crossentropy": 2.176842510700226,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.20548538118600845,
"step": 478
},
{
"epoch": 0.0068953107568287325,
"grad_norm": 0.11328125,
"grad_norm_var": 0.0001228402058283488,
"learning_rate": 0.0001,
"loss": 0.2052,
"loss/crossentropy": 2.43264901638031,
"loss/fcd": 0.42578125,
"loss/idx": 18.0,
"loss/logits": 0.2051537036895752,
"step": 479
},
{
"epoch": 0.006909705977615432,
"grad_norm": 0.1162109375,
"grad_norm_var": 0.00010639429092407227,
"learning_rate": 0.0001,
"loss": 0.21,
"loss/crossentropy": 2.24726939201355,
"loss/fcd": 0.4169921875,
"loss/idx": 18.0,
"loss/logits": 0.210049070417881,
"step": 480
},
{
"epoch": 0.006924101198402131,
"grad_norm": 0.1181640625,
"grad_norm_var": 0.0001061081886291504,
"learning_rate": 0.0001,
"loss": 0.204,
"loss/crossentropy": 2.71012020111084,
"loss/fcd": 0.4716796875,
"loss/idx": 18.0,
"loss/logits": 0.20403584837913513,
"step": 481
},
{
"epoch": 0.006938496419188829,
"grad_norm": 0.1181640625,
"grad_norm_var": 0.00010386208693186442,
"learning_rate": 0.0001,
"loss": 0.2221,
"loss/crossentropy": 2.3664560317993164,
"loss/fcd": 0.4501953125,
"loss/idx": 18.0,
"loss/logits": 0.22213804721832275,
"step": 482
},
{
"epoch": 0.006952891639975528,
"grad_norm": 0.10791015625,
"grad_norm_var": 0.00010653237501780192,
"learning_rate": 0.0001,
"loss": 0.2118,
"loss/crossentropy": 2.541406989097595,
"loss/fcd": 0.4638671875,
"loss/idx": 18.0,
"loss/logits": 0.21182993054389954,
"step": 483
},
{
"epoch": 0.006967286860762227,
"grad_norm": 0.1171875,
"grad_norm_var": 9.980897108713786e-05,
"learning_rate": 0.0001,
"loss": 0.2098,
"loss/crossentropy": 2.0675625801086426,
"loss/fcd": 0.3955078125,
"loss/idx": 18.0,
"loss/logits": 0.20982014387845993,
"step": 484
},
{
"epoch": 0.006981682081548926,
"grad_norm": 0.11181640625,
"grad_norm_var": 9.15755828221639e-05,
"learning_rate": 0.0001,
"loss": 0.2191,
"loss/crossentropy": 2.1868069767951965,
"loss/fcd": 0.4140625,
"loss/idx": 18.0,
"loss/logits": 0.21909870952367783,
"step": 485
},
{
"epoch": 0.006996077302335624,
"grad_norm": 0.10546875,
"grad_norm_var": 9.856919447580973e-05,
"learning_rate": 0.0001,
"loss": 0.2026,
"loss/crossentropy": 2.26907217502594,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.20264852046966553,
"step": 486
},
{
"epoch": 0.0070104725231223235,
"grad_norm": 0.11572265625,
"grad_norm_var": 7.203022638956706e-05,
"learning_rate": 0.0001,
"loss": 0.2109,
"loss/crossentropy": 2.4840633869171143,
"loss/fcd": 0.439453125,
"loss/idx": 18.0,
"loss/logits": 0.21092981100082397,
"step": 487
},
{
"epoch": 0.007024867743909022,
"grad_norm": 0.115234375,
"grad_norm_var": 6.31640354792277e-05,
"learning_rate": 0.0001,
"loss": 0.2276,
"loss/crossentropy": 2.5656063556671143,
"loss/fcd": 0.43359375,
"loss/idx": 18.0,
"loss/logits": 0.22764952480793,
"step": 488
},
{
"epoch": 0.007039262964695721,
"grad_norm": 0.1455078125,
"grad_norm_var": 0.0001110623280207316,
"learning_rate": 0.0001,
"loss": 0.2581,
"loss/crossentropy": 2.414512276649475,
"loss/fcd": 0.5146484375,
"loss/idx": 18.0,
"loss/logits": 0.258076474070549,
"step": 489
},
{
"epoch": 0.00705365818548242,
"grad_norm": 0.1123046875,
"grad_norm_var": 0.00011261304219563802,
"learning_rate": 0.0001,
"loss": 0.1995,
"loss/crossentropy": 2.2527265548706055,
"loss/fcd": 0.3984375,
"loss/idx": 18.0,
"loss/logits": 0.19945065677165985,
"step": 490
},
{
"epoch": 0.0070680534062691185,
"grad_norm": 0.1064453125,
"grad_norm_var": 0.00011458297570546469,
"learning_rate": 0.0001,
"loss": 0.214,
"loss/crossentropy": 2.3830225467681885,
"loss/fcd": 0.4140625,
"loss/idx": 18.0,
"loss/logits": 0.21395207196474075,
"step": 491
},
{
"epoch": 0.007082448627055818,
"grad_norm": 0.10400390625,
"grad_norm_var": 0.00012197395165761312,
"learning_rate": 0.0001,
"loss": 0.2245,
"loss/crossentropy": 2.578980803489685,
"loss/fcd": 0.412109375,
"loss/idx": 18.0,
"loss/logits": 0.2245059311389923,
"step": 492
},
{
"epoch": 0.007096843847842516,
"grad_norm": 0.11328125,
"grad_norm_var": 8.859535058339437e-05,
"learning_rate": 0.0001,
"loss": 0.2001,
"loss/crossentropy": 2.0505954027175903,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.2001277357339859,
"step": 493
},
{
"epoch": 0.007111239068629215,
"grad_norm": 0.111328125,
"grad_norm_var": 8.837381998697917e-05,
"learning_rate": 0.0001,
"loss": 0.2057,
"loss/crossentropy": 2.2900065183639526,
"loss/fcd": 0.4326171875,
"loss/idx": 18.0,
"loss/logits": 0.2057407721877098,
"step": 494
},
{
"epoch": 0.007125634289415914,
"grad_norm": 0.09765625,
"grad_norm_var": 0.00010617574055989583,
"learning_rate": 0.0001,
"loss": 0.2047,
"loss/crossentropy": 2.6084879636764526,
"loss/fcd": 0.4052734375,
"loss/idx": 18.0,
"loss/logits": 0.2047056257724762,
"step": 495
},
{
"epoch": 0.007140029510202613,
"grad_norm": 0.10986328125,
"grad_norm_var": 0.0001064211130142212,
"learning_rate": 0.0001,
"loss": 0.1886,
"loss/crossentropy": 2.2180538177490234,
"loss/fcd": 0.41015625,
"loss/idx": 18.0,
"loss/logits": 0.18857233971357346,
"step": 496
},
{
"epoch": 0.007154424730989311,
"grad_norm": 0.10205078125,
"grad_norm_var": 0.0001118302345275879,
"learning_rate": 0.0001,
"loss": 0.2011,
"loss/crossentropy": 2.3378570079803467,
"loss/fcd": 0.4169921875,
"loss/idx": 18.0,
"loss/logits": 0.20111830532550812,
"step": 497
},
{
"epoch": 0.00716881995177601,
"grad_norm": 0.099609375,
"grad_norm_var": 0.00011839866638183594,
"learning_rate": 0.0001,
"loss": 0.2105,
"loss/crossentropy": 2.4460572004318237,
"loss/fcd": 0.41796875,
"loss/idx": 18.0,
"loss/logits": 0.21054691076278687,
"step": 498
},
{
"epoch": 0.0071832151725627095,
"grad_norm": 0.1025390625,
"grad_norm_var": 0.00012238721052805582,
"learning_rate": 0.0001,
"loss": 0.189,
"loss/crossentropy": 2.358466863632202,
"loss/fcd": 0.3779296875,
"loss/idx": 18.0,
"loss/logits": 0.18903843313455582,
"step": 499
},
{
"epoch": 0.007197610393349408,
"grad_norm": 0.0986328125,
"grad_norm_var": 0.00012767215569814045,
"learning_rate": 0.0001,
"loss": 0.23,
"loss/crossentropy": 2.604634642601013,
"loss/fcd": 0.4287109375,
"loss/idx": 18.0,
"loss/logits": 0.23002738505601883,
"step": 500
},
{
"epoch": 0.007212005614136107,
"grad_norm": 0.10546875,
"grad_norm_var": 0.0001282016436258952,
"learning_rate": 0.0001,
"loss": 0.1989,
"loss/crossentropy": 2.3237578868865967,
"loss/fcd": 0.408203125,
"loss/idx": 18.0,
"loss/logits": 0.19885492324829102,
"step": 501
},
{
"epoch": 0.007226400834922805,
"grad_norm": 0.1015625,
"grad_norm_var": 0.00013103087743123373,
"learning_rate": 0.0001,
"loss": 0.2136,
"loss/crossentropy": 2.434670090675354,
"loss/fcd": 0.4296875,
"loss/idx": 18.0,
"loss/logits": 0.21359950304031372,
"step": 502
},
{
"epoch": 0.007240796055709505,
"grad_norm": 0.111328125,
"grad_norm_var": 0.0001281966765721639,
"learning_rate": 0.0001,
"loss": 0.2027,
"loss/crossentropy": 2.3419206142425537,
"loss/fcd": 0.4189453125,
"loss/idx": 18.0,
"loss/logits": 0.20269384235143661,
"step": 503
},
{
"epoch": 0.007255191276496203,
"grad_norm": 0.1142578125,
"grad_norm_var": 0.0001273860534032186,
"learning_rate": 0.0001,
"loss": 0.2168,
"loss/crossentropy": 2.418344020843506,
"loss/fcd": 0.4287109375,
"loss/idx": 18.0,
"loss/logits": 0.21677344292402267,
"step": 504
},
{
"epoch": 0.007269586497282902,
"grad_norm": 0.1181640625,
"grad_norm_var": 3.9155284563700356e-05,
"learning_rate": 0.0001,
"loss": 0.2208,
"loss/crossentropy": 2.5992894172668457,
"loss/fcd": 0.45703125,
"loss/idx": 18.0,
"loss/logits": 0.22083494067192078,
"step": 505
},
{
"epoch": 0.007283981718069601,
"grad_norm": 0.10302734375,
"grad_norm_var": 3.770192464192708e-05,
"learning_rate": 0.0001,
"loss": 0.2111,
"loss/crossentropy": 2.3879592418670654,
"loss/fcd": 0.451171875,
"loss/idx": 18.0,
"loss/logits": 0.21107713878154755,
"step": 506
},
{
"epoch": 0.0072983769388563,
"grad_norm": 0.10546875,
"grad_norm_var": 3.7729740142822266e-05,
"learning_rate": 0.0001,
"loss": 0.2234,
"loss/crossentropy": 2.7517272233963013,
"loss/fcd": 0.447265625,
"loss/idx": 18.0,
"loss/logits": 0.22336142510175705,
"step": 507
},
{
"epoch": 0.007312772159642999,
"grad_norm": 0.10400390625,
"grad_norm_var": 3.7729740142822266e-05,
"learning_rate": 0.0001,
"loss": 0.1838,
"loss/crossentropy": 2.1463602781295776,
"loss/fcd": 0.4033203125,
"loss/idx": 18.0,
"loss/logits": 0.1838330551981926,
"step": 508
},
{
"epoch": 0.007327167380429697,
"grad_norm": 0.10107421875,
"grad_norm_var": 3.542006015777588e-05,
"learning_rate": 0.0001,
"loss": 0.2068,
"loss/crossentropy": 2.4218236207962036,
"loss/fcd": 0.4130859375,
"loss/idx": 18.0,
"loss/logits": 0.2068256437778473,
"step": 509
},
{
"epoch": 0.007341562601216396,
"grad_norm": 0.1201171875,
"grad_norm_var": 4.722177982330322e-05,
"learning_rate": 0.0001,
"loss": 0.2242,
"loss/crossentropy": 2.253819227218628,
"loss/fcd": 0.447265625,
"loss/idx": 18.0,
"loss/logits": 0.22418855130672455,
"step": 510
},
{
"epoch": 0.007355957822003095,
"grad_norm": 0.10302734375,
"grad_norm_var": 4.3102105458577474e-05,
"learning_rate": 0.0001,
"loss": 0.2045,
"loss/crossentropy": 2.1473891735076904,
"loss/fcd": 0.40234375,
"loss/idx": 18.0,
"loss/logits": 0.2044747918844223,
"step": 511
},
{
"epoch": 0.007370353042789794,
"grad_norm": 0.1220703125,
"grad_norm_var": 5.827645460764567e-05,
"learning_rate": 0.0001,
"loss": 0.2399,
"loss/crossentropy": 2.4559924602508545,
"loss/fcd": 0.4638671875,
"loss/idx": 18.0,
"loss/logits": 0.23986588418483734,
"step": 512
},
{
"epoch": 0.007384748263576492,
"grad_norm": 0.1240234375,
"grad_norm_var": 7.387797037760417e-05,
"learning_rate": 0.0001,
"loss": 0.2335,
"loss/crossentropy": 2.3832513093948364,
"loss/fcd": 0.482421875,
"loss/idx": 18.0,
"loss/logits": 0.2334604561328888,
"step": 513
},
{
"epoch": 0.0073991434843631914,
"grad_norm": 0.11083984375,
"grad_norm_var": 6.859997908274333e-05,
"learning_rate": 0.0001,
"loss": 0.2134,
"loss/crossentropy": 2.4991053342819214,
"loss/fcd": 0.439453125,
"loss/idx": 18.0,
"loss/logits": 0.21339743584394455,
"step": 514
},
{
"epoch": 0.007413538705149891,
"grad_norm": 0.1376953125,
"grad_norm_var": 0.00011509160200754802,
"learning_rate": 0.0001,
"loss": 0.2837,
"loss/crossentropy": 2.707633852958679,
"loss/fcd": 0.525390625,
"loss/idx": 18.0,
"loss/logits": 0.28367944806814194,
"step": 515
},
{
"epoch": 0.007427933925936589,
"grad_norm": 0.10791015625,
"grad_norm_var": 0.00010480483373006184,
"learning_rate": 0.0001,
"loss": 0.2266,
"loss/crossentropy": 2.4334722757339478,
"loss/fcd": 0.4580078125,
"loss/idx": 18.0,
"loss/logits": 0.22656814754009247,
"step": 516
},
{
"epoch": 0.007442329146723288,
"grad_norm": 0.107421875,
"grad_norm_var": 0.00010337432225545247,
"learning_rate": 0.0001,
"loss": 0.2487,
"loss/crossentropy": 2.794032335281372,
"loss/fcd": 0.4853515625,
"loss/idx": 18.0,
"loss/logits": 0.2486870214343071,
"step": 517
},
{
"epoch": 0.0074567243675099865,
"grad_norm": 0.09912109375,
"grad_norm_var": 0.00010714431603749593,
"learning_rate": 0.0001,
"loss": 0.1797,
"loss/crossentropy": 2.4037466049194336,
"loss/fcd": 0.408203125,
"loss/idx": 18.0,
"loss/logits": 0.1796710044145584,
"step": 518
},
{
"epoch": 0.007471119588296686,
"grad_norm": 0.1181640625,
"grad_norm_var": 0.00010959208011627198,
"learning_rate": 0.0001,
"loss": 0.2219,
"loss/crossentropy": 2.3638296127319336,
"loss/fcd": 0.42578125,
"loss/idx": 18.0,
"loss/logits": 0.22193115949630737,
"step": 519
},
{
"epoch": 0.007485514809083384,
"grad_norm": 0.109375,
"grad_norm_var": 0.00010979076226552328,
"learning_rate": 0.0001,
"loss": 0.2223,
"loss/crossentropy": 2.555932879447937,
"loss/fcd": 0.458984375,
"loss/idx": 18.0,
"loss/logits": 0.22230461984872818,
"step": 520
},
{
"epoch": 0.007499910029870083,
"grad_norm": 0.1083984375,
"grad_norm_var": 0.00010768473148345948,
"learning_rate": 0.0001,
"loss": 0.2183,
"loss/crossentropy": 2.5372231006622314,
"loss/fcd": 0.4541015625,
"loss/idx": 18.0,
"loss/logits": 0.21830307692289352,
"step": 521
},
{
"epoch": 0.0075143052506567816,
"grad_norm": 0.11279296875,
"grad_norm_var": 0.00010279715061187745,
"learning_rate": 0.0001,
"loss": 0.2068,
"loss/crossentropy": 2.566136121749878,
"loss/fcd": 0.4287109375,
"loss/idx": 18.0,
"loss/logits": 0.20676826685667038,
"step": 522
},
{
"epoch": 0.007528700471443481,
"grad_norm": 0.119140625,
"grad_norm_var": 0.00010263025760650634,
"learning_rate": 0.0001,
"loss": 0.2323,
"loss/crossentropy": 2.4165902137756348,
"loss/fcd": 0.5078125,
"loss/idx": 18.0,
"loss/logits": 0.23230554163455963,
"step": 523
},
{
"epoch": 0.00754309569223018,
"grad_norm": 0.11474609375,
"grad_norm_var": 9.721020857493083e-05,
"learning_rate": 0.0001,
"loss": 0.2198,
"loss/crossentropy": 2.5744664669036865,
"loss/fcd": 0.4638671875,
"loss/idx": 18.0,
"loss/logits": 0.21984682232141495,
"step": 524
},
{
"epoch": 0.007557490913016878,
"grad_norm": 0.1103515625,
"grad_norm_var": 8.722543716430665e-05,
"learning_rate": 0.0001,
"loss": 0.2083,
"loss/crossentropy": 2.0694758892059326,
"loss/fcd": 0.4228515625,
"loss/idx": 18.0,
"loss/logits": 0.20834185183048248,
"step": 525
},
{
"epoch": 0.0075718861338035775,
"grad_norm": 0.107421875,
"grad_norm_var": 8.707046508789062e-05,
"learning_rate": 0.0001,
"loss": 0.2306,
"loss/crossentropy": 2.5832005739212036,
"loss/fcd": 0.474609375,
"loss/idx": 18.0,
"loss/logits": 0.23062562197446823,
"step": 526
},
{
"epoch": 0.007586281354590276,
"grad_norm": 0.11572265625,
"grad_norm_var": 7.978677749633789e-05,
"learning_rate": 0.0001,
"loss": 0.2216,
"loss/crossentropy": 2.38311767578125,
"loss/fcd": 0.431640625,
"loss/idx": 18.0,
"loss/logits": 0.2215922325849533,
"step": 527
},
{
"epoch": 0.007600676575376975,
"grad_norm": 0.1279296875,
"grad_norm_var": 8.81791114807129e-05,
"learning_rate": 0.0001,
"loss": 0.1902,
"loss/crossentropy": 1.8877107501029968,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.19022603332996368,
"step": 528
},
{
"epoch": 0.007615071796163673,
"grad_norm": 0.12060546875,
"grad_norm_var": 8.454223473866781e-05,
"learning_rate": 0.0001,
"loss": 0.2042,
"loss/crossentropy": 2.158120632171631,
"loss/fcd": 0.40234375,
"loss/idx": 18.0,
"loss/logits": 0.20415493100881577,
"step": 529
},
{
"epoch": 0.0076294670169503725,
"grad_norm": 0.10498046875,
"grad_norm_var": 8.933444817860921e-05,
"learning_rate": 0.0001,
"loss": 0.2037,
"loss/crossentropy": 2.460996627807617,
"loss/fcd": 0.431640625,
"loss/idx": 18.0,
"loss/logits": 0.20372942835092545,
"step": 530
},
{
"epoch": 0.007643862237737071,
"grad_norm": 0.11181640625,
"grad_norm_var": 4.8951307932535806e-05,
"learning_rate": 0.0001,
"loss": 0.2132,
"loss/crossentropy": 2.226336717605591,
"loss/fcd": 0.4189453125,
"loss/idx": 18.0,
"loss/logits": 0.21315942704677582,
"step": 531
},
{
"epoch": 0.00765825745852377,
"grad_norm": 0.11181640625,
"grad_norm_var": 4.7647953033447264e-05,
"learning_rate": 0.0001,
"loss": 0.2321,
"loss/crossentropy": 2.446126341819763,
"loss/fcd": 0.462890625,
"loss/idx": 18.0,
"loss/logits": 0.2321249470114708,
"step": 532
},
{
"epoch": 0.007672652679310469,
"grad_norm": 0.1044921875,
"grad_norm_var": 5.016326904296875e-05,
"learning_rate": 0.0001,
"loss": 0.2082,
"loss/crossentropy": 2.3785619735717773,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.208193838596344,
"step": 533
},
{
"epoch": 0.007687047900097168,
"grad_norm": 0.107421875,
"grad_norm_var": 3.9878487586975095e-05,
"learning_rate": 0.0001,
"loss": 0.2113,
"loss/crossentropy": 2.2807793617248535,
"loss/fcd": 0.427734375,
"loss/idx": 18.0,
"loss/logits": 0.21130456030368805,
"step": 534
},
{
"epoch": 0.007701443120883867,
"grad_norm": 0.10888671875,
"grad_norm_var": 3.865162531534831e-05,
"learning_rate": 0.0001,
"loss": 0.231,
"loss/crossentropy": 2.65705668926239,
"loss/fcd": 0.4619140625,
"loss/idx": 18.0,
"loss/logits": 0.23100796341896057,
"step": 535
},
{
"epoch": 0.007715838341670565,
"grad_norm": 0.11669921875,
"grad_norm_var": 3.920296827952067e-05,
"learning_rate": 0.0001,
"loss": 0.2427,
"loss/crossentropy": 2.493618130683899,
"loss/fcd": 0.474609375,
"loss/idx": 18.0,
"loss/logits": 0.24267538636922836,
"step": 536
},
{
"epoch": 0.007730233562457264,
"grad_norm": 0.1025390625,
"grad_norm_var": 4.4710437456766765e-05,
"learning_rate": 0.0001,
"loss": 0.2032,
"loss/crossentropy": 2.3469570875167847,
"loss/fcd": 0.3955078125,
"loss/idx": 18.0,
"loss/logits": 0.2031807154417038,
"step": 537
},
{
"epoch": 0.007744628783243963,
"grad_norm": 0.1201171875,
"grad_norm_var": 4.851023356119792e-05,
"learning_rate": 0.0001,
"loss": 0.2267,
"loss/crossentropy": 2.262601613998413,
"loss/fcd": 0.42578125,
"loss/idx": 18.0,
"loss/logits": 0.22673919051885605,
"step": 538
},
{
"epoch": 0.007759024004030662,
"grad_norm": 0.10693359375,
"grad_norm_var": 4.749198754628499e-05,
"learning_rate": 0.0001,
"loss": 0.2243,
"loss/crossentropy": 2.8090314865112305,
"loss/fcd": 0.4482421875,
"loss/idx": 18.0,
"loss/logits": 0.22425533086061478,
"step": 539
},
{
"epoch": 0.00777341922481736,
"grad_norm": 0.10791015625,
"grad_norm_var": 4.793703556060791e-05,
"learning_rate": 0.0001,
"loss": 0.2073,
"loss/crossentropy": 2.41420841217041,
"loss/fcd": 0.4091796875,
"loss/idx": 18.0,
"loss/logits": 0.20730414986610413,
"step": 540
},
{
"epoch": 0.007787814445604059,
"grad_norm": 0.10546875,
"grad_norm_var": 5.024174849192301e-05,
"learning_rate": 0.0001,
"loss": 0.1982,
"loss/crossentropy": 2.338608145713806,
"loss/fcd": 0.4267578125,
"loss/idx": 18.0,
"loss/logits": 0.19816286861896515,
"step": 541
},
{
"epoch": 0.007802209666390759,
"grad_norm": 0.10498046875,
"grad_norm_var": 5.1875909169514976e-05,
"learning_rate": 0.0001,
"loss": 0.1913,
"loss/crossentropy": 2.199298143386841,
"loss/fcd": 0.3994140625,
"loss/idx": 18.0,
"loss/logits": 0.19127565622329712,
"step": 542
},
{
"epoch": 0.007816604887177458,
"grad_norm": 0.1064453125,
"grad_norm_var": 5.159278710683187e-05,
"learning_rate": 0.0001,
"loss": 0.2308,
"loss/crossentropy": 2.403664708137512,
"loss/fcd": 0.4345703125,
"loss/idx": 18.0,
"loss/logits": 0.23076358437538147,
"step": 543
},
{
"epoch": 0.007831000107964156,
"grad_norm": 0.10693359375,
"grad_norm_var": 3.05334726969401e-05,
"learning_rate": 0.0001,
"loss": 0.2145,
"loss/crossentropy": 2.511462450027466,
"loss/fcd": 0.4169921875,
"loss/idx": 18.0,
"loss/logits": 0.21448855847120285,
"step": 544
},
{
"epoch": 0.007845395328750854,
"grad_norm": 0.10205078125,
"grad_norm_var": 2.3965040842692057e-05,
"learning_rate": 0.0001,
"loss": 0.2069,
"loss/crossentropy": 2.543255090713501,
"loss/fcd": 0.4072265625,
"loss/idx": 18.0,
"loss/logits": 0.20689202100038528,
"step": 545
},
{
"epoch": 0.007859790549537553,
"grad_norm": 0.09765625,
"grad_norm_var": 3.03576389948527e-05,
"learning_rate": 0.0001,
"loss": 0.2131,
"loss/crossentropy": 2.618165135383606,
"loss/fcd": 0.431640625,
"loss/idx": 18.0,
"loss/logits": 0.2130560278892517,
"step": 546
},
{
"epoch": 0.007874185770324253,
"grad_norm": 0.10888671875,
"grad_norm_var": 2.9260913530985515e-05,
"learning_rate": 0.0001,
"loss": 0.205,
"loss/crossentropy": 2.3171777725219727,
"loss/fcd": 0.439453125,
"loss/idx": 18.0,
"loss/logits": 0.20504355430603027,
"step": 547
},
{
"epoch": 0.007888580991110951,
"grad_norm": 0.10888671875,
"grad_norm_var": 2.8092662493387858e-05,
"learning_rate": 0.0001,
"loss": 0.2055,
"loss/crossentropy": 2.3242313861846924,
"loss/fcd": 0.4365234375,
"loss/idx": 18.0,
"loss/logits": 0.20549335330724716,
"step": 548
},
{
"epoch": 0.00790297621189765,
"grad_norm": 0.287109375,
"grad_norm_var": 0.0020447880029678344,
"learning_rate": 0.0001,
"loss": 0.2562,
"loss/crossentropy": 2.264755129814148,
"loss/fcd": 0.548828125,
"loss/idx": 18.0,
"loss/logits": 0.2562015801668167,
"step": 549
},
{
"epoch": 0.00791737143268435,
"grad_norm": 0.109375,
"grad_norm_var": 0.002042093873023987,
"learning_rate": 0.0001,
"loss": 0.1887,
"loss/crossentropy": 2.053212523460388,
"loss/fcd": 0.3916015625,
"loss/idx": 18.0,
"loss/logits": 0.18873201310634613,
"step": 550
},
{
"epoch": 0.007931766653471048,
"grad_norm": 0.109375,
"grad_norm_var": 0.002041463057200114,
"learning_rate": 0.0001,
"loss": 0.1847,
"loss/crossentropy": 2.1234816908836365,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.18469391763210297,
"step": 551
},
{
"epoch": 0.007946161874257746,
"grad_norm": 0.1044921875,
"grad_norm_var": 0.0020542532205581666,
"learning_rate": 0.0001,
"loss": 0.2206,
"loss/crossentropy": 2.5538755655288696,
"loss/fcd": 0.443359375,
"loss/idx": 18.0,
"loss/logits": 0.22058459371328354,
"step": 552
},
{
"epoch": 0.007960557095044445,
"grad_norm": 0.10205078125,
"grad_norm_var": 0.0020552794138590496,
"learning_rate": 0.0001,
"loss": 0.2014,
"loss/crossentropy": 2.2996666431427,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.20140548795461655,
"step": 553
},
{
"epoch": 0.007974952315831145,
"grad_norm": 0.1162109375,
"grad_norm_var": 0.0020551522572835284,
"learning_rate": 0.0001,
"loss": 0.1931,
"loss/crossentropy": 2.076995849609375,
"loss/fcd": 0.3876953125,
"loss/idx": 18.0,
"loss/logits": 0.19310477375984192,
"step": 554
},
{
"epoch": 0.007989347536617843,
"grad_norm": 0.10107421875,
"grad_norm_var": 0.0020657857259114582,
"learning_rate": 0.0001,
"loss": 0.2072,
"loss/crossentropy": 2.5844489336013794,
"loss/fcd": 0.4453125,
"loss/idx": 18.0,
"loss/logits": 0.20723149180412292,
"step": 555
},
{
"epoch": 0.008003742757404541,
"grad_norm": 0.1181640625,
"grad_norm_var": 0.0020593394835789996,
"learning_rate": 0.0001,
"loss": 0.2353,
"loss/crossentropy": 2.580026626586914,
"loss/fcd": 0.4541015625,
"loss/idx": 18.0,
"loss/logits": 0.23526855558156967,
"step": 556
},
{
"epoch": 0.00801813797819124,
"grad_norm": 0.1005859375,
"grad_norm_var": 0.0020690351724624635,
"learning_rate": 0.0001,
"loss": 0.2148,
"loss/crossentropy": 2.526800036430359,
"loss/fcd": 0.4296875,
"loss/idx": 18.0,
"loss/logits": 0.21484342962503433,
"step": 557
},
{
"epoch": 0.00803253319897794,
"grad_norm": 0.10400390625,
"grad_norm_var": 0.002070759733517965,
"learning_rate": 0.0001,
"loss": 0.203,
"loss/crossentropy": 2.459173560142517,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.2029871866106987,
"step": 558
},
{
"epoch": 0.008046928419764638,
"grad_norm": 0.11865234375,
"grad_norm_var": 0.002061744530995687,
"learning_rate": 0.0001,
"loss": 0.238,
"loss/crossentropy": 2.560517430305481,
"loss/fcd": 0.4775390625,
"loss/idx": 18.0,
"loss/logits": 0.23796609044075012,
"step": 559
},
{
"epoch": 0.008061323640551336,
"grad_norm": 0.1044921875,
"grad_norm_var": 0.002065872152646383,
"learning_rate": 0.0001,
"loss": 0.203,
"loss/crossentropy": 2.476174235343933,
"loss/fcd": 0.43359375,
"loss/idx": 18.0,
"loss/logits": 0.20299049466848373,
"step": 560
},
{
"epoch": 0.008075718861338036,
"grad_norm": 0.10986328125,
"grad_norm_var": 0.0020527432362238566,
"learning_rate": 0.0001,
"loss": 0.1991,
"loss/crossentropy": 2.5808521509170532,
"loss/fcd": 0.4267578125,
"loss/idx": 18.0,
"loss/logits": 0.19911371916532516,
"step": 561
},
{
"epoch": 0.008090114082124735,
"grad_norm": 0.1025390625,
"grad_norm_var": 0.002040464679400126,
"learning_rate": 0.0001,
"loss": 0.1829,
"loss/crossentropy": 2.2305572628974915,
"loss/fcd": 0.3935546875,
"loss/idx": 18.0,
"loss/logits": 0.182855024933815,
"step": 562
},
{
"epoch": 0.008104509302911433,
"grad_norm": 0.10302734375,
"grad_norm_var": 0.0020505974690119425,
"learning_rate": 0.0001,
"loss": 0.197,
"loss/crossentropy": 2.2332805395126343,
"loss/fcd": 0.40625,
"loss/idx": 18.0,
"loss/logits": 0.19702833145856857,
"step": 563
},
{
"epoch": 0.008118904523698131,
"grad_norm": 0.11279296875,
"grad_norm_var": 0.002046417196591695,
"learning_rate": 0.0001,
"loss": 0.2281,
"loss/crossentropy": 2.634607434272766,
"loss/fcd": 0.4599609375,
"loss/idx": 18.0,
"loss/logits": 0.2280866503715515,
"step": 564
},
{
"epoch": 0.008133299744484832,
"grad_norm": 0.11865234375,
"grad_norm_var": 4.386504491170247e-05,
"learning_rate": 0.0001,
"loss": 0.2263,
"loss/crossentropy": 2.566808342933655,
"loss/fcd": 0.458984375,
"loss/idx": 18.0,
"loss/logits": 0.22630243003368378,
"step": 565
},
{
"epoch": 0.00814769496527153,
"grad_norm": 0.1064453125,
"grad_norm_var": 4.404385884602864e-05,
"learning_rate": 0.0001,
"loss": 0.2021,
"loss/crossentropy": 2.404551863670349,
"loss/fcd": 0.4326171875,
"loss/idx": 18.0,
"loss/logits": 0.20205579698085785,
"step": 566
},
{
"epoch": 0.008162090186058228,
"grad_norm": 0.10791015625,
"grad_norm_var": 4.396339257558187e-05,
"learning_rate": 0.0001,
"loss": 0.1902,
"loss/crossentropy": 2.052983283996582,
"loss/fcd": 0.4033203125,
"loss/idx": 18.0,
"loss/logits": 0.19023562967777252,
"step": 567
},
{
"epoch": 0.008176485406844928,
"grad_norm": 0.1015625,
"grad_norm_var": 4.5942266782124835e-05,
"learning_rate": 0.0001,
"loss": 0.1904,
"loss/crossentropy": 2.1084887981414795,
"loss/fcd": 0.3916015625,
"loss/idx": 18.0,
"loss/logits": 0.1903728023171425,
"step": 568
},
{
"epoch": 0.008190880627631627,
"grad_norm": 0.107421875,
"grad_norm_var": 4.348357518513997e-05,
"learning_rate": 0.0001,
"loss": 0.2044,
"loss/crossentropy": 2.528154492378235,
"loss/fcd": 0.455078125,
"loss/idx": 18.0,
"loss/logits": 0.20440030097961426,
"step": 569
},
{
"epoch": 0.008205275848418325,
"grad_norm": 0.10400390625,
"grad_norm_var": 3.998180230458577e-05,
"learning_rate": 0.0001,
"loss": 0.2031,
"loss/crossentropy": 2.2640358209609985,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.20313097536563873,
"step": 570
},
{
"epoch": 0.008219671069205023,
"grad_norm": 0.125,
"grad_norm_var": 5.50230344136556e-05,
"learning_rate": 0.0001,
"loss": 0.2647,
"loss/crossentropy": 2.533176898956299,
"loss/fcd": 0.5107421875,
"loss/idx": 18.0,
"loss/logits": 0.2647128999233246,
"step": 571
},
{
"epoch": 0.008234066289991723,
"grad_norm": 0.10693359375,
"grad_norm_var": 4.928807417551676e-05,
"learning_rate": 0.0001,
"loss": 0.2343,
"loss/crossentropy": 2.5634536743164062,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.23427864164113998,
"step": 572
},
{
"epoch": 0.008248461510778422,
"grad_norm": 0.10595703125,
"grad_norm_var": 4.5518080393473305e-05,
"learning_rate": 0.0001,
"loss": 0.2025,
"loss/crossentropy": 2.4560309648513794,
"loss/fcd": 0.4296875,
"loss/idx": 18.0,
"loss/logits": 0.20253371447324753,
"step": 573
},
{
"epoch": 0.00826285673156512,
"grad_norm": 0.1015625,
"grad_norm_var": 4.742046197255453e-05,
"learning_rate": 0.0001,
"loss": 0.1913,
"loss/crossentropy": 2.2056825160980225,
"loss/fcd": 0.392578125,
"loss/idx": 18.0,
"loss/logits": 0.1913457065820694,
"step": 574
},
{
"epoch": 0.00827725195235182,
"grad_norm": 0.10205078125,
"grad_norm_var": 4.228651523590088e-05,
"learning_rate": 0.0001,
"loss": 0.2063,
"loss/crossentropy": 2.330709218978882,
"loss/fcd": 0.41796875,
"loss/idx": 18.0,
"loss/logits": 0.20634697377681732,
"step": 575
},
{
"epoch": 0.008291647173138518,
"grad_norm": 0.11083984375,
"grad_norm_var": 4.224777221679687e-05,
"learning_rate": 0.0001,
"loss": 0.1815,
"loss/crossentropy": 1.9496164321899414,
"loss/fcd": 0.3935546875,
"loss/idx": 18.0,
"loss/logits": 0.18145756423473358,
"step": 576
},
{
"epoch": 0.008306042393925217,
"grad_norm": 0.1083984375,
"grad_norm_var": 4.2000412940979e-05,
"learning_rate": 0.0001,
"loss": 0.2194,
"loss/crossentropy": 2.6926685571670532,
"loss/fcd": 0.4365234375,
"loss/idx": 18.0,
"loss/logits": 0.21936995536088943,
"step": 577
},
{
"epoch": 0.008320437614711915,
"grad_norm": 0.1103515625,
"grad_norm_var": 4.031558831532796e-05,
"learning_rate": 0.0001,
"loss": 0.1976,
"loss/crossentropy": 2.299630641937256,
"loss/fcd": 0.4072265625,
"loss/idx": 18.0,
"loss/logits": 0.1975780501961708,
"step": 578
},
{
"epoch": 0.008334832835498615,
"grad_norm": 0.1044921875,
"grad_norm_var": 3.9418538411458336e-05,
"learning_rate": 0.0001,
"loss": 0.1928,
"loss/crossentropy": 2.3579829931259155,
"loss/fcd": 0.4296875,
"loss/idx": 18.0,
"loss/logits": 0.19275517761707306,
"step": 579
},
{
"epoch": 0.008349228056285313,
"grad_norm": 0.1025390625,
"grad_norm_var": 3.998180230458577e-05,
"learning_rate": 0.0001,
"loss": 0.1892,
"loss/crossentropy": 2.1995487213134766,
"loss/fcd": 0.404296875,
"loss/idx": 18.0,
"loss/logits": 0.18920866400003433,
"step": 580
},
{
"epoch": 0.008363623277072012,
"grad_norm": 0.10595703125,
"grad_norm_var": 3.161331017812093e-05,
"learning_rate": 0.0001,
"loss": 0.2302,
"loss/crossentropy": 2.54054057598114,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.23019887506961823,
"step": 581
},
{
"epoch": 0.00837801849785871,
"grad_norm": 0.11279296875,
"grad_norm_var": 3.3692518870035806e-05,
"learning_rate": 0.0001,
"loss": 0.2467,
"loss/crossentropy": 2.5756444931030273,
"loss/fcd": 0.490234375,
"loss/idx": 18.0,
"loss/logits": 0.2466834932565689,
"step": 582
},
{
"epoch": 0.00839241371864541,
"grad_norm": 0.1259765625,
"grad_norm_var": 5.541543165842692e-05,
"learning_rate": 0.0001,
"loss": 0.217,
"loss/crossentropy": 2.225999653339386,
"loss/fcd": 0.45703125,
"loss/idx": 18.0,
"loss/logits": 0.21695519983768463,
"step": 583
},
{
"epoch": 0.008406808939432108,
"grad_norm": 0.1083984375,
"grad_norm_var": 5.202194054921468e-05,
"learning_rate": 0.0001,
"loss": 0.2196,
"loss/crossentropy": 2.315016746520996,
"loss/fcd": 0.4287109375,
"loss/idx": 18.0,
"loss/logits": 0.21962474286556244,
"step": 584
},
{
"epoch": 0.008421204160218807,
"grad_norm": 0.10595703125,
"grad_norm_var": 5.244811375935872e-05,
"learning_rate": 0.0001,
"loss": 0.2172,
"loss/crossentropy": 2.390581250190735,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.21722210943698883,
"step": 585
},
{
"epoch": 0.008435599381005507,
"grad_norm": 0.10498046875,
"grad_norm_var": 5.18798828125e-05,
"learning_rate": 0.0001,
"loss": 0.202,
"loss/crossentropy": 2.451754093170166,
"loss/fcd": 0.412109375,
"loss/idx": 18.0,
"loss/logits": 0.20203383266925812,
"step": 586
},
{
"epoch": 0.008449994601792205,
"grad_norm": 0.1279296875,
"grad_norm_var": 5.8710575103759766e-05,
"learning_rate": 0.0001,
"loss": 0.2503,
"loss/crossentropy": 2.1609503030776978,
"loss/fcd": 0.4873046875,
"loss/idx": 18.0,
"loss/logits": 0.2503489702939987,
"step": 587
},
{
"epoch": 0.008464389822578904,
"grad_norm": 0.10888671875,
"grad_norm_var": 5.839268366495768e-05,
"learning_rate": 0.0001,
"loss": 0.1994,
"loss/crossentropy": 2.347867727279663,
"loss/fcd": 0.41796875,
"loss/idx": 18.0,
"loss/logits": 0.1994006633758545,
"step": 588
},
{
"epoch": 0.008478785043365602,
"grad_norm": 0.10400390625,
"grad_norm_var": 5.947351455688476e-05,
"learning_rate": 0.0001,
"loss": 0.2185,
"loss/crossentropy": 2.502691388130188,
"loss/fcd": 0.4140625,
"loss/idx": 18.0,
"loss/logits": 0.2184857726097107,
"step": 589
},
{
"epoch": 0.008493180264152302,
"grad_norm": 0.1083984375,
"grad_norm_var": 5.555152893066406e-05,
"learning_rate": 0.0001,
"loss": 0.2103,
"loss/crossentropy": 2.1813002228736877,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.21025604009628296,
"step": 590
},
{
"epoch": 0.008507575484939,
"grad_norm": 0.10595703125,
"grad_norm_var": 5.262692769368489e-05,
"learning_rate": 0.0001,
"loss": 0.2165,
"loss/crossentropy": 2.544050931930542,
"loss/fcd": 0.427734375,
"loss/idx": 18.0,
"loss/logits": 0.21648868918418884,
"step": 591
},
{
"epoch": 0.008521970705725699,
"grad_norm": 0.126953125,
"grad_norm_var": 7.121463616689046e-05,
"learning_rate": 0.0001,
"loss": 0.2075,
"loss/crossentropy": 2.072811484336853,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.20745252817869186,
"step": 592
},
{
"epoch": 0.008536365926512399,
"grad_norm": 0.09375,
"grad_norm_var": 8.921523888905843e-05,
"learning_rate": 0.0001,
"loss": 0.1907,
"loss/crossentropy": 2.657747268676758,
"loss/fcd": 0.423828125,
"loss/idx": 18.0,
"loss/logits": 0.1907452642917633,
"step": 593
},
{
"epoch": 0.008550761147299097,
"grad_norm": 0.18359375,
"grad_norm_var": 0.0004295577605565389,
"learning_rate": 0.0001,
"loss": 0.2969,
"loss/crossentropy": 2.365026593208313,
"loss/fcd": 0.5234375,
"loss/idx": 18.0,
"loss/logits": 0.29685717821121216,
"step": 594
},
{
"epoch": 0.008565156368085795,
"grad_norm": 0.10888671875,
"grad_norm_var": 0.00042495330174764,
"learning_rate": 0.0001,
"loss": 0.2275,
"loss/crossentropy": 2.617425799369812,
"loss/fcd": 0.455078125,
"loss/idx": 18.0,
"loss/logits": 0.2274792492389679,
"step": 595
},
{
"epoch": 0.008579551588872494,
"grad_norm": 0.099609375,
"grad_norm_var": 0.0004302342732747396,
"learning_rate": 0.0001,
"loss": 0.2138,
"loss/crossentropy": 2.589759111404419,
"loss/fcd": 0.44140625,
"loss/idx": 18.0,
"loss/logits": 0.2137622982263565,
"step": 596
},
{
"epoch": 0.008593946809659194,
"grad_norm": 0.123046875,
"grad_norm_var": 0.00042901734511057533,
"learning_rate": 0.0001,
"loss": 0.2186,
"loss/crossentropy": 2.165451228618622,
"loss/fcd": 0.443359375,
"loss/idx": 18.0,
"loss/logits": 0.21863602101802826,
"step": 597
},
{
"epoch": 0.008608342030445892,
"grad_norm": 0.111328125,
"grad_norm_var": 0.00042969385782877604,
"learning_rate": 0.0001,
"loss": 0.2004,
"loss/crossentropy": 2.421903610229492,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.20041261613368988,
"step": 598
},
{
"epoch": 0.00862273725123259,
"grad_norm": 0.11474609375,
"grad_norm_var": 0.0004218568404515584,
"learning_rate": 0.0001,
"loss": 0.249,
"loss/crossentropy": 2.555266857147217,
"loss/fcd": 0.478515625,
"loss/idx": 18.0,
"loss/logits": 0.2490156590938568,
"step": 599
},
{
"epoch": 0.008637132472019289,
"grad_norm": 0.111328125,
"grad_norm_var": 0.00041990180810292564,
"learning_rate": 0.0001,
"loss": 0.2241,
"loss/crossentropy": 2.4431397914886475,
"loss/fcd": 0.439453125,
"loss/idx": 18.0,
"loss/logits": 0.22414565831422806,
"step": 600
},
{
"epoch": 0.008651527692805989,
"grad_norm": 0.12158203125,
"grad_norm_var": 0.0004164050022761027,
"learning_rate": 0.0001,
"loss": 0.1972,
"loss/crossentropy": 2.086324453353882,
"loss/fcd": 0.4306640625,
"loss/idx": 18.0,
"loss/logits": 0.19717370718717575,
"step": 601
},
{
"epoch": 0.008665922913592687,
"grad_norm": 0.10302734375,
"grad_norm_var": 0.000419496496518453,
"learning_rate": 0.0001,
"loss": 0.1806,
"loss/crossentropy": 2.2410671710968018,
"loss/fcd": 0.3984375,
"loss/idx": 18.0,
"loss/logits": 0.18064773827791214,
"step": 602
},
{
"epoch": 0.008680318134379385,
"grad_norm": 0.10205078125,
"grad_norm_var": 0.00041954914728800456,
"learning_rate": 0.0001,
"loss": 0.214,
"loss/crossentropy": 2.3243794441223145,
"loss/fcd": 0.4345703125,
"loss/idx": 18.0,
"loss/logits": 0.2140304446220398,
"step": 603
},
{
"epoch": 0.008694713355166086,
"grad_norm": 0.12353515625,
"grad_norm_var": 0.0004225889841715495,
"learning_rate": 0.0001,
"loss": 0.2533,
"loss/crossentropy": 2.4268319606781006,
"loss/fcd": 0.48828125,
"loss/idx": 18.0,
"loss/logits": 0.25334879010915756,
"step": 604
},
{
"epoch": 0.008709108575952784,
"grad_norm": 0.10009765625,
"grad_norm_var": 0.00042932828267415365,
"learning_rate": 0.0001,
"loss": 0.1851,
"loss/crossentropy": 2.3854864835739136,
"loss/fcd": 0.408203125,
"loss/idx": 18.0,
"loss/logits": 0.18514161556959152,
"step": 605
},
{
"epoch": 0.008723503796739482,
"grad_norm": 0.10400390625,
"grad_norm_var": 0.0004343261321385702,
"learning_rate": 0.0001,
"loss": 0.1993,
"loss/crossentropy": 2.385258913040161,
"loss/fcd": 0.4482421875,
"loss/idx": 18.0,
"loss/logits": 0.19932958483695984,
"step": 606
},
{
"epoch": 0.00873789901752618,
"grad_norm": 0.1064453125,
"grad_norm_var": 0.00043377876281738283,
"learning_rate": 0.0001,
"loss": 0.2085,
"loss/crossentropy": 2.4880837202072144,
"loss/fcd": 0.4326171875,
"loss/idx": 18.0,
"loss/logits": 0.20852985978126526,
"step": 607
},
{
"epoch": 0.00875229423831288,
"grad_norm": 0.111328125,
"grad_norm_var": 0.0004233519236246745,
"learning_rate": 0.0001,
"loss": 0.2413,
"loss/crossentropy": 2.486106753349304,
"loss/fcd": 0.4619140625,
"loss/idx": 18.0,
"loss/logits": 0.2413138523697853,
"step": 608
},
{
"epoch": 0.008766689459099579,
"grad_norm": 0.09912109375,
"grad_norm_var": 0.0004109054803848267,
"learning_rate": 0.0001,
"loss": 0.2237,
"loss/crossentropy": 2.713275671005249,
"loss/fcd": 0.43359375,
"loss/idx": 18.0,
"loss/logits": 0.22369590401649475,
"step": 609
},
{
"epoch": 0.008781084679886277,
"grad_norm": 0.10205078125,
"grad_norm_var": 6.965001424153646e-05,
"learning_rate": 0.0001,
"loss": 0.2017,
"loss/crossentropy": 2.4143831729888916,
"loss/fcd": 0.4013671875,
"loss/idx": 18.0,
"loss/logits": 0.2017301544547081,
"step": 610
},
{
"epoch": 0.008795479900672977,
"grad_norm": 0.10302734375,
"grad_norm_var": 7.179578145345052e-05,
"learning_rate": 0.0001,
"loss": 0.2116,
"loss/crossentropy": 2.3723723888397217,
"loss/fcd": 0.427734375,
"loss/idx": 18.0,
"loss/logits": 0.21161457151174545,
"step": 611
},
{
"epoch": 0.008809875121459676,
"grad_norm": 0.095703125,
"grad_norm_var": 7.739067077636719e-05,
"learning_rate": 0.0001,
"loss": 0.1875,
"loss/crossentropy": 2.3493517637252808,
"loss/fcd": 0.416015625,
"loss/idx": 18.0,
"loss/logits": 0.18747683614492416,
"step": 612
},
{
"epoch": 0.008824270342246374,
"grad_norm": 0.0986328125,
"grad_norm_var": 6.656249364217122e-05,
"learning_rate": 0.0001,
"loss": 0.1934,
"loss/crossentropy": 2.484821081161499,
"loss/fcd": 0.4384765625,
"loss/idx": 18.0,
"loss/logits": 0.1934322491288185,
"step": 613
},
{
"epoch": 0.008838665563033072,
"grad_norm": 0.2138671875,
"grad_norm_var": 0.000786288579305013,
"learning_rate": 0.0001,
"loss": 0.2388,
"loss/crossentropy": 2.2311092615127563,
"loss/fcd": 0.521484375,
"loss/idx": 18.0,
"loss/logits": 0.23880772292613983,
"step": 614
},
{
"epoch": 0.008853060783819772,
"grad_norm": 0.1142578125,
"grad_norm_var": 0.0007862001657485962,
"learning_rate": 0.0001,
"loss": 0.209,
"loss/crossentropy": 2.1388099193573,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.20903942734003067,
"step": 615
},
{
"epoch": 0.00886745600460647,
"grad_norm": 0.1396484375,
"grad_norm_var": 0.0008295287688573201,
"learning_rate": 0.0001,
"loss": 0.2045,
"loss/crossentropy": 2.118674635887146,
"loss/fcd": 0.501953125,
"loss/idx": 18.0,
"loss/logits": 0.20450318604707718,
"step": 616
},
{
"epoch": 0.008881851225393169,
"grad_norm": 0.1025390625,
"grad_norm_var": 0.0008352239926656087,
"learning_rate": 0.0001,
"loss": 0.1834,
"loss/crossentropy": 2.149811267852783,
"loss/fcd": 0.3935546875,
"loss/idx": 18.0,
"loss/logits": 0.18339695036411285,
"step": 617
},
{
"epoch": 0.008896246446179869,
"grad_norm": 0.12109375,
"grad_norm_var": 0.0008298943440119426,
"learning_rate": 0.0001,
"loss": 0.2338,
"loss/crossentropy": 2.324687123298645,
"loss/fcd": 0.4658203125,
"loss/idx": 18.0,
"loss/logits": 0.23375140875577927,
"step": 618
},
{
"epoch": 0.008910641666966567,
"grad_norm": 0.1162109375,
"grad_norm_var": 0.0008182843526204427,
"learning_rate": 0.0001,
"loss": 0.2211,
"loss/crossentropy": 2.2215802669525146,
"loss/fcd": 0.447265625,
"loss/idx": 18.0,
"loss/logits": 0.22114143520593643,
"step": 619
},
{
"epoch": 0.008925036887753266,
"grad_norm": 0.0986328125,
"grad_norm_var": 0.0008311023314793905,
"learning_rate": 0.0001,
"loss": 0.2057,
"loss/crossentropy": 2.406686782836914,
"loss/fcd": 0.42578125,
"loss/idx": 18.0,
"loss/logits": 0.20571539551019669,
"step": 620
},
{
"epoch": 0.008939432108539964,
"grad_norm": 0.115234375,
"grad_norm_var": 0.0008170286814371745,
"learning_rate": 0.0001,
"loss": 0.2058,
"loss/crossentropy": 2.327828884124756,
"loss/fcd": 0.4208984375,
"loss/idx": 18.0,
"loss/logits": 0.2057729959487915,
"step": 621
},
{
"epoch": 0.008953827329326664,
"grad_norm": 0.1044921875,
"grad_norm_var": 0.0008163203795750936,
"learning_rate": 0.0001,
"loss": 0.2041,
"loss/crossentropy": 2.394818425178528,
"loss/fcd": 0.4228515625,
"loss/idx": 18.0,
"loss/logits": 0.2041458711028099,
"step": 622
},
{
"epoch": 0.008968222550113363,
"grad_norm": 0.09765625,
"grad_norm_var": 0.0008313407500584921,
"learning_rate": 0.0001,
"loss": 0.1949,
"loss/crossentropy": 2.384241223335266,
"loss/fcd": 0.4375,
"loss/idx": 18.0,
"loss/logits": 0.19486035406589508,
"step": 623
},
{
"epoch": 0.00898261777090006,
"grad_norm": 0.1025390625,
"grad_norm_var": 0.0008399953444798787,
"learning_rate": 0.0001,
"loss": 0.2079,
"loss/crossentropy": 2.4613648653030396,
"loss/fcd": 0.427734375,
"loss/idx": 18.0,
"loss/logits": 0.20785125344991684,
"step": 624
},
{
"epoch": 0.00899701299168676,
"grad_norm": 0.123046875,
"grad_norm_var": 0.0008281668027242025,
"learning_rate": 0.0001,
"loss": 0.2737,
"loss/crossentropy": 2.572801351547241,
"loss/fcd": 0.54296875,
"loss/idx": 18.0,
"loss/logits": 0.27374986559152603,
"step": 625
},
{
"epoch": 0.00901140821247346,
"grad_norm": 0.123046875,
"grad_norm_var": 0.0008179575204849243,
"learning_rate": 0.0001,
"loss": 0.2015,
"loss/crossentropy": 1.862765610218048,
"loss/fcd": 0.4208984375,
"loss/idx": 18.0,
"loss/logits": 0.2015407457947731,
"step": 626
},
{
"epoch": 0.009025803433260158,
"grad_norm": 0.107421875,
"grad_norm_var": 0.0008110642433166504,
"learning_rate": 0.0001,
"loss": 0.2186,
"loss/crossentropy": 2.5048106908798218,
"loss/fcd": 0.4541015625,
"loss/idx": 18.0,
"loss/logits": 0.21860718727111816,
"step": 627
},
{
"epoch": 0.009040198654046856,
"grad_norm": 0.1337890625,
"grad_norm_var": 0.0007929325103759766,
"learning_rate": 0.0001,
"loss": 0.22,
"loss/crossentropy": 2.3713510036468506,
"loss/fcd": 0.4541015625,
"loss/idx": 18.0,
"loss/logits": 0.21999357640743256,
"step": 628
},
{
"epoch": 0.009054593874833556,
"grad_norm": 0.1123046875,
"grad_norm_var": 0.0007665634155273437,
"learning_rate": 0.0001,
"loss": 0.2194,
"loss/crossentropy": 2.5511568784713745,
"loss/fcd": 0.447265625,
"loss/idx": 18.0,
"loss/logits": 0.2194477617740631,
"step": 629
},
{
"epoch": 0.009068989095620254,
"grad_norm": 0.1142578125,
"grad_norm_var": 0.00014481544494628906,
"learning_rate": 0.0001,
"loss": 0.2324,
"loss/crossentropy": 2.371564745903015,
"loss/fcd": 0.4443359375,
"loss/idx": 18.0,
"loss/logits": 0.2323940396308899,
"step": 630
},
{
"epoch": 0.009083384316406953,
"grad_norm": 0.10595703125,
"grad_norm_var": 0.00014898677666982016,
"learning_rate": 0.0001,
"loss": 0.2114,
"loss/crossentropy": 2.4888617992401123,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.21141232550144196,
"step": 631
},
{
"epoch": 0.009097779537193651,
"grad_norm": 0.107421875,
"grad_norm_var": 0.0001020421584447225,
"learning_rate": 0.0001,
"loss": 0.2337,
"loss/crossentropy": 2.7666863203048706,
"loss/fcd": 0.484375,
"loss/idx": 18.0,
"loss/logits": 0.23369022458791733,
"step": 632
},
{
"epoch": 0.009112174757980351,
"grad_norm": 0.1025390625,
"grad_norm_var": 0.0001020421584447225,
"learning_rate": 0.0001,
"loss": 0.2183,
"loss/crossentropy": 2.369840621948242,
"loss/fcd": 0.42578125,
"loss/idx": 18.0,
"loss/logits": 0.21829679608345032,
"step": 633
},
{
"epoch": 0.00912656997876705,
"grad_norm": 0.115234375,
"grad_norm_var": 9.677310784657796e-05,
"learning_rate": 0.0001,
"loss": 0.2211,
"loss/crossentropy": 2.469444990158081,
"loss/fcd": 0.4775390625,
"loss/idx": 18.0,
"loss/logits": 0.22109205275774002,
"step": 634
},
{
"epoch": 0.009140965199553748,
"grad_norm": 0.111328125,
"grad_norm_var": 9.50247049331665e-05,
"learning_rate": 0.0001,
"loss": 0.2079,
"loss/crossentropy": 2.3658159971237183,
"loss/fcd": 0.4619140625,
"loss/idx": 18.0,
"loss/logits": 0.20790337026119232,
"step": 635
},
{
"epoch": 0.009155360420340448,
"grad_norm": 0.09423828125,
"grad_norm_var": 0.0001034379005432129,
"learning_rate": 0.0001,
"loss": 0.1845,
"loss/crossentropy": 2.618008255958557,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.18447843939065933,
"step": 636
},
{
"epoch": 0.009169755641127146,
"grad_norm": 0.10986328125,
"grad_norm_var": 0.00010196268558502198,
"learning_rate": 0.0001,
"loss": 0.2325,
"loss/crossentropy": 2.4641441106796265,
"loss/fcd": 0.453125,
"loss/idx": 18.0,
"loss/logits": 0.23245185613632202,
"step": 637
},
{
"epoch": 0.009184150861913844,
"grad_norm": 0.11865234375,
"grad_norm_var": 0.00010348955790201823,
"learning_rate": 0.0001,
"loss": 0.2099,
"loss/crossentropy": 2.5920947790145874,
"loss/fcd": 0.5068359375,
"loss/idx": 18.0,
"loss/logits": 0.20993127673864365,
"step": 638
},
{
"epoch": 0.009198546082700543,
"grad_norm": 0.10400390625,
"grad_norm_var": 9.453992048899332e-05,
"learning_rate": 0.0001,
"loss": 0.2037,
"loss/crossentropy": 2.207823634147644,
"loss/fcd": 0.4130859375,
"loss/idx": 18.0,
"loss/logits": 0.20368105918169022,
"step": 639
},
{
"epoch": 0.009212941303487243,
"grad_norm": 0.1103515625,
"grad_norm_var": 8.891324202219645e-05,
"learning_rate": 0.0001,
"loss": 0.1993,
"loss/crossentropy": 2.3137396574020386,
"loss/fcd": 0.4287109375,
"loss/idx": 18.0,
"loss/logits": 0.1993313431739807,
"step": 640
},
{
"epoch": 0.009227336524273941,
"grad_norm": 0.10009765625,
"grad_norm_var": 8.830626805623372e-05,
"learning_rate": 0.0001,
"loss": 0.2031,
"loss/crossentropy": 2.4254961013793945,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.20313136279582977,
"step": 641
},
{
"epoch": 0.00924173174506064,
"grad_norm": 0.107421875,
"grad_norm_var": 7.775227228800456e-05,
"learning_rate": 0.0001,
"loss": 0.1889,
"loss/crossentropy": 1.978569746017456,
"loss/fcd": 0.400390625,
"loss/idx": 18.0,
"loss/logits": 0.18890459090471268,
"step": 642
},
{
"epoch": 0.00925612696584734,
"grad_norm": 0.109375,
"grad_norm_var": 7.740259170532226e-05,
"learning_rate": 0.0001,
"loss": 0.2315,
"loss/crossentropy": 2.575870633125305,
"loss/fcd": 0.484375,
"loss/idx": 18.0,
"loss/logits": 0.2314896583557129,
"step": 643
},
{
"epoch": 0.009270522186634038,
"grad_norm": 0.095703125,
"grad_norm_var": 4.6253204345703125e-05,
"learning_rate": 0.0001,
"loss": 0.1942,
"loss/crossentropy": 2.4864895343780518,
"loss/fcd": 0.4208984375,
"loss/idx": 18.0,
"loss/logits": 0.19422397762537003,
"step": 644
},
{
"epoch": 0.009284917407420736,
"grad_norm": 0.11083984375,
"grad_norm_var": 4.5433640480041504e-05,
"learning_rate": 0.0001,
"loss": 0.2053,
"loss/crossentropy": 2.3608009815216064,
"loss/fcd": 0.4375,
"loss/idx": 18.0,
"loss/logits": 0.20532061159610748,
"step": 645
},
{
"epoch": 0.009299312628207435,
"grad_norm": 0.12109375,
"grad_norm_var": 5.466838677724202e-05,
"learning_rate": 0.0001,
"loss": 0.2398,
"loss/crossentropy": 2.3986343145370483,
"loss/fcd": 0.5078125,
"loss/idx": 18.0,
"loss/logits": 0.23981131613254547,
"step": 646
},
{
"epoch": 0.009313707848994135,
"grad_norm": 0.10498046875,
"grad_norm_var": 5.496243635813395e-05,
"learning_rate": 0.0001,
"loss": 0.2281,
"loss/crossentropy": 2.7443615198135376,
"loss/fcd": 0.4541015625,
"loss/idx": 18.0,
"loss/logits": 0.2280602902173996,
"step": 647
},
{
"epoch": 0.009328103069780833,
"grad_norm": 0.10107421875,
"grad_norm_var": 5.771319071451823e-05,
"learning_rate": 0.0001,
"loss": 0.2143,
"loss/crossentropy": 2.785035014152527,
"loss/fcd": 0.453125,
"loss/idx": 18.0,
"loss/logits": 0.2142793908715248,
"step": 648
},
{
"epoch": 0.009342498290567531,
"grad_norm": 0.111328125,
"grad_norm_var": 5.6962172190348305e-05,
"learning_rate": 0.0001,
"loss": 0.2117,
"loss/crossentropy": 2.3756792545318604,
"loss/fcd": 0.462890625,
"loss/idx": 18.0,
"loss/logits": 0.21174004673957825,
"step": 649
},
{
"epoch": 0.00935689351135423,
"grad_norm": 0.0986328125,
"grad_norm_var": 5.784034729003906e-05,
"learning_rate": 0.0001,
"loss": 0.1971,
"loss/crossentropy": 2.4738396406173706,
"loss/fcd": 0.4169921875,
"loss/idx": 18.0,
"loss/logits": 0.19714603573083878,
"step": 650
},
{
"epoch": 0.00937128873214093,
"grad_norm": 0.09619140625,
"grad_norm_var": 6.304482618967692e-05,
"learning_rate": 0.0001,
"loss": 0.1734,
"loss/crossentropy": 2.1993648409843445,
"loss/fcd": 0.376953125,
"loss/idx": 18.0,
"loss/logits": 0.1733626276254654,
"step": 651
},
{
"epoch": 0.009385683952927628,
"grad_norm": 0.1279296875,
"grad_norm_var": 8.175770441691081e-05,
"learning_rate": 0.0001,
"loss": 0.2205,
"loss/crossentropy": 2.3313381671905518,
"loss/fcd": 0.447265625,
"loss/idx": 18.0,
"loss/logits": 0.22045698016881943,
"step": 652
},
{
"epoch": 0.009400079173714326,
"grad_norm": 0.115234375,
"grad_norm_var": 8.491575717926026e-05,
"learning_rate": 0.0001,
"loss": 0.2458,
"loss/crossentropy": 2.5517263412475586,
"loss/fcd": 0.47265625,
"loss/idx": 18.0,
"loss/logits": 0.2457558810710907,
"step": 653
},
{
"epoch": 0.009414474394501026,
"grad_norm": 0.10498046875,
"grad_norm_var": 7.773935794830322e-05,
"learning_rate": 0.0001,
"loss": 0.2048,
"loss/crossentropy": 2.1996500492095947,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.20480218529701233,
"step": 654
},
{
"epoch": 0.009428869615287725,
"grad_norm": 0.1396484375,
"grad_norm_var": 0.00014075835545857747,
"learning_rate": 0.0001,
"loss": 0.2842,
"loss/crossentropy": 2.34015429019928,
"loss/fcd": 0.544921875,
"loss/idx": 18.0,
"loss/logits": 0.2842213958501816,
"step": 655
},
{
"epoch": 0.009443264836074423,
"grad_norm": 0.1455078125,
"grad_norm_var": 0.00022115310033162436,
"learning_rate": 0.0001,
"loss": 0.2686,
"loss/crossentropy": 2.409281849861145,
"loss/fcd": 0.5390625,
"loss/idx": 18.0,
"loss/logits": 0.26864343136548996,
"step": 656
},
{
"epoch": 0.009457660056861121,
"grad_norm": 0.10986328125,
"grad_norm_var": 0.0002117753028869629,
"learning_rate": 0.0001,
"loss": 0.2035,
"loss/crossentropy": 2.250716805458069,
"loss/fcd": 0.400390625,
"loss/idx": 18.0,
"loss/logits": 0.20348752290010452,
"step": 657
},
{
"epoch": 0.009472055277647821,
"grad_norm": 0.109375,
"grad_norm_var": 0.00021069447199503581,
"learning_rate": 0.0001,
"loss": 0.1903,
"loss/crossentropy": 2.2704538106918335,
"loss/fcd": 0.4111328125,
"loss/idx": 18.0,
"loss/logits": 0.19032004475593567,
"step": 658
},
{
"epoch": 0.00948645049843452,
"grad_norm": 0.1064453125,
"grad_norm_var": 0.0002124945322672526,
"learning_rate": 0.0001,
"loss": 0.2032,
"loss/crossentropy": 2.31364369392395,
"loss/fcd": 0.4052734375,
"loss/idx": 18.0,
"loss/logits": 0.20321927964687347,
"step": 659
},
{
"epoch": 0.009500845719221218,
"grad_norm": 0.11181640625,
"grad_norm_var": 0.00019279221693674725,
"learning_rate": 0.0001,
"loss": 0.1964,
"loss/crossentropy": 1.959843933582306,
"loss/fcd": 0.3955078125,
"loss/idx": 18.0,
"loss/logits": 0.19641809910535812,
"step": 660
},
{
"epoch": 0.009515240940007918,
"grad_norm": 0.11279296875,
"grad_norm_var": 0.00019235511620839437,
"learning_rate": 0.0001,
"loss": 0.2212,
"loss/crossentropy": 2.4466131925582886,
"loss/fcd": 0.4765625,
"loss/idx": 18.0,
"loss/logits": 0.22118167579174042,
"step": 661
},
{
"epoch": 0.009529636160794617,
"grad_norm": 0.10498046875,
"grad_norm_var": 0.00019238789876302082,
"learning_rate": 0.0001,
"loss": 0.1816,
"loss/crossentropy": 2.186416506767273,
"loss/fcd": 0.4052734375,
"loss/idx": 18.0,
"loss/logits": 0.18155072629451752,
"step": 662
},
{
"epoch": 0.009544031381581315,
"grad_norm": 0.1142578125,
"grad_norm_var": 0.00018840531508127847,
"learning_rate": 0.0001,
"loss": 0.2391,
"loss/crossentropy": 2.504140853881836,
"loss/fcd": 0.490234375,
"loss/idx": 18.0,
"loss/logits": 0.2391308844089508,
"step": 663
},
{
"epoch": 0.009558426602368013,
"grad_norm": 0.11328125,
"grad_norm_var": 0.0001780986785888672,
"learning_rate": 0.0001,
"loss": 0.2317,
"loss/crossentropy": 2.4283803701400757,
"loss/fcd": 0.45703125,
"loss/idx": 18.0,
"loss/logits": 0.23169831186532974,
"step": 664
},
{
"epoch": 0.009572821823154713,
"grad_norm": 0.1005859375,
"grad_norm_var": 0.0001889824867248535,
"learning_rate": 0.0001,
"loss": 0.2064,
"loss/crossentropy": 2.438134789466858,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.20637594163417816,
"step": 665
},
{
"epoch": 0.009587217043941412,
"grad_norm": 0.1103515625,
"grad_norm_var": 0.00017477273941040038,
"learning_rate": 0.0001,
"loss": 0.1955,
"loss/crossentropy": 2.309617757797241,
"loss/fcd": 0.4287109375,
"loss/idx": 18.0,
"loss/logits": 0.1954583376646042,
"step": 666
},
{
"epoch": 0.00960161226472811,
"grad_norm": 0.1064453125,
"grad_norm_var": 0.00015706121921539308,
"learning_rate": 0.0001,
"loss": 0.2246,
"loss/crossentropy": 2.53112256526947,
"loss/fcd": 0.4482421875,
"loss/idx": 18.0,
"loss/logits": 0.22456367313861847,
"step": 667
},
{
"epoch": 0.009616007485514808,
"grad_norm": 0.10302734375,
"grad_norm_var": 0.00015153884887695313,
"learning_rate": 0.0001,
"loss": 0.2352,
"loss/crossentropy": 2.456951379776001,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.23520664870738983,
"step": 668
},
{
"epoch": 0.009630402706301508,
"grad_norm": 0.1220703125,
"grad_norm_var": 0.0001564621925354004,
"learning_rate": 0.0001,
"loss": 0.2193,
"loss/crossentropy": 2.065362870693207,
"loss/fcd": 0.4296875,
"loss/idx": 18.0,
"loss/logits": 0.2193107306957245,
"step": 669
},
{
"epoch": 0.009644797927088207,
"grad_norm": 0.115234375,
"grad_norm_var": 0.0001514345407485962,
"learning_rate": 0.0001,
"loss": 0.2085,
"loss/crossentropy": 2.2472126483917236,
"loss/fcd": 0.47265625,
"loss/idx": 18.0,
"loss/logits": 0.20847148448228836,
"step": 670
},
{
"epoch": 0.009659193147874905,
"grad_norm": 0.10400390625,
"grad_norm_var": 0.00010944604873657227,
"learning_rate": 0.0001,
"loss": 0.2067,
"loss/crossentropy": 2.3741711378097534,
"loss/fcd": 0.42578125,
"loss/idx": 18.0,
"loss/logits": 0.20672930777072906,
"step": 671
},
{
"epoch": 0.009673588368661605,
"grad_norm": 0.1044921875,
"grad_norm_var": 3.067255020141602e-05,
"learning_rate": 0.0001,
"loss": 0.2208,
"loss/crossentropy": 2.336190938949585,
"loss/fcd": 0.4169921875,
"loss/idx": 18.0,
"loss/logits": 0.22081031650304794,
"step": 672
},
{
"epoch": 0.009687983589448303,
"grad_norm": 0.10400390625,
"grad_norm_var": 3.2389163970947264e-05,
"learning_rate": 0.0001,
"loss": 0.2063,
"loss/crossentropy": 2.3808083534240723,
"loss/fcd": 0.416015625,
"loss/idx": 18.0,
"loss/logits": 0.20631127804517746,
"step": 673
},
{
"epoch": 0.009702378810235002,
"grad_norm": 0.09716796875,
"grad_norm_var": 4.1007002194722494e-05,
"learning_rate": 0.0001,
"loss": 0.1958,
"loss/crossentropy": 2.3818055391311646,
"loss/fcd": 0.4130859375,
"loss/idx": 18.0,
"loss/logits": 0.19577700644731522,
"step": 674
},
{
"epoch": 0.0097167740310217,
"grad_norm": 0.10888671875,
"grad_norm_var": 4.0813287099202475e-05,
"learning_rate": 0.0001,
"loss": 0.2047,
"loss/crossentropy": 2.5091700553894043,
"loss/fcd": 0.453125,
"loss/idx": 18.0,
"loss/logits": 0.2047055885195732,
"step": 675
},
{
"epoch": 0.0097311692518084,
"grad_norm": 0.10986328125,
"grad_norm_var": 4.01457150777181e-05,
"learning_rate": 0.0001,
"loss": 0.2221,
"loss/crossentropy": 2.404844641685486,
"loss/fcd": 0.4501953125,
"loss/idx": 18.0,
"loss/logits": 0.2221018671989441,
"step": 676
},
{
"epoch": 0.009745564472595098,
"grad_norm": 0.10302734375,
"grad_norm_var": 4.01457150777181e-05,
"learning_rate": 0.0001,
"loss": 0.2118,
"loss/crossentropy": 2.279817581176758,
"loss/fcd": 0.4052734375,
"loss/idx": 18.0,
"loss/logits": 0.2117534652352333,
"step": 677
},
{
"epoch": 0.009759959693381797,
"grad_norm": 0.1064453125,
"grad_norm_var": 3.9767225583394365e-05,
"learning_rate": 0.0001,
"loss": 0.2293,
"loss/crossentropy": 2.5266642570495605,
"loss/fcd": 0.4296875,
"loss/idx": 18.0,
"loss/logits": 0.22927331924438477,
"step": 678
},
{
"epoch": 0.009774354914168497,
"grad_norm": 0.11474609375,
"grad_norm_var": 4.020929336547852e-05,
"learning_rate": 0.0001,
"loss": 0.2077,
"loss/crossentropy": 2.128249764442444,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.20772311836481094,
"step": 679
},
{
"epoch": 0.009788750134955195,
"grad_norm": 0.09814453125,
"grad_norm_var": 4.331966241200765e-05,
"learning_rate": 0.0001,
"loss": 0.2052,
"loss/crossentropy": 2.6459118127822876,
"loss/fcd": 0.4296875,
"loss/idx": 18.0,
"loss/logits": 0.20518244057893753,
"step": 680
},
{
"epoch": 0.009803145355741893,
"grad_norm": 0.11669921875,
"grad_norm_var": 4.623730977376302e-05,
"learning_rate": 0.0001,
"loss": 0.2262,
"loss/crossentropy": 2.3065195083618164,
"loss/fcd": 0.45703125,
"loss/idx": 18.0,
"loss/logits": 0.2262207344174385,
"step": 681
},
{
"epoch": 0.009817540576528592,
"grad_norm": 0.1015625,
"grad_norm_var": 4.8061211903889976e-05,
"learning_rate": 0.0001,
"loss": 0.2083,
"loss/crossentropy": 2.4284178018569946,
"loss/fcd": 0.4228515625,
"loss/idx": 18.0,
"loss/logits": 0.20825288444757462,
"step": 682
},
{
"epoch": 0.009831935797315292,
"grad_norm": 0.1025390625,
"grad_norm_var": 4.942814509073893e-05,
"learning_rate": 0.0001,
"loss": 0.1944,
"loss/crossentropy": 2.3323564529418945,
"loss/fcd": 0.42578125,
"loss/idx": 18.0,
"loss/logits": 0.1943565011024475,
"step": 683
},
{
"epoch": 0.00984633101810199,
"grad_norm": 0.10595703125,
"grad_norm_var": 4.841486612955729e-05,
"learning_rate": 0.0001,
"loss": 0.2041,
"loss/crossentropy": 2.2371606826782227,
"loss/fcd": 0.4052734375,
"loss/idx": 18.0,
"loss/logits": 0.2040523663163185,
"step": 684
},
{
"epoch": 0.009860726238888689,
"grad_norm": 0.11083984375,
"grad_norm_var": 3.3997495969136556e-05,
"learning_rate": 0.0001,
"loss": 0.2427,
"loss/crossentropy": 2.6680363416671753,
"loss/fcd": 0.4697265625,
"loss/idx": 18.0,
"loss/logits": 0.24272434413433075,
"step": 685
},
{
"epoch": 0.009875121459675389,
"grad_norm": 0.111328125,
"grad_norm_var": 3.038942813873291e-05,
"learning_rate": 0.0001,
"loss": 0.2114,
"loss/crossentropy": 2.392301321029663,
"loss/fcd": 0.4267578125,
"loss/idx": 18.0,
"loss/logits": 0.2113867551088333,
"step": 686
},
{
"epoch": 0.009889516680462087,
"grad_norm": 0.10693359375,
"grad_norm_var": 3.0055642127990723e-05,
"learning_rate": 0.0001,
"loss": 0.238,
"loss/crossentropy": 2.646833062171936,
"loss/fcd": 0.458984375,
"loss/idx": 18.0,
"loss/logits": 0.23803511261940002,
"step": 687
},
{
"epoch": 0.009903911901248785,
"grad_norm": 0.1015625,
"grad_norm_var": 3.134310245513916e-05,
"learning_rate": 0.0001,
"loss": 0.1935,
"loss/crossentropy": 2.256480574607849,
"loss/fcd": 0.4013671875,
"loss/idx": 18.0,
"loss/logits": 0.19345563650131226,
"step": 688
},
{
"epoch": 0.009918307122035484,
"grad_norm": 0.109375,
"grad_norm_var": 3.155072530110677e-05,
"learning_rate": 0.0001,
"loss": 0.2321,
"loss/crossentropy": 2.425878643989563,
"loss/fcd": 0.451171875,
"loss/idx": 18.0,
"loss/logits": 0.23210398107767105,
"step": 689
},
{
"epoch": 0.009932702342822184,
"grad_norm": 0.11083984375,
"grad_norm_var": 2.6098887125651042e-05,
"learning_rate": 0.0001,
"loss": 0.2257,
"loss/crossentropy": 2.565882086753845,
"loss/fcd": 0.478515625,
"loss/idx": 18.0,
"loss/logits": 0.2256726175546646,
"step": 690
},
{
"epoch": 0.009947097563608882,
"grad_norm": 0.09912109375,
"grad_norm_var": 3.0152002970377605e-05,
"learning_rate": 0.0001,
"loss": 0.1932,
"loss/crossentropy": 2.3051689863204956,
"loss/fcd": 0.39453125,
"loss/idx": 18.0,
"loss/logits": 0.1932462379336357,
"step": 691
},
{
"epoch": 0.00996149278439558,
"grad_norm": 0.10888671875,
"grad_norm_var": 2.981424331665039e-05,
"learning_rate": 0.0001,
"loss": 0.2124,
"loss/crossentropy": 2.275663137435913,
"loss/fcd": 0.41796875,
"loss/idx": 18.0,
"loss/logits": 0.21243004500865936,
"step": 692
},
{
"epoch": 0.009975888005182279,
"grad_norm": 0.1044921875,
"grad_norm_var": 2.9221177101135254e-05,
"learning_rate": 0.0001,
"loss": 0.2011,
"loss/crossentropy": 2.344420909881592,
"loss/fcd": 0.408203125,
"loss/idx": 18.0,
"loss/logits": 0.20112024247646332,
"step": 693
},
{
"epoch": 0.009990283225968979,
"grad_norm": 0.1455078125,
"grad_norm_var": 0.00012252231438954672,
"learning_rate": 0.0001,
"loss": 0.2729,
"loss/crossentropy": 2.1788020730018616,
"loss/fcd": 0.521484375,
"loss/idx": 18.0,
"loss/logits": 0.27289582788944244,
"step": 694
},
{
"epoch": 0.010004678446755677,
"grad_norm": 0.111328125,
"grad_norm_var": 0.00012076298395792643,
"learning_rate": 0.0001,
"loss": 0.23,
"loss/crossentropy": 2.405027389526367,
"loss/fcd": 0.4306640625,
"loss/idx": 18.0,
"loss/logits": 0.22998760640621185,
"step": 695
},
{
"epoch": 0.010019073667542375,
"grad_norm": 0.11767578125,
"grad_norm_var": 0.00011615355809529622,
"learning_rate": 0.0001,
"loss": 0.2227,
"loss/crossentropy": 2.9466445446014404,
"loss/fcd": 0.4755859375,
"loss/idx": 18.0,
"loss/logits": 0.22269698232412338,
"step": 696
},
{
"epoch": 0.010033468888329075,
"grad_norm": 0.107421875,
"grad_norm_var": 0.00011360545953114828,
"learning_rate": 0.0001,
"loss": 0.2136,
"loss/crossentropy": 3.066506266593933,
"loss/fcd": 0.46875,
"loss/idx": 18.0,
"loss/logits": 0.21359677612781525,
"step": 697
},
{
"epoch": 0.010047864109115774,
"grad_norm": 0.09130859375,
"grad_norm_var": 0.0001313169797261556,
"learning_rate": 0.0001,
"loss": 0.1792,
"loss/crossentropy": 2.3103922605514526,
"loss/fcd": 0.388671875,
"loss/idx": 18.0,
"loss/logits": 0.17915956676006317,
"step": 698
},
{
"epoch": 0.010062259329902472,
"grad_norm": 0.1044921875,
"grad_norm_var": 0.000129854679107666,
"learning_rate": 0.0001,
"loss": 0.2095,
"loss/crossentropy": 2.201840400695801,
"loss/fcd": 0.4013671875,
"loss/idx": 18.0,
"loss/logits": 0.2094813957810402,
"step": 699
},
{
"epoch": 0.01007665455068917,
"grad_norm": 0.09619140625,
"grad_norm_var": 0.0001400272051493327,
"learning_rate": 0.0001,
"loss": 0.2055,
"loss/crossentropy": 2.5452860593795776,
"loss/fcd": 0.4267578125,
"loss/idx": 18.0,
"loss/logits": 0.20547957718372345,
"step": 700
},
{
"epoch": 0.01009104977147587,
"grad_norm": 0.1103515625,
"grad_norm_var": 0.00013989508152008058,
"learning_rate": 0.0001,
"loss": 0.1918,
"loss/crossentropy": 2.007373094558716,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.19176460802555084,
"step": 701
},
{
"epoch": 0.010105444992262569,
"grad_norm": 0.10498046875,
"grad_norm_var": 0.00014006296793619792,
"learning_rate": 0.0001,
"loss": 0.2082,
"loss/crossentropy": 2.3383631706237793,
"loss/fcd": 0.4384765625,
"loss/idx": 18.0,
"loss/logits": 0.20822366327047348,
"step": 702
},
{
"epoch": 0.010119840213049267,
"grad_norm": 0.1083984375,
"grad_norm_var": 0.000139958659807841,
"learning_rate": 0.0001,
"loss": 0.1983,
"loss/crossentropy": 1.9882320761680603,
"loss/fcd": 0.4287109375,
"loss/idx": 18.0,
"loss/logits": 0.19827204197645187,
"step": 703
},
{
"epoch": 0.010134235433835967,
"grad_norm": 0.11669921875,
"grad_norm_var": 0.00014079014460245768,
"learning_rate": 0.0001,
"loss": 0.238,
"loss/crossentropy": 2.5094656944274902,
"loss/fcd": 0.46875,
"loss/idx": 18.0,
"loss/logits": 0.23796136677265167,
"step": 704
},
{
"epoch": 0.010148630654622666,
"grad_norm": 0.10595703125,
"grad_norm_var": 0.00014143685499827066,
"learning_rate": 0.0001,
"loss": 0.206,
"loss/crossentropy": 2.1021994948387146,
"loss/fcd": 0.443359375,
"loss/idx": 18.0,
"loss/logits": 0.2059553563594818,
"step": 705
},
{
"epoch": 0.010163025875409364,
"grad_norm": 0.09814453125,
"grad_norm_var": 0.00014835894107818605,
"learning_rate": 0.0001,
"loss": 0.2037,
"loss/crossentropy": 2.3918451070785522,
"loss/fcd": 0.4248046875,
"loss/idx": 18.0,
"loss/logits": 0.20372479408979416,
"step": 706
},
{
"epoch": 0.010177421096196062,
"grad_norm": 0.1123046875,
"grad_norm_var": 0.00014328956604003906,
"learning_rate": 0.0001,
"loss": 0.2379,
"loss/crossentropy": 2.4593441486358643,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.23789776116609573,
"step": 707
},
{
"epoch": 0.010191816316982762,
"grad_norm": 0.10400390625,
"grad_norm_var": 0.00014485915501912436,
"learning_rate": 0.0001,
"loss": 0.1983,
"loss/crossentropy": 2.2852306365966797,
"loss/fcd": 0.4013671875,
"loss/idx": 18.0,
"loss/logits": 0.1982945054769516,
"step": 708
},
{
"epoch": 0.01020621153776946,
"grad_norm": 0.1103515625,
"grad_norm_var": 0.00014371474583943684,
"learning_rate": 0.0001,
"loss": 0.2481,
"loss/crossentropy": 2.5582568645477295,
"loss/fcd": 0.4580078125,
"loss/idx": 18.0,
"loss/logits": 0.24808169901371002,
"step": 709
},
{
"epoch": 0.010220606758556159,
"grad_norm": 0.11083984375,
"grad_norm_var": 5.040069421132406e-05,
"learning_rate": 0.0001,
"loss": 0.241,
"loss/crossentropy": 2.4824811220169067,
"loss/fcd": 0.51171875,
"loss/idx": 18.0,
"loss/logits": 0.2409602850675583,
"step": 710
},
{
"epoch": 0.010235001979342857,
"grad_norm": 0.12158203125,
"grad_norm_var": 6.302197774251302e-05,
"learning_rate": 0.0001,
"loss": 0.2292,
"loss/crossentropy": 2.237234354019165,
"loss/fcd": 0.4619140625,
"loss/idx": 18.0,
"loss/logits": 0.22918210923671722,
"step": 711
},
{
"epoch": 0.010249397200129557,
"grad_norm": 0.10595703125,
"grad_norm_var": 5.577405293782552e-05,
"learning_rate": 0.0001,
"loss": 0.207,
"loss/crossentropy": 2.31795597076416,
"loss/fcd": 0.4296875,
"loss/idx": 18.0,
"loss/logits": 0.20698396116495132,
"step": 712
},
{
"epoch": 0.010263792420916256,
"grad_norm": 0.103515625,
"grad_norm_var": 5.6409835815429686e-05,
"learning_rate": 0.0001,
"loss": 0.2065,
"loss/crossentropy": 2.5415326356887817,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.20651167631149292,
"step": 713
},
{
"epoch": 0.010278187641702954,
"grad_norm": 0.10791015625,
"grad_norm_var": 3.9859612782796225e-05,
"learning_rate": 0.0001,
"loss": 0.2139,
"loss/crossentropy": 2.2211133241653442,
"loss/fcd": 0.4208984375,
"loss/idx": 18.0,
"loss/logits": 0.21389107406139374,
"step": 714
},
{
"epoch": 0.010292582862489654,
"grad_norm": 0.1015625,
"grad_norm_var": 4.161198933919271e-05,
"learning_rate": 0.0001,
"loss": 0.212,
"loss/crossentropy": 2.3691943883895874,
"loss/fcd": 0.4521484375,
"loss/idx": 18.0,
"loss/logits": 0.2119893953204155,
"step": 715
},
{
"epoch": 0.010306978083276352,
"grad_norm": 0.10400390625,
"grad_norm_var": 3.372828165690104e-05,
"learning_rate": 0.0001,
"loss": 0.2145,
"loss/crossentropy": 2.390496850013733,
"loss/fcd": 0.4287109375,
"loss/idx": 18.0,
"loss/logits": 0.2145363911986351,
"step": 716
},
{
"epoch": 0.01032137330406305,
"grad_norm": 0.1005859375,
"grad_norm_var": 3.650983174641927e-05,
"learning_rate": 0.0001,
"loss": 0.1619,
"loss/crossentropy": 1.7626497149467468,
"loss/fcd": 0.513671875,
"loss/idx": 18.0,
"loss/logits": 0.16188892722129822,
"step": 717
},
{
"epoch": 0.010335768524849749,
"grad_norm": 0.11279296875,
"grad_norm_var": 3.790855407714844e-05,
"learning_rate": 0.0001,
"loss": 0.2436,
"loss/crossentropy": 2.6944552659988403,
"loss/fcd": 0.4609375,
"loss/idx": 18.0,
"loss/logits": 0.2435958907008171,
"step": 718
},
{
"epoch": 0.01035016374563645,
"grad_norm": 0.10107421875,
"grad_norm_var": 4.0665268898010254e-05,
"learning_rate": 0.0001,
"loss": 0.2072,
"loss/crossentropy": 2.383134961128235,
"loss/fcd": 0.4091796875,
"loss/idx": 18.0,
"loss/logits": 0.20715758204460144,
"step": 719
},
{
"epoch": 0.010364558966423148,
"grad_norm": 0.1005859375,
"grad_norm_var": 3.6764144897460935e-05,
"learning_rate": 0.0001,
"loss": 0.2088,
"loss/crossentropy": 2.5325286388397217,
"loss/fcd": 0.4189453125,
"loss/idx": 18.0,
"loss/logits": 0.20877134799957275,
"step": 720
},
{
"epoch": 0.010378954187209846,
"grad_norm": 0.1005859375,
"grad_norm_var": 3.8829445838928225e-05,
"learning_rate": 0.0001,
"loss": 0.2236,
"loss/crossentropy": 2.6585100889205933,
"loss/fcd": 0.4140625,
"loss/idx": 18.0,
"loss/logits": 0.22355867177248,
"step": 721
},
{
"epoch": 0.010393349407996546,
"grad_norm": 0.1025390625,
"grad_norm_var": 3.544092178344727e-05,
"learning_rate": 0.0001,
"loss": 0.2016,
"loss/crossentropy": 2.3599932193756104,
"loss/fcd": 0.423828125,
"loss/idx": 18.0,
"loss/logits": 0.2016456127166748,
"step": 722
},
{
"epoch": 0.010407744628783244,
"grad_norm": 0.1220703125,
"grad_norm_var": 4.926919937133789e-05,
"learning_rate": 0.0001,
"loss": 0.2227,
"loss/crossentropy": 2.1093697547912598,
"loss/fcd": 0.4326171875,
"loss/idx": 18.0,
"loss/logits": 0.22268912196159363,
"step": 723
},
{
"epoch": 0.010422139849569943,
"grad_norm": 0.1044921875,
"grad_norm_var": 4.9097339312235516e-05,
"learning_rate": 0.0001,
"loss": 0.2061,
"loss/crossentropy": 2.120736837387085,
"loss/fcd": 0.41796875,
"loss/idx": 18.0,
"loss/logits": 0.20611396431922913,
"step": 724
},
{
"epoch": 0.010436535070356641,
"grad_norm": 0.1123046875,
"grad_norm_var": 5.023380120595296e-05,
"learning_rate": 0.0001,
"loss": 0.2238,
"loss/crossentropy": 2.3321027755737305,
"loss/fcd": 0.4560546875,
"loss/idx": 18.0,
"loss/logits": 0.22382070124149323,
"step": 725
},
{
"epoch": 0.010450930291143341,
"grad_norm": 0.1083984375,
"grad_norm_var": 4.936456680297852e-05,
"learning_rate": 0.0001,
"loss": 0.206,
"loss/crossentropy": 2.2690643668174744,
"loss/fcd": 0.4091796875,
"loss/idx": 18.0,
"loss/logits": 0.20598538219928741,
"step": 726
},
{
"epoch": 0.01046532551193004,
"grad_norm": 0.10888671875,
"grad_norm_var": 3.4538904825846356e-05,
"learning_rate": 0.0001,
"loss": 0.2204,
"loss/crossentropy": 2.39444100856781,
"loss/fcd": 0.4404296875,
"loss/idx": 18.0,
"loss/logits": 0.22039655596017838,
"step": 727
},
{
"epoch": 0.010479720732716738,
"grad_norm": 0.10986328125,
"grad_norm_var": 3.5429000854492186e-05,
"learning_rate": 0.0001,
"loss": 0.2212,
"loss/crossentropy": 2.4713072776794434,
"loss/fcd": 0.4306640625,
"loss/idx": 18.0,
"loss/logits": 0.2212340533733368,
"step": 728
},
{
"epoch": 0.010494115953503438,
"grad_norm": 0.138671875,
"grad_norm_var": 9.951591491699218e-05,
"learning_rate": 0.0001,
"loss": 0.2904,
"loss/crossentropy": 2.2529489994049072,
"loss/fcd": 0.5947265625,
"loss/idx": 18.0,
"loss/logits": 0.29038895666599274,
"step": 729
},
{
"epoch": 0.010508511174290136,
"grad_norm": 0.142578125,
"grad_norm_var": 0.00017181138197580975,
"learning_rate": 0.0001,
"loss": 0.2931,
"loss/crossentropy": 2.3451786041259766,
"loss/fcd": 0.5263671875,
"loss/idx": 18.0,
"loss/logits": 0.29310375452041626,
"step": 730
},
{
"epoch": 0.010522906395076834,
"grad_norm": 0.111328125,
"grad_norm_var": 0.00016589065392812093,
"learning_rate": 0.0001,
"loss": 0.2063,
"loss/crossentropy": 2.3045698404312134,
"loss/fcd": 0.4111328125,
"loss/idx": 18.0,
"loss/logits": 0.2062971591949463,
"step": 731
},
{
"epoch": 0.010537301615863533,
"grad_norm": 0.10205078125,
"grad_norm_var": 0.00016802847385406495,
"learning_rate": 0.0001,
"loss": 0.2105,
"loss/crossentropy": 2.5085275173187256,
"loss/fcd": 0.4384765625,
"loss/idx": 18.0,
"loss/logits": 0.21049045026302338,
"step": 732
},
{
"epoch": 0.010551696836650233,
"grad_norm": 0.10107421875,
"grad_norm_var": 0.00016735394795735676,
"learning_rate": 0.0001,
"loss": 0.2196,
"loss/crossentropy": 2.644802451133728,
"loss/fcd": 0.42578125,
"loss/idx": 18.0,
"loss/logits": 0.2196320742368698,
"step": 733
},
{
"epoch": 0.010566092057436931,
"grad_norm": 0.10791015625,
"grad_norm_var": 0.00016781091690063477,
"learning_rate": 0.0001,
"loss": 0.1974,
"loss/crossentropy": 2.2515525817871094,
"loss/fcd": 0.3974609375,
"loss/idx": 18.0,
"loss/logits": 0.19744951277971268,
"step": 734
},
{
"epoch": 0.01058048727822363,
"grad_norm": 0.1015625,
"grad_norm_var": 0.0001671860615412394,
"learning_rate": 0.0001,
"loss": 0.1965,
"loss/crossentropy": 2.3382036685943604,
"loss/fcd": 0.3974609375,
"loss/idx": 18.0,
"loss/logits": 0.1964586153626442,
"step": 735
},
{
"epoch": 0.010594882499010328,
"grad_norm": 0.095703125,
"grad_norm_var": 0.00017541150252024332,
"learning_rate": 0.0001,
"loss": 0.2012,
"loss/crossentropy": 2.638480305671692,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.20124144107103348,
"step": 736
},
{
"epoch": 0.010609277719797028,
"grad_norm": 0.10546875,
"grad_norm_var": 0.00017036497592926025,
"learning_rate": 0.0001,
"loss": 0.222,
"loss/crossentropy": 2.498441696166992,
"loss/fcd": 0.431640625,
"loss/idx": 18.0,
"loss/logits": 0.22197365015745163,
"step": 737
},
{
"epoch": 0.010623672940583726,
"grad_norm": 0.09814453125,
"grad_norm_var": 0.00017648935317993164,
"learning_rate": 0.0001,
"loss": 0.1943,
"loss/crossentropy": 2.2127552032470703,
"loss/fcd": 0.3896484375,
"loss/idx": 18.0,
"loss/logits": 0.19434216618537903,
"step": 738
},
{
"epoch": 0.010638068161370424,
"grad_norm": 0.1005859375,
"grad_norm_var": 0.00017264286677042643,
"learning_rate": 0.0001,
"loss": 0.2065,
"loss/crossentropy": 2.4787211418151855,
"loss/fcd": 0.4140625,
"loss/idx": 18.0,
"loss/logits": 0.20646335184574127,
"step": 739
},
{
"epoch": 0.010652463382157125,
"grad_norm": 0.1015625,
"grad_norm_var": 0.0001750628153483073,
"learning_rate": 0.0001,
"loss": 0.1945,
"loss/crossentropy": 2.167446494102478,
"loss/fcd": 0.41015625,
"loss/idx": 18.0,
"loss/logits": 0.19451382011175156,
"step": 740
},
{
"epoch": 0.010666858602943823,
"grad_norm": 0.10205078125,
"grad_norm_var": 0.00017729500929514568,
"learning_rate": 0.0001,
"loss": 0.2069,
"loss/crossentropy": 2.3936961889266968,
"loss/fcd": 0.421875,
"loss/idx": 18.0,
"loss/logits": 0.20688295364379883,
"step": 741
},
{
"epoch": 0.010681253823730521,
"grad_norm": 0.10595703125,
"grad_norm_var": 0.00017769734064737957,
"learning_rate": 0.0001,
"loss": 0.2344,
"loss/crossentropy": 2.502206325531006,
"loss/fcd": 0.453125,
"loss/idx": 18.0,
"loss/logits": 0.23436500132083893,
"step": 742
},
{
"epoch": 0.01069564904451722,
"grad_norm": 0.12451171875,
"grad_norm_var": 0.00019410053888956706,
"learning_rate": 0.0001,
"loss": 0.2446,
"loss/crossentropy": 2.7519075870513916,
"loss/fcd": 0.4853515625,
"loss/idx": 18.0,
"loss/logits": 0.2446460798382759,
"step": 743
},
{
"epoch": 0.01071004426530392,
"grad_norm": 0.109375,
"grad_norm_var": 0.00019407967726389568,
"learning_rate": 0.0001,
"loss": 0.2061,
"loss/crossentropy": 2.3958401679992676,
"loss/fcd": 0.4345703125,
"loss/idx": 18.0,
"loss/logits": 0.20611582696437836,
"step": 744
},
{
"epoch": 0.010724439486090618,
"grad_norm": 0.09716796875,
"grad_norm_var": 0.00013910929361979167,
"learning_rate": 0.0001,
"loss": 0.2229,
"loss/crossentropy": 2.6051418781280518,
"loss/fcd": 0.427734375,
"loss/idx": 18.0,
"loss/logits": 0.22285999357700348,
"step": 745
},
{
"epoch": 0.010738834706877316,
"grad_norm": 0.09814453125,
"grad_norm_var": 4.988412062327067e-05,
"learning_rate": 0.0001,
"loss": 0.2096,
"loss/crossentropy": 2.5375572443008423,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.20957274734973907,
"step": 746
},
{
"epoch": 0.010753229927664016,
"grad_norm": 0.123046875,
"grad_norm_var": 7.005433241526285e-05,
"learning_rate": 0.0001,
"loss": 0.2273,
"loss/crossentropy": 2.2432570457458496,
"loss/fcd": 0.46484375,
"loss/idx": 18.0,
"loss/logits": 0.22729168832302094,
"step": 747
},
{
"epoch": 0.010767625148450715,
"grad_norm": 0.09912109375,
"grad_norm_var": 7.160405317942301e-05,
"learning_rate": 0.0001,
"loss": 0.1981,
"loss/crossentropy": 2.451253056526184,
"loss/fcd": 0.404296875,
"loss/idx": 18.0,
"loss/logits": 0.1981128826737404,
"step": 748
},
{
"epoch": 0.010782020369237413,
"grad_norm": 0.1083984375,
"grad_norm_var": 7.164875666300455e-05,
"learning_rate": 0.0001,
"loss": 0.2238,
"loss/crossentropy": 2.6088002920150757,
"loss/fcd": 0.44921875,
"loss/idx": 18.0,
"loss/logits": 0.22383547574281693,
"step": 749
},
{
"epoch": 0.010796415590024111,
"grad_norm": 0.11328125,
"grad_norm_var": 7.559359073638916e-05,
"learning_rate": 0.0001,
"loss": 0.2204,
"loss/crossentropy": 2.3209699392318726,
"loss/fcd": 0.4443359375,
"loss/idx": 18.0,
"loss/logits": 0.2203991711139679,
"step": 750
},
{
"epoch": 0.010810810810810811,
"grad_norm": 0.1240234375,
"grad_norm_var": 9.606579939524333e-05,
"learning_rate": 0.0001,
"loss": 0.2522,
"loss/crossentropy": 2.3715856075286865,
"loss/fcd": 0.447265625,
"loss/idx": 18.0,
"loss/logits": 0.2521570920944214,
"step": 751
},
{
"epoch": 0.01082520603159751,
"grad_norm": 0.115234375,
"grad_norm_var": 9.13769006729126e-05,
"learning_rate": 0.0001,
"loss": 0.217,
"loss/crossentropy": 2.3642451763153076,
"loss/fcd": 0.4931640625,
"loss/idx": 18.0,
"loss/logits": 0.21697237342596054,
"step": 752
},
{
"epoch": 0.010839601252384208,
"grad_norm": 0.10693359375,
"grad_norm_var": 9.104013442993165e-05,
"learning_rate": 0.0001,
"loss": 0.2155,
"loss/crossentropy": 2.4382712841033936,
"loss/fcd": 0.4462890625,
"loss/idx": 18.0,
"loss/logits": 0.21553221344947815,
"step": 753
},
{
"epoch": 0.010853996473170906,
"grad_norm": 0.1015625,
"grad_norm_var": 8.729199568430583e-05,
"learning_rate": 0.0001,
"loss": 0.1936,
"loss/crossentropy": 2.401493191719055,
"loss/fcd": 0.41015625,
"loss/idx": 18.0,
"loss/logits": 0.19359815120697021,
"step": 754
},
{
"epoch": 0.010868391693957606,
"grad_norm": 0.11279296875,
"grad_norm_var": 8.423725763956706e-05,
"learning_rate": 0.0001,
"loss": 0.1962,
"loss/crossentropy": 2.1797173619270325,
"loss/fcd": 0.4287109375,
"loss/idx": 18.0,
"loss/logits": 0.19620782881975174,
"step": 755
},
{
"epoch": 0.010882786914744305,
"grad_norm": 0.11767578125,
"grad_norm_var": 8.459786574045817e-05,
"learning_rate": 0.0001,
"loss": 0.2162,
"loss/crossentropy": 2.2014777660369873,
"loss/fcd": 0.431640625,
"loss/idx": 18.0,
"loss/logits": 0.21618105471134186,
"step": 756
},
{
"epoch": 0.010897182135531003,
"grad_norm": 0.10498046875,
"grad_norm_var": 8.204678694407145e-05,
"learning_rate": 0.0001,
"loss": 0.2144,
"loss/crossentropy": 2.5520023107528687,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.2143661305308342,
"step": 757
},
{
"epoch": 0.010911577356317703,
"grad_norm": 0.1357421875,
"grad_norm_var": 0.00012089014053344727,
"learning_rate": 0.0001,
"loss": 0.2172,
"loss/crossentropy": 2.605940818786621,
"loss/fcd": 0.470703125,
"loss/idx": 18.0,
"loss/logits": 0.2171928584575653,
"step": 758
},
{
"epoch": 0.010925972577104402,
"grad_norm": 0.1015625,
"grad_norm_var": 0.00011552075544993082,
"learning_rate": 0.0001,
"loss": 0.1896,
"loss/crossentropy": 2.260614037513733,
"loss/fcd": 0.3916015625,
"loss/idx": 18.0,
"loss/logits": 0.1895817369222641,
"step": 759
},
{
"epoch": 0.0109403677978911,
"grad_norm": 0.130859375,
"grad_norm_var": 0.00014096001784006754,
"learning_rate": 0.0001,
"loss": 0.2287,
"loss/crossentropy": 2.3699567317962646,
"loss/fcd": 0.5078125,
"loss/idx": 18.0,
"loss/logits": 0.2286616861820221,
"step": 760
},
{
"epoch": 0.010954763018677798,
"grad_norm": 0.1123046875,
"grad_norm_var": 0.00012553135553995768,
"learning_rate": 0.0001,
"loss": 0.1979,
"loss/crossentropy": 2.0666418075561523,
"loss/fcd": 0.431640625,
"loss/idx": 18.0,
"loss/logits": 0.19788716733455658,
"step": 761
},
{
"epoch": 0.010969158239464498,
"grad_norm": 0.11669921875,
"grad_norm_var": 0.0001106580098470052,
"learning_rate": 0.0001,
"loss": 0.2149,
"loss/crossentropy": 2.25100314617157,
"loss/fcd": 0.421875,
"loss/idx": 18.0,
"loss/logits": 0.21493691205978394,
"step": 762
},
{
"epoch": 0.010983553460251197,
"grad_norm": 0.11669921875,
"grad_norm_var": 0.00010553101698557536,
"learning_rate": 0.0001,
"loss": 0.2,
"loss/crossentropy": 2.3312637209892273,
"loss/fcd": 0.4306640625,
"loss/idx": 18.0,
"loss/logits": 0.19997263699769974,
"step": 763
},
{
"epoch": 0.010997948681037895,
"grad_norm": 0.10595703125,
"grad_norm_var": 9.52392816543579e-05,
"learning_rate": 0.0001,
"loss": 0.2213,
"loss/crossentropy": 2.4567571878433228,
"loss/fcd": 0.4169921875,
"loss/idx": 18.0,
"loss/logits": 0.2212778776884079,
"step": 764
},
{
"epoch": 0.011012343901824595,
"grad_norm": 0.1279296875,
"grad_norm_var": 0.00010437866051991781,
"learning_rate": 0.0001,
"loss": 0.2505,
"loss/crossentropy": 2.3997398614883423,
"loss/fcd": 0.48046875,
"loss/idx": 18.0,
"loss/logits": 0.25046147406101227,
"step": 765
},
{
"epoch": 0.011026739122611293,
"grad_norm": 0.09716796875,
"grad_norm_var": 0.00012486775716145834,
"learning_rate": 0.0001,
"loss": 0.1976,
"loss/crossentropy": 2.327947497367859,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.1975831389427185,
"step": 766
},
{
"epoch": 0.011041134343397992,
"grad_norm": 0.125,
"grad_norm_var": 0.00012619892756144207,
"learning_rate": 0.0001,
"loss": 0.2108,
"loss/crossentropy": 2.3216136693954468,
"loss/fcd": 0.4521484375,
"loss/idx": 18.0,
"loss/logits": 0.21076547354459763,
"step": 767
},
{
"epoch": 0.01105552956418469,
"grad_norm": 0.1298828125,
"grad_norm_var": 0.00014139811197916668,
"learning_rate": 0.0001,
"loss": 0.2461,
"loss/crossentropy": 2.2610775232315063,
"loss/fcd": 0.5029296875,
"loss/idx": 18.0,
"loss/logits": 0.2461041733622551,
"step": 768
},
{
"epoch": 0.01106992478497139,
"grad_norm": 0.109375,
"grad_norm_var": 0.00013906856377919515,
"learning_rate": 0.0001,
"loss": 0.199,
"loss/crossentropy": 2.2911869883537292,
"loss/fcd": 0.4208984375,
"loss/idx": 18.0,
"loss/logits": 0.19901156425476074,
"step": 769
},
{
"epoch": 0.011084320005758088,
"grad_norm": 0.10009765625,
"grad_norm_var": 0.00014190276463826496,
"learning_rate": 0.0001,
"loss": 0.21,
"loss/crossentropy": 2.432590365409851,
"loss/fcd": 0.41796875,
"loss/idx": 18.0,
"loss/logits": 0.2100282460451126,
"step": 770
},
{
"epoch": 0.011098715226544787,
"grad_norm": 0.10498046875,
"grad_norm_var": 0.00014832417170206706,
"learning_rate": 0.0001,
"loss": 0.179,
"loss/crossentropy": 2.154644250869751,
"loss/fcd": 0.4853515625,
"loss/idx": 18.0,
"loss/logits": 0.17896521091461182,
"step": 771
},
{
"epoch": 0.011113110447331487,
"grad_norm": 0.11572265625,
"grad_norm_var": 0.00014781554539998373,
"learning_rate": 0.0001,
"loss": 0.2293,
"loss/crossentropy": 2.5124725103378296,
"loss/fcd": 0.453125,
"loss/idx": 18.0,
"loss/logits": 0.22927424311637878,
"step": 772
},
{
"epoch": 0.011127505668118185,
"grad_norm": 0.111328125,
"grad_norm_var": 0.00014212032159169514,
"learning_rate": 0.0001,
"loss": 0.2246,
"loss/crossentropy": 2.411632537841797,
"loss/fcd": 0.455078125,
"loss/idx": 18.0,
"loss/logits": 0.22455725073814392,
"step": 773
},
{
"epoch": 0.011141900888904883,
"grad_norm": 0.11279296875,
"grad_norm_var": 0.00011181831359863281,
"learning_rate": 0.0001,
"loss": 0.2336,
"loss/crossentropy": 2.4840848445892334,
"loss/fcd": 0.462890625,
"loss/idx": 18.0,
"loss/logits": 0.23362614214420319,
"step": 774
},
{
"epoch": 0.011156296109691582,
"grad_norm": 0.1142578125,
"grad_norm_var": 0.00010143518447875976,
"learning_rate": 0.0001,
"loss": 0.2162,
"loss/crossentropy": 2.2171601057052612,
"loss/fcd": 0.4228515625,
"loss/idx": 18.0,
"loss/logits": 0.2162095457315445,
"step": 775
},
{
"epoch": 0.011170691330478282,
"grad_norm": 0.10400390625,
"grad_norm_var": 8.772114912668864e-05,
"learning_rate": 0.0001,
"loss": 0.2101,
"loss/crossentropy": 2.466732382774353,
"loss/fcd": 0.4287109375,
"loss/idx": 18.0,
"loss/logits": 0.21009384095668793,
"step": 776
},
{
"epoch": 0.01118508655126498,
"grad_norm": 0.11572265625,
"grad_norm_var": 8.824268976847331e-05,
"learning_rate": 0.0001,
"loss": 0.2331,
"loss/crossentropy": 2.463024854660034,
"loss/fcd": 0.4765625,
"loss/idx": 18.0,
"loss/logits": 0.2330816239118576,
"step": 777
},
{
"epoch": 0.011199481772051679,
"grad_norm": 0.11865234375,
"grad_norm_var": 8.945067723592122e-05,
"learning_rate": 0.0001,
"loss": 0.2142,
"loss/crossentropy": 2.1225094199180603,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.2141725867986679,
"step": 778
},
{
"epoch": 0.011213876992838377,
"grad_norm": 0.11279296875,
"grad_norm_var": 8.852879206339518e-05,
"learning_rate": 0.0001,
"loss": 0.2065,
"loss/crossentropy": 2.0846282243728638,
"loss/fcd": 0.4111328125,
"loss/idx": 18.0,
"loss/logits": 0.20651061832904816,
"step": 779
},
{
"epoch": 0.011228272213625077,
"grad_norm": 0.11376953125,
"grad_norm_var": 8.51591428120931e-05,
"learning_rate": 0.0001,
"loss": 0.2156,
"loss/crossentropy": 2.2128478288650513,
"loss/fcd": 0.3984375,
"loss/idx": 18.0,
"loss/logits": 0.21557357162237167,
"step": 780
},
{
"epoch": 0.011242667434411775,
"grad_norm": 0.10498046875,
"grad_norm_var": 7.343987623850504e-05,
"learning_rate": 0.0001,
"loss": 0.2015,
"loss/crossentropy": 2.3130797147750854,
"loss/fcd": 0.439453125,
"loss/idx": 18.0,
"loss/logits": 0.20153620839118958,
"step": 781
},
{
"epoch": 0.011257062655198474,
"grad_norm": 0.1025390625,
"grad_norm_var": 6.468693415323893e-05,
"learning_rate": 0.0001,
"loss": 0.2148,
"loss/crossentropy": 2.5943338871002197,
"loss/fcd": 0.458984375,
"loss/idx": 18.0,
"loss/logits": 0.2147517278790474,
"step": 782
},
{
"epoch": 0.011271457875985174,
"grad_norm": 0.107421875,
"grad_norm_var": 5.4101149241129555e-05,
"learning_rate": 0.0001,
"loss": 0.229,
"loss/crossentropy": 2.7100160121917725,
"loss/fcd": 0.4580078125,
"loss/idx": 18.0,
"loss/logits": 0.22897624969482422,
"step": 783
},
{
"epoch": 0.011285853096771872,
"grad_norm": 0.11474609375,
"grad_norm_var": 3.060400485992432e-05,
"learning_rate": 0.0001,
"loss": 0.2299,
"loss/crossentropy": 2.500633478164673,
"loss/fcd": 0.44921875,
"loss/idx": 18.0,
"loss/logits": 0.22991500794887543,
"step": 784
},
{
"epoch": 0.01130024831755857,
"grad_norm": 0.10888671875,
"grad_norm_var": 3.067255020141602e-05,
"learning_rate": 0.0001,
"loss": 0.226,
"loss/crossentropy": 2.4316182136535645,
"loss/fcd": 0.423828125,
"loss/idx": 18.0,
"loss/logits": 0.22597461938858032,
"step": 785
},
{
"epoch": 0.011314643538345269,
"grad_norm": 0.1015625,
"grad_norm_var": 2.8839707374572755e-05,
"learning_rate": 0.0001,
"loss": 0.217,
"loss/crossentropy": 2.592137098312378,
"loss/fcd": 0.45703125,
"loss/idx": 18.0,
"loss/logits": 0.2169811800122261,
"step": 786
},
{
"epoch": 0.011329038759131969,
"grad_norm": 0.099609375,
"grad_norm_var": 3.44236691792806e-05,
"learning_rate": 0.0001,
"loss": 0.1974,
"loss/crossentropy": 2.287144422531128,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.1974037140607834,
"step": 787
},
{
"epoch": 0.011343433979918667,
"grad_norm": 0.09521484375,
"grad_norm_var": 4.4854482014973957e-05,
"learning_rate": 0.0001,
"loss": 0.2052,
"loss/crossentropy": 2.4738489389419556,
"loss/fcd": 0.421875,
"loss/idx": 18.0,
"loss/logits": 0.20523115992546082,
"step": 788
},
{
"epoch": 0.011357829200705365,
"grad_norm": 0.1015625,
"grad_norm_var": 4.7318140665690105e-05,
"learning_rate": 0.0001,
"loss": 0.215,
"loss/crossentropy": 2.4524784088134766,
"loss/fcd": 0.4560546875,
"loss/idx": 18.0,
"loss/logits": 0.21502291411161423,
"step": 789
},
{
"epoch": 0.011372224421492065,
"grad_norm": 0.1005859375,
"grad_norm_var": 4.888276259104411e-05,
"learning_rate": 0.0001,
"loss": 0.1999,
"loss/crossentropy": 2.2310436964035034,
"loss/fcd": 0.3916015625,
"loss/idx": 18.0,
"loss/logits": 0.19993127137422562,
"step": 790
},
{
"epoch": 0.011386619642278764,
"grad_norm": 0.09765625,
"grad_norm_var": 5.063911279042562e-05,
"learning_rate": 0.0001,
"loss": 0.2285,
"loss/crossentropy": 2.613986611366272,
"loss/fcd": 0.421875,
"loss/idx": 18.0,
"loss/logits": 0.22845745831727982,
"step": 791
},
{
"epoch": 0.011401014863065462,
"grad_norm": 0.1005859375,
"grad_norm_var": 5.238453547159831e-05,
"learning_rate": 0.0001,
"loss": 0.2083,
"loss/crossentropy": 2.5012824535369873,
"loss/fcd": 0.4287109375,
"loss/idx": 18.0,
"loss/logits": 0.20829569548368454,
"step": 792
},
{
"epoch": 0.01141541008385216,
"grad_norm": 0.11572265625,
"grad_norm_var": 5.238453547159831e-05,
"learning_rate": 0.0001,
"loss": 0.2444,
"loss/crossentropy": 2.225709557533264,
"loss/fcd": 0.45703125,
"loss/idx": 18.0,
"loss/logits": 0.2444288209080696,
"step": 793
},
{
"epoch": 0.01142980530463886,
"grad_norm": 0.10009765625,
"grad_norm_var": 4.264513651529948e-05,
"learning_rate": 0.0001,
"loss": 0.2043,
"loss/crossentropy": 2.2551809549331665,
"loss/fcd": 0.3994140625,
"loss/idx": 18.0,
"loss/logits": 0.20429246127605438,
"step": 794
},
{
"epoch": 0.011444200525425559,
"grad_norm": 0.1025390625,
"grad_norm_var": 3.8368503252665204e-05,
"learning_rate": 0.0001,
"loss": 0.2353,
"loss/crossentropy": 2.4520708322525024,
"loss/fcd": 0.44140625,
"loss/idx": 18.0,
"loss/logits": 0.2352810874581337,
"step": 795
},
{
"epoch": 0.011458595746212257,
"grad_norm": 0.09423828125,
"grad_norm_var": 3.733535607655843e-05,
"learning_rate": 0.0001,
"loss": 0.2003,
"loss/crossentropy": 2.3560184240341187,
"loss/fcd": 0.4130859375,
"loss/idx": 18.0,
"loss/logits": 0.2003132924437523,
"step": 796
},
{
"epoch": 0.011472990966998955,
"grad_norm": 0.10009765625,
"grad_norm_var": 3.7534038225809735e-05,
"learning_rate": 0.0001,
"loss": 0.2268,
"loss/crossentropy": 2.6456328630447388,
"loss/fcd": 0.4384765625,
"loss/idx": 18.0,
"loss/logits": 0.22680091857910156,
"step": 797
},
{
"epoch": 0.011487386187785656,
"grad_norm": 0.1064453125,
"grad_norm_var": 3.840823968251546e-05,
"learning_rate": 0.0001,
"loss": 0.2003,
"loss/crossentropy": 2.5123294591903687,
"loss/fcd": 0.439453125,
"loss/idx": 18.0,
"loss/logits": 0.20031608641147614,
"step": 798
},
{
"epoch": 0.011501781408572354,
"grad_norm": 0.130859375,
"grad_norm_var": 8.675952752431233e-05,
"learning_rate": 0.0001,
"loss": 0.2347,
"loss/crossentropy": 2.2425618171691895,
"loss/fcd": 0.451171875,
"loss/idx": 18.0,
"loss/logits": 0.23472215235233307,
"step": 799
},
{
"epoch": 0.011516176629359052,
"grad_norm": 0.11767578125,
"grad_norm_var": 9.133716424306234e-05,
"learning_rate": 0.0001,
"loss": 0.2115,
"loss/crossentropy": 2.1281662583351135,
"loss/fcd": 0.4208984375,
"loss/idx": 18.0,
"loss/logits": 0.21148262917995453,
"step": 800
},
{
"epoch": 0.011530571850145752,
"grad_norm": 0.1083984375,
"grad_norm_var": 9.107192357381185e-05,
"learning_rate": 0.0001,
"loss": 0.2171,
"loss/crossentropy": 2.4536547660827637,
"loss/fcd": 0.42578125,
"loss/idx": 18.0,
"loss/logits": 0.2171497568488121,
"step": 801
},
{
"epoch": 0.01154496707093245,
"grad_norm": 0.1201171875,
"grad_norm_var": 0.00010519027709960937,
"learning_rate": 0.0001,
"loss": 0.2157,
"loss/crossentropy": 2.3697547912597656,
"loss/fcd": 0.44921875,
"loss/idx": 18.0,
"loss/logits": 0.21570491790771484,
"step": 802
},
{
"epoch": 0.011559362291719149,
"grad_norm": 0.11083984375,
"grad_norm_var": 0.00010393361250559489,
"learning_rate": 0.0001,
"loss": 0.2189,
"loss/crossentropy": 2.4509881734848022,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.21891363710165024,
"step": 803
},
{
"epoch": 0.011573757512505847,
"grad_norm": 0.1103515625,
"grad_norm_var": 9.564956029256185e-05,
"learning_rate": 0.0001,
"loss": 0.2175,
"loss/crossentropy": 2.1731194853782654,
"loss/fcd": 0.474609375,
"loss/idx": 18.0,
"loss/logits": 0.21748338639736176,
"step": 804
},
{
"epoch": 0.011588152733292547,
"grad_norm": 0.10791015625,
"grad_norm_var": 9.326040744781495e-05,
"learning_rate": 0.0001,
"loss": 0.2308,
"loss/crossentropy": 2.4915411472320557,
"loss/fcd": 0.45703125,
"loss/idx": 18.0,
"loss/logits": 0.2308463379740715,
"step": 805
},
{
"epoch": 0.011602547954079246,
"grad_norm": 0.10107421875,
"grad_norm_var": 9.280840555826823e-05,
"learning_rate": 0.0001,
"loss": 0.2106,
"loss/crossentropy": 2.4593664407730103,
"loss/fcd": 0.4306640625,
"loss/idx": 18.0,
"loss/logits": 0.21063391864299774,
"step": 806
},
{
"epoch": 0.011616943174865944,
"grad_norm": 0.09912109375,
"grad_norm_var": 9.096364180246988e-05,
"learning_rate": 0.0001,
"loss": 0.2093,
"loss/crossentropy": 2.420872926712036,
"loss/fcd": 0.431640625,
"loss/idx": 18.0,
"loss/logits": 0.20925325900316238,
"step": 807
},
{
"epoch": 0.011631338395652644,
"grad_norm": 0.10009765625,
"grad_norm_var": 9.145339330037434e-05,
"learning_rate": 0.0001,
"loss": 0.2035,
"loss/crossentropy": 2.4789732694625854,
"loss/fcd": 0.41796875,
"loss/idx": 18.0,
"loss/logits": 0.2034958302974701,
"step": 808
},
{
"epoch": 0.011645733616439342,
"grad_norm": 0.0986328125,
"grad_norm_var": 9.176631768544515e-05,
"learning_rate": 0.0001,
"loss": 0.2055,
"loss/crossentropy": 2.356053352355957,
"loss/fcd": 0.4296875,
"loss/idx": 18.0,
"loss/logits": 0.20546124875545502,
"step": 809
},
{
"epoch": 0.01166012883722604,
"grad_norm": 0.1005859375,
"grad_norm_var": 9.134610493977864e-05,
"learning_rate": 0.0001,
"loss": 0.1982,
"loss/crossentropy": 2.286035180091858,
"loss/fcd": 0.40625,
"loss/idx": 18.0,
"loss/logits": 0.19822601974010468,
"step": 810
},
{
"epoch": 0.011674524058012739,
"grad_norm": 0.107421875,
"grad_norm_var": 9.005467096964518e-05,
"learning_rate": 0.0001,
"loss": 0.2307,
"loss/crossentropy": 2.546161413192749,
"loss/fcd": 0.458984375,
"loss/idx": 18.0,
"loss/logits": 0.23073262721300125,
"step": 811
},
{
"epoch": 0.01168891927879944,
"grad_norm": 0.10693359375,
"grad_norm_var": 7.832845052083334e-05,
"learning_rate": 0.0001,
"loss": 0.214,
"loss/crossentropy": 2.4045225381851196,
"loss/fcd": 0.427734375,
"loss/idx": 18.0,
"loss/logits": 0.21400006115436554,
"step": 812
},
{
"epoch": 0.011703314499586137,
"grad_norm": 0.10693359375,
"grad_norm_var": 7.412830988566081e-05,
"learning_rate": 0.0001,
"loss": 0.2324,
"loss/crossentropy": 2.3815245628356934,
"loss/fcd": 0.44140625,
"loss/idx": 18.0,
"loss/logits": 0.23237691074609756,
"step": 813
},
{
"epoch": 0.011717709720372836,
"grad_norm": 0.1064453125,
"grad_norm_var": 7.412830988566081e-05,
"learning_rate": 0.0001,
"loss": 0.197,
"loss/crossentropy": 2.2638756036758423,
"loss/fcd": 0.4453125,
"loss/idx": 18.0,
"loss/logits": 0.19701003283262253,
"step": 814
},
{
"epoch": 0.011732104941159536,
"grad_norm": 0.095703125,
"grad_norm_var": 4.5804182688395184e-05,
"learning_rate": 0.0001,
"loss": 0.1993,
"loss/crossentropy": 2.310957193374634,
"loss/fcd": 0.3994140625,
"loss/idx": 18.0,
"loss/logits": 0.19931814819574356,
"step": 815
},
{
"epoch": 0.011746500161946234,
"grad_norm": 0.13671875,
"grad_norm_var": 9.775857130686441e-05,
"learning_rate": 0.0001,
"loss": 0.2326,
"loss/crossentropy": 2.3524898290634155,
"loss/fcd": 0.490234375,
"loss/idx": 18.0,
"loss/logits": 0.23264919221401215,
"step": 816
},
{
"epoch": 0.011760895382732933,
"grad_norm": 0.109375,
"grad_norm_var": 9.795725345611573e-05,
"learning_rate": 0.0001,
"loss": 0.2404,
"loss/crossentropy": 2.542204737663269,
"loss/fcd": 0.45703125,
"loss/idx": 18.0,
"loss/logits": 0.2403649091720581,
"step": 817
},
{
"epoch": 0.01177529060351963,
"grad_norm": 0.1298828125,
"grad_norm_var": 0.00012048780918121338,
"learning_rate": 0.0001,
"loss": 0.2259,
"loss/crossentropy": 2.300834894180298,
"loss/fcd": 0.4453125,
"loss/idx": 18.0,
"loss/logits": 0.2259274125099182,
"step": 818
},
{
"epoch": 0.011789685824306331,
"grad_norm": 0.1044921875,
"grad_norm_var": 0.0001206040382385254,
"learning_rate": 0.0001,
"loss": 0.2003,
"loss/crossentropy": 2.309138298034668,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.2003060281276703,
"step": 819
},
{
"epoch": 0.01180408104509303,
"grad_norm": 0.1083984375,
"grad_norm_var": 0.00012012720108032227,
"learning_rate": 0.0001,
"loss": 0.2192,
"loss/crossentropy": 2.516822099685669,
"loss/fcd": 0.453125,
"loss/idx": 18.0,
"loss/logits": 0.2192147672176361,
"step": 820
},
{
"epoch": 0.011818476265879728,
"grad_norm": 0.11279296875,
"grad_norm_var": 0.00012189547220865885,
"learning_rate": 0.0001,
"loss": 0.222,
"loss/crossentropy": 2.5142600536346436,
"loss/fcd": 0.478515625,
"loss/idx": 18.0,
"loss/logits": 0.22199787199497223,
"step": 821
},
{
"epoch": 0.011832871486666426,
"grad_norm": 0.10693359375,
"grad_norm_var": 0.00011879603068033854,
"learning_rate": 0.0001,
"loss": 0.2006,
"loss/crossentropy": 2.166727066040039,
"loss/fcd": 0.3984375,
"loss/idx": 18.0,
"loss/logits": 0.20062025636434555,
"step": 822
},
{
"epoch": 0.011847266707453126,
"grad_norm": 0.11572265625,
"grad_norm_var": 0.00011602640151977539,
"learning_rate": 0.0001,
"loss": 0.2171,
"loss/crossentropy": 2.2036046981811523,
"loss/fcd": 0.443359375,
"loss/idx": 18.0,
"loss/logits": 0.21710190176963806,
"step": 823
},
{
"epoch": 0.011861661928239824,
"grad_norm": 0.10791015625,
"grad_norm_var": 0.00011036793390909831,
"learning_rate": 0.0001,
"loss": 0.2275,
"loss/crossentropy": 2.2625406980514526,
"loss/fcd": 0.4423828125,
"loss/idx": 18.0,
"loss/logits": 0.22749044001102448,
"step": 824
},
{
"epoch": 0.011876057149026523,
"grad_norm": 0.7890625,
"grad_norm_var": 0.028886699676513673,
"learning_rate": 0.0001,
"loss": 0.2046,
"loss/crossentropy": 1.833857238292694,
"loss/fcd": 0.5595703125,
"loss/idx": 18.0,
"loss/logits": 0.20455920696258545,
"step": 825
},
{
"epoch": 0.011890452369813223,
"grad_norm": 0.1044921875,
"grad_norm_var": 0.028860441843668618,
"learning_rate": 0.0001,
"loss": 0.2183,
"loss/crossentropy": 2.433130979537964,
"loss/fcd": 0.423828125,
"loss/idx": 18.0,
"loss/logits": 0.21825896203517914,
"step": 826
},
{
"epoch": 0.011904847590599921,
"grad_norm": 0.1103515625,
"grad_norm_var": 0.028843144575754803,
"learning_rate": 0.0001,
"loss": 0.2252,
"loss/crossentropy": 2.3956053256988525,
"loss/fcd": 0.4375,
"loss/idx": 18.0,
"loss/logits": 0.22523467242717743,
"step": 827
},
{
"epoch": 0.01191924281138662,
"grad_norm": 0.10888671875,
"grad_norm_var": 0.028831319014231364,
"learning_rate": 0.0001,
"loss": 0.2197,
"loss/crossentropy": 2.4500895738601685,
"loss/fcd": 0.458984375,
"loss/idx": 18.0,
"loss/logits": 0.2196703627705574,
"step": 828
},
{
"epoch": 0.011933638032173318,
"grad_norm": 0.11181640625,
"grad_norm_var": 0.0288025697072347,
"learning_rate": 0.0001,
"loss": 0.2463,
"loss/crossentropy": 2.4316296577453613,
"loss/fcd": 0.470703125,
"loss/idx": 18.0,
"loss/logits": 0.24632105976343155,
"step": 829
},
{
"epoch": 0.011948033252960018,
"grad_norm": 0.11669921875,
"grad_norm_var": 0.028744553526242573,
"learning_rate": 0.0001,
"loss": 0.2168,
"loss/crossentropy": 2.432488799095154,
"loss/fcd": 0.462890625,
"loss/idx": 18.0,
"loss/logits": 0.21679828315973282,
"step": 830
},
{
"epoch": 0.011962428473746716,
"grad_norm": 0.09375,
"grad_norm_var": 0.028760058681170146,
"learning_rate": 0.0001,
"loss": 0.1881,
"loss/crossentropy": 2.3647295236587524,
"loss/fcd": 0.3896484375,
"loss/idx": 18.0,
"loss/logits": 0.18810325115919113,
"step": 831
},
{
"epoch": 0.011976823694533414,
"grad_norm": 0.11083984375,
"grad_norm_var": 0.02886225382486979,
"learning_rate": 0.0001,
"loss": 0.2179,
"loss/crossentropy": 2.4084588289260864,
"loss/fcd": 0.4345703125,
"loss/idx": 18.0,
"loss/logits": 0.2179015353322029,
"step": 832
},
{
"epoch": 0.011991218915320115,
"grad_norm": 0.10400390625,
"grad_norm_var": 0.02889500359694163,
"learning_rate": 0.0001,
"loss": 0.2007,
"loss/crossentropy": 2.3215763568878174,
"loss/fcd": 0.4384765625,
"loss/idx": 18.0,
"loss/logits": 0.20068107545375824,
"step": 833
},
{
"epoch": 0.012005614136106813,
"grad_norm": 0.10107421875,
"grad_norm_var": 0.029032798608144124,
"learning_rate": 0.0001,
"loss": 0.1899,
"loss/crossentropy": 2.1791869401931763,
"loss/fcd": 0.416015625,
"loss/idx": 18.0,
"loss/logits": 0.18989010155200958,
"step": 834
},
{
"epoch": 0.012020009356893511,
"grad_norm": 0.11279296875,
"grad_norm_var": 0.02898623843987783,
"learning_rate": 0.0001,
"loss": 0.2187,
"loss/crossentropy": 2.292221188545227,
"loss/fcd": 0.4365234375,
"loss/idx": 18.0,
"loss/logits": 0.2187333106994629,
"step": 835
},
{
"epoch": 0.01203440457768021,
"grad_norm": 0.1064453125,
"grad_norm_var": 0.028997563322385154,
"learning_rate": 0.0001,
"loss": 0.216,
"loss/crossentropy": 2.191875457763672,
"loss/fcd": 0.439453125,
"loss/idx": 18.0,
"loss/logits": 0.21604549139738083,
"step": 836
},
{
"epoch": 0.01204879979846691,
"grad_norm": 0.1298828125,
"grad_norm_var": 0.028929102420806884,
"learning_rate": 0.0001,
"loss": 0.2124,
"loss/crossentropy": 2.0077582597732544,
"loss/fcd": 0.4228515625,
"loss/idx": 18.0,
"loss/logits": 0.21244481950998306,
"step": 837
},
{
"epoch": 0.012063195019253608,
"grad_norm": 0.10693359375,
"grad_norm_var": 0.028929102420806884,
"learning_rate": 0.0001,
"loss": 0.2144,
"loss/crossentropy": 2.4339540004730225,
"loss/fcd": 0.4296875,
"loss/idx": 18.0,
"loss/logits": 0.21442482620477676,
"step": 838
},
{
"epoch": 0.012077590240040306,
"grad_norm": 0.220703125,
"grad_norm_var": 0.0291112889846166,
"learning_rate": 0.0001,
"loss": 0.2018,
"loss/crossentropy": 1.9582195281982422,
"loss/fcd": 0.5849609375,
"loss/idx": 18.0,
"loss/logits": 0.20181410014629364,
"step": 839
},
{
"epoch": 0.012091985460827005,
"grad_norm": 0.14453125,
"grad_norm_var": 0.028948195775349937,
"learning_rate": 0.0001,
"loss": 0.2261,
"loss/crossentropy": 1.961089551448822,
"loss/fcd": 0.431640625,
"loss/idx": 18.0,
"loss/logits": 0.22611552476882935,
"step": 840
},
{
"epoch": 0.012106380681613705,
"grad_norm": 0.10205078125,
"grad_norm_var": 0.0008943786223729451,
"learning_rate": 0.0001,
"loss": 0.208,
"loss/crossentropy": 2.2961446046829224,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.20795201510190964,
"step": 841
},
{
"epoch": 0.012120775902400403,
"grad_norm": 0.10791015625,
"grad_norm_var": 0.0008890310923258464,
"learning_rate": 0.0001,
"loss": 0.2208,
"loss/crossentropy": 2.3987420797348022,
"loss/fcd": 0.4345703125,
"loss/idx": 18.0,
"loss/logits": 0.2207762897014618,
"step": 842
},
{
"epoch": 0.012135171123187101,
"grad_norm": 0.09912109375,
"grad_norm_var": 0.0009084294239679972,
"learning_rate": 0.0001,
"loss": 0.1981,
"loss/crossentropy": 2.284560799598694,
"loss/fcd": 0.4033203125,
"loss/idx": 18.0,
"loss/logits": 0.19807633757591248,
"step": 843
},
{
"epoch": 0.012149566343973801,
"grad_norm": 0.10595703125,
"grad_norm_var": 0.0009122679630915324,
"learning_rate": 0.0001,
"loss": 0.2087,
"loss/crossentropy": 2.5325080156326294,
"loss/fcd": 0.4560546875,
"loss/idx": 18.0,
"loss/logits": 0.20872415602207184,
"step": 844
},
{
"epoch": 0.0121639615647605,
"grad_norm": 0.1044921875,
"grad_norm_var": 0.0009208361307779948,
"learning_rate": 0.0001,
"loss": 0.2326,
"loss/crossentropy": 2.4612646102905273,
"loss/fcd": 0.439453125,
"loss/idx": 18.0,
"loss/logits": 0.23255135864019394,
"step": 845
},
{
"epoch": 0.012178356785547198,
"grad_norm": 0.1259765625,
"grad_norm_var": 0.0009262154499689738,
"learning_rate": 0.0001,
"loss": 0.2151,
"loss/crossentropy": 2.463419198989868,
"loss/fcd": 0.4560546875,
"loss/idx": 18.0,
"loss/logits": 0.2151269093155861,
"step": 846
},
{
"epoch": 0.012192752006333896,
"grad_norm": 0.11474609375,
"grad_norm_var": 0.000887898604075114,
"learning_rate": 0.0001,
"loss": 0.2149,
"loss/crossentropy": 2.0733948945999146,
"loss/fcd": 0.412109375,
"loss/idx": 18.0,
"loss/logits": 0.21493042260408401,
"step": 847
},
{
"epoch": 0.012207147227120596,
"grad_norm": 0.09375,
"grad_norm_var": 0.000923815369606018,
"learning_rate": 0.0001,
"loss": 0.1887,
"loss/crossentropy": 2.6692737340927124,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.18867085129022598,
"step": 848
},
{
"epoch": 0.012221542447907295,
"grad_norm": 0.111328125,
"grad_norm_var": 0.0009139657020568847,
"learning_rate": 0.0001,
"loss": 0.2055,
"loss/crossentropy": 2.349723696708679,
"loss/fcd": 0.4189453125,
"loss/idx": 18.0,
"loss/logits": 0.2054726406931877,
"step": 849
},
{
"epoch": 0.012235937668693993,
"grad_norm": 0.103515625,
"grad_norm_var": 0.0009088347355524699,
"learning_rate": 0.0001,
"loss": 0.2146,
"loss/crossentropy": 2.4036346673965454,
"loss/fcd": 0.4189453125,
"loss/idx": 18.0,
"loss/logits": 0.21456415951251984,
"step": 850
},
{
"epoch": 0.012250332889480693,
"grad_norm": 0.12890625,
"grad_norm_var": 0.0009135882059733073,
"learning_rate": 0.0001,
"loss": 0.2798,
"loss/crossentropy": 2.601546049118042,
"loss/fcd": 0.515625,
"loss/idx": 18.0,
"loss/logits": 0.27978505194187164,
"step": 851
},
{
"epoch": 0.012264728110267391,
"grad_norm": 0.12109375,
"grad_norm_var": 0.0009022037188212077,
"learning_rate": 0.0001,
"loss": 0.2426,
"loss/crossentropy": 2.657235622406006,
"loss/fcd": 0.48828125,
"loss/idx": 18.0,
"loss/logits": 0.24257582426071167,
"step": 852
},
{
"epoch": 0.01227912333105409,
"grad_norm": 0.1044921875,
"grad_norm_var": 0.0009092291196187338,
"learning_rate": 0.0001,
"loss": 0.2069,
"loss/crossentropy": 2.515699028968811,
"loss/fcd": 0.4189453125,
"loss/idx": 18.0,
"loss/logits": 0.20689593255519867,
"step": 853
},
{
"epoch": 0.012293518551840788,
"grad_norm": 0.1298828125,
"grad_norm_var": 0.0009068479140599569,
"learning_rate": 0.0001,
"loss": 0.2659,
"loss/crossentropy": 2.6577337980270386,
"loss/fcd": 0.4970703125,
"loss/idx": 18.0,
"loss/logits": 0.26587389409542084,
"step": 854
},
{
"epoch": 0.012307913772627488,
"grad_norm": 0.10205078125,
"grad_norm_var": 0.00019206603368123373,
"learning_rate": 0.0001,
"loss": 0.2053,
"loss/crossentropy": 2.189553380012512,
"loss/fcd": 0.4228515625,
"loss/idx": 18.0,
"loss/logits": 0.2052648663520813,
"step": 855
},
{
"epoch": 0.012322308993414187,
"grad_norm": 0.111328125,
"grad_norm_var": 0.0001191099484761556,
"learning_rate": 0.0001,
"loss": 0.2515,
"loss/crossentropy": 2.7488744258880615,
"loss/fcd": 0.47265625,
"loss/idx": 18.0,
"loss/logits": 0.2514711171388626,
"step": 856
},
{
"epoch": 0.012336704214200885,
"grad_norm": 0.1025390625,
"grad_norm_var": 0.00011858046054840088,
"learning_rate": 0.0001,
"loss": 0.2299,
"loss/crossentropy": 2.5891239643096924,
"loss/fcd": 0.4462890625,
"loss/idx": 18.0,
"loss/logits": 0.2299317717552185,
"step": 857
},
{
"epoch": 0.012351099434987585,
"grad_norm": 0.09912109375,
"grad_norm_var": 0.00012637674808502197,
"learning_rate": 0.0001,
"loss": 0.196,
"loss/crossentropy": 2.2487235069274902,
"loss/fcd": 0.4013671875,
"loss/idx": 18.0,
"loss/logits": 0.19604943692684174,
"step": 858
},
{
"epoch": 0.012365494655774283,
"grad_norm": 0.10986328125,
"grad_norm_var": 0.00011815925439198812,
"learning_rate": 0.0001,
"loss": 0.2355,
"loss/crossentropy": 2.448120951652527,
"loss/fcd": 0.4658203125,
"loss/idx": 18.0,
"loss/logits": 0.23546921461820602,
"step": 859
},
{
"epoch": 0.012379889876560982,
"grad_norm": 0.1123046875,
"grad_norm_var": 0.00011677742004394532,
"learning_rate": 0.0001,
"loss": 0.2299,
"loss/crossentropy": 2.6376739740371704,
"loss/fcd": 0.4765625,
"loss/idx": 18.0,
"loss/logits": 0.22993575036525726,
"step": 860
},
{
"epoch": 0.01239428509734768,
"grad_norm": 0.11962890625,
"grad_norm_var": 0.00011804004510243734,
"learning_rate": 0.0001,
"loss": 0.2362,
"loss/crossentropy": 2.4012396335601807,
"loss/fcd": 0.4541015625,
"loss/idx": 18.0,
"loss/logits": 0.2361709326505661,
"step": 861
},
{
"epoch": 0.01240868031813438,
"grad_norm": 0.10546875,
"grad_norm_var": 0.0001058568557103475,
"learning_rate": 0.0001,
"loss": 0.2031,
"loss/crossentropy": 2.3792039155960083,
"loss/fcd": 0.4306640625,
"loss/idx": 18.0,
"loss/logits": 0.2031245082616806,
"step": 862
},
{
"epoch": 0.012423075538921078,
"grad_norm": 0.1435546875,
"grad_norm_var": 0.00017355283101399738,
"learning_rate": 0.0001,
"loss": 0.1986,
"loss/crossentropy": 2.1046639680862427,
"loss/fcd": 0.4140625,
"loss/idx": 18.0,
"loss/logits": 0.19857460260391235,
"step": 863
},
{
"epoch": 0.012437470759707777,
"grad_norm": 0.10009765625,
"grad_norm_var": 0.00016026397546132405,
"learning_rate": 0.0001,
"loss": 0.2247,
"loss/crossentropy": 2.6387428045272827,
"loss/fcd": 0.4599609375,
"loss/idx": 18.0,
"loss/logits": 0.22474492341279984,
"step": 864
},
{
"epoch": 0.012451865980494475,
"grad_norm": 0.10498046875,
"grad_norm_var": 0.00016404787699381512,
"learning_rate": 0.0001,
"loss": 0.2216,
"loss/crossentropy": 2.420724868774414,
"loss/fcd": 0.4384765625,
"loss/idx": 18.0,
"loss/logits": 0.22158697247505188,
"step": 865
},
{
"epoch": 0.012466261201281175,
"grad_norm": 0.11279296875,
"grad_norm_var": 0.00015840431054433188,
"learning_rate": 0.0001,
"loss": 0.2081,
"loss/crossentropy": 2.1139690279960632,
"loss/fcd": 0.4599609375,
"loss/idx": 18.0,
"loss/logits": 0.20810745656490326,
"step": 866
},
{
"epoch": 0.012480656422067873,
"grad_norm": 0.099609375,
"grad_norm_var": 0.0001499404509862264,
"learning_rate": 0.0001,
"loss": 0.208,
"loss/crossentropy": 2.540156126022339,
"loss/fcd": 0.439453125,
"loss/idx": 18.0,
"loss/logits": 0.2079625502228737,
"step": 867
},
{
"epoch": 0.012495051642854572,
"grad_norm": 0.10107421875,
"grad_norm_var": 0.0001485149065653483,
"learning_rate": 0.0001,
"loss": 0.2183,
"loss/crossentropy": 2.39568293094635,
"loss/fcd": 0.4267578125,
"loss/idx": 18.0,
"loss/logits": 0.21834557503461838,
"step": 868
},
{
"epoch": 0.012509446863641272,
"grad_norm": 0.10546875,
"grad_norm_var": 0.00014786720275878907,
"learning_rate": 0.0001,
"loss": 0.2185,
"loss/crossentropy": 2.3164258003234863,
"loss/fcd": 0.4443359375,
"loss/idx": 18.0,
"loss/logits": 0.2184668406844139,
"step": 869
},
{
"epoch": 0.01252384208442797,
"grad_norm": 0.109375,
"grad_norm_var": 0.00011974573135375977,
"learning_rate": 0.0001,
"loss": 0.2123,
"loss/crossentropy": 2.3733065128326416,
"loss/fcd": 0.4384765625,
"loss/idx": 18.0,
"loss/logits": 0.21230120956897736,
"step": 870
},
{
"epoch": 0.012538237305214668,
"grad_norm": 0.10302734375,
"grad_norm_var": 0.00011893908182779948,
"learning_rate": 0.0001,
"loss": 0.217,
"loss/crossentropy": 2.5337724685668945,
"loss/fcd": 0.443359375,
"loss/idx": 18.0,
"loss/logits": 0.21697236597537994,
"step": 871
},
{
"epoch": 0.012552632526001367,
"grad_norm": 0.1328125,
"grad_norm_var": 0.00015513102213541666,
"learning_rate": 0.0001,
"loss": 0.2258,
"loss/crossentropy": 2.4623916149139404,
"loss/fcd": 0.44921875,
"loss/idx": 18.0,
"loss/logits": 0.2257620170712471,
"step": 872
},
{
"epoch": 0.012567027746788067,
"grad_norm": 0.11767578125,
"grad_norm_var": 0.00015417635440826417,
"learning_rate": 0.0001,
"loss": 0.1991,
"loss/crossentropy": 2.1633788347244263,
"loss/fcd": 0.41015625,
"loss/idx": 18.0,
"loss/logits": 0.1990898996591568,
"step": 873
},
{
"epoch": 0.012581422967574765,
"grad_norm": 0.10009765625,
"grad_norm_var": 0.00015268226464589438,
"learning_rate": 0.0001,
"loss": 0.2118,
"loss/crossentropy": 2.4863176345825195,
"loss/fcd": 0.431640625,
"loss/idx": 18.0,
"loss/logits": 0.21175408363342285,
"step": 874
},
{
"epoch": 0.012595818188361464,
"grad_norm": 0.10888671875,
"grad_norm_var": 0.00015290478865305582,
"learning_rate": 0.0001,
"loss": 0.2287,
"loss/crossentropy": 2.6219388246536255,
"loss/fcd": 0.4541015625,
"loss/idx": 18.0,
"loss/logits": 0.22867251932621002,
"step": 875
},
{
"epoch": 0.012610213409148164,
"grad_norm": 0.10546875,
"grad_norm_var": 0.0001546849807103475,
"learning_rate": 0.0001,
"loss": 0.2051,
"loss/crossentropy": 2.1874676942825317,
"loss/fcd": 0.44140625,
"loss/idx": 18.0,
"loss/logits": 0.205148346722126,
"step": 876
},
{
"epoch": 0.012624608629934862,
"grad_norm": 0.1083984375,
"grad_norm_var": 0.00014908711115519207,
"learning_rate": 0.0001,
"loss": 0.2029,
"loss/crossentropy": 2.129917621612549,
"loss/fcd": 0.412109375,
"loss/idx": 18.0,
"loss/logits": 0.20289119333028793,
"step": 877
},
{
"epoch": 0.01263900385072156,
"grad_norm": 0.11376953125,
"grad_norm_var": 0.0001484622557957967,
"learning_rate": 0.0001,
"loss": 0.2193,
"loss/crossentropy": 2.0245165824890137,
"loss/fcd": 0.4482421875,
"loss/idx": 18.0,
"loss/logits": 0.2193296253681183,
"step": 878
},
{
"epoch": 0.012653399071508259,
"grad_norm": 0.10986328125,
"grad_norm_var": 7.06632932027181e-05,
"learning_rate": 0.0001,
"loss": 0.197,
"loss/crossentropy": 2.1348493099212646,
"loss/fcd": 0.4072265625,
"loss/idx": 18.0,
"loss/logits": 0.19701501727104187,
"step": 879
},
{
"epoch": 0.012667794292294959,
"grad_norm": 0.1171875,
"grad_norm_var": 7.014175256093343e-05,
"learning_rate": 0.0001,
"loss": 0.2465,
"loss/crossentropy": 2.4276719093322754,
"loss/fcd": 0.447265625,
"loss/idx": 18.0,
"loss/logits": 0.24645158648490906,
"step": 880
},
{
"epoch": 0.012682189513081657,
"grad_norm": 0.1103515625,
"grad_norm_var": 6.877581278483072e-05,
"learning_rate": 0.0001,
"loss": 0.2378,
"loss/crossentropy": 2.301609516143799,
"loss/fcd": 0.451171875,
"loss/idx": 18.0,
"loss/logits": 0.23781420290470123,
"step": 881
},
{
"epoch": 0.012696584733868355,
"grad_norm": 0.11083984375,
"grad_norm_var": 6.821950276692708e-05,
"learning_rate": 0.0001,
"loss": 0.2172,
"loss/crossentropy": 2.4009501934051514,
"loss/fcd": 0.4345703125,
"loss/idx": 18.0,
"loss/logits": 0.21715039014816284,
"step": 882
},
{
"epoch": 0.012710979954655055,
"grad_norm": 0.09521484375,
"grad_norm_var": 7.529159386952718e-05,
"learning_rate": 0.0001,
"loss": 0.1797,
"loss/crossentropy": 2.402838706970215,
"loss/fcd": 0.3974609375,
"loss/idx": 18.0,
"loss/logits": 0.1796911582350731,
"step": 883
},
{
"epoch": 0.012725375175441754,
"grad_norm": 0.1083984375,
"grad_norm_var": 7.056792577107748e-05,
"learning_rate": 0.0001,
"loss": 0.2222,
"loss/crossentropy": 2.573052167892456,
"loss/fcd": 0.46875,
"loss/idx": 18.0,
"loss/logits": 0.22222469747066498,
"step": 884
},
{
"epoch": 0.012739770396228452,
"grad_norm": 0.10302734375,
"grad_norm_var": 7.23510980606079e-05,
"learning_rate": 0.0001,
"loss": 0.2008,
"loss/crossentropy": 2.232303559780121,
"loss/fcd": 0.40234375,
"loss/idx": 18.0,
"loss/logits": 0.20078317821025848,
"step": 885
},
{
"epoch": 0.01275416561701515,
"grad_norm": 0.1044921875,
"grad_norm_var": 7.402002811431884e-05,
"learning_rate": 0.0001,
"loss": 0.2282,
"loss/crossentropy": 2.5020205974578857,
"loss/fcd": 0.4580078125,
"loss/idx": 18.0,
"loss/logits": 0.22823868691921234,
"step": 886
},
{
"epoch": 0.01276856083780185,
"grad_norm": 0.1025390625,
"grad_norm_var": 7.444620132446289e-05,
"learning_rate": 0.0001,
"loss": 0.212,
"loss/crossentropy": 2.333465099334717,
"loss/fcd": 0.439453125,
"loss/idx": 18.0,
"loss/logits": 0.212021104991436,
"step": 887
},
{
"epoch": 0.012782956058588549,
"grad_norm": 0.10498046875,
"grad_norm_var": 3.565847873687744e-05,
"learning_rate": 0.0001,
"loss": 0.2017,
"loss/crossentropy": 2.190958023071289,
"loss/fcd": 0.404296875,
"loss/idx": 18.0,
"loss/logits": 0.20165172219276428,
"step": 888
},
{
"epoch": 0.012797351279375247,
"grad_norm": 0.10107421875,
"grad_norm_var": 3.0524532000223796e-05,
"learning_rate": 0.0001,
"loss": 0.2102,
"loss/crossentropy": 2.4441522359848022,
"loss/fcd": 0.4169921875,
"loss/idx": 18.0,
"loss/logits": 0.21022119373083115,
"step": 889
},
{
"epoch": 0.012811746500161945,
"grad_norm": 0.1044921875,
"grad_norm_var": 2.795855204264323e-05,
"learning_rate": 0.0001,
"loss": 0.2209,
"loss/crossentropy": 2.504876732826233,
"loss/fcd": 0.4453125,
"loss/idx": 18.0,
"loss/logits": 0.22085034102201462,
"step": 890
},
{
"epoch": 0.012826141720948646,
"grad_norm": 0.11279296875,
"grad_norm_var": 2.9993057250976562e-05,
"learning_rate": 0.0001,
"loss": 0.2144,
"loss/crossentropy": 2.03822124004364,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.2144273966550827,
"step": 891
},
{
"epoch": 0.012840536941735344,
"grad_norm": 0.09521484375,
"grad_norm_var": 3.87340784072876e-05,
"learning_rate": 0.0001,
"loss": 0.1949,
"loss/crossentropy": 2.364703059196472,
"loss/fcd": 0.4140625,
"loss/idx": 18.0,
"loss/logits": 0.19488562643527985,
"step": 892
},
{
"epoch": 0.012854932162522042,
"grad_norm": 0.09716796875,
"grad_norm_var": 4.364649454752604e-05,
"learning_rate": 0.0001,
"loss": 0.2072,
"loss/crossentropy": 2.37164306640625,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.2072392851114273,
"step": 893
},
{
"epoch": 0.012869327383308742,
"grad_norm": 0.1083984375,
"grad_norm_var": 3.96798054377238e-05,
"learning_rate": 0.0001,
"loss": 0.2372,
"loss/crossentropy": 2.513867974281311,
"loss/fcd": 0.474609375,
"loss/idx": 18.0,
"loss/logits": 0.2371513769030571,
"step": 894
},
{
"epoch": 0.01288372260409544,
"grad_norm": 0.11376953125,
"grad_norm_var": 4.2969981829325356e-05,
"learning_rate": 0.0001,
"loss": 0.2024,
"loss/crossentropy": 2.266432523727417,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.20237434655427933,
"step": 895
},
{
"epoch": 0.012898117824882139,
"grad_norm": 0.1083984375,
"grad_norm_var": 3.424386183420817e-05,
"learning_rate": 0.0001,
"loss": 0.2159,
"loss/crossentropy": 2.191688299179077,
"loss/fcd": 0.4296875,
"loss/idx": 18.0,
"loss/logits": 0.21588657796382904,
"step": 896
},
{
"epoch": 0.012912513045668837,
"grad_norm": 0.1064453125,
"grad_norm_var": 3.24477752049764e-05,
"learning_rate": 0.0001,
"loss": 0.2044,
"loss/crossentropy": 2.1034794449806213,
"loss/fcd": 0.421875,
"loss/idx": 18.0,
"loss/logits": 0.2044026404619217,
"step": 897
},
{
"epoch": 0.012926908266455537,
"grad_norm": 0.10498046875,
"grad_norm_var": 2.989669640858968e-05,
"learning_rate": 0.0001,
"loss": 0.1955,
"loss/crossentropy": 2.1368765830993652,
"loss/fcd": 0.4033203125,
"loss/idx": 18.0,
"loss/logits": 0.19553573429584503,
"step": 898
},
{
"epoch": 0.012941303487242236,
"grad_norm": 0.1044921875,
"grad_norm_var": 2.3837884267171224e-05,
"learning_rate": 0.0001,
"loss": 0.2218,
"loss/crossentropy": 2.5139960050582886,
"loss/fcd": 0.453125,
"loss/idx": 18.0,
"loss/logits": 0.2217550054192543,
"step": 899
},
{
"epoch": 0.012955698708028934,
"grad_norm": 0.115234375,
"grad_norm_var": 2.9818216959635416e-05,
"learning_rate": 0.0001,
"loss": 0.224,
"loss/crossentropy": 2.2468815445899963,
"loss/fcd": 0.427734375,
"loss/idx": 18.0,
"loss/logits": 0.22403018921613693,
"step": 900
},
{
"epoch": 0.012970093928815634,
"grad_norm": 0.1220703125,
"grad_norm_var": 4.6284000078837076e-05,
"learning_rate": 0.0001,
"loss": 0.2005,
"loss/crossentropy": 1.8389571905136108,
"loss/fcd": 0.41796875,
"loss/idx": 18.0,
"loss/logits": 0.20048467069864273,
"step": 901
},
{
"epoch": 0.012984489149602332,
"grad_norm": 0.1103515625,
"grad_norm_var": 4.6736995379130046e-05,
"learning_rate": 0.0001,
"loss": 0.2181,
"loss/crossentropy": 2.47100293636322,
"loss/fcd": 0.4404296875,
"loss/idx": 18.0,
"loss/logits": 0.21807243674993515,
"step": 902
},
{
"epoch": 0.01299888437038903,
"grad_norm": 0.142578125,
"grad_norm_var": 0.00012298325697580973,
"learning_rate": 0.0001,
"loss": 0.2141,
"loss/crossentropy": 2.011506676673889,
"loss/fcd": 0.4443359375,
"loss/idx": 18.0,
"loss/logits": 0.21408168226480484,
"step": 903
},
{
"epoch": 0.013013279591175729,
"grad_norm": 0.1064453125,
"grad_norm_var": 0.00012222925821940103,
"learning_rate": 0.0001,
"loss": 0.2293,
"loss/crossentropy": 2.7555429935455322,
"loss/fcd": 0.4814453125,
"loss/idx": 18.0,
"loss/logits": 0.22929980605840683,
"step": 904
},
{
"epoch": 0.013027674811962429,
"grad_norm": 0.10791015625,
"grad_norm_var": 0.00011736154556274414,
"learning_rate": 0.0001,
"loss": 0.2118,
"loss/crossentropy": 2.3159666061401367,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.21182847768068314,
"step": 905
},
{
"epoch": 0.013042070032749127,
"grad_norm": 0.09814453125,
"grad_norm_var": 0.0001245806614557902,
"learning_rate": 0.0001,
"loss": 0.2026,
"loss/crossentropy": 2.5774402618408203,
"loss/fcd": 0.4140625,
"loss/idx": 18.0,
"loss/logits": 0.20259930193424225,
"step": 906
},
{
"epoch": 0.013056465253535826,
"grad_norm": 0.1044921875,
"grad_norm_var": 0.00012540817260742188,
"learning_rate": 0.0001,
"loss": 0.2085,
"loss/crossentropy": 2.3706518411636353,
"loss/fcd": 0.4306640625,
"loss/idx": 18.0,
"loss/logits": 0.2085341438651085,
"step": 907
},
{
"epoch": 0.013070860474322524,
"grad_norm": 0.103515625,
"grad_norm_var": 0.0001143127679824829,
"learning_rate": 0.0001,
"loss": 0.2094,
"loss/crossentropy": 2.5629345178604126,
"loss/fcd": 0.458984375,
"loss/idx": 18.0,
"loss/logits": 0.2093740627169609,
"step": 908
},
{
"epoch": 0.013085255695109224,
"grad_norm": 0.10498046875,
"grad_norm_var": 0.0001051257054011027,
"learning_rate": 0.0001,
"loss": 0.1934,
"loss/crossentropy": 2.204862952232361,
"loss/fcd": 0.4296875,
"loss/idx": 18.0,
"loss/logits": 0.1933920904994011,
"step": 909
},
{
"epoch": 0.013099650915895922,
"grad_norm": 0.08935546875,
"grad_norm_var": 0.00013220707575480143,
"learning_rate": 0.0001,
"loss": 0.1996,
"loss/crossentropy": 2.6174755096435547,
"loss/fcd": 0.4140625,
"loss/idx": 18.0,
"loss/logits": 0.19955138117074966,
"step": 910
},
{
"epoch": 0.01311404613668262,
"grad_norm": 0.10205078125,
"grad_norm_var": 0.0001332561175028483,
"learning_rate": 0.0001,
"loss": 0.2163,
"loss/crossentropy": 2.3615927696228027,
"loss/fcd": 0.4580078125,
"loss/idx": 18.0,
"loss/logits": 0.21632999181747437,
"step": 911
},
{
"epoch": 0.013128441357469321,
"grad_norm": 0.10302734375,
"grad_norm_var": 0.00013492802778879802,
"learning_rate": 0.0001,
"loss": 0.2018,
"loss/crossentropy": 2.3113813400268555,
"loss/fcd": 0.435546875,
"loss/idx": 18.0,
"loss/logits": 0.20180628448724747,
"step": 912
},
{
"epoch": 0.01314283657825602,
"grad_norm": 0.10498046875,
"grad_norm_var": 0.00013534228006998697,
"learning_rate": 0.0001,
"loss": 0.2166,
"loss/crossentropy": 2.575196623802185,
"loss/fcd": 0.453125,
"loss/idx": 18.0,
"loss/logits": 0.2166244387626648,
"step": 913
},
{
"epoch": 0.013157231799042718,
"grad_norm": 0.107421875,
"grad_norm_var": 0.0001348008712132772,
"learning_rate": 0.0001,
"loss": 0.2273,
"loss/crossentropy": 2.4861690998077393,
"loss/fcd": 0.4462890625,
"loss/idx": 18.0,
"loss/logits": 0.22732173651456833,
"step": 914
},
{
"epoch": 0.013171627019829416,
"grad_norm": 0.119140625,
"grad_norm_var": 0.00014147659142812094,
"learning_rate": 0.0001,
"loss": 0.2495,
"loss/crossentropy": 2.6098480224609375,
"loss/fcd": 0.5009765625,
"loss/idx": 18.0,
"loss/logits": 0.24945074319839478,
"step": 915
},
{
"epoch": 0.013186022240616116,
"grad_norm": 0.10791015625,
"grad_norm_var": 0.00013860066731770834,
"learning_rate": 0.0001,
"loss": 0.2244,
"loss/crossentropy": 2.3869473934173584,
"loss/fcd": 0.44921875,
"loss/idx": 18.0,
"loss/logits": 0.2244347631931305,
"step": 916
},
{
"epoch": 0.013200417461402814,
"grad_norm": 0.1044921875,
"grad_norm_var": 0.00012586911519368488,
"learning_rate": 0.0001,
"loss": 0.2015,
"loss/crossentropy": 2.152829647064209,
"loss/fcd": 0.3984375,
"loss/idx": 18.0,
"loss/logits": 0.20145908743143082,
"step": 917
},
{
"epoch": 0.013214812682189513,
"grad_norm": 0.1015625,
"grad_norm_var": 0.00012712081273396809,
"learning_rate": 0.0001,
"loss": 0.2334,
"loss/crossentropy": 2.607330799102783,
"loss/fcd": 0.4599609375,
"loss/idx": 18.0,
"loss/logits": 0.23341640084981918,
"step": 918
},
{
"epoch": 0.013229207902976213,
"grad_norm": 0.09619140625,
"grad_norm_var": 4.0013591448465986e-05,
"learning_rate": 0.0001,
"loss": 0.2237,
"loss/crossentropy": 2.6008039712905884,
"loss/fcd": 0.44921875,
"loss/idx": 18.0,
"loss/logits": 0.22374649345874786,
"step": 919
},
{
"epoch": 0.013243603123762911,
"grad_norm": 0.1259765625,
"grad_norm_var": 7.061064243316651e-05,
"learning_rate": 0.0001,
"loss": 0.2637,
"loss/crossentropy": 2.669323205947876,
"loss/fcd": 0.509765625,
"loss/idx": 18.0,
"loss/logits": 0.26368650794029236,
"step": 920
},
{
"epoch": 0.01325799834454961,
"grad_norm": 0.107421875,
"grad_norm_var": 7.044076919555664e-05,
"learning_rate": 0.0001,
"loss": 0.2218,
"loss/crossentropy": 2.663564920425415,
"loss/fcd": 0.4560546875,
"loss/idx": 18.0,
"loss/logits": 0.22182673960924149,
"step": 921
},
{
"epoch": 0.013272393565336308,
"grad_norm": 0.1015625,
"grad_norm_var": 6.802777449289957e-05,
"learning_rate": 0.0001,
"loss": 0.2233,
"loss/crossentropy": 2.503962516784668,
"loss/fcd": 0.427734375,
"loss/idx": 18.0,
"loss/logits": 0.22326037287712097,
"step": 922
},
{
"epoch": 0.013286788786123008,
"grad_norm": 0.10009765625,
"grad_norm_var": 6.968180338541666e-05,
"learning_rate": 0.0001,
"loss": 0.2051,
"loss/crossentropy": 2.488289475440979,
"loss/fcd": 0.4482421875,
"loss/idx": 18.0,
"loss/logits": 0.20506569743156433,
"step": 923
},
{
"epoch": 0.013301184006909706,
"grad_norm": 0.1240234375,
"grad_norm_var": 9.196201960245768e-05,
"learning_rate": 0.0001,
"loss": 0.2772,
"loss/crossentropy": 2.7090861797332764,
"loss/fcd": 0.537109375,
"loss/idx": 18.0,
"loss/logits": 0.2771788090467453,
"step": 924
},
{
"epoch": 0.013315579227696404,
"grad_norm": 0.09765625,
"grad_norm_var": 9.65664784113566e-05,
"learning_rate": 0.0001,
"loss": 0.2027,
"loss/crossentropy": 2.4453471899032593,
"loss/fcd": 0.4375,
"loss/idx": 18.0,
"loss/logits": 0.202697291970253,
"step": 925
},
{
"epoch": 0.013329974448483104,
"grad_norm": 0.10205078125,
"grad_norm_var": 7.879634698232015e-05,
"learning_rate": 0.0001,
"loss": 0.2021,
"loss/crossentropy": 2.4429107904434204,
"loss/fcd": 0.4248046875,
"loss/idx": 18.0,
"loss/logits": 0.20214182883501053,
"step": 926
},
{
"epoch": 0.013344369669269803,
"grad_norm": 0.11181640625,
"grad_norm_var": 7.883608341217042e-05,
"learning_rate": 0.0001,
"loss": 0.2276,
"loss/crossentropy": 2.4106050729751587,
"loss/fcd": 0.509765625,
"loss/idx": 18.0,
"loss/logits": 0.22757098823785782,
"step": 927
},
{
"epoch": 0.013358764890056501,
"grad_norm": 0.1171875,
"grad_norm_var": 8.347431818644206e-05,
"learning_rate": 0.0001,
"loss": 0.1978,
"loss/crossentropy": 2.1090660095214844,
"loss/fcd": 0.4111328125,
"loss/idx": 18.0,
"loss/logits": 0.197757326066494,
"step": 928
},
{
"epoch": 0.0133731601108432,
"grad_norm": 0.1044921875,
"grad_norm_var": 8.369187513987223e-05,
"learning_rate": 0.0001,
"loss": 0.2262,
"loss/crossentropy": 2.443942070007324,
"loss/fcd": 0.4521484375,
"loss/idx": 18.0,
"loss/logits": 0.22621066123247147,
"step": 929
},
{
"epoch": 0.0133875553316299,
"grad_norm": 0.1005859375,
"grad_norm_var": 8.71966282526652e-05,
"learning_rate": 0.0001,
"loss": 0.1729,
"loss/crossentropy": 2.2424585819244385,
"loss/fcd": 0.4091796875,
"loss/idx": 18.0,
"loss/logits": 0.17293807864189148,
"step": 930
},
{
"epoch": 0.013401950552416598,
"grad_norm": 0.115234375,
"grad_norm_var": 8.215804894765219e-05,
"learning_rate": 0.0001,
"loss": 0.1967,
"loss/crossentropy": 2.0759899616241455,
"loss/fcd": 0.41796875,
"loss/idx": 18.0,
"loss/logits": 0.19673413038253784,
"step": 931
},
{
"epoch": 0.013416345773203296,
"grad_norm": 0.12060546875,
"grad_norm_var": 9.310940901438395e-05,
"learning_rate": 0.0001,
"loss": 0.2183,
"loss/crossentropy": 2.1836588382720947,
"loss/fcd": 0.443359375,
"loss/idx": 18.0,
"loss/logits": 0.21834751218557358,
"step": 932
},
{
"epoch": 0.013430740993989995,
"grad_norm": 0.0966796875,
"grad_norm_var": 0.00010077059268951417,
"learning_rate": 0.0001,
"loss": 0.2045,
"loss/crossentropy": 2.431378960609436,
"loss/fcd": 0.408203125,
"loss/idx": 18.0,
"loss/logits": 0.2045089453458786,
"step": 933
},
{
"epoch": 0.013445136214776695,
"grad_norm": 0.1005859375,
"grad_norm_var": 0.00010162889957427978,
"learning_rate": 0.0001,
"loss": 0.2197,
"loss/crossentropy": 2.4605276584625244,
"loss/fcd": 0.42578125,
"loss/idx": 18.0,
"loss/logits": 0.2197011262178421,
"step": 934
},
{
"epoch": 0.013459531435563393,
"grad_norm": 0.1123046875,
"grad_norm_var": 9.326934814453125e-05,
"learning_rate": 0.0001,
"loss": 0.2397,
"loss/crossentropy": 2.551363468170166,
"loss/fcd": 0.470703125,
"loss/idx": 18.0,
"loss/logits": 0.23971816152334213,
"step": 935
},
{
"epoch": 0.013473926656350091,
"grad_norm": 0.099609375,
"grad_norm_var": 7.578134536743165e-05,
"learning_rate": 0.0001,
"loss": 0.2057,
"loss/crossentropy": 2.3025097846984863,
"loss/fcd": 0.4375,
"loss/idx": 18.0,
"loss/logits": 0.20573781430721283,
"step": 936
},
{
"epoch": 0.013488321877136791,
"grad_norm": 0.10205078125,
"grad_norm_var": 7.72784153620402e-05,
"learning_rate": 0.0001,
"loss": 0.2001,
"loss/crossentropy": 2.358902096748352,
"loss/fcd": 0.4208984375,
"loss/idx": 18.0,
"loss/logits": 0.20014435052871704,
"step": 937
},
{
"epoch": 0.01350271709792349,
"grad_norm": 0.12890625,
"grad_norm_var": 0.00010542770226796468,
"learning_rate": 0.0001,
"loss": 0.2339,
"loss/crossentropy": 2.2731123566627502,
"loss/fcd": 0.4287109375,
"loss/idx": 18.0,
"loss/logits": 0.23394957929849625,
"step": 938
},
{
"epoch": 0.013517112318710188,
"grad_norm": 0.10498046875,
"grad_norm_var": 0.00010153353214263916,
"learning_rate": 0.0001,
"loss": 0.1838,
"loss/crossentropy": 2.0903587341308594,
"loss/fcd": 0.3876953125,
"loss/idx": 18.0,
"loss/logits": 0.1838395819067955,
"step": 939
},
{
"epoch": 0.013531507539496886,
"grad_norm": 0.11083984375,
"grad_norm_var": 8.541345596313477e-05,
"learning_rate": 0.0001,
"loss": 0.205,
"loss/crossentropy": 2.3965718746185303,
"loss/fcd": 0.455078125,
"loss/idx": 18.0,
"loss/logits": 0.2049787938594818,
"step": 940
},
{
"epoch": 0.013545902760283586,
"grad_norm": 0.11083984375,
"grad_norm_var": 7.835924625396729e-05,
"learning_rate": 0.0001,
"loss": 0.1831,
"loss/crossentropy": 2.0947141647338867,
"loss/fcd": 0.4453125,
"loss/idx": 18.0,
"loss/logits": 0.1831398606300354,
"step": 941
},
{
"epoch": 0.013560297981070285,
"grad_norm": 0.1044921875,
"grad_norm_var": 7.657607396443685e-05,
"learning_rate": 0.0001,
"loss": 0.223,
"loss/crossentropy": 2.4057345390319824,
"loss/fcd": 0.427734375,
"loss/idx": 18.0,
"loss/logits": 0.22295735031366348,
"step": 942
},
{
"epoch": 0.013574693201856983,
"grad_norm": 0.1103515625,
"grad_norm_var": 7.612605889638265e-05,
"learning_rate": 0.0001,
"loss": 0.2116,
"loss/crossentropy": 2.507383942604065,
"loss/fcd": 0.451171875,
"loss/idx": 18.0,
"loss/logits": 0.21163207292556763,
"step": 943
},
{
"epoch": 0.013589088422643683,
"grad_norm": 0.0966796875,
"grad_norm_var": 7.929702599843344e-05,
"learning_rate": 0.0001,
"loss": 0.199,
"loss/crossentropy": 2.5167927742004395,
"loss/fcd": 0.4296875,
"loss/idx": 18.0,
"loss/logits": 0.19900661706924438,
"step": 944
},
{
"epoch": 0.013603483643430381,
"grad_norm": 0.099609375,
"grad_norm_var": 8.271435896555583e-05,
"learning_rate": 0.0001,
"loss": 0.223,
"loss/crossentropy": 2.495205879211426,
"loss/fcd": 0.427734375,
"loss/idx": 18.0,
"loss/logits": 0.2230018451809883,
"step": 945
},
{
"epoch": 0.01361787886421708,
"grad_norm": 0.1015625,
"grad_norm_var": 8.191963036855062e-05,
"learning_rate": 0.0001,
"loss": 0.1923,
"loss/crossentropy": 2.3246638774871826,
"loss/fcd": 0.4072265625,
"loss/idx": 18.0,
"loss/logits": 0.19232943654060364,
"step": 946
},
{
"epoch": 0.013632274085003778,
"grad_norm": 0.10791015625,
"grad_norm_var": 7.743438084920248e-05,
"learning_rate": 0.0001,
"loss": 0.2183,
"loss/crossentropy": 2.342094659805298,
"loss/fcd": 0.44140625,
"loss/idx": 18.0,
"loss/logits": 0.21831049770116806,
"step": 947
},
{
"epoch": 0.013646669305790478,
"grad_norm": 0.09765625,
"grad_norm_var": 6.79562489191691e-05,
"learning_rate": 0.0001,
"loss": 0.2115,
"loss/crossentropy": 2.395784616470337,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.2115183100104332,
"step": 948
},
{
"epoch": 0.013661064526577177,
"grad_norm": 0.09326171875,
"grad_norm_var": 7.262229919433594e-05,
"learning_rate": 0.0001,
"loss": 0.1866,
"loss/crossentropy": 2.3914581537246704,
"loss/fcd": 0.408203125,
"loss/idx": 18.0,
"loss/logits": 0.1866021454334259,
"step": 949
},
{
"epoch": 0.013675459747363875,
"grad_norm": 0.10009765625,
"grad_norm_var": 7.293124993642171e-05,
"learning_rate": 0.0001,
"loss": 0.2197,
"loss/crossentropy": 2.7165035009384155,
"loss/fcd": 0.43359375,
"loss/idx": 18.0,
"loss/logits": 0.21966220438480377,
"step": 950
},
{
"epoch": 0.013689854968150573,
"grad_norm": 0.0947265625,
"grad_norm_var": 7.529159386952718e-05,
"learning_rate": 0.0001,
"loss": 0.2021,
"loss/crossentropy": 2.4402034282684326,
"loss/fcd": 0.43359375,
"loss/idx": 18.0,
"loss/logits": 0.20212795585393906,
"step": 951
},
{
"epoch": 0.013704250188937273,
"grad_norm": 0.1201171875,
"grad_norm_var": 8.964439233144124e-05,
"learning_rate": 0.0001,
"loss": 0.2361,
"loss/crossentropy": 2.3232113122940063,
"loss/fcd": 0.455078125,
"loss/idx": 18.0,
"loss/logits": 0.2360788732767105,
"step": 952
},
{
"epoch": 0.013718645409723972,
"grad_norm": 0.09619140625,
"grad_norm_var": 9.429355462392171e-05,
"learning_rate": 0.0001,
"loss": 0.2187,
"loss/crossentropy": 2.51291286945343,
"loss/fcd": 0.4423828125,
"loss/idx": 18.0,
"loss/logits": 0.21865685284137726,
"step": 953
},
{
"epoch": 0.01373304063051067,
"grad_norm": 0.099609375,
"grad_norm_var": 5.412002404530843e-05,
"learning_rate": 0.0001,
"loss": 0.2028,
"loss/crossentropy": 2.311350464820862,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.20284704118967056,
"step": 954
},
{
"epoch": 0.01374743585129737,
"grad_norm": 0.10302734375,
"grad_norm_var": 5.3857763608296714e-05,
"learning_rate": 0.0001,
"loss": 0.2074,
"loss/crossentropy": 2.3779343366622925,
"loss/fcd": 0.42578125,
"loss/idx": 18.0,
"loss/logits": 0.20741773396730423,
"step": 955
},
{
"epoch": 0.013761831072084068,
"grad_norm": 0.10009765625,
"grad_norm_var": 4.9749016761779784e-05,
"learning_rate": 0.0001,
"loss": 0.1919,
"loss/crossentropy": 2.285262107849121,
"loss/fcd": 0.3984375,
"loss/idx": 18.0,
"loss/logits": 0.19189584255218506,
"step": 956
},
{
"epoch": 0.013776226292870767,
"grad_norm": 0.10498046875,
"grad_norm_var": 4.519522190093994e-05,
"learning_rate": 0.0001,
"loss": 0.2057,
"loss/crossentropy": 2.509137988090515,
"loss/fcd": 0.4453125,
"loss/idx": 18.0,
"loss/logits": 0.20570625364780426,
"step": 957
},
{
"epoch": 0.013790621513657465,
"grad_norm": 0.09375,
"grad_norm_var": 4.8692027727762856e-05,
"learning_rate": 0.0001,
"loss": 0.1856,
"loss/crossentropy": 2.298775553703308,
"loss/fcd": 0.400390625,
"loss/idx": 18.0,
"loss/logits": 0.18564346432685852,
"step": 958
},
{
"epoch": 0.013805016734444165,
"grad_norm": 0.1396484375,
"grad_norm_var": 0.000137979785601298,
"learning_rate": 0.0001,
"loss": 0.255,
"loss/crossentropy": 2.497642993927002,
"loss/fcd": 0.4765625,
"loss/idx": 18.0,
"loss/logits": 0.2549555003643036,
"step": 959
},
{
"epoch": 0.013819411955230863,
"grad_norm": 0.1025390625,
"grad_norm_var": 0.00013514260450998942,
"learning_rate": 0.0001,
"loss": 0.2086,
"loss/crossentropy": 2.250615358352661,
"loss/fcd": 0.427734375,
"loss/idx": 18.0,
"loss/logits": 0.20857169479131699,
"step": 960
},
{
"epoch": 0.013833807176017562,
"grad_norm": 0.1162109375,
"grad_norm_var": 0.00014392435550689698,
"learning_rate": 0.0001,
"loss": 0.2192,
"loss/crossentropy": 2.349924087524414,
"loss/fcd": 0.4716796875,
"loss/idx": 18.0,
"loss/logits": 0.2192462459206581,
"step": 961
},
{
"epoch": 0.013848202396804262,
"grad_norm": 0.11962890625,
"grad_norm_var": 0.00015734036763509113,
"learning_rate": 0.0001,
"loss": 0.2206,
"loss/crossentropy": 2.6663416624069214,
"loss/fcd": 0.46875,
"loss/idx": 18.0,
"loss/logits": 0.22060546278953552,
"step": 962
},
{
"epoch": 0.01386259761759096,
"grad_norm": 0.119140625,
"grad_norm_var": 0.00016869604587554932,
"learning_rate": 0.0001,
"loss": 0.2306,
"loss/crossentropy": 2.5041009187698364,
"loss/fcd": 0.4501953125,
"loss/idx": 18.0,
"loss/logits": 0.23058706521987915,
"step": 963
},
{
"epoch": 0.013876992838377658,
"grad_norm": 0.10888671875,
"grad_norm_var": 0.00016364653905232747,
"learning_rate": 0.0001,
"loss": 0.2344,
"loss/crossentropy": 2.6687543392181396,
"loss/fcd": 0.4609375,
"loss/idx": 18.0,
"loss/logits": 0.2344193086028099,
"step": 964
},
{
"epoch": 0.013891388059164357,
"grad_norm": 0.091796875,
"grad_norm_var": 0.00016646285851796468,
"learning_rate": 0.0001,
"loss": 0.2073,
"loss/crossentropy": 2.6841739416122437,
"loss/fcd": 0.4150390625,
"loss/idx": 18.0,
"loss/logits": 0.2073235660791397,
"step": 965
},
{
"epoch": 0.013905783279951057,
"grad_norm": 0.10205078125,
"grad_norm_var": 0.00016492903232574464,
"learning_rate": 0.0001,
"loss": 0.23,
"loss/crossentropy": 2.511627197265625,
"loss/fcd": 0.47265625,
"loss/idx": 18.0,
"loss/logits": 0.23003337532281876,
"step": 966
},
{
"epoch": 0.013920178500737755,
"grad_norm": 0.111328125,
"grad_norm_var": 0.0001549313465754191,
"learning_rate": 0.0001,
"loss": 0.2113,
"loss/crossentropy": 2.2891138792037964,
"loss/fcd": 0.4267578125,
"loss/idx": 18.0,
"loss/logits": 0.21126385778188705,
"step": 967
},
{
"epoch": 0.013934573721524453,
"grad_norm": 0.134765625,
"grad_norm_var": 0.0001918862263361613,
"learning_rate": 0.0001,
"loss": 0.2406,
"loss/crossentropy": 2.610072374343872,
"loss/fcd": 0.4755859375,
"loss/idx": 18.0,
"loss/logits": 0.24063490331172943,
"step": 968
},
{
"epoch": 0.013948968942311154,
"grad_norm": 0.1181640625,
"grad_norm_var": 0.0001845995585123698,
"learning_rate": 0.0001,
"loss": 0.2615,
"loss/crossentropy": 2.6200684309005737,
"loss/fcd": 0.484375,
"loss/idx": 18.0,
"loss/logits": 0.2615353539586067,
"step": 969
},
{
"epoch": 0.013963364163097852,
"grad_norm": 0.115234375,
"grad_norm_var": 0.00017747879028320312,
"learning_rate": 0.0001,
"loss": 0.2364,
"loss/crossentropy": 2.344622015953064,
"loss/fcd": 0.4375,
"loss/idx": 18.0,
"loss/logits": 0.23641209304332733,
"step": 970
},
{
"epoch": 0.01397775938388455,
"grad_norm": 0.11328125,
"grad_norm_var": 0.00017270147800445556,
"learning_rate": 0.0001,
"loss": 0.2414,
"loss/crossentropy": 2.6739622354507446,
"loss/fcd": 0.50390625,
"loss/idx": 18.0,
"loss/logits": 0.24144794046878815,
"step": 971
},
{
"epoch": 0.013992154604671249,
"grad_norm": 0.13671875,
"grad_norm_var": 0.00019855499267578124,
"learning_rate": 0.0001,
"loss": 0.1994,
"loss/crossentropy": 2.1970399618148804,
"loss/fcd": 0.544921875,
"loss/idx": 18.0,
"loss/logits": 0.19944548606872559,
"step": 972
},
{
"epoch": 0.014006549825457949,
"grad_norm": 0.1279296875,
"grad_norm_var": 0.00020308395226796468,
"learning_rate": 0.0001,
"loss": 0.2182,
"loss/crossentropy": 2.0679745078086853,
"loss/fcd": 0.4267578125,
"loss/idx": 18.0,
"loss/logits": 0.21820590645074844,
"step": 973
},
{
"epoch": 0.014020945046244647,
"grad_norm": 0.11572265625,
"grad_norm_var": 0.0001689751942952474,
"learning_rate": 0.0001,
"loss": 0.2122,
"loss/crossentropy": 2.2601789236068726,
"loss/fcd": 0.43359375,
"loss/idx": 18.0,
"loss/logits": 0.21224602311849594,
"step": 974
},
{
"epoch": 0.014035340267031345,
"grad_norm": 0.115234375,
"grad_norm_var": 0.00013271570205688476,
"learning_rate": 0.0001,
"loss": 0.2131,
"loss/crossentropy": 2.215391755104065,
"loss/fcd": 0.439453125,
"loss/idx": 18.0,
"loss/logits": 0.21311646699905396,
"step": 975
},
{
"epoch": 0.014049735487818044,
"grad_norm": 0.0908203125,
"grad_norm_var": 0.00016161203384399415,
"learning_rate": 0.0001,
"loss": 0.2055,
"loss/crossentropy": 2.593106508255005,
"loss/fcd": 0.4111328125,
"loss/idx": 18.0,
"loss/logits": 0.20546036958694458,
"step": 976
},
{
"epoch": 0.014064130708604744,
"grad_norm": 0.1044921875,
"grad_norm_var": 0.000168001651763916,
"learning_rate": 0.0001,
"loss": 0.2212,
"loss/crossentropy": 2.4123164415359497,
"loss/fcd": 0.4482421875,
"loss/idx": 18.0,
"loss/logits": 0.22117872536182404,
"step": 977
},
{
"epoch": 0.014078525929391442,
"grad_norm": 0.1357421875,
"grad_norm_var": 0.00019616186618804933,
"learning_rate": 0.0001,
"loss": 0.2037,
"loss/crossentropy": 2.111898362636566,
"loss/fcd": 0.47265625,
"loss/idx": 18.0,
"loss/logits": 0.20368139445781708,
"step": 978
},
{
"epoch": 0.01409292115017814,
"grad_norm": 0.10302734375,
"grad_norm_var": 0.00020366907119750977,
"learning_rate": 0.0001,
"loss": 0.2233,
"loss/crossentropy": 2.484625220298767,
"loss/fcd": 0.466796875,
"loss/idx": 18.0,
"loss/logits": 0.22334980964660645,
"step": 979
},
{
"epoch": 0.01410731637096484,
"grad_norm": 0.103515625,
"grad_norm_var": 0.0002091874678929647,
"learning_rate": 0.0001,
"loss": 0.2286,
"loss/crossentropy": 2.5562527179718018,
"loss/fcd": 0.455078125,
"loss/idx": 18.0,
"loss/logits": 0.22860489040613174,
"step": 980
},
{
"epoch": 0.014121711591751539,
"grad_norm": 0.109375,
"grad_norm_var": 0.0001770724852879842,
"learning_rate": 0.0001,
"loss": 0.2195,
"loss/crossentropy": 2.372304320335388,
"loss/fcd": 0.4423828125,
"loss/idx": 18.0,
"loss/logits": 0.21949142217636108,
"step": 981
},
{
"epoch": 0.014136106812538237,
"grad_norm": 0.10888671875,
"grad_norm_var": 0.00016833841800689697,
"learning_rate": 0.0001,
"loss": 0.2166,
"loss/crossentropy": 2.5525119304656982,
"loss/fcd": 0.453125,
"loss/idx": 18.0,
"loss/logits": 0.21661998331546783,
"step": 982
},
{
"epoch": 0.014150502033324935,
"grad_norm": 0.099609375,
"grad_norm_var": 0.00018307268619537352,
"learning_rate": 0.0001,
"loss": 0.205,
"loss/crossentropy": 2.346623182296753,
"loss/fcd": 0.416015625,
"loss/idx": 18.0,
"loss/logits": 0.205020934343338,
"step": 983
},
{
"epoch": 0.014164897254111635,
"grad_norm": 0.09814453125,
"grad_norm_var": 0.00016809701919555663,
"learning_rate": 0.0001,
"loss": 0.1916,
"loss/crossentropy": 2.372196674346924,
"loss/fcd": 0.408203125,
"loss/idx": 18.0,
"loss/logits": 0.19163141399621964,
"step": 984
},
{
"epoch": 0.014179292474898334,
"grad_norm": 0.11962890625,
"grad_norm_var": 0.00016938745975494384,
"learning_rate": 0.0001,
"loss": 0.2246,
"loss/crossentropy": 2.2609957456588745,
"loss/fcd": 0.447265625,
"loss/idx": 18.0,
"loss/logits": 0.2246478945016861,
"step": 985
},
{
"epoch": 0.014193687695685032,
"grad_norm": 0.1201171875,
"grad_norm_var": 0.000172765056292216,
"learning_rate": 0.0001,
"loss": 0.2191,
"loss/crossentropy": 2.2087113857269287,
"loss/fcd": 0.4287109375,
"loss/idx": 18.0,
"loss/logits": 0.21905823051929474,
"step": 986
},
{
"epoch": 0.014208082916471732,
"grad_norm": 0.1171875,
"grad_norm_var": 0.00017405251661936443,
"learning_rate": 0.0001,
"loss": 0.22,
"loss/crossentropy": 2.257576823234558,
"loss/fcd": 0.44921875,
"loss/idx": 18.0,
"loss/logits": 0.2199638932943344,
"step": 987
},
{
"epoch": 0.01422247813725843,
"grad_norm": 0.10693359375,
"grad_norm_var": 0.00013484557469685873,
"learning_rate": 0.0001,
"loss": 0.2398,
"loss/crossentropy": 2.626092791557312,
"loss/fcd": 0.462890625,
"loss/idx": 18.0,
"loss/logits": 0.23977234959602356,
"step": 988
},
{
"epoch": 0.014236873358045129,
"grad_norm": 0.107421875,
"grad_norm_var": 0.00011490186055501302,
"learning_rate": 0.0001,
"loss": 0.2098,
"loss/crossentropy": 2.4047662019729614,
"loss/fcd": 0.4453125,
"loss/idx": 18.0,
"loss/logits": 0.20984865725040436,
"step": 989
},
{
"epoch": 0.014251268578831827,
"grad_norm": 0.11328125,
"grad_norm_var": 0.00011332730452219645,
"learning_rate": 0.0001,
"loss": 0.2132,
"loss/crossentropy": 2.4295172691345215,
"loss/fcd": 0.4462890625,
"loss/idx": 18.0,
"loss/logits": 0.21317294985055923,
"step": 990
},
{
"epoch": 0.014265663799618527,
"grad_norm": 0.12451171875,
"grad_norm_var": 0.0001256903012593587,
"learning_rate": 0.0001,
"loss": 0.2325,
"loss/crossentropy": 2.5081902742385864,
"loss/fcd": 0.47265625,
"loss/idx": 18.0,
"loss/logits": 0.23250433802604675,
"step": 991
},
{
"epoch": 0.014280059020405226,
"grad_norm": 0.1376953125,
"grad_norm_var": 0.00014209349950154623,
"learning_rate": 0.0001,
"loss": 0.2266,
"loss/crossentropy": 2.0926729440689087,
"loss/fcd": 0.4951171875,
"loss/idx": 18.0,
"loss/logits": 0.226626954972744,
"step": 992
},
{
"epoch": 0.014294454241191924,
"grad_norm": 0.10595703125,
"grad_norm_var": 0.00014054675896962482,
"learning_rate": 0.0001,
"loss": 0.2136,
"loss/crossentropy": 2.383934497833252,
"loss/fcd": 0.4609375,
"loss/idx": 18.0,
"loss/logits": 0.21359023451805115,
"step": 993
},
{
"epoch": 0.014308849461978622,
"grad_norm": 0.11865234375,
"grad_norm_var": 0.00010741154352823893,
"learning_rate": 0.0001,
"loss": 0.2225,
"loss/crossentropy": 2.4633511304855347,
"loss/fcd": 0.443359375,
"loss/idx": 18.0,
"loss/logits": 0.2224937155842781,
"step": 994
},
{
"epoch": 0.014323244682765322,
"grad_norm": 0.10107421875,
"grad_norm_var": 0.00011001825332641601,
"learning_rate": 0.0001,
"loss": 0.1897,
"loss/crossentropy": 2.1795610189437866,
"loss/fcd": 0.3994140625,
"loss/idx": 18.0,
"loss/logits": 0.18965643644332886,
"step": 995
},
{
"epoch": 0.01433763990355202,
"grad_norm": 0.1015625,
"grad_norm_var": 0.00011246601740519205,
"learning_rate": 0.0001,
"loss": 0.1976,
"loss/crossentropy": 2.336984634399414,
"loss/fcd": 0.4033203125,
"loss/idx": 18.0,
"loss/logits": 0.1975797638297081,
"step": 996
},
{
"epoch": 0.014352035124338719,
"grad_norm": 0.099609375,
"grad_norm_var": 0.00012168486913045247,
"learning_rate": 0.0001,
"loss": 0.2147,
"loss/crossentropy": 2.664496660232544,
"loss/fcd": 0.44921875,
"loss/idx": 18.0,
"loss/logits": 0.2147291675209999,
"step": 997
},
{
"epoch": 0.014366430345125419,
"grad_norm": 0.09912109375,
"grad_norm_var": 0.00013074477513631185,
"learning_rate": 0.0001,
"loss": 0.2094,
"loss/crossentropy": 2.33840548992157,
"loss/fcd": 0.419921875,
"loss/idx": 18.0,
"loss/logits": 0.209433451294899,
"step": 998
},
{
"epoch": 0.014380825565912117,
"grad_norm": 0.10107421875,
"grad_norm_var": 0.00012872119744618735,
"learning_rate": 0.0001,
"loss": 0.2101,
"loss/crossentropy": 2.5498578548431396,
"loss/fcd": 0.431640625,
"loss/idx": 18.0,
"loss/logits": 0.210076242685318,
"step": 999
},
{
"epoch": 0.014395220786698816,
"grad_norm": 0.103515625,
"grad_norm_var": 0.00012149810791015626,
"learning_rate": 0.0001,
"loss": 0.2192,
"loss/crossentropy": 2.547055721282959,
"loss/fcd": 0.4560546875,
"loss/idx": 18.0,
"loss/logits": 0.21922268718481064,
"step": 1000
}
],
"logging_steps": 1,
"max_steps": 100000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": true,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.51753290940416e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}