{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5, "eval_steps": 250, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001, "grad_norm": 7.1875, "learning_rate": 1.0000000000000002e-06, "loss": 1.8199, "loss/crossentropy": 3.5243165493011475, "loss/fcd": 1.1015625, "loss/logits": 0.7183534502983093, "step": 1 }, { "epoch": 0.002, "grad_norm": 7.03125, "learning_rate": 2.0000000000000003e-06, "loss": 1.7544, "loss/crossentropy": 3.38218891620636, "loss/fcd": 1.06640625, "loss/logits": 0.6879583299160004, "step": 2 }, { "epoch": 0.003, "grad_norm": 6.90625, "learning_rate": 3e-06, "loss": 1.7466, "loss/crossentropy": 3.4039193391799927, "loss/fcd": 1.06640625, "loss/logits": 0.6802149713039398, "step": 3 }, { "epoch": 0.004, "grad_norm": 8.0, "learning_rate": 4.000000000000001e-06, "loss": 1.7976, "loss/crossentropy": 3.5470337867736816, "loss/fcd": 1.08984375, "loss/logits": 0.7077842652797699, "step": 4 }, { "epoch": 0.005, "grad_norm": 7.0, "learning_rate": 5e-06, "loss": 1.7934, "loss/crossentropy": 3.393998146057129, "loss/fcd": 1.08203125, "loss/logits": 0.7113845348358154, "step": 5 }, { "epoch": 0.006, "grad_norm": 6.25, "learning_rate": 6e-06, "loss": 1.7157, "loss/crossentropy": 3.348253607749939, "loss/fcd": 1.03125, "loss/logits": 0.684424489736557, "step": 6 }, { "epoch": 0.007, "grad_norm": 6.90625, "learning_rate": 7.000000000000001e-06, "loss": 1.7533, "loss/crossentropy": 3.370575785636902, "loss/fcd": 1.06640625, "loss/logits": 0.686925858259201, "step": 7 }, { "epoch": 0.008, "grad_norm": 5.65625, "learning_rate": 8.000000000000001e-06, "loss": 1.7326, "loss/crossentropy": 3.325868844985962, "loss/fcd": 1.06640625, "loss/logits": 0.6662170886993408, "step": 8 }, { "epoch": 0.009, "grad_norm": 6.0, "learning_rate": 9e-06, "loss": 1.7279, "loss/crossentropy": 3.2712498903274536, "loss/fcd": 1.05078125, "loss/logits": 0.6771342754364014, "step": 9 }, { "epoch": 0.01, "grad_norm": 5.15625, "learning_rate": 1e-05, "loss": 1.6636, "loss/crossentropy": 3.31475031375885, "loss/fcd": 1.037109375, "loss/logits": 0.6264936327934265, "step": 10 }, { "epoch": 0.011, "grad_norm": 5.1875, "learning_rate": 1.1000000000000001e-05, "loss": 1.6084, "loss/crossentropy": 3.252698540687561, "loss/fcd": 0.99609375, "loss/logits": 0.612336277961731, "step": 11 }, { "epoch": 0.012, "grad_norm": 4.53125, "learning_rate": 1.2e-05, "loss": 1.5937, "loss/crossentropy": 3.2348395586013794, "loss/fcd": 0.994140625, "loss/logits": 0.5995941162109375, "step": 12 }, { "epoch": 0.013, "grad_norm": 3.75, "learning_rate": 1.3000000000000001e-05, "loss": 1.5844, "loss/crossentropy": 3.2545889616012573, "loss/fcd": 0.990234375, "loss/logits": 0.5941979587078094, "step": 13 }, { "epoch": 0.014, "grad_norm": 3.09375, "learning_rate": 1.4000000000000001e-05, "loss": 1.5826, "loss/crossentropy": 3.3384639024734497, "loss/fcd": 0.970703125, "loss/logits": 0.6118558943271637, "step": 14 }, { "epoch": 0.015, "grad_norm": 2.65625, "learning_rate": 1.5e-05, "loss": 1.4314, "loss/crossentropy": 3.279728889465332, "loss/fcd": 0.90234375, "loss/logits": 0.529065728187561, "step": 15 }, { "epoch": 0.016, "grad_norm": 2.28125, "grad_norm_var": 3.1285441080729166, "learning_rate": 1.6000000000000003e-05, "loss": 1.4926, "loss/crossentropy": 3.225893259048462, "loss/fcd": 0.919921875, "loss/logits": 0.572645902633667, "step": 16 }, { "epoch": 0.017, "grad_norm": 2.078125, "grad_norm_var": 3.593244425455729, "learning_rate": 1.7000000000000003e-05, "loss": 1.4306, "loss/crossentropy": 3.2493473291397095, "loss/fcd": 0.896484375, "loss/logits": 0.534164160490036, "step": 17 }, { "epoch": 0.018, "grad_norm": 1.75, "grad_norm_var": 4.015469360351562, "learning_rate": 1.8e-05, "loss": 1.366, "loss/crossentropy": 3.346846103668213, "loss/fcd": 0.861328125, "loss/logits": 0.504672035574913, "step": 18 }, { "epoch": 0.019, "grad_norm": 1.671875, "grad_norm_var": 4.275484212239584, "learning_rate": 1.9e-05, "loss": 1.3969, "loss/crossentropy": 3.273995280265808, "loss/fcd": 0.865234375, "loss/logits": 0.5316321551799774, "step": 19 }, { "epoch": 0.02, "grad_norm": 1.609375, "grad_norm_var": 3.8440338134765626, "learning_rate": 2e-05, "loss": 1.2777, "loss/crossentropy": 3.2759294509887695, "loss/fcd": 0.802734375, "loss/logits": 0.47498229146003723, "step": 20 }, { "epoch": 0.021, "grad_norm": 1.4765625, "grad_norm_var": 3.6140703837076824, "learning_rate": 2.1e-05, "loss": 1.3094, "loss/crossentropy": 3.299839496612549, "loss/fcd": 0.81640625, "loss/logits": 0.49299056828022003, "step": 21 }, { "epoch": 0.022, "grad_norm": 1.5546875, "grad_norm_var": 3.4289784749348957, "learning_rate": 2.2000000000000003e-05, "loss": 1.3074, "loss/crossentropy": 3.3400200605392456, "loss/fcd": 0.826171875, "loss/logits": 0.4812265634536743, "step": 22 }, { "epoch": 0.023, "grad_norm": 1.3046875, "grad_norm_var": 2.816125233968099, "learning_rate": 2.3000000000000003e-05, "loss": 1.2273, "loss/crossentropy": 3.1525696516036987, "loss/fcd": 0.78125, "loss/logits": 0.4460318386554718, "step": 23 }, { "epoch": 0.024, "grad_norm": 1.4140625, "grad_norm_var": 2.500584920247396, "learning_rate": 2.4e-05, "loss": 1.2879, "loss/crossentropy": 3.345754623413086, "loss/fcd": 0.810546875, "loss/logits": 0.4773627370595932, "step": 24 }, { "epoch": 0.025, "grad_norm": 1.203125, "grad_norm_var": 1.9206502278645834, "learning_rate": 2.5e-05, "loss": 1.1679, "loss/crossentropy": 3.041555643081665, "loss/fcd": 0.740234375, "loss/logits": 0.42767438292503357, "step": 25 }, { "epoch": 0.026, "grad_norm": 1.1328125, "grad_norm_var": 1.5315345764160155, "learning_rate": 2.6000000000000002e-05, "loss": 1.203, "loss/crossentropy": 3.267328143119812, "loss/fcd": 0.763671875, "loss/logits": 0.43937453627586365, "step": 26 }, { "epoch": 0.027, "grad_norm": 1.03125, "grad_norm_var": 1.007403310139974, "learning_rate": 2.7000000000000002e-05, "loss": 1.1558, "loss/crossentropy": 3.1087806224823, "loss/fcd": 0.7421875, "loss/logits": 0.41364721953868866, "step": 27 }, { "epoch": 0.028, "grad_norm": 1.0, "grad_norm_var": 0.610827382405599, "learning_rate": 2.8000000000000003e-05, "loss": 1.1024, "loss/crossentropy": 3.087596893310547, "loss/fcd": 0.71484375, "loss/logits": 0.3875562846660614, "step": 28 }, { "epoch": 0.029, "grad_norm": 0.9765625, "grad_norm_var": 0.3752838134765625, "learning_rate": 2.9e-05, "loss": 1.0592, "loss/crossentropy": 2.9776238203048706, "loss/fcd": 0.689453125, "loss/logits": 0.36973273754119873, "step": 29 }, { "epoch": 0.03, "grad_norm": 1.03125, "grad_norm_var": 0.2412750244140625, "learning_rate": 3e-05, "loss": 1.1152, "loss/crossentropy": 3.046830892562866, "loss/fcd": 0.728515625, "loss/logits": 0.38670212030410767, "step": 30 }, { "epoch": 0.031, "grad_norm": 1.046875, "grad_norm_var": 0.1573486328125, "learning_rate": 3.1e-05, "loss": 1.1022, "loss/crossentropy": 3.1873209476470947, "loss/fcd": 0.705078125, "loss/logits": 0.39707766473293304, "step": 31 }, { "epoch": 0.032, "grad_norm": 0.8984375, "grad_norm_var": 0.11625137329101562, "learning_rate": 3.2000000000000005e-05, "loss": 1.0413, "loss/crossentropy": 3.0968486070632935, "loss/fcd": 0.685546875, "loss/logits": 0.3557591587305069, "step": 32 }, { "epoch": 0.033, "grad_norm": 0.8828125, "grad_norm_var": 0.0853179931640625, "learning_rate": 3.3e-05, "loss": 1.0181, "loss/crossentropy": 3.0179919004440308, "loss/fcd": 0.671875, "loss/logits": 0.34625276923179626, "step": 33 }, { "epoch": 0.034, "grad_norm": 0.82421875, "grad_norm_var": 0.0770456314086914, "learning_rate": 3.4000000000000007e-05, "loss": 1.023, "loss/crossentropy": 2.9905418157577515, "loss/fcd": 0.669921875, "loss/logits": 0.3531098961830139, "step": 34 }, { "epoch": 0.035, "grad_norm": 0.8359375, "grad_norm_var": 0.06714064280192057, "learning_rate": 3.5e-05, "loss": 1.0004, "loss/crossentropy": 2.974391222000122, "loss/fcd": 0.666015625, "loss/logits": 0.3343943804502487, "step": 35 }, { "epoch": 0.036, "grad_norm": 0.90234375, "grad_norm_var": 0.05403340657552083, "learning_rate": 3.6e-05, "loss": 1.0024, "loss/crossentropy": 3.1857047080993652, "loss/fcd": 0.6796875, "loss/logits": 0.3227563053369522, "step": 36 }, { "epoch": 0.037, "grad_norm": 0.890625, "grad_norm_var": 0.045660146077473956, "learning_rate": 3.7e-05, "loss": 1.0341, "loss/crossentropy": 3.1201231479644775, "loss/fcd": 0.68359375, "loss/logits": 0.35050980746746063, "step": 37 }, { "epoch": 0.038, "grad_norm": 0.78125, "grad_norm_var": 0.03183797200520833, "learning_rate": 3.8e-05, "loss": 0.9339, "loss/crossentropy": 3.1853790283203125, "loss/fcd": 0.623046875, "loss/logits": 0.3108416050672531, "step": 38 }, { "epoch": 0.039, "grad_norm": 0.87109375, "grad_norm_var": 0.02653802235921224, "learning_rate": 3.9000000000000006e-05, "loss": 1.0078, "loss/crossentropy": 3.111618757247925, "loss/fcd": 0.654296875, "loss/logits": 0.35348545014858246, "step": 39 }, { "epoch": 0.04, "grad_norm": 0.83984375, "grad_norm_var": 0.014117177327473958, "learning_rate": 4e-05, "loss": 0.9597, "loss/crossentropy": 3.0982449054718018, "loss/fcd": 0.64453125, "loss/logits": 0.3151911199092865, "step": 40 }, { "epoch": 0.041, "grad_norm": 0.72265625, "grad_norm_var": 0.012123044331868489, "learning_rate": 4.1e-05, "loss": 0.8549, "loss/crossentropy": 3.0434144735336304, "loss/fcd": 0.58984375, "loss/logits": 0.2650972008705139, "step": 41 }, { "epoch": 0.042, "grad_norm": 0.78125, "grad_norm_var": 0.009719785054524739, "learning_rate": 4.2e-05, "loss": 0.8934, "loss/crossentropy": 2.959277391433716, "loss/fcd": 0.609375, "loss/logits": 0.28400175273418427, "step": 42 }, { "epoch": 0.043, "grad_norm": 0.7421875, "grad_norm_var": 0.009682146708170573, "learning_rate": 4.3e-05, "loss": 0.8814, "loss/crossentropy": 2.9370049238204956, "loss/fcd": 0.59765625, "loss/logits": 0.28376346826553345, "step": 43 }, { "epoch": 0.044, "grad_norm": 0.75390625, "grad_norm_var": 0.009421793619791667, "learning_rate": 4.4000000000000006e-05, "loss": 0.8301, "loss/crossentropy": 3.0939581394195557, "loss/fcd": 0.568359375, "loss/logits": 0.2617700919508934, "step": 44 }, { "epoch": 0.045, "grad_norm": 0.67578125, "grad_norm_var": 0.010454750061035157, "learning_rate": 4.5e-05, "loss": 0.8153, "loss/crossentropy": 3.077065944671631, "loss/fcd": 0.5625, "loss/logits": 0.2528259754180908, "step": 45 }, { "epoch": 0.046, "grad_norm": 0.71484375, "grad_norm_var": 0.008750152587890626, "learning_rate": 4.600000000000001e-05, "loss": 0.8464, "loss/crossentropy": 3.009420156478882, "loss/fcd": 0.580078125, "loss/logits": 0.26630888134241104, "step": 46 }, { "epoch": 0.047, "grad_norm": 0.703125, "grad_norm_var": 0.005863189697265625, "learning_rate": 4.7e-05, "loss": 0.79, "loss/crossentropy": 2.916282534599304, "loss/fcd": 0.541015625, "loss/logits": 0.24900944530963898, "step": 47 }, { "epoch": 0.048, "grad_norm": 0.79296875, "grad_norm_var": 0.005191993713378906, "learning_rate": 4.8e-05, "loss": 0.866, "loss/crossentropy": 2.9061275720596313, "loss/fcd": 0.591796875, "loss/logits": 0.27421246469020844, "step": 48 }, { "epoch": 0.049, "grad_norm": 0.6953125, "grad_norm_var": 0.005185890197753906, "learning_rate": 4.9e-05, "loss": 0.8133, "loss/crossentropy": 2.9824827909469604, "loss/fcd": 0.560546875, "loss/logits": 0.25276362150907516, "step": 49 }, { "epoch": 0.05, "grad_norm": 0.703125, "grad_norm_var": 0.0054361979166666664, "learning_rate": 5e-05, "loss": 0.7799, "loss/crossentropy": 2.917865753173828, "loss/fcd": 0.537109375, "loss/logits": 0.24279683083295822, "step": 50 }, { "epoch": 0.051, "grad_norm": 0.671875, "grad_norm_var": 0.005794016520182291, "learning_rate": 5.1000000000000006e-05, "loss": 0.7346, "loss/crossentropy": 2.9159669876098633, "loss/fcd": 0.5107421875, "loss/logits": 0.22386203706264496, "step": 51 }, { "epoch": 0.052, "grad_norm": 0.71484375, "grad_norm_var": 0.004561106363932292, "learning_rate": 5.2000000000000004e-05, "loss": 0.7484, "loss/crossentropy": 2.892116069793701, "loss/fcd": 0.517578125, "loss/logits": 0.2308187410235405, "step": 52 }, { "epoch": 0.053, "grad_norm": 0.6640625, "grad_norm_var": 0.0036244710286458333, "learning_rate": 5.300000000000001e-05, "loss": 0.7393, "loss/crossentropy": 2.996389389038086, "loss/fcd": 0.521484375, "loss/logits": 0.21783078461885452, "step": 53 }, { "epoch": 0.054, "grad_norm": 0.65234375, "grad_norm_var": 0.00394128163655599, "learning_rate": 5.4000000000000005e-05, "loss": 0.7563, "loss/crossentropy": 2.931565999984741, "loss/fcd": 0.53125, "loss/logits": 0.22508002817630768, "step": 54 }, { "epoch": 0.055, "grad_norm": 1.2265625, "grad_norm_var": 0.01846898396809896, "learning_rate": 5.500000000000001e-05, "loss": 0.8031, "loss/crossentropy": 3.2799376249313354, "loss/fcd": 0.5703125, "loss/logits": 0.23281589150428772, "step": 55 }, { "epoch": 0.056, "grad_norm": 0.95703125, "grad_norm_var": 0.020677693684895835, "learning_rate": 5.6000000000000006e-05, "loss": 0.7495, "loss/crossentropy": 2.97863233089447, "loss/fcd": 0.529296875, "loss/logits": 0.22019048035144806, "step": 56 }, { "epoch": 0.057, "grad_norm": 0.71484375, "grad_norm_var": 0.02072118123372396, "learning_rate": 5.6999999999999996e-05, "loss": 0.6926, "loss/crossentropy": 2.888186454772949, "loss/fcd": 0.48828125, "loss/logits": 0.20430795848369598, "step": 57 }, { "epoch": 0.058, "grad_norm": 0.63671875, "grad_norm_var": 0.021622149149576823, "learning_rate": 5.8e-05, "loss": 0.6907, "loss/crossentropy": 3.028933882713318, "loss/fcd": 0.4921875, "loss/logits": 0.1985393539071083, "step": 58 }, { "epoch": 0.059, "grad_norm": 0.640625, "grad_norm_var": 0.02238915761311849, "learning_rate": 5.9e-05, "loss": 0.6926, "loss/crossentropy": 2.909870743751526, "loss/fcd": 0.494140625, "loss/logits": 0.19847750663757324, "step": 59 }, { "epoch": 0.06, "grad_norm": 0.76953125, "grad_norm_var": 0.022423235575358073, "learning_rate": 6e-05, "loss": 0.7102, "loss/crossentropy": 3.0572707653045654, "loss/fcd": 0.4931640625, "loss/logits": 0.21706663817167282, "step": 60 }, { "epoch": 0.061, "grad_norm": 0.7265625, "grad_norm_var": 0.0221099853515625, "learning_rate": 6.1e-05, "loss": 0.7628, "loss/crossentropy": 3.054797649383545, "loss/fcd": 0.53515625, "loss/logits": 0.22761277854442596, "step": 61 }, { "epoch": 0.062, "grad_norm": 0.83203125, "grad_norm_var": 0.022434234619140625, "learning_rate": 6.2e-05, "loss": 0.7159, "loss/crossentropy": 2.89387047290802, "loss/fcd": 0.501953125, "loss/logits": 0.21391130983829498, "step": 62 }, { "epoch": 0.063, "grad_norm": 0.69140625, "grad_norm_var": 0.022525978088378907, "learning_rate": 6.3e-05, "loss": 0.7102, "loss/crossentropy": 3.0660817623138428, "loss/fcd": 0.50390625, "loss/logits": 0.20626945048570633, "step": 63 }, { "epoch": 0.064, "grad_norm": 0.71875, "grad_norm_var": 0.0225006103515625, "learning_rate": 6.400000000000001e-05, "loss": 0.6924, "loss/crossentropy": 3.02998948097229, "loss/fcd": 0.494140625, "loss/logits": 0.19828403741121292, "step": 64 }, { "epoch": 0.065, "grad_norm": 0.79296875, "grad_norm_var": 0.022371864318847655, "learning_rate": 6.500000000000001e-05, "loss": 0.6387, "loss/crossentropy": 2.9985952377319336, "loss/fcd": 0.45703125, "loss/logits": 0.18168944120407104, "step": 65 }, { "epoch": 0.066, "grad_norm": 0.6484375, "grad_norm_var": 0.02295220692952474, "learning_rate": 6.6e-05, "loss": 0.6224, "loss/crossentropy": 2.8418047428131104, "loss/fcd": 0.455078125, "loss/logits": 0.1672864779829979, "step": 66 }, { "epoch": 0.067, "grad_norm": 0.66015625, "grad_norm_var": 0.023088582356770835, "learning_rate": 6.7e-05, "loss": 0.6538, "loss/crossentropy": 3.1374388933181763, "loss/fcd": 0.4736328125, "loss/logits": 0.18021433055400848, "step": 67 }, { "epoch": 0.068, "grad_norm": 0.6171875, "grad_norm_var": 0.02418053944905599, "learning_rate": 6.800000000000001e-05, "loss": 0.6242, "loss/crossentropy": 2.95910382270813, "loss/fcd": 0.455078125, "loss/logits": 0.1690843105316162, "step": 68 }, { "epoch": 0.069, "grad_norm": 0.71875, "grad_norm_var": 0.02376397450764974, "learning_rate": 6.9e-05, "loss": 0.6089, "loss/crossentropy": 2.8932254314422607, "loss/fcd": 0.4453125, "loss/logits": 0.16358838975429535, "step": 69 }, { "epoch": 0.07, "grad_norm": 0.6875, "grad_norm_var": 0.02338231404622396, "learning_rate": 7e-05, "loss": 0.6128, "loss/crossentropy": 2.9293062686920166, "loss/fcd": 0.4443359375, "loss/logits": 0.16845793277025223, "step": 70 }, { "epoch": 0.071, "grad_norm": 0.7265625, "grad_norm_var": 0.007399241129557292, "learning_rate": 7.1e-05, "loss": 0.6994, "loss/crossentropy": 3.072464942932129, "loss/fcd": 0.501953125, "loss/logits": 0.19741064310073853, "step": 71 }, { "epoch": 0.072, "grad_norm": 0.61328125, "grad_norm_var": 0.0039751688639322914, "learning_rate": 7.2e-05, "loss": 0.6146, "loss/crossentropy": 2.8726388216018677, "loss/fcd": 0.44921875, "loss/logits": 0.16534685343503952, "step": 72 }, { "epoch": 0.073, "grad_norm": 0.703125, "grad_norm_var": 0.0039601008097330725, "learning_rate": 7.3e-05, "loss": 0.585, "loss/crossentropy": 2.7776870727539062, "loss/fcd": 0.427734375, "loss/logits": 0.1572359874844551, "step": 73 }, { "epoch": 0.074, "grad_norm": 0.66015625, "grad_norm_var": 0.003799883524576823, "learning_rate": 7.4e-05, "loss": 0.668, "loss/crossentropy": 3.068588614463806, "loss/fcd": 0.4814453125, "loss/logits": 0.18657152354717255, "step": 74 }, { "epoch": 0.075, "grad_norm": 0.625, "grad_norm_var": 0.0039397557576497395, "learning_rate": 7.500000000000001e-05, "loss": 0.5844, "loss/crossentropy": 2.985379934310913, "loss/fcd": 0.4296875, "loss/logits": 0.1547524631023407, "step": 75 }, { "epoch": 0.076, "grad_norm": 0.6484375, "grad_norm_var": 0.0037249247233072918, "learning_rate": 7.6e-05, "loss": 0.5738, "loss/crossentropy": 2.9654282331466675, "loss/fcd": 0.419921875, "loss/logits": 0.1539217308163643, "step": 76 }, { "epoch": 0.077, "grad_norm": 0.59375, "grad_norm_var": 0.004213460286458333, "learning_rate": 7.7e-05, "loss": 0.5822, "loss/crossentropy": 3.0354888439178467, "loss/fcd": 0.4189453125, "loss/logits": 0.16322524845600128, "step": 77 }, { "epoch": 0.078, "grad_norm": 0.65625, "grad_norm_var": 0.0026656468709309895, "learning_rate": 7.800000000000001e-05, "loss": 0.6238, "loss/crossentropy": 2.976304769515991, "loss/fcd": 0.447265625, "loss/logits": 0.17654582858085632, "step": 78 }, { "epoch": 0.079, "grad_norm": 0.5859375, "grad_norm_var": 0.0030965169270833334, "learning_rate": 7.900000000000001e-05, "loss": 0.5371, "loss/crossentropy": 2.884138584136963, "loss/fcd": 0.3974609375, "loss/logits": 0.13963794708251953, "step": 79 }, { "epoch": 0.08, "grad_norm": 0.6015625, "grad_norm_var": 0.0031308492024739584, "learning_rate": 8e-05, "loss": 0.5553, "loss/crossentropy": 2.6886497735977173, "loss/fcd": 0.4140625, "loss/logits": 0.14122631400823593, "step": 80 }, { "epoch": 0.081, "grad_norm": 0.6015625, "grad_norm_var": 0.001993751525878906, "learning_rate": 8.1e-05, "loss": 0.5781, "loss/crossentropy": 2.8824827671051025, "loss/fcd": 0.4267578125, "loss/logits": 0.1513620838522911, "step": 81 }, { "epoch": 0.082, "grad_norm": 0.609375, "grad_norm_var": 0.002080217997233073, "learning_rate": 8.2e-05, "loss": 0.5702, "loss/crossentropy": 2.934627056121826, "loss/fcd": 0.419921875, "loss/logits": 0.15031378716230392, "step": 82 }, { "epoch": 0.083, "grad_norm": 0.68359375, "grad_norm_var": 0.002164141337076823, "learning_rate": 8.3e-05, "loss": 0.592, "loss/crossentropy": 3.0252050161361694, "loss/fcd": 0.4345703125, "loss/logits": 0.157474547624588, "step": 83 }, { "epoch": 0.084, "grad_norm": 0.7890625, "grad_norm_var": 0.003355852762858073, "learning_rate": 8.4e-05, "loss": 0.5616, "loss/crossentropy": 3.0032228231430054, "loss/fcd": 0.41796875, "loss/logits": 0.1436692625284195, "step": 84 }, { "epoch": 0.085, "grad_norm": 0.70703125, "grad_norm_var": 0.0032671610514322918, "learning_rate": 8.5e-05, "loss": 0.5471, "loss/crossentropy": 2.9243968725204468, "loss/fcd": 0.4033203125, "loss/logits": 0.14377443492412567, "step": 85 }, { "epoch": 0.086, "grad_norm": 0.6640625, "grad_norm_var": 0.0032023111979166668, "learning_rate": 8.6e-05, "loss": 0.5416, "loss/crossentropy": 2.8646360635757446, "loss/fcd": 0.4033203125, "loss/logits": 0.13825497776269913, "step": 86 }, { "epoch": 0.087, "grad_norm": 0.609375, "grad_norm_var": 0.0029314676920572918, "learning_rate": 8.7e-05, "loss": 0.5441, "loss/crossentropy": 2.7806389331817627, "loss/fcd": 0.4052734375, "loss/logits": 0.13886360824108124, "step": 87 }, { "epoch": 0.088, "grad_norm": 0.69140625, "grad_norm_var": 0.0029619852701822918, "learning_rate": 8.800000000000001e-05, "loss": 0.5696, "loss/crossentropy": 2.923311948776245, "loss/fcd": 0.4189453125, "loss/logits": 0.15060903877019882, "step": 88 }, { "epoch": 0.089, "grad_norm": 0.65234375, "grad_norm_var": 0.002776018778483073, "learning_rate": 8.900000000000001e-05, "loss": 0.5715, "loss/crossentropy": 2.9447062015533447, "loss/fcd": 0.4169921875, "loss/logits": 0.15446894615888596, "step": 89 }, { "epoch": 0.09, "grad_norm": 0.6171875, "grad_norm_var": 0.0028256734212239584, "learning_rate": 9e-05, "loss": 0.5464, "loss/crossentropy": 2.8143328428268433, "loss/fcd": 0.4033203125, "loss/logits": 0.14312554895877838, "step": 90 }, { "epoch": 0.091, "grad_norm": 0.6484375, "grad_norm_var": 0.0027943929036458332, "learning_rate": 9.1e-05, "loss": 0.4956, "loss/crossentropy": 2.8094122409820557, "loss/fcd": 0.375, "loss/logits": 0.12059126049280167, "step": 91 }, { "epoch": 0.092, "grad_norm": 0.671875, "grad_norm_var": 0.0028317769368489585, "learning_rate": 9.200000000000001e-05, "loss": 0.56, "loss/crossentropy": 2.8966939449310303, "loss/fcd": 0.4169921875, "loss/logits": 0.14303536713123322, "step": 92 }, { "epoch": 0.093, "grad_norm": 0.6796875, "grad_norm_var": 0.0026611328125, "learning_rate": 9.300000000000001e-05, "loss": 0.4895, "loss/crossentropy": 2.8516520261764526, "loss/fcd": 0.3662109375, "loss/logits": 0.12323926761746407, "step": 93 }, { "epoch": 0.094, "grad_norm": 0.8046875, "grad_norm_var": 0.004076894124348958, "learning_rate": 9.4e-05, "loss": 0.6001, "loss/crossentropy": 2.906438112258911, "loss/fcd": 0.4208984375, "loss/logits": 0.1791856661438942, "step": 94 }, { "epoch": 0.095, "grad_norm": 0.64453125, "grad_norm_var": 0.0036849339803059896, "learning_rate": 9.5e-05, "loss": 0.4952, "loss/crossentropy": 2.7804044485092163, "loss/fcd": 0.37109375, "loss/logits": 0.1240886002779007, "step": 95 }, { "epoch": 0.096, "grad_norm": 0.81640625, "grad_norm_var": 0.0046885172526041664, "learning_rate": 9.6e-05, "loss": 0.5221, "loss/crossentropy": 2.7704015970230103, "loss/fcd": 0.3837890625, "loss/logits": 0.13829708099365234, "step": 96 }, { "epoch": 0.097, "grad_norm": 0.66796875, "grad_norm_var": 0.004263750712076823, "learning_rate": 9.7e-05, "loss": 0.4998, "loss/crossentropy": 3.006615400314331, "loss/fcd": 0.375, "loss/logits": 0.12483775615692139, "step": 97 }, { "epoch": 0.098, "grad_norm": 0.99609375, "grad_norm_var": 0.009720865885416667, "learning_rate": 9.8e-05, "loss": 0.5155, "loss/crossentropy": 2.900958776473999, "loss/fcd": 0.3837890625, "loss/logits": 0.13175079226493835, "step": 98 }, { "epoch": 0.099, "grad_norm": 1.03125, "grad_norm_var": 0.01609795888264974, "learning_rate": 9.900000000000001e-05, "loss": 0.5303, "loss/crossentropy": 2.6889891624450684, "loss/fcd": 0.3896484375, "loss/logits": 0.14069970324635506, "step": 99 }, { "epoch": 0.1, "grad_norm": 0.703125, "grad_norm_var": 0.01589094797770182, "learning_rate": 0.0001, "loss": 0.5076, "loss/crossentropy": 2.856358528137207, "loss/fcd": 0.3857421875, "loss/logits": 0.1218658983707428, "step": 100 }, { "epoch": 0.101, "grad_norm": 1.2109375, "grad_norm_var": 0.030530802408854165, "learning_rate": 0.0001, "loss": 0.6061, "loss/crossentropy": 2.836049795150757, "loss/fcd": 0.3857421875, "loss/logits": 0.22031500563025475, "step": 101 }, { "epoch": 0.102, "grad_norm": 0.7890625, "grad_norm_var": 0.029961140950520833, "learning_rate": 0.0001, "loss": 0.5018, "loss/crossentropy": 2.9406790733337402, "loss/fcd": 0.375, "loss/logits": 0.12680745497345924, "step": 102 }, { "epoch": 0.103, "grad_norm": 0.6328125, "grad_norm_var": 0.02951024373372396, "learning_rate": 0.0001, "loss": 0.4978, "loss/crossentropy": 2.9559073448181152, "loss/fcd": 0.369140625, "loss/logits": 0.12867781147360802, "step": 103 }, { "epoch": 0.104, "grad_norm": 0.51953125, "grad_norm_var": 0.03306859334309896, "learning_rate": 0.0001, "loss": 0.4568, "loss/crossentropy": 2.858797073364258, "loss/fcd": 0.3525390625, "loss/logits": 0.10423312336206436, "step": 104 }, { "epoch": 0.105, "grad_norm": 0.59375, "grad_norm_var": 0.03408807118733724, "learning_rate": 0.0001, "loss": 0.5192, "loss/crossentropy": 2.960332751274109, "loss/fcd": 0.3876953125, "loss/logits": 0.13145895302295685, "step": 105 }, { "epoch": 0.106, "grad_norm": 0.6796875, "grad_norm_var": 0.03321119944254557, "learning_rate": 0.0001, "loss": 0.5741, "loss/crossentropy": 2.82626473903656, "loss/fcd": 0.427734375, "loss/logits": 0.14631643146276474, "step": 106 }, { "epoch": 0.107, "grad_norm": 0.60546875, "grad_norm_var": 0.033940633138020836, "learning_rate": 0.0001, "loss": 0.4548, "loss/crossentropy": 2.8321757316589355, "loss/fcd": 0.3486328125, "loss/logits": 0.10612062737345695, "step": 107 }, { "epoch": 0.108, "grad_norm": 0.6796875, "grad_norm_var": 0.03386001586914063, "learning_rate": 0.0001, "loss": 0.4943, "loss/crossentropy": 2.782582402229309, "loss/fcd": 0.3681640625, "loss/logits": 0.12610362470149994, "step": 108 }, { "epoch": 0.109, "grad_norm": 0.609375, "grad_norm_var": 0.0348602294921875, "learning_rate": 0.0001, "loss": 0.4827, "loss/crossentropy": 2.8280365467071533, "loss/fcd": 0.359375, "loss/logits": 0.1233636736869812, "step": 109 }, { "epoch": 0.11, "grad_norm": 0.6015625, "grad_norm_var": 0.035931396484375, "learning_rate": 0.0001, "loss": 0.5, "loss/crossentropy": 2.964377284049988, "loss/fcd": 0.3681640625, "loss/logits": 0.13179466873407364, "step": 110 }, { "epoch": 0.111, "grad_norm": 0.69140625, "grad_norm_var": 0.0354949951171875, "learning_rate": 0.0001, "loss": 0.4909, "loss/crossentropy": 2.8390966653823853, "loss/fcd": 0.3662109375, "loss/logits": 0.12465999647974968, "step": 111 }, { "epoch": 0.112, "grad_norm": 0.7578125, "grad_norm_var": 0.035106849670410153, "learning_rate": 0.0001, "loss": 0.527, "loss/crossentropy": 2.828911542892456, "loss/fcd": 0.392578125, "loss/logits": 0.13445420563220978, "step": 112 }, { "epoch": 0.113, "grad_norm": 0.58984375, "grad_norm_var": 0.036192766825358075, "learning_rate": 0.0001, "loss": 0.4785, "loss/crossentropy": 2.9939907789230347, "loss/fcd": 0.3671875, "loss/logits": 0.11130134016275406, "step": 113 }, { "epoch": 0.114, "grad_norm": 0.578125, "grad_norm_var": 0.032321929931640625, "learning_rate": 0.0001, "loss": 0.4889, "loss/crossentropy": 2.901363730430603, "loss/fcd": 0.3671875, "loss/logits": 0.12166618555784225, "step": 114 }, { "epoch": 0.115, "grad_norm": 0.578125, "grad_norm_var": 0.025418853759765624, "learning_rate": 0.0001, "loss": 0.5046, "loss/crossentropy": 2.9244019985198975, "loss/fcd": 0.3759765625, "loss/logits": 0.1286001205444336, "step": 115 }, { "epoch": 0.116, "grad_norm": 0.55859375, "grad_norm_var": 0.02620690663655599, "learning_rate": 0.0001, "loss": 0.4725, "loss/crossentropy": 2.8373483419418335, "loss/fcd": 0.3564453125, "loss/logits": 0.11607612296938896, "step": 116 }, { "epoch": 0.117, "grad_norm": 0.5859375, "grad_norm_var": 0.005312538146972657, "learning_rate": 0.0001, "loss": 0.4802, "loss/crossentropy": 2.862138271331787, "loss/fcd": 0.3583984375, "loss/logits": 0.1218356192111969, "step": 117 }, { "epoch": 0.118, "grad_norm": 0.5625, "grad_norm_var": 0.0036605199178059897, "learning_rate": 0.0001, "loss": 0.4794, "loss/crossentropy": 2.8210216760635376, "loss/fcd": 0.36328125, "loss/logits": 0.1161160059273243, "step": 118 }, { "epoch": 0.119, "grad_norm": 0.54296875, "grad_norm_var": 0.0039398193359375, "learning_rate": 0.0001, "loss": 0.4864, "loss/crossentropy": 2.880854606628418, "loss/fcd": 0.365234375, "loss/logits": 0.12117999419569969, "step": 119 }, { "epoch": 0.12, "grad_norm": 0.55078125, "grad_norm_var": 0.0036305745442708334, "learning_rate": 0.0001, "loss": 0.4729, "loss/crossentropy": 2.908434748649597, "loss/fcd": 0.3564453125, "loss/logits": 0.11646857485175133, "step": 120 }, { "epoch": 0.121, "grad_norm": 0.5390625, "grad_norm_var": 0.003938547770182292, "learning_rate": 0.0001, "loss": 0.4538, "loss/crossentropy": 2.8156157732009888, "loss/fcd": 0.345703125, "loss/logits": 0.108062494546175, "step": 121 }, { "epoch": 0.122, "grad_norm": 0.51953125, "grad_norm_var": 0.003988075256347656, "learning_rate": 0.0001, "loss": 0.4501, "loss/crossentropy": 2.8814542293548584, "loss/fcd": 0.33984375, "loss/logits": 0.11024175211787224, "step": 122 }, { "epoch": 0.123, "grad_norm": 0.5390625, "grad_norm_var": 0.004188028971354166, "learning_rate": 0.0001, "loss": 0.4605, "loss/crossentropy": 2.794196367263794, "loss/fcd": 0.3544921875, "loss/logits": 0.10602544993162155, "step": 123 }, { "epoch": 0.124, "grad_norm": 0.56640625, "grad_norm_var": 0.0036773045857747396, "learning_rate": 0.0001, "loss": 0.4203, "loss/crossentropy": 2.7264811992645264, "loss/fcd": 0.3232421875, "loss/logits": 0.09708360582590103, "step": 124 }, { "epoch": 0.125, "grad_norm": 0.51171875, "grad_norm_var": 0.003964996337890625, "learning_rate": 0.0001, "loss": 0.4584, "loss/crossentropy": 2.739405870437622, "loss/fcd": 0.3515625, "loss/logits": 0.10686014965176582, "step": 125 }, { "epoch": 0.126, "grad_norm": 0.546875, "grad_norm_var": 0.00399169921875, "learning_rate": 0.0001, "loss": 0.4395, "loss/crossentropy": 2.9202369451522827, "loss/fcd": 0.3359375, "loss/logits": 0.10352493822574615, "step": 126 }, { "epoch": 0.127, "grad_norm": 0.6796875, "grad_norm_var": 0.0038202285766601564, "learning_rate": 0.0001, "loss": 0.4886, "loss/crossentropy": 2.872094511985779, "loss/fcd": 0.3642578125, "loss/logits": 0.12433834001421928, "step": 127 }, { "epoch": 0.128, "grad_norm": 0.5859375, "grad_norm_var": 0.0014871597290039063, "learning_rate": 0.0001, "loss": 0.4743, "loss/crossentropy": 3.0496490001678467, "loss/fcd": 0.3583984375, "loss/logits": 0.11591921001672745, "step": 128 }, { "epoch": 0.129, "grad_norm": 0.53515625, "grad_norm_var": 0.0014907201131184897, "learning_rate": 0.0001, "loss": 0.4772, "loss/crossentropy": 3.0315643548965454, "loss/fcd": 0.3603515625, "loss/logits": 0.11689409613609314, "step": 129 }, { "epoch": 0.13, "grad_norm": 0.625, "grad_norm_var": 0.0017333348592122396, "learning_rate": 0.0001, "loss": 0.4397, "loss/crossentropy": 2.784233808517456, "loss/fcd": 0.33984375, "loss/logits": 0.0998244546353817, "step": 130 }, { "epoch": 0.131, "grad_norm": 0.59765625, "grad_norm_var": 0.0017934163411458333, "learning_rate": 0.0001, "loss": 0.4349, "loss/crossentropy": 2.7653247117996216, "loss/fcd": 0.3310546875, "loss/logits": 0.10387945547699928, "step": 131 }, { "epoch": 0.132, "grad_norm": 0.61328125, "grad_norm_var": 0.0019304911295572917, "learning_rate": 0.0001, "loss": 0.4506, "loss/crossentropy": 2.861555814743042, "loss/fcd": 0.3408203125, "loss/logits": 0.10979178920388222, "step": 132 }, { "epoch": 0.133, "grad_norm": 0.609375, "grad_norm_var": 0.002018229166666667, "learning_rate": 0.0001, "loss": 0.451, "loss/crossentropy": 2.8154799938201904, "loss/fcd": 0.34375, "loss/logits": 0.10724714770913124, "step": 133 }, { "epoch": 0.134, "grad_norm": 0.67578125, "grad_norm_var": 0.0027022679646809896, "learning_rate": 0.0001, "loss": 0.4704, "loss/crossentropy": 2.9504034519195557, "loss/fcd": 0.3525390625, "loss/logits": 0.11785488575696945, "step": 134 }, { "epoch": 0.135, "grad_norm": 0.6328125, "grad_norm_var": 0.0027943929036458332, "learning_rate": 0.0001, "loss": 0.4539, "loss/crossentropy": 2.909873127937317, "loss/fcd": 0.345703125, "loss/logits": 0.1082424707710743, "step": 135 }, { "epoch": 0.136, "grad_norm": 0.921875, "grad_norm_var": 0.00980676015218099, "learning_rate": 0.0001, "loss": 0.4273, "loss/crossentropy": 2.8628796339035034, "loss/fcd": 0.3251953125, "loss/logits": 0.10214787721633911, "step": 136 }, { "epoch": 0.137, "grad_norm": 0.50390625, "grad_norm_var": 0.010198720296223958, "learning_rate": 0.0001, "loss": 0.4094, "loss/crossentropy": 2.8360742330551147, "loss/fcd": 0.314453125, "loss/logits": 0.0949702151119709, "step": 137 }, { "epoch": 0.138, "grad_norm": 0.58203125, "grad_norm_var": 0.009738922119140625, "learning_rate": 0.0001, "loss": 0.4563, "loss/crossentropy": 2.9145069122314453, "loss/fcd": 0.3427734375, "loss/logits": 0.1135670654475689, "step": 138 }, { "epoch": 0.139, "grad_norm": 0.6015625, "grad_norm_var": 0.009409332275390625, "learning_rate": 0.0001, "loss": 0.4629, "loss/crossentropy": 2.915167808532715, "loss/fcd": 0.357421875, "loss/logits": 0.10544831678271294, "step": 139 }, { "epoch": 0.14, "grad_norm": 0.5625, "grad_norm_var": 0.009433937072753907, "learning_rate": 0.0001, "loss": 0.433, "loss/crossentropy": 2.8979417085647583, "loss/fcd": 0.330078125, "loss/logits": 0.10290464386343956, "step": 140 }, { "epoch": 0.141, "grad_norm": 0.7890625, "grad_norm_var": 0.010548909505208334, "learning_rate": 0.0001, "loss": 0.5202, "loss/crossentropy": 2.953768253326416, "loss/fcd": 0.3828125, "loss/logits": 0.13743700832128525, "step": 141 }, { "epoch": 0.142, "grad_norm": 0.890625, "grad_norm_var": 0.014174397786458333, "learning_rate": 0.0001, "loss": 0.5144, "loss/crossentropy": 2.8743395805358887, "loss/fcd": 0.376953125, "loss/logits": 0.13742298260331154, "step": 142 }, { "epoch": 0.143, "grad_norm": 0.6484375, "grad_norm_var": 0.014113362630208333, "learning_rate": 0.0001, "loss": 0.4276, "loss/crossentropy": 2.8286346197128296, "loss/fcd": 0.328125, "loss/logits": 0.09949736669659615, "step": 143 }, { "epoch": 0.144, "grad_norm": 0.79296875, "grad_norm_var": 0.015066973368326823, "learning_rate": 0.0001, "loss": 0.4918, "loss/crossentropy": 2.908802032470703, "loss/fcd": 0.3671875, "loss/logits": 0.12464811280369759, "step": 144 }, { "epoch": 0.145, "grad_norm": 1.0625, "grad_norm_var": 0.023572794596354165, "learning_rate": 0.0001, "loss": 0.4983, "loss/crossentropy": 3.0399140119552612, "loss/fcd": 0.3662109375, "loss/logits": 0.13205987215042114, "step": 145 }, { "epoch": 0.146, "grad_norm": 0.85546875, "grad_norm_var": 0.02476189931233724, "learning_rate": 0.0001, "loss": 0.4498, "loss/crossentropy": 2.7846896648406982, "loss/fcd": 0.3447265625, "loss/logits": 0.10509144142270088, "step": 146 }, { "epoch": 0.147, "grad_norm": 0.62109375, "grad_norm_var": 0.02444909413655599, "learning_rate": 0.0001, "loss": 0.4078, "loss/crossentropy": 2.7551146745681763, "loss/fcd": 0.3203125, "loss/logits": 0.0875249132514, "step": 147 }, { "epoch": 0.148, "grad_norm": 0.62890625, "grad_norm_var": 0.024262428283691406, "learning_rate": 0.0001, "loss": 0.4494, "loss/crossentropy": 3.0209089517593384, "loss/fcd": 0.3427734375, "loss/logits": 0.10662630200386047, "step": 148 }, { "epoch": 0.149, "grad_norm": 0.703125, "grad_norm_var": 0.023539161682128905, "learning_rate": 0.0001, "loss": 0.4343, "loss/crossentropy": 2.762128233909607, "loss/fcd": 0.330078125, "loss/logits": 0.10424429923295975, "step": 149 }, { "epoch": 0.15, "grad_norm": 0.671875, "grad_norm_var": 0.023561604817708335, "learning_rate": 0.0001, "loss": 0.4631, "loss/crossentropy": 2.9137667417526245, "loss/fcd": 0.3466796875, "loss/logits": 0.11645066738128662, "step": 150 }, { "epoch": 0.151, "grad_norm": 0.5390625, "grad_norm_var": 0.025160725911458334, "learning_rate": 0.0001, "loss": 0.4427, "loss/crossentropy": 2.881830334663391, "loss/fcd": 0.3369140625, "loss/logits": 0.10573653504252434, "step": 151 }, { "epoch": 0.152, "grad_norm": 0.63671875, "grad_norm_var": 0.02222283681233724, "learning_rate": 0.0001, "loss": 0.5156, "loss/crossentropy": 2.850717067718506, "loss/fcd": 0.3798828125, "loss/logits": 0.1356773041188717, "step": 152 }, { "epoch": 0.153, "grad_norm": 0.55859375, "grad_norm_var": 0.021030108133951824, "learning_rate": 0.0001, "loss": 0.4536, "loss/crossentropy": 2.8608055114746094, "loss/fcd": 0.3447265625, "loss/logits": 0.10892104357481003, "step": 153 }, { "epoch": 0.154, "grad_norm": 0.55078125, "grad_norm_var": 0.02156823476155599, "learning_rate": 0.0001, "loss": 0.4315, "loss/crossentropy": 2.800734758377075, "loss/fcd": 0.3310546875, "loss/logits": 0.1004045195877552, "step": 154 }, { "epoch": 0.155, "grad_norm": 0.58984375, "grad_norm_var": 0.021722157796223957, "learning_rate": 0.0001, "loss": 0.4312, "loss/crossentropy": 2.7723549604415894, "loss/fcd": 0.3369140625, "loss/logits": 0.09424581006169319, "step": 155 }, { "epoch": 0.156, "grad_norm": 0.6015625, "grad_norm_var": 0.0211334228515625, "learning_rate": 0.0001, "loss": 0.4285, "loss/crossentropy": 2.7112516164779663, "loss/fcd": 0.333984375, "loss/logits": 0.09450496360659599, "step": 156 }, { "epoch": 0.157, "grad_norm": 0.55078125, "grad_norm_var": 0.02173455556233724, "learning_rate": 0.0001, "loss": 0.405, "loss/crossentropy": 2.8753061294555664, "loss/fcd": 0.30859375, "loss/logits": 0.0963931567966938, "step": 157 }, { "epoch": 0.158, "grad_norm": 0.59765625, "grad_norm_var": 0.018925984700520832, "learning_rate": 0.0001, "loss": 0.4793, "loss/crossentropy": 2.859338641166687, "loss/fcd": 0.3623046875, "loss/logits": 0.1170136034488678, "step": 158 }, { "epoch": 0.159, "grad_norm": 0.59765625, "grad_norm_var": 0.01918633778889974, "learning_rate": 0.0001, "loss": 0.4427, "loss/crossentropy": 2.9007763862609863, "loss/fcd": 0.33984375, "loss/logits": 0.10285023972392082, "step": 159 }, { "epoch": 0.16, "grad_norm": 0.54296875, "grad_norm_var": 0.018657366434733074, "learning_rate": 0.0001, "loss": 0.4145, "loss/crossentropy": 2.764789581298828, "loss/fcd": 0.3193359375, "loss/logits": 0.09517998620867729, "step": 160 }, { "epoch": 0.161, "grad_norm": 0.55859375, "grad_norm_var": 0.006428782145182292, "learning_rate": 0.0001, "loss": 0.4301, "loss/crossentropy": 2.9352102279663086, "loss/fcd": 0.3291015625, "loss/logits": 0.10102442279458046, "step": 161 }, { "epoch": 0.162, "grad_norm": 0.7109375, "grad_norm_var": 0.0030577977498372397, "learning_rate": 0.0001, "loss": 0.4884, "loss/crossentropy": 2.8804049491882324, "loss/fcd": 0.3720703125, "loss/logits": 0.11631228029727936, "step": 162 }, { "epoch": 0.163, "grad_norm": 0.59765625, "grad_norm_var": 0.0030379613240559896, "learning_rate": 0.0001, "loss": 0.4418, "loss/crossentropy": 2.9045521020889282, "loss/fcd": 0.3369140625, "loss/logits": 0.10484013333916664, "step": 163 }, { "epoch": 0.164, "grad_norm": 0.51953125, "grad_norm_var": 0.003397560119628906, "learning_rate": 0.0001, "loss": 0.4228, "loss/crossentropy": 2.84330677986145, "loss/fcd": 0.3232421875, "loss/logits": 0.09951141849160194, "step": 164 }, { "epoch": 0.165, "grad_norm": 0.6171875, "grad_norm_var": 0.0026254653930664062, "learning_rate": 0.0001, "loss": 0.4573, "loss/crossentropy": 2.9289658069610596, "loss/fcd": 0.34765625, "loss/logits": 0.1096344031393528, "step": 165 }, { "epoch": 0.166, "grad_norm": 0.62109375, "grad_norm_var": 0.0022328694661458335, "learning_rate": 0.0001, "loss": 0.4325, "loss/crossentropy": 2.9142534732818604, "loss/fcd": 0.3359375, "loss/logits": 0.09654596820473671, "step": 166 }, { "epoch": 0.167, "grad_norm": 0.59765625, "grad_norm_var": 0.0020736058553059897, "learning_rate": 0.0001, "loss": 0.4141, "loss/crossentropy": 2.722415328025818, "loss/fcd": 0.318359375, "loss/logits": 0.09574693441390991, "step": 167 }, { "epoch": 0.168, "grad_norm": 0.48046875, "grad_norm_var": 0.0026381810506184897, "learning_rate": 0.0001, "loss": 0.4442, "loss/crossentropy": 2.9160542488098145, "loss/fcd": 0.337890625, "loss/logits": 0.10633675754070282, "step": 168 }, { "epoch": 0.169, "grad_norm": 0.53125, "grad_norm_var": 0.002765909830729167, "learning_rate": 0.0001, "loss": 0.437, "loss/crossentropy": 2.8252073526382446, "loss/fcd": 0.333984375, "loss/logits": 0.10301430523395538, "step": 169 }, { "epoch": 0.17, "grad_norm": 0.474609375, "grad_norm_var": 0.0034161726633707683, "learning_rate": 0.0001, "loss": 0.4145, "loss/crossentropy": 2.8485448360443115, "loss/fcd": 0.3203125, "loss/logits": 0.09413985908031464, "step": 170 }, { "epoch": 0.171, "grad_norm": 0.6953125, "grad_norm_var": 0.004329411188761393, "learning_rate": 0.0001, "loss": 0.4442, "loss/crossentropy": 2.7965201139450073, "loss/fcd": 0.3427734375, "loss/logits": 0.1014043353497982, "step": 171 }, { "epoch": 0.172, "grad_norm": 0.6328125, "grad_norm_var": 0.004476404190063477, "learning_rate": 0.0001, "loss": 0.4499, "loss/crossentropy": 2.9560967683792114, "loss/fcd": 0.3447265625, "loss/logits": 0.1051417626440525, "step": 172 }, { "epoch": 0.173, "grad_norm": 0.64453125, "grad_norm_var": 0.004624414443969727, "learning_rate": 0.0001, "loss": 0.4659, "loss/crossentropy": 2.8695074319839478, "loss/fcd": 0.35546875, "loss/logits": 0.11045186966657639, "step": 173 }, { "epoch": 0.174, "grad_norm": 0.515625, "grad_norm_var": 0.004947519302368164, "learning_rate": 0.0001, "loss": 0.3876, "loss/crossentropy": 2.6555715799331665, "loss/fcd": 0.3056640625, "loss/logits": 0.08189895376563072, "step": 174 }, { "epoch": 0.175, "grad_norm": 0.63671875, "grad_norm_var": 0.005116001764933268, "learning_rate": 0.0001, "loss": 0.4016, "loss/crossentropy": 2.8286584615707397, "loss/fcd": 0.310546875, "loss/logits": 0.09104615077376366, "step": 175 }, { "epoch": 0.176, "grad_norm": 0.5390625, "grad_norm_var": 0.005139398574829102, "learning_rate": 0.0001, "loss": 0.4242, "loss/crossentropy": 2.8632349967956543, "loss/fcd": 0.3232421875, "loss/logits": 0.10094088688492775, "step": 176 }, { "epoch": 0.177, "grad_norm": 0.55859375, "grad_norm_var": 0.005139398574829102, "learning_rate": 0.0001, "loss": 0.4185, "loss/crossentropy": 2.834444522857666, "loss/fcd": 0.322265625, "loss/logits": 0.09620294347405434, "step": 177 }, { "epoch": 0.178, "grad_norm": 0.5625, "grad_norm_var": 0.004040129979451497, "learning_rate": 0.0001, "loss": 0.3848, "loss/crossentropy": 2.7522603273391724, "loss/fcd": 0.3017578125, "loss/logits": 0.08301593363285065, "step": 178 }, { "epoch": 0.179, "grad_norm": 0.5625, "grad_norm_var": 0.004018386205037435, "learning_rate": 0.0001, "loss": 0.4228, "loss/crossentropy": 2.8634541034698486, "loss/fcd": 0.3232421875, "loss/logits": 0.09959971159696579, "step": 179 }, { "epoch": 0.18, "grad_norm": 0.52734375, "grad_norm_var": 0.003965107599894205, "learning_rate": 0.0001, "loss": 0.4212, "loss/crossentropy": 2.8671987056732178, "loss/fcd": 0.32421875, "loss/logits": 0.09694879502058029, "step": 180 }, { "epoch": 0.181, "grad_norm": 0.578125, "grad_norm_var": 0.0038398583730061848, "learning_rate": 0.0001, "loss": 0.4259, "loss/crossentropy": 2.9206082820892334, "loss/fcd": 0.32421875, "loss/logits": 0.10163932293653488, "step": 181 }, { "epoch": 0.182, "grad_norm": 0.609375, "grad_norm_var": 0.0037723382314046225, "learning_rate": 0.0001, "loss": 0.4385, "loss/crossentropy": 2.8616329431533813, "loss/fcd": 0.33203125, "loss/logits": 0.10650401189923286, "step": 182 }, { "epoch": 0.183, "grad_norm": 0.56640625, "grad_norm_var": 0.003725035985310872, "learning_rate": 0.0001, "loss": 0.4238, "loss/crossentropy": 2.743476986885071, "loss/fcd": 0.3251953125, "loss/logits": 0.09862633794546127, "step": 183 }, { "epoch": 0.184, "grad_norm": 0.5625, "grad_norm_var": 0.0031696160634358725, "learning_rate": 0.0001, "loss": 0.4195, "loss/crossentropy": 2.822865128517151, "loss/fcd": 0.322265625, "loss/logits": 0.09720025584101677, "step": 184 }, { "epoch": 0.185, "grad_norm": 0.51953125, "grad_norm_var": 0.0032462914784749347, "learning_rate": 0.0001, "loss": 0.395, "loss/crossentropy": 2.875390648841858, "loss/fcd": 0.3076171875, "loss/logits": 0.08735854551196098, "step": 185 }, { "epoch": 0.186, "grad_norm": 0.56640625, "grad_norm_var": 0.0025552749633789063, "learning_rate": 0.0001, "loss": 0.45, "loss/crossentropy": 2.821836471557617, "loss/fcd": 0.341796875, "loss/logits": 0.10815925523638725, "step": 186 }, { "epoch": 0.187, "grad_norm": 0.59375, "grad_norm_var": 0.001636187235514323, "learning_rate": 0.0001, "loss": 0.4167, "loss/crossentropy": 2.9735742807388306, "loss/fcd": 0.318359375, "loss/logits": 0.09838059172034264, "step": 187 }, { "epoch": 0.188, "grad_norm": 0.57421875, "grad_norm_var": 0.0013872782389322917, "learning_rate": 0.0001, "loss": 0.4461, "loss/crossentropy": 2.8691996335983276, "loss/fcd": 0.341796875, "loss/logits": 0.10430673509836197, "step": 188 }, { "epoch": 0.189, "grad_norm": 0.515625, "grad_norm_var": 0.001141802469889323, "learning_rate": 0.0001, "loss": 0.4334, "loss/crossentropy": 2.7962933778762817, "loss/fcd": 0.3330078125, "loss/logits": 0.1004236750304699, "step": 189 }, { "epoch": 0.19, "grad_norm": 0.5625, "grad_norm_var": 0.0009907404581705728, "learning_rate": 0.0001, "loss": 0.4124, "loss/crossentropy": 2.9746346473693848, "loss/fcd": 0.3173828125, "loss/logits": 0.09501497820019722, "step": 190 }, { "epoch": 0.191, "grad_norm": 0.6328125, "grad_norm_var": 0.0009541829427083333, "learning_rate": 0.0001, "loss": 0.4909, "loss/crossentropy": 3.0425490140914917, "loss/fcd": 0.369140625, "loss/logits": 0.12177145853638649, "step": 191 }, { "epoch": 0.192, "grad_norm": 0.6328125, "grad_norm_var": 0.0011861165364583333, "learning_rate": 0.0001, "loss": 0.4268, "loss/crossentropy": 2.8151696920394897, "loss/fcd": 0.33203125, "loss/logits": 0.09481121972203255, "step": 192 }, { "epoch": 0.193, "grad_norm": 0.70703125, "grad_norm_var": 0.0023312886555989583, "learning_rate": 0.0001, "loss": 0.4791, "loss/crossentropy": 2.8669657707214355, "loss/fcd": 0.3662109375, "loss/logits": 0.11289490759372711, "step": 193 }, { "epoch": 0.194, "grad_norm": 0.66796875, "grad_norm_var": 0.0027861913045247396, "learning_rate": 0.0001, "loss": 0.3946, "loss/crossentropy": 2.7116475105285645, "loss/fcd": 0.30859375, "loss/logits": 0.0860372893512249, "step": 194 }, { "epoch": 0.195, "grad_norm": 0.6484375, "grad_norm_var": 0.0029764175415039062, "learning_rate": 0.0001, "loss": 0.4165, "loss/crossentropy": 2.870499610900879, "loss/fcd": 0.3212890625, "loss/logits": 0.09520205855369568, "step": 195 }, { "epoch": 0.196, "grad_norm": 0.6875, "grad_norm_var": 0.003208414713541667, "learning_rate": 0.0001, "loss": 0.419, "loss/crossentropy": 2.9276784658432007, "loss/fcd": 0.318359375, "loss/logits": 0.10059179738163948, "step": 196 }, { "epoch": 0.197, "grad_norm": 0.7109375, "grad_norm_var": 0.0038958231608072916, "learning_rate": 0.0001, "loss": 0.4725, "loss/crossentropy": 2.845921277999878, "loss/fcd": 0.3505859375, "loss/logits": 0.12191061675548553, "step": 197 }, { "epoch": 0.198, "grad_norm": 0.71484375, "grad_norm_var": 0.004584185282389323, "learning_rate": 0.0001, "loss": 0.453, "loss/crossentropy": 2.8804662227630615, "loss/fcd": 0.3359375, "loss/logits": 0.11705750972032547, "step": 198 }, { "epoch": 0.199, "grad_norm": 0.53515625, "grad_norm_var": 0.00485375722249349, "learning_rate": 0.0001, "loss": 0.3999, "loss/crossentropy": 2.681739091873169, "loss/fcd": 0.3115234375, "loss/logits": 0.08841734752058983, "step": 199 }, { "epoch": 0.2, "grad_norm": 0.57421875, "grad_norm_var": 0.004781087239583333, "learning_rate": 0.0001, "loss": 0.4227, "loss/crossentropy": 2.968848705291748, "loss/fcd": 0.3212890625, "loss/logits": 0.10139760375022888, "step": 200 }, { "epoch": 0.201, "grad_norm": 0.609375, "grad_norm_var": 0.004139137268066406, "learning_rate": 0.0001, "loss": 0.4464, "loss/crossentropy": 2.8944156169891357, "loss/fcd": 0.3388671875, "loss/logits": 0.1075349859893322, "step": 201 }, { "epoch": 0.202, "grad_norm": 0.62890625, "grad_norm_var": 0.0039295832316080725, "learning_rate": 0.0001, "loss": 0.4596, "loss/crossentropy": 2.9402053356170654, "loss/fcd": 0.3447265625, "loss/logits": 0.11490758880972862, "step": 202 }, { "epoch": 0.203, "grad_norm": 0.5859375, "grad_norm_var": 0.003965695699055989, "learning_rate": 0.0001, "loss": 0.4385, "loss/crossentropy": 2.930693745613098, "loss/fcd": 0.3330078125, "loss/logits": 0.10551033169031143, "step": 203 }, { "epoch": 0.204, "grad_norm": 0.671875, "grad_norm_var": 0.003910064697265625, "learning_rate": 0.0001, "loss": 0.4432, "loss/crossentropy": 2.9876062870025635, "loss/fcd": 0.3349609375, "loss/logits": 0.10823972150683403, "step": 204 }, { "epoch": 0.205, "grad_norm": 0.828125, "grad_norm_var": 0.005232493082682292, "learning_rate": 0.0001, "loss": 0.4812, "loss/crossentropy": 2.9890397787094116, "loss/fcd": 0.3583984375, "loss/logits": 0.12285104021430016, "step": 205 }, { "epoch": 0.206, "grad_norm": 0.765625, "grad_norm_var": 0.005444081624348959, "learning_rate": 0.0001, "loss": 0.3894, "loss/crossentropy": 2.867851734161377, "loss/fcd": 0.3037109375, "loss/logits": 0.08565087616443634, "step": 206 }, { "epoch": 0.207, "grad_norm": 0.89453125, "grad_norm_var": 0.008685747782389322, "learning_rate": 0.0001, "loss": 0.441, "loss/crossentropy": 3.0004996061325073, "loss/fcd": 0.3369140625, "loss/logits": 0.10407992079854012, "step": 207 }, { "epoch": 0.208, "grad_norm": 0.82421875, "grad_norm_var": 0.009797922770182292, "learning_rate": 0.0001, "loss": 0.4171, "loss/crossentropy": 2.8082648515701294, "loss/fcd": 0.3212890625, "loss/logits": 0.09578917548060417, "step": 208 }, { "epoch": 0.209, "grad_norm": 0.69921875, "grad_norm_var": 0.009784952799479166, "learning_rate": 0.0001, "loss": 0.4368, "loss/crossentropy": 2.928029775619507, "loss/fcd": 0.33203125, "loss/logits": 0.10473304614424706, "step": 209 }, { "epoch": 0.21, "grad_norm": 0.494140625, "grad_norm_var": 0.012194045384724935, "learning_rate": 0.0001, "loss": 0.3936, "loss/crossentropy": 2.699643611907959, "loss/fcd": 0.30859375, "loss/logits": 0.0850089080631733, "step": 210 }, { "epoch": 0.211, "grad_norm": 0.57421875, "grad_norm_var": 0.01284635861714681, "learning_rate": 0.0001, "loss": 0.4002, "loss/crossentropy": 2.853596329689026, "loss/fcd": 0.3076171875, "loss/logits": 0.09262469410896301, "step": 211 }, { "epoch": 0.212, "grad_norm": 0.62890625, "grad_norm_var": 0.012962706883748372, "learning_rate": 0.0001, "loss": 0.4106, "loss/crossentropy": 2.7878748178482056, "loss/fcd": 0.3212890625, "loss/logits": 0.0892898328602314, "step": 212 }, { "epoch": 0.213, "grad_norm": 1.0859375, "grad_norm_var": 0.023735411961873374, "learning_rate": 0.0001, "loss": 0.516, "loss/crossentropy": 2.8341736793518066, "loss/fcd": 0.373046875, "loss/logits": 0.1429123878479004, "step": 213 }, { "epoch": 0.214, "grad_norm": 0.71484375, "grad_norm_var": 0.023735411961873374, "learning_rate": 0.0001, "loss": 0.417, "loss/crossentropy": 2.848291754722595, "loss/fcd": 0.3193359375, "loss/logits": 0.09763862192630768, "step": 214 }, { "epoch": 0.215, "grad_norm": 0.74609375, "grad_norm_var": 0.02202909787495931, "learning_rate": 0.0001, "loss": 0.4259, "loss/crossentropy": 3.0413947105407715, "loss/fcd": 0.3251953125, "loss/logits": 0.10072237998247147, "step": 215 }, { "epoch": 0.216, "grad_norm": 0.6484375, "grad_norm_var": 0.0210506280263265, "learning_rate": 0.0001, "loss": 0.4208, "loss/crossentropy": 2.9306408166885376, "loss/fcd": 0.318359375, "loss/logits": 0.10242888703942299, "step": 216 }, { "epoch": 0.217, "grad_norm": 0.80078125, "grad_norm_var": 0.02070794105529785, "learning_rate": 0.0001, "loss": 0.4511, "loss/crossentropy": 2.8223299980163574, "loss/fcd": 0.3408203125, "loss/logits": 0.11031396687030792, "step": 217 }, { "epoch": 0.218, "grad_norm": 0.6171875, "grad_norm_var": 0.020865869522094727, "learning_rate": 0.0001, "loss": 0.3949, "loss/crossentropy": 2.7668732404708862, "loss/fcd": 0.3095703125, "loss/logits": 0.08537604659795761, "step": 218 }, { "epoch": 0.219, "grad_norm": 0.6640625, "grad_norm_var": 0.019811741511027017, "learning_rate": 0.0001, "loss": 0.4237, "loss/crossentropy": 2.9836037158966064, "loss/fcd": 0.318359375, "loss/logits": 0.10530032590031624, "step": 219 }, { "epoch": 0.22, "grad_norm": 0.66796875, "grad_norm_var": 0.019842259089152017, "learning_rate": 0.0001, "loss": 0.3983, "loss/crossentropy": 2.747257947921753, "loss/fcd": 0.3076171875, "loss/logits": 0.09065764769911766, "step": 220 }, { "epoch": 0.221, "grad_norm": 0.6640625, "grad_norm_var": 0.019342915217081705, "learning_rate": 0.0001, "loss": 0.43, "loss/crossentropy": 2.845228672027588, "loss/fcd": 0.330078125, "loss/logits": 0.09995642304420471, "step": 221 }, { "epoch": 0.222, "grad_norm": 0.6484375, "grad_norm_var": 0.019459263483683268, "learning_rate": 0.0001, "loss": 0.4125, "loss/crossentropy": 2.8502864837646484, "loss/fcd": 0.3203125, "loss/logits": 0.09222512319684029, "step": 222 }, { "epoch": 0.223, "grad_norm": 0.53515625, "grad_norm_var": 0.018728113174438475, "learning_rate": 0.0001, "loss": 0.4058, "loss/crossentropy": 2.924101948738098, "loss/fcd": 0.3115234375, "loss/logits": 0.0942913256585598, "step": 223 }, { "epoch": 0.224, "grad_norm": 0.58984375, "grad_norm_var": 0.01791558265686035, "learning_rate": 0.0001, "loss": 0.4231, "loss/crossentropy": 2.9752752780914307, "loss/fcd": 0.326171875, "loss/logits": 0.09692845121026039, "step": 224 }, { "epoch": 0.225, "grad_norm": 0.60546875, "grad_norm_var": 0.018145990371704102, "learning_rate": 0.0001, "loss": 0.4163, "loss/crossentropy": 2.9902013540267944, "loss/fcd": 0.3203125, "loss/logits": 0.09601914137601852, "step": 225 }, { "epoch": 0.226, "grad_norm": 0.60546875, "grad_norm_var": 0.0163421630859375, "learning_rate": 0.0001, "loss": 0.4282, "loss/crossentropy": 2.9544960260391235, "loss/fcd": 0.32421875, "loss/logits": 0.10400371253490448, "step": 226 }, { "epoch": 0.227, "grad_norm": 0.52734375, "grad_norm_var": 0.017108154296875, "learning_rate": 0.0001, "loss": 0.4189, "loss/crossentropy": 2.9443225860595703, "loss/fcd": 0.322265625, "loss/logits": 0.09667930379509926, "step": 227 }, { "epoch": 0.228, "grad_norm": 0.5625, "grad_norm_var": 0.01776421864827474, "learning_rate": 0.0001, "loss": 0.4086, "loss/crossentropy": 2.8471500873565674, "loss/fcd": 0.3154296875, "loss/logits": 0.09314657375216484, "step": 228 }, { "epoch": 0.229, "grad_norm": 0.78515625, "grad_norm_var": 0.006646474202473958, "learning_rate": 0.0001, "loss": 0.4747, "loss/crossentropy": 3.0068620443344116, "loss/fcd": 0.353515625, "loss/logits": 0.1211743988096714, "step": 229 }, { "epoch": 0.23, "grad_norm": 1.0390625, "grad_norm_var": 0.01606591542561849, "learning_rate": 0.0001, "loss": 0.5054, "loss/crossentropy": 2.904601573944092, "loss/fcd": 0.357421875, "loss/logits": 0.1479528360068798, "step": 230 }, { "epoch": 0.231, "grad_norm": 0.71484375, "grad_norm_var": 0.01580651601155599, "learning_rate": 0.0001, "loss": 0.413, "loss/crossentropy": 2.8541531562805176, "loss/fcd": 0.314453125, "loss/logits": 0.09850187227129936, "step": 231 }, { "epoch": 0.232, "grad_norm": 0.8359375, "grad_norm_var": 0.01753381093343099, "learning_rate": 0.0001, "loss": 0.4665, "loss/crossentropy": 3.0907140970230103, "loss/fcd": 0.3447265625, "loss/logits": 0.12178562209010124, "step": 232 }, { "epoch": 0.233, "grad_norm": 0.78125, "grad_norm_var": 0.017240397135416665, "learning_rate": 0.0001, "loss": 0.4282, "loss/crossentropy": 2.7091563940048218, "loss/fcd": 0.3310546875, "loss/logits": 0.09712602570652962, "step": 233 }, { "epoch": 0.234, "grad_norm": 1.3046875, "grad_norm_var": 0.04123128255208333, "learning_rate": 0.0001, "loss": 0.504, "loss/crossentropy": 2.912622332572937, "loss/fcd": 0.3564453125, "loss/logits": 0.14751752838492393, "step": 234 }, { "epoch": 0.235, "grad_norm": 0.7890625, "grad_norm_var": 0.04126383463541667, "learning_rate": 0.0001, "loss": 0.4265, "loss/crossentropy": 2.7788244485855103, "loss/fcd": 0.3232421875, "loss/logits": 0.10323498025536537, "step": 235 }, { "epoch": 0.236, "grad_norm": 0.984375, "grad_norm_var": 0.044966570536295575, "learning_rate": 0.0001, "loss": 0.4324, "loss/crossentropy": 2.8856834173202515, "loss/fcd": 0.3291015625, "loss/logits": 0.10333634912967682, "step": 236 }, { "epoch": 0.237, "grad_norm": 1.0234375, "grad_norm_var": 0.049002520243326825, "learning_rate": 0.0001, "loss": 0.4056, "loss/crossentropy": 2.8181627988815308, "loss/fcd": 0.3125, "loss/logits": 0.09313894063234329, "step": 237 }, { "epoch": 0.238, "grad_norm": 0.95703125, "grad_norm_var": 0.049921671549479164, "learning_rate": 0.0001, "loss": 0.4479, "loss/crossentropy": 2.959288477897644, "loss/fcd": 0.3369140625, "loss/logits": 0.11102355644106865, "step": 238 }, { "epoch": 0.239, "grad_norm": 0.8515625, "grad_norm_var": 0.04542586008707682, "learning_rate": 0.0001, "loss": 0.4428, "loss/crossentropy": 2.8065025806427, "loss/fcd": 0.333984375, "loss/logits": 0.10878817737102509, "step": 239 }, { "epoch": 0.24, "grad_norm": 0.76171875, "grad_norm_var": 0.042231178283691405, "learning_rate": 0.0001, "loss": 0.396, "loss/crossentropy": 2.7715532779693604, "loss/fcd": 0.3095703125, "loss/logits": 0.08645889163017273, "step": 240 }, { "epoch": 0.241, "grad_norm": 0.75, "grad_norm_var": 0.03939183553059896, "learning_rate": 0.0001, "loss": 0.4564, "loss/crossentropy": 2.829145073890686, "loss/fcd": 0.3369140625, "loss/logits": 0.11950911581516266, "step": 241 }, { "epoch": 0.242, "grad_norm": 0.71484375, "grad_norm_var": 0.036871083577473956, "learning_rate": 0.0001, "loss": 0.4192, "loss/crossentropy": 2.8319283723831177, "loss/fcd": 0.326171875, "loss/logits": 0.09307562187314034, "step": 242 }, { "epoch": 0.243, "grad_norm": 0.65625, "grad_norm_var": 0.032597287495930986, "learning_rate": 0.0001, "loss": 0.4242, "loss/crossentropy": 2.817041516304016, "loss/fcd": 0.32421875, "loss/logits": 0.09995916113257408, "step": 243 }, { "epoch": 0.244, "grad_norm": 0.64453125, "grad_norm_var": 0.029933675130208334, "learning_rate": 0.0001, "loss": 0.4462, "loss/crossentropy": 2.806472897529602, "loss/fcd": 0.3408203125, "loss/logits": 0.10536561161279678, "step": 244 }, { "epoch": 0.245, "grad_norm": 0.55078125, "grad_norm_var": 0.03538106282552083, "learning_rate": 0.0001, "loss": 0.3982, "loss/crossentropy": 2.800318717956543, "loss/fcd": 0.3076171875, "loss/logits": 0.09055132418870926, "step": 245 }, { "epoch": 0.246, "grad_norm": 0.58984375, "grad_norm_var": 0.03576857248942057, "learning_rate": 0.0001, "loss": 0.402, "loss/crossentropy": 2.7864032983779907, "loss/fcd": 0.3095703125, "loss/logits": 0.09244050830602646, "step": 246 }, { "epoch": 0.247, "grad_norm": 0.63671875, "grad_norm_var": 0.03710880279541016, "learning_rate": 0.0001, "loss": 0.4454, "loss/crossentropy": 2.8640553951263428, "loss/fcd": 0.3388671875, "loss/logits": 0.10653172433376312, "step": 247 }, { "epoch": 0.248, "grad_norm": 0.68359375, "grad_norm_var": 0.037870025634765624, "learning_rate": 0.0001, "loss": 0.4301, "loss/crossentropy": 2.8637115955352783, "loss/fcd": 0.3271484375, "loss/logits": 0.10292865708470345, "step": 248 }, { "epoch": 0.249, "grad_norm": 0.80078125, "grad_norm_var": 0.03786462148030599, "learning_rate": 0.0001, "loss": 0.3903, "loss/crossentropy": 2.8047688007354736, "loss/fcd": 0.3046875, "loss/logits": 0.08557453379034996, "step": 249 }, { "epoch": 0.25, "grad_norm": 0.57421875, "grad_norm_var": 0.021445719401041667, "learning_rate": 0.0001, "loss": 0.4041, "loss/crossentropy": 2.837961196899414, "loss/fcd": 0.3125, "loss/logits": 0.09161163493990898, "step": 250 }, { "epoch": 0.251, "grad_norm": 0.84765625, "grad_norm_var": 0.021980730692545573, "learning_rate": 0.0001, "loss": 0.5253, "loss/crossentropy": 2.749048352241516, "loss/fcd": 0.3818359375, "loss/logits": 0.1434529758989811, "step": 251 }, { "epoch": 0.252, "grad_norm": 0.60546875, "grad_norm_var": 0.019199371337890625, "learning_rate": 0.0001, "loss": 0.4083, "loss/crossentropy": 3.039583444595337, "loss/fcd": 0.31640625, "loss/logits": 0.09189995378255844, "step": 252 }, { "epoch": 0.253, "grad_norm": 0.65234375, "grad_norm_var": 0.01318963368733724, "learning_rate": 0.0001, "loss": 0.4154, "loss/crossentropy": 2.83537220954895, "loss/fcd": 0.31640625, "loss/logits": 0.09897411242127419, "step": 253 }, { "epoch": 0.254, "grad_norm": 0.67578125, "grad_norm_var": 0.008676083882649739, "learning_rate": 0.0001, "loss": 0.4261, "loss/crossentropy": 3.0131027698516846, "loss/fcd": 0.326171875, "loss/logits": 0.09990369901061058, "step": 254 }, { "epoch": 0.255, "grad_norm": 0.70703125, "grad_norm_var": 0.006815338134765625, "learning_rate": 0.0001, "loss": 0.4328, "loss/crossentropy": 2.93803608417511, "loss/fcd": 0.3291015625, "loss/logits": 0.10372024774551392, "step": 255 }, { "epoch": 0.256, "grad_norm": 0.578125, "grad_norm_var": 0.006878089904785156, "learning_rate": 0.0001, "loss": 0.3991, "loss/crossentropy": 2.9447672367095947, "loss/fcd": 0.3056640625, "loss/logits": 0.09344978258013725, "step": 256 }, { "epoch": 0.257, "grad_norm": 0.78125, "grad_norm_var": 0.00728600819905599, "learning_rate": 0.0001, "loss": 0.4649, "loss/crossentropy": 2.7877269983291626, "loss/fcd": 0.3447265625, "loss/logits": 0.12016481161117554, "step": 257 }, { "epoch": 0.258, "grad_norm": 1.0625, "grad_norm_var": 0.016978963216145834, "learning_rate": 0.0001, "loss": 0.4578, "loss/crossentropy": 2.8194340467453003, "loss/fcd": 0.3515625, "loss/logits": 0.10628331080079079, "step": 258 }, { "epoch": 0.259, "grad_norm": 0.59765625, "grad_norm_var": 0.01746056874593099, "learning_rate": 0.0001, "loss": 0.4067, "loss/crossentropy": 2.864375114440918, "loss/fcd": 0.31640625, "loss/logits": 0.09034289047122002, "step": 259 }, { "epoch": 0.26, "grad_norm": 0.6171875, "grad_norm_var": 0.017661285400390626, "learning_rate": 0.0001, "loss": 0.3902, "loss/crossentropy": 2.8615986108779907, "loss/fcd": 0.302734375, "loss/logits": 0.08747856691479683, "step": 260 }, { "epoch": 0.261, "grad_norm": 0.64453125, "grad_norm_var": 0.016532135009765626, "learning_rate": 0.0001, "loss": 0.4094, "loss/crossentropy": 2.824509024620056, "loss/fcd": 0.3115234375, "loss/logits": 0.09783903509378433, "step": 261 }, { "epoch": 0.262, "grad_norm": 0.6328125, "grad_norm_var": 0.016068458557128906, "learning_rate": 0.0001, "loss": 0.4327, "loss/crossentropy": 2.802370309829712, "loss/fcd": 0.3310546875, "loss/logits": 0.10163304209709167, "step": 262 }, { "epoch": 0.263, "grad_norm": 0.7109375, "grad_norm_var": 0.015849812825520834, "learning_rate": 0.0001, "loss": 0.4364, "loss/crossentropy": 2.9660686254501343, "loss/fcd": 0.328125, "loss/logits": 0.10826770216226578, "step": 263 }, { "epoch": 0.264, "grad_norm": 0.640625, "grad_norm_var": 0.01604913075764974, "learning_rate": 0.0001, "loss": 0.4306, "loss/crossentropy": 2.970041871070862, "loss/fcd": 0.328125, "loss/logits": 0.10252038761973381, "step": 264 }, { "epoch": 0.265, "grad_norm": 0.57421875, "grad_norm_var": 0.016078631083170574, "learning_rate": 0.0001, "loss": 0.3905, "loss/crossentropy": 2.7347593307495117, "loss/fcd": 0.30078125, "loss/logits": 0.08967656269669533, "step": 265 }, { "epoch": 0.266, "grad_norm": 0.62890625, "grad_norm_var": 0.015484046936035157, "learning_rate": 0.0001, "loss": 0.3998, "loss/crossentropy": 2.6828304529190063, "loss/fcd": 0.3115234375, "loss/logits": 0.08822760730981827, "step": 266 }, { "epoch": 0.267, "grad_norm": 0.73046875, "grad_norm_var": 0.013797950744628907, "learning_rate": 0.0001, "loss": 0.5116, "loss/crossentropy": 3.124196410179138, "loss/fcd": 0.376953125, "loss/logits": 0.13468829542398453, "step": 267 }, { "epoch": 0.268, "grad_norm": 0.73828125, "grad_norm_var": 0.013625017801920573, "learning_rate": 0.0001, "loss": 0.4071, "loss/crossentropy": 2.963875889778137, "loss/fcd": 0.3134765625, "loss/logits": 0.09359101951122284, "step": 268 }, { "epoch": 0.269, "grad_norm": 0.6640625, "grad_norm_var": 0.013581339518229167, "learning_rate": 0.0001, "loss": 0.4062, "loss/crossentropy": 2.852897524833679, "loss/fcd": 0.3134765625, "loss/logits": 0.09275222197175026, "step": 269 }, { "epoch": 0.27, "grad_norm": 0.76171875, "grad_norm_var": 0.013919830322265625, "learning_rate": 0.0001, "loss": 0.4204, "loss/crossentropy": 2.8601616621017456, "loss/fcd": 0.322265625, "loss/logits": 0.09812301397323608, "step": 270 }, { "epoch": 0.271, "grad_norm": 0.7578125, "grad_norm_var": 0.014183489481608073, "learning_rate": 0.0001, "loss": 0.4126, "loss/crossentropy": 2.713176965713501, "loss/fcd": 0.3154296875, "loss/logits": 0.09714118391275406, "step": 271 }, { "epoch": 0.272, "grad_norm": 0.671875, "grad_norm_var": 0.013271013895670572, "learning_rate": 0.0001, "loss": 0.4449, "loss/crossentropy": 3.02414071559906, "loss/fcd": 0.337890625, "loss/logits": 0.10696037858724594, "step": 272 }, { "epoch": 0.273, "grad_norm": 0.6640625, "grad_norm_var": 0.012874285380045572, "learning_rate": 0.0001, "loss": 0.422, "loss/crossentropy": 2.842344880104065, "loss/fcd": 0.3251953125, "loss/logits": 0.09679682552814484, "step": 273 }, { "epoch": 0.274, "grad_norm": 0.83203125, "grad_norm_var": 0.004858144124348958, "learning_rate": 0.0001, "loss": 0.4451, "loss/crossentropy": 2.876818299293518, "loss/fcd": 0.33984375, "loss/logits": 0.10525127500295639, "step": 274 }, { "epoch": 0.275, "grad_norm": 0.91796875, "grad_norm_var": 0.0077880859375, "learning_rate": 0.0001, "loss": 0.4179, "loss/crossentropy": 2.785101294517517, "loss/fcd": 0.322265625, "loss/logits": 0.09567511081695557, "step": 275 }, { "epoch": 0.276, "grad_norm": 0.734375, "grad_norm_var": 0.007364654541015625, "learning_rate": 0.0001, "loss": 0.4274, "loss/crossentropy": 2.839262843132019, "loss/fcd": 0.33203125, "loss/logits": 0.09535057097673416, "step": 276 }, { "epoch": 0.277, "grad_norm": 0.68359375, "grad_norm_var": 0.0071370442708333336, "learning_rate": 0.0001, "loss": 0.4264, "loss/crossentropy": 2.8711353540420532, "loss/fcd": 0.326171875, "loss/logits": 0.10018761828541756, "step": 277 }, { "epoch": 0.278, "grad_norm": 0.6328125, "grad_norm_var": 0.0071370442708333336, "learning_rate": 0.0001, "loss": 0.4129, "loss/crossentropy": 2.871408224105835, "loss/fcd": 0.3173828125, "loss/logits": 0.09553812071681023, "step": 278 }, { "epoch": 0.279, "grad_norm": 0.62890625, "grad_norm_var": 0.007536252339680989, "learning_rate": 0.0001, "loss": 0.4575, "loss/crossentropy": 2.876176953315735, "loss/fcd": 0.341796875, "loss/logits": 0.11570525541901588, "step": 279 }, { "epoch": 0.28, "grad_norm": 0.61328125, "grad_norm_var": 0.007813517252604167, "learning_rate": 0.0001, "loss": 0.4354, "loss/crossentropy": 2.8264540433883667, "loss/fcd": 0.3330078125, "loss/logits": 0.1023903377354145, "step": 280 }, { "epoch": 0.281, "grad_norm": 0.69140625, "grad_norm_var": 0.006672922770182292, "learning_rate": 0.0001, "loss": 0.3931, "loss/crossentropy": 2.744624972343445, "loss/fcd": 0.3046875, "loss/logits": 0.08837807923555374, "step": 281 }, { "epoch": 0.282, "grad_norm": 0.7109375, "grad_norm_var": 0.006212298075358073, "learning_rate": 0.0001, "loss": 0.4133, "loss/crossentropy": 2.892166256904602, "loss/fcd": 0.314453125, "loss/logits": 0.09885082766413689, "step": 282 }, { "epoch": 0.283, "grad_norm": 0.7578125, "grad_norm_var": 0.0063168843587239586, "learning_rate": 0.0001, "loss": 0.4083, "loss/crossentropy": 2.8837363719940186, "loss/fcd": 0.31640625, "loss/logits": 0.0918973907828331, "step": 283 }, { "epoch": 0.284, "grad_norm": 0.67578125, "grad_norm_var": 0.0063779195149739586, "learning_rate": 0.0001, "loss": 0.4308, "loss/crossentropy": 2.7492740154266357, "loss/fcd": 0.3359375, "loss/logits": 0.09483079984784126, "step": 284 }, { "epoch": 0.285, "grad_norm": 0.55078125, "grad_norm_var": 0.00791009267171224, "learning_rate": 0.0001, "loss": 0.4066, "loss/crossentropy": 2.8545846939086914, "loss/fcd": 0.314453125, "loss/logits": 0.09213941171765327, "step": 285 }, { "epoch": 0.286, "grad_norm": 0.61328125, "grad_norm_var": 0.008171017964680989, "learning_rate": 0.0001, "loss": 0.4126, "loss/crossentropy": 2.67995822429657, "loss/fcd": 0.3251953125, "loss/logits": 0.08735696226358414, "step": 286 }, { "epoch": 0.287, "grad_norm": 0.80078125, "grad_norm_var": 0.008640289306640625, "learning_rate": 0.0001, "loss": 0.3921, "loss/crossentropy": 2.8901337385177612, "loss/fcd": 0.3056640625, "loss/logits": 0.08640312403440475, "step": 287 }, { "epoch": 0.288, "grad_norm": 0.609375, "grad_norm_var": 0.009108225504557291, "learning_rate": 0.0001, "loss": 0.4164, "loss/crossentropy": 2.8247963190078735, "loss/fcd": 0.31640625, "loss/logits": 0.10003437474370003, "step": 288 }, { "epoch": 0.289, "grad_norm": 0.5546875, "grad_norm_var": 0.010304514567057292, "learning_rate": 0.0001, "loss": 0.4182, "loss/crossentropy": 2.943819999694824, "loss/fcd": 0.3232421875, "loss/logits": 0.09490966796875, "step": 289 }, { "epoch": 0.29, "grad_norm": 0.515625, "grad_norm_var": 0.010484759012858074, "learning_rate": 0.0001, "loss": 0.3826, "loss/crossentropy": 2.9070965051651, "loss/fcd": 0.2939453125, "loss/logits": 0.0886867605149746, "step": 290 }, { "epoch": 0.291, "grad_norm": 0.62109375, "grad_norm_var": 0.006107012430826823, "learning_rate": 0.0001, "loss": 0.4199, "loss/crossentropy": 2.8656623363494873, "loss/fcd": 0.318359375, "loss/logits": 0.10157756507396698, "step": 291 }, { "epoch": 0.292, "grad_norm": 0.6484375, "grad_norm_var": 0.005597877502441406, "learning_rate": 0.0001, "loss": 0.3961, "loss/crossentropy": 2.8809529542922974, "loss/fcd": 0.3095703125, "loss/logits": 0.08648988232016563, "step": 292 }, { "epoch": 0.293, "grad_norm": 0.765625, "grad_norm_var": 0.0064483642578125, "learning_rate": 0.0001, "loss": 0.383, "loss/crossentropy": 2.6984453201293945, "loss/fcd": 0.3017578125, "loss/logits": 0.08123517036437988, "step": 293 }, { "epoch": 0.294, "grad_norm": 0.734375, "grad_norm_var": 0.006868235270182292, "learning_rate": 0.0001, "loss": 0.4093, "loss/crossentropy": 2.7411580085754395, "loss/fcd": 0.3154296875, "loss/logits": 0.09388710558414459, "step": 294 }, { "epoch": 0.295, "grad_norm": 0.64453125, "grad_norm_var": 0.006827545166015625, "learning_rate": 0.0001, "loss": 0.4139, "loss/crossentropy": 2.761058211326599, "loss/fcd": 0.31640625, "loss/logits": 0.09746293723583221, "step": 295 }, { "epoch": 0.296, "grad_norm": 0.8671875, "grad_norm_var": 0.009385617574055989, "learning_rate": 0.0001, "loss": 0.4587, "loss/crossentropy": 2.882654905319214, "loss/fcd": 0.345703125, "loss/logits": 0.11303438991308212, "step": 296 }, { "epoch": 0.297, "grad_norm": 0.75390625, "grad_norm_var": 0.009786415100097656, "learning_rate": 0.0001, "loss": 0.4656, "loss/crossentropy": 2.759294867515564, "loss/fcd": 0.353515625, "loss/logits": 0.11211657896637917, "step": 297 }, { "epoch": 0.298, "grad_norm": 0.68359375, "grad_norm_var": 0.0097076416015625, "learning_rate": 0.0001, "loss": 0.4354, "loss/crossentropy": 2.732961416244507, "loss/fcd": 0.3349609375, "loss/logits": 0.10048427432775497, "step": 298 }, { "epoch": 0.299, "grad_norm": 0.6171875, "grad_norm_var": 0.00938720703125, "learning_rate": 0.0001, "loss": 0.4397, "loss/crossentropy": 2.8611546754837036, "loss/fcd": 0.337890625, "loss/logits": 0.10177960991859436, "step": 299 }, { "epoch": 0.3, "grad_norm": 0.8203125, "grad_norm_var": 0.010880978902180989, "learning_rate": 0.0001, "loss": 0.4757, "loss/crossentropy": 2.784830689430237, "loss/fcd": 0.3515625, "loss/logits": 0.1241227462887764, "step": 300 }, { "epoch": 0.301, "grad_norm": 0.65234375, "grad_norm_var": 0.009842872619628906, "learning_rate": 0.0001, "loss": 0.429, "loss/crossentropy": 2.9347182512283325, "loss/fcd": 0.328125, "loss/logits": 0.10089103132486343, "step": 301 }, { "epoch": 0.302, "grad_norm": 0.6875, "grad_norm_var": 0.009513092041015626, "learning_rate": 0.0001, "loss": 0.4071, "loss/crossentropy": 2.816414952278137, "loss/fcd": 0.31640625, "loss/logits": 0.09071046859025955, "step": 302 }, { "epoch": 0.303, "grad_norm": 0.640625, "grad_norm_var": 0.008665911356608073, "learning_rate": 0.0001, "loss": 0.4173, "loss/crossentropy": 2.8991583585739136, "loss/fcd": 0.318359375, "loss/logits": 0.09893063083291054, "step": 303 }, { "epoch": 0.304, "grad_norm": 0.5546875, "grad_norm_var": 0.009338823954264323, "learning_rate": 0.0001, "loss": 0.4005, "loss/crossentropy": 2.894463539123535, "loss/fcd": 0.3125, "loss/logits": 0.08798391744494438, "step": 304 }, { "epoch": 0.305, "grad_norm": 0.62109375, "grad_norm_var": 0.008570353190104166, "learning_rate": 0.0001, "loss": 0.4035, "loss/crossentropy": 2.914915442466736, "loss/fcd": 0.3125, "loss/logits": 0.09100022539496422, "step": 305 }, { "epoch": 0.306, "grad_norm": 0.7109375, "grad_norm_var": 0.006758371988932292, "learning_rate": 0.0001, "loss": 0.4032, "loss/crossentropy": 2.9412468671798706, "loss/fcd": 0.314453125, "loss/logits": 0.08878887072205544, "step": 306 }, { "epoch": 0.307, "grad_norm": 0.64453125, "grad_norm_var": 0.0065806070963541664, "learning_rate": 0.0001, "loss": 0.3907, "loss/crossentropy": 2.829257607460022, "loss/fcd": 0.3056640625, "loss/logits": 0.08507547527551651, "step": 307 }, { "epoch": 0.308, "grad_norm": 0.6015625, "grad_norm_var": 0.006980387369791666, "learning_rate": 0.0001, "loss": 0.3899, "loss/crossentropy": 2.7131818532943726, "loss/fcd": 0.30078125, "loss/logits": 0.08915158361196518, "step": 308 }, { "epoch": 0.309, "grad_norm": 0.57421875, "grad_norm_var": 0.007276344299316406, "learning_rate": 0.0001, "loss": 0.3931, "loss/crossentropy": 2.820746898651123, "loss/fcd": 0.30859375, "loss/logits": 0.0845017246901989, "step": 309 }, { "epoch": 0.31, "grad_norm": 0.65625, "grad_norm_var": 0.0070449193318684895, "learning_rate": 0.0001, "loss": 0.4006, "loss/crossentropy": 2.7722835540771484, "loss/fcd": 0.3115234375, "loss/logits": 0.08902696147561073, "step": 310 }, { "epoch": 0.311, "grad_norm": 0.63671875, "grad_norm_var": 0.007075945536295573, "learning_rate": 0.0001, "loss": 0.4239, "loss/crossentropy": 2.867286205291748, "loss/fcd": 0.3271484375, "loss/logits": 0.09677955880761147, "step": 311 }, { "epoch": 0.312, "grad_norm": 0.734375, "grad_norm_var": 0.004689470926920573, "learning_rate": 0.0001, "loss": 0.4179, "loss/crossentropy": 2.7744059562683105, "loss/fcd": 0.318359375, "loss/logits": 0.09955108538269997, "step": 312 }, { "epoch": 0.313, "grad_norm": 0.66796875, "grad_norm_var": 0.004096412658691406, "learning_rate": 0.0001, "loss": 0.4488, "loss/crossentropy": 2.7806670665740967, "loss/fcd": 0.3427734375, "loss/logits": 0.10600494965910912, "step": 313 }, { "epoch": 0.314, "grad_norm": 0.56640625, "grad_norm_var": 0.004531288146972656, "learning_rate": 0.0001, "loss": 0.3871, "loss/crossentropy": 2.8486790657043457, "loss/fcd": 0.3037109375, "loss/logits": 0.08334493264555931, "step": 314 }, { "epoch": 0.315, "grad_norm": 0.63671875, "grad_norm_var": 0.004471842447916667, "learning_rate": 0.0001, "loss": 0.4111, "loss/crossentropy": 2.9870445728302, "loss/fcd": 0.318359375, "loss/logits": 0.0927395410835743, "step": 315 }, { "epoch": 0.316, "grad_norm": 0.6015625, "grad_norm_var": 0.002506510416666667, "learning_rate": 0.0001, "loss": 0.3772, "loss/crossentropy": 2.8333510160446167, "loss/fcd": 0.2958984375, "loss/logits": 0.0812879391014576, "step": 316 }, { "epoch": 0.317, "grad_norm": 0.6875, "grad_norm_var": 0.002657000223795573, "learning_rate": 0.0001, "loss": 0.4304, "loss/crossentropy": 2.783381938934326, "loss/fcd": 0.330078125, "loss/logits": 0.10034845769405365, "step": 317 }, { "epoch": 0.318, "grad_norm": 0.60546875, "grad_norm_var": 0.0025461832682291668, "learning_rate": 0.0001, "loss": 0.4136, "loss/crossentropy": 2.886239528656006, "loss/fcd": 0.3173828125, "loss/logits": 0.09618546813726425, "step": 318 }, { "epoch": 0.319, "grad_norm": 0.625, "grad_norm_var": 0.0025472005208333334, "learning_rate": 0.0001, "loss": 0.4161, "loss/crossentropy": 2.9301581382751465, "loss/fcd": 0.32421875, "loss/logits": 0.09190791472792625, "step": 319 }, { "epoch": 0.32, "grad_norm": 0.546875, "grad_norm_var": 0.002632395426432292, "learning_rate": 0.0001, "loss": 0.3861, "loss/crossentropy": 2.813088059425354, "loss/fcd": 0.30078125, "loss/logits": 0.08532163128256798, "step": 320 }, { "epoch": 0.321, "grad_norm": 0.52734375, "grad_norm_var": 0.0033220926920572917, "learning_rate": 0.0001, "loss": 0.3855, "loss/crossentropy": 2.8616257905960083, "loss/fcd": 0.298828125, "loss/logits": 0.08671489730477333, "step": 321 }, { "epoch": 0.322, "grad_norm": 0.58984375, "grad_norm_var": 0.0028746922810872397, "learning_rate": 0.0001, "loss": 0.4115, "loss/crossentropy": 2.859944701194763, "loss/fcd": 0.3193359375, "loss/logits": 0.09216945245862007, "step": 322 }, { "epoch": 0.323, "grad_norm": 0.65625, "grad_norm_var": 0.0029233296712239585, "learning_rate": 0.0001, "loss": 0.4164, "loss/crossentropy": 2.8817009925842285, "loss/fcd": 0.3193359375, "loss/logits": 0.09702347591519356, "step": 323 }, { "epoch": 0.324, "grad_norm": 0.75, "grad_norm_var": 0.00394287109375, "learning_rate": 0.0001, "loss": 0.4215, "loss/crossentropy": 2.9102847576141357, "loss/fcd": 0.330078125, "loss/logits": 0.09144152700901031, "step": 324 }, { "epoch": 0.325, "grad_norm": 0.89453125, "grad_norm_var": 0.008019765218098959, "learning_rate": 0.0001, "loss": 0.3933, "loss/crossentropy": 2.7305492162704468, "loss/fcd": 0.3056640625, "loss/logits": 0.08765168115496635, "step": 325 }, { "epoch": 0.326, "grad_norm": 0.796875, "grad_norm_var": 0.009393056233723959, "learning_rate": 0.0001, "loss": 0.3895, "loss/crossentropy": 2.8607096672058105, "loss/fcd": 0.3017578125, "loss/logits": 0.08771565556526184, "step": 326 }, { "epoch": 0.327, "grad_norm": 0.6328125, "grad_norm_var": 0.009404945373535156, "learning_rate": 0.0001, "loss": 0.4095, "loss/crossentropy": 2.9553240537643433, "loss/fcd": 0.3212890625, "loss/logits": 0.08823421597480774, "step": 327 }, { "epoch": 0.328, "grad_norm": 0.6953125, "grad_norm_var": 0.009099769592285156, "learning_rate": 0.0001, "loss": 0.4631, "loss/crossentropy": 2.838402509689331, "loss/fcd": 0.34765625, "loss/logits": 0.1154676228761673, "step": 328 }, { "epoch": 0.329, "grad_norm": 1.75, "grad_norm_var": 0.08414103190104166, "learning_rate": 0.0001, "loss": 0.4394, "loss/crossentropy": 2.756170868873596, "loss/fcd": 0.314453125, "loss/logits": 0.1249246709048748, "step": 329 }, { "epoch": 0.33, "grad_norm": 0.765625, "grad_norm_var": 0.08247114817301432, "learning_rate": 0.0001, "loss": 0.422, "loss/crossentropy": 2.8995450735092163, "loss/fcd": 0.31640625, "loss/logits": 0.10559750720858574, "step": 330 }, { "epoch": 0.331, "grad_norm": 0.796875, "grad_norm_var": 0.0819732666015625, "learning_rate": 0.0001, "loss": 0.4665, "loss/crossentropy": 2.8580288887023926, "loss/fcd": 0.341796875, "loss/logits": 0.12471328675746918, "step": 331 }, { "epoch": 0.332, "grad_norm": 0.796875, "grad_norm_var": 0.08061904907226562, "learning_rate": 0.0001, "loss": 0.4448, "loss/crossentropy": 2.831050157546997, "loss/fcd": 0.330078125, "loss/logits": 0.11475277319550514, "step": 332 }, { "epoch": 0.333, "grad_norm": 0.7421875, "grad_norm_var": 0.08029683430989583, "learning_rate": 0.0001, "loss": 0.4326, "loss/crossentropy": 2.8131872415542603, "loss/fcd": 0.328125, "loss/logits": 0.10444386303424835, "step": 333 }, { "epoch": 0.334, "grad_norm": 0.6875, "grad_norm_var": 0.07901910146077475, "learning_rate": 0.0001, "loss": 0.4318, "loss/crossentropy": 2.822066903114319, "loss/fcd": 0.32421875, "loss/logits": 0.10761934518814087, "step": 334 }, { "epoch": 0.335, "grad_norm": 0.79296875, "grad_norm_var": 0.0776275634765625, "learning_rate": 0.0001, "loss": 0.4357, "loss/crossentropy": 2.8042834997177124, "loss/fcd": 0.3359375, "loss/logits": 0.09971516206860542, "step": 335 }, { "epoch": 0.336, "grad_norm": 0.6328125, "grad_norm_var": 0.07545954386393229, "learning_rate": 0.0001, "loss": 0.4202, "loss/crossentropy": 2.8359432220458984, "loss/fcd": 0.32421875, "loss/logits": 0.0959857665002346, "step": 336 }, { "epoch": 0.337, "grad_norm": 0.57421875, "grad_norm_var": 0.0740069071451823, "learning_rate": 0.0001, "loss": 0.4091, "loss/crossentropy": 2.869974374771118, "loss/fcd": 0.31640625, "loss/logits": 0.0926552303135395, "step": 337 }, { "epoch": 0.338, "grad_norm": 0.58984375, "grad_norm_var": 0.0740069071451823, "learning_rate": 0.0001, "loss": 0.3979, "loss/crossentropy": 2.807898163795471, "loss/fcd": 0.306640625, "loss/logits": 0.09129861742258072, "step": 338 }, { "epoch": 0.339, "grad_norm": 0.63671875, "grad_norm_var": 0.07436517079671225, "learning_rate": 0.0001, "loss": 0.4291, "loss/crossentropy": 2.8850966691970825, "loss/fcd": 0.328125, "loss/logits": 0.10096083208918571, "step": 339 }, { "epoch": 0.34, "grad_norm": 0.7265625, "grad_norm_var": 0.07450402577718099, "learning_rate": 0.0001, "loss": 0.4518, "loss/crossentropy": 2.9171935319900513, "loss/fcd": 0.34375, "loss/logits": 0.10800491645932198, "step": 340 }, { "epoch": 0.341, "grad_norm": 0.703125, "grad_norm_var": 0.07392145792643229, "learning_rate": 0.0001, "loss": 0.4566, "loss/crossentropy": 2.8163551092147827, "loss/fcd": 0.34375, "loss/logits": 0.11288850009441376, "step": 341 }, { "epoch": 0.342, "grad_norm": 0.55859375, "grad_norm_var": 0.07661685943603516, "learning_rate": 0.0001, "loss": 0.3953, "loss/crossentropy": 2.788830041885376, "loss/fcd": 0.3056640625, "loss/logits": 0.08966437354683876, "step": 342 }, { "epoch": 0.343, "grad_norm": 0.63671875, "grad_norm_var": 0.07655410766601563, "learning_rate": 0.0001, "loss": 0.423, "loss/crossentropy": 2.919311761856079, "loss/fcd": 0.3251953125, "loss/logits": 0.0978211909532547, "step": 343 }, { "epoch": 0.344, "grad_norm": 0.6484375, "grad_norm_var": 0.07706680297851562, "learning_rate": 0.0001, "loss": 0.404, "loss/crossentropy": 2.8302316665649414, "loss/fcd": 0.3154296875, "loss/logits": 0.08858633041381836, "step": 344 }, { "epoch": 0.345, "grad_norm": 0.63671875, "grad_norm_var": 0.006453895568847656, "learning_rate": 0.0001, "loss": 0.4177, "loss/crossentropy": 2.918962240219116, "loss/fcd": 0.322265625, "loss/logits": 0.09547547250986099, "step": 345 }, { "epoch": 0.346, "grad_norm": 0.55859375, "grad_norm_var": 0.00684814453125, "learning_rate": 0.0001, "loss": 0.4338, "loss/crossentropy": 2.921096920967102, "loss/fcd": 0.33203125, "loss/logits": 0.10181862115859985, "step": 346 }, { "epoch": 0.347, "grad_norm": 0.58203125, "grad_norm_var": 0.0060963312784830725, "learning_rate": 0.0001, "loss": 0.4104, "loss/crossentropy": 2.638503074645996, "loss/fcd": 0.3173828125, "loss/logits": 0.09305207803845406, "step": 347 }, { "epoch": 0.348, "grad_norm": 0.73828125, "grad_norm_var": 0.005214182535807291, "learning_rate": 0.0001, "loss": 0.4034, "loss/crossentropy": 2.8353854417800903, "loss/fcd": 0.3125, "loss/logits": 0.09088730812072754, "step": 348 }, { "epoch": 0.349, "grad_norm": 0.7265625, "grad_norm_var": 0.0050432840983072914, "learning_rate": 0.0001, "loss": 0.4215, "loss/crossentropy": 2.875319242477417, "loss/fcd": 0.3271484375, "loss/logits": 0.09431769698858261, "step": 349 }, { "epoch": 0.35, "grad_norm": 0.58203125, "grad_norm_var": 0.005237261454264323, "learning_rate": 0.0001, "loss": 0.4012, "loss/crossentropy": 2.716060757637024, "loss/fcd": 0.314453125, "loss/logits": 0.08677634596824646, "step": 350 }, { "epoch": 0.351, "grad_norm": 0.61328125, "grad_norm_var": 0.0037164688110351562, "learning_rate": 0.0001, "loss": 0.4161, "loss/crossentropy": 2.9706015586853027, "loss/fcd": 0.3271484375, "loss/logits": 0.08898995071649551, "step": 351 }, { "epoch": 0.352, "grad_norm": 0.72265625, "grad_norm_var": 0.004206339518229167, "learning_rate": 0.0001, "loss": 0.4454, "loss/crossentropy": 2.9135972261428833, "loss/fcd": 0.3408203125, "loss/logits": 0.10461591929197311, "step": 352 }, { "epoch": 0.353, "grad_norm": 0.71875, "grad_norm_var": 0.00425103505452474, "learning_rate": 0.0001, "loss": 0.4235, "loss/crossentropy": 2.9371761083602905, "loss/fcd": 0.326171875, "loss/logits": 0.09735563024878502, "step": 353 }, { "epoch": 0.354, "grad_norm": 0.69140625, "grad_norm_var": 0.004098955790201823, "learning_rate": 0.0001, "loss": 0.4207, "loss/crossentropy": 2.9942837953567505, "loss/fcd": 0.326171875, "loss/logits": 0.09450989216566086, "step": 354 }, { "epoch": 0.355, "grad_norm": 0.6171875, "grad_norm_var": 0.004170481363932292, "learning_rate": 0.0001, "loss": 0.4433, "loss/crossentropy": 2.820738196372986, "loss/fcd": 0.3349609375, "loss/logits": 0.10836119577288628, "step": 355 }, { "epoch": 0.356, "grad_norm": 0.65625, "grad_norm_var": 0.0037974039713541668, "learning_rate": 0.0001, "loss": 0.4336, "loss/crossentropy": 2.8830113410949707, "loss/fcd": 0.3291015625, "loss/logits": 0.10447230935096741, "step": 356 }, { "epoch": 0.357, "grad_norm": 0.87890625, "grad_norm_var": 0.006987444559733073, "learning_rate": 0.0001, "loss": 0.4988, "loss/crossentropy": 2.950507402420044, "loss/fcd": 0.365234375, "loss/logits": 0.13359853625297546, "step": 357 }, { "epoch": 0.358, "grad_norm": 0.6640625, "grad_norm_var": 0.006251017252604167, "learning_rate": 0.0001, "loss": 0.4254, "loss/crossentropy": 3.050377130508423, "loss/fcd": 0.328125, "loss/logits": 0.09723382443189621, "step": 358 }, { "epoch": 0.359, "grad_norm": 0.61328125, "grad_norm_var": 0.006379954020182292, "learning_rate": 0.0001, "loss": 0.4174, "loss/crossentropy": 2.872576355934143, "loss/fcd": 0.3173828125, "loss/logits": 0.09998740255832672, "step": 359 }, { "epoch": 0.36, "grad_norm": 0.64453125, "grad_norm_var": 0.006389808654785156, "learning_rate": 0.0001, "loss": 0.4129, "loss/crossentropy": 2.636136770248413, "loss/fcd": 0.3193359375, "loss/logits": 0.09356288239359856, "step": 360 }, { "epoch": 0.361, "grad_norm": 0.61328125, "grad_norm_var": 0.006513404846191406, "learning_rate": 0.0001, "loss": 0.4126, "loss/crossentropy": 2.9116278886795044, "loss/fcd": 0.3173828125, "loss/logits": 0.09523866325616837, "step": 361 }, { "epoch": 0.362, "grad_norm": 0.62109375, "grad_norm_var": 0.005880673726399739, "learning_rate": 0.0001, "loss": 0.4159, "loss/crossentropy": 2.89151668548584, "loss/fcd": 0.3173828125, "loss/logits": 0.09850744158029556, "step": 362 }, { "epoch": 0.363, "grad_norm": 0.875, "grad_norm_var": 0.007897694905598959, "learning_rate": 0.0001, "loss": 0.4513, "loss/crossentropy": 2.9357181787490845, "loss/fcd": 0.3369140625, "loss/logits": 0.11434905976057053, "step": 363 }, { "epoch": 0.364, "grad_norm": 0.74609375, "grad_norm_var": 0.0079559326171875, "learning_rate": 0.0001, "loss": 0.4275, "loss/crossentropy": 3.013663649559021, "loss/fcd": 0.330078125, "loss/logits": 0.0974624715745449, "step": 364 }, { "epoch": 0.365, "grad_norm": 0.7265625, "grad_norm_var": 0.0079559326171875, "learning_rate": 0.0001, "loss": 0.4141, "loss/crossentropy": 2.7465943098068237, "loss/fcd": 0.322265625, "loss/logits": 0.09180082008242607, "step": 365 }, { "epoch": 0.366, "grad_norm": 0.63671875, "grad_norm_var": 0.007380930582682291, "learning_rate": 0.0001, "loss": 0.4263, "loss/crossentropy": 2.9539248943328857, "loss/fcd": 0.3251953125, "loss/logits": 0.10109733417630196, "step": 366 }, { "epoch": 0.367, "grad_norm": 0.63671875, "grad_norm_var": 0.0071756998697916664, "learning_rate": 0.0001, "loss": 0.4324, "loss/crossentropy": 2.89021635055542, "loss/fcd": 0.33203125, "loss/logits": 0.10038217157125473, "step": 367 }, { "epoch": 0.368, "grad_norm": 0.5703125, "grad_norm_var": 0.007991472880045572, "learning_rate": 0.0001, "loss": 0.4051, "loss/crossentropy": 2.7650424242019653, "loss/fcd": 0.3125, "loss/logits": 0.09255194291472435, "step": 368 }, { "epoch": 0.369, "grad_norm": 0.53125, "grad_norm_var": 0.009267107645670573, "learning_rate": 0.0001, "loss": 0.3949, "loss/crossentropy": 2.841827392578125, "loss/fcd": 0.3076171875, "loss/logits": 0.0872911661863327, "step": 369 }, { "epoch": 0.37, "grad_norm": 0.50390625, "grad_norm_var": 0.010933367411295573, "learning_rate": 0.0001, "loss": 0.3829, "loss/crossentropy": 2.776681661605835, "loss/fcd": 0.298828125, "loss/logits": 0.08403456583619118, "step": 370 }, { "epoch": 0.371, "grad_norm": 0.61328125, "grad_norm_var": 0.010955810546875, "learning_rate": 0.0001, "loss": 0.4569, "loss/crossentropy": 2.904550552368164, "loss/fcd": 0.349609375, "loss/logits": 0.10726456344127655, "step": 371 }, { "epoch": 0.372, "grad_norm": 0.625, "grad_norm_var": 0.011024983723958333, "learning_rate": 0.0001, "loss": 0.4291, "loss/crossentropy": 2.7700815200805664, "loss/fcd": 0.3369140625, "loss/logits": 0.09217642992734909, "step": 372 }, { "epoch": 0.373, "grad_norm": 0.68359375, "grad_norm_var": 0.007610829671223959, "learning_rate": 0.0001, "loss": 0.4092, "loss/crossentropy": 2.7967922687530518, "loss/fcd": 0.3193359375, "loss/logits": 0.0898539088666439, "step": 373 }, { "epoch": 0.374, "grad_norm": 0.87890625, "grad_norm_var": 0.011069170633951823, "learning_rate": 0.0001, "loss": 0.4631, "loss/crossentropy": 2.9314931631088257, "loss/fcd": 0.34765625, "loss/logits": 0.11540572345256805, "step": 374 }, { "epoch": 0.375, "grad_norm": 0.8125, "grad_norm_var": 0.012375895182291667, "learning_rate": 0.0001, "loss": 0.4492, "loss/crossentropy": 2.9116827249526978, "loss/fcd": 0.34375, "loss/logits": 0.10542852059006691, "step": 375 }, { "epoch": 0.376, "grad_norm": 0.81640625, "grad_norm_var": 0.013640340169270833, "learning_rate": 0.0001, "loss": 0.4443, "loss/crossentropy": 2.7188583612442017, "loss/fcd": 0.349609375, "loss/logits": 0.09467883035540581, "step": 376 }, { "epoch": 0.377, "grad_norm": 0.66015625, "grad_norm_var": 0.013356526692708334, "learning_rate": 0.0001, "loss": 0.4221, "loss/crossentropy": 2.844428539276123, "loss/fcd": 0.32421875, "loss/logits": 0.09790046513080597, "step": 377 }, { "epoch": 0.378, "grad_norm": 0.734375, "grad_norm_var": 0.013214556376139323, "learning_rate": 0.0001, "loss": 0.4092, "loss/crossentropy": 2.787495493888855, "loss/fcd": 0.31640625, "loss/logits": 0.09276452288031578, "step": 378 }, { "epoch": 0.379, "grad_norm": 0.7421875, "grad_norm_var": 0.011052894592285156, "learning_rate": 0.0001, "loss": 0.4362, "loss/crossentropy": 2.9763654470443726, "loss/fcd": 0.3349609375, "loss/logits": 0.10128062590956688, "step": 379 }, { "epoch": 0.38, "grad_norm": 0.85546875, "grad_norm_var": 0.012729835510253907, "learning_rate": 0.0001, "loss": 0.417, "loss/crossentropy": 2.846543788909912, "loss/fcd": 0.3232421875, "loss/logits": 0.09378386661410332, "step": 380 }, { "epoch": 0.381, "grad_norm": 0.83203125, "grad_norm_var": 0.013950347900390625, "learning_rate": 0.0001, "loss": 0.4281, "loss/crossentropy": 2.728231906890869, "loss/fcd": 0.3310546875, "loss/logits": 0.09700711071491241, "step": 381 }, { "epoch": 0.382, "grad_norm": 1.21875, "grad_norm_var": 0.030537859598795573, "learning_rate": 0.0001, "loss": 0.5004, "loss/crossentropy": 2.8327022790908813, "loss/fcd": 0.3544921875, "loss/logits": 0.1459062695503235, "step": 382 }, { "epoch": 0.383, "grad_norm": 0.91015625, "grad_norm_var": 0.03173058827718099, "learning_rate": 0.0001, "loss": 0.4301, "loss/crossentropy": 2.865984559059143, "loss/fcd": 0.3232421875, "loss/logits": 0.10680827498435974, "step": 383 }, { "epoch": 0.384, "grad_norm": 0.828125, "grad_norm_var": 0.02973321278889974, "learning_rate": 0.0001, "loss": 0.4345, "loss/crossentropy": 2.8023486137390137, "loss/fcd": 0.3271484375, "loss/logits": 0.10733743384480476, "step": 384 }, { "epoch": 0.385, "grad_norm": 0.89453125, "grad_norm_var": 0.026640828450520834, "learning_rate": 0.0001, "loss": 0.4466, "loss/crossentropy": 2.9903111457824707, "loss/fcd": 0.328125, "loss/logits": 0.11846619471907616, "step": 385 }, { "epoch": 0.386, "grad_norm": 0.7421875, "grad_norm_var": 0.02116082509358724, "learning_rate": 0.0001, "loss": 0.4408, "loss/crossentropy": 2.8822553157806396, "loss/fcd": 0.3369140625, "loss/logits": 0.10392668098211288, "step": 386 }, { "epoch": 0.387, "grad_norm": 0.87890625, "grad_norm_var": 0.01885217030843099, "learning_rate": 0.0001, "loss": 0.4332, "loss/crossentropy": 3.0039666891098022, "loss/fcd": 0.330078125, "loss/logits": 0.10309557244181633, "step": 387 }, { "epoch": 0.388, "grad_norm": 0.91796875, "grad_norm_var": 0.016615804036458334, "learning_rate": 0.0001, "loss": 0.4587, "loss/crossentropy": 2.811259388923645, "loss/fcd": 0.3505859375, "loss/logits": 0.10812849923968315, "step": 388 }, { "epoch": 0.389, "grad_norm": 0.69140625, "grad_norm_var": 0.016458892822265626, "learning_rate": 0.0001, "loss": 0.4188, "loss/crossentropy": 2.794381260871887, "loss/fcd": 0.3193359375, "loss/logits": 0.0994318462908268, "step": 389 }, { "epoch": 0.39, "grad_norm": 0.69140625, "grad_norm_var": 0.017642974853515625, "learning_rate": 0.0001, "loss": 0.4247, "loss/crossentropy": 2.789844274520874, "loss/fcd": 0.3251953125, "loss/logits": 0.09947730973362923, "step": 390 }, { "epoch": 0.391, "grad_norm": 0.63671875, "grad_norm_var": 0.019906044006347656, "learning_rate": 0.0001, "loss": 0.4485, "loss/crossentropy": 2.736423969268799, "loss/fcd": 0.337890625, "loss/logits": 0.11057012155652046, "step": 391 }, { "epoch": 0.392, "grad_norm": 0.66015625, "grad_norm_var": 0.021416664123535156, "learning_rate": 0.0001, "loss": 0.4431, "loss/crossentropy": 2.9826775789260864, "loss/fcd": 0.3330078125, "loss/logits": 0.11012021079659462, "step": 392 }, { "epoch": 0.393, "grad_norm": 0.66015625, "grad_norm_var": 0.021416664123535156, "learning_rate": 0.0001, "loss": 0.4056, "loss/crossentropy": 2.818847894668579, "loss/fcd": 0.3115234375, "loss/logits": 0.09412503615021706, "step": 393 }, { "epoch": 0.394, "grad_norm": 0.63671875, "grad_norm_var": 0.022944132486979168, "learning_rate": 0.0001, "loss": 0.4532, "loss/crossentropy": 2.912561774253845, "loss/fcd": 0.3408203125, "loss/logits": 0.11239226907491684, "step": 394 }, { "epoch": 0.395, "grad_norm": 0.71875, "grad_norm_var": 0.023158518473307292, "learning_rate": 0.0001, "loss": 0.4436, "loss/crossentropy": 2.7931829690933228, "loss/fcd": 0.34375, "loss/logits": 0.09981722384691238, "step": 395 }, { "epoch": 0.396, "grad_norm": 0.94140625, "grad_norm_var": 0.024274698893229165, "learning_rate": 0.0001, "loss": 0.4378, "loss/crossentropy": 2.9140548706054688, "loss/fcd": 0.333984375, "loss/logits": 0.10377934947609901, "step": 396 }, { "epoch": 0.397, "grad_norm": 0.7109375, "grad_norm_var": 0.024733924865722658, "learning_rate": 0.0001, "loss": 0.4385, "loss/crossentropy": 2.8483548164367676, "loss/fcd": 0.33203125, "loss/logits": 0.10648469254374504, "step": 397 }, { "epoch": 0.398, "grad_norm": 0.6953125, "grad_norm_var": 0.012363624572753907, "learning_rate": 0.0001, "loss": 0.4473, "loss/crossentropy": 2.7623294591903687, "loss/fcd": 0.33984375, "loss/logits": 0.10747401416301727, "step": 398 }, { "epoch": 0.399, "grad_norm": 0.5859375, "grad_norm_var": 0.012590535481770833, "learning_rate": 0.0001, "loss": 0.4196, "loss/crossentropy": 2.924271821975708, "loss/fcd": 0.326171875, "loss/logits": 0.09341751039028168, "step": 399 }, { "epoch": 0.4, "grad_norm": 0.5703125, "grad_norm_var": 0.013824208577473959, "learning_rate": 0.0001, "loss": 0.4132, "loss/crossentropy": 2.8939170837402344, "loss/fcd": 0.3232421875, "loss/logits": 0.08991139009594917, "step": 400 }, { "epoch": 0.401, "grad_norm": 0.65625, "grad_norm_var": 0.012051836649576823, "learning_rate": 0.0001, "loss": 0.4504, "loss/crossentropy": 2.948604702949524, "loss/fcd": 0.337890625, "loss/logits": 0.1125587709248066, "step": 401 }, { "epoch": 0.402, "grad_norm": 0.671875, "grad_norm_var": 0.012079302469889324, "learning_rate": 0.0001, "loss": 0.4236, "loss/crossentropy": 2.953108549118042, "loss/fcd": 0.326171875, "loss/logits": 0.09738374873995781, "step": 402 }, { "epoch": 0.403, "grad_norm": 0.640625, "grad_norm_var": 0.010190582275390625, "learning_rate": 0.0001, "loss": 0.4453, "loss/crossentropy": 2.8220441341400146, "loss/fcd": 0.341796875, "loss/logits": 0.10346821323037148, "step": 403 }, { "epoch": 0.404, "grad_norm": 0.72265625, "grad_norm_var": 0.006712849934895833, "learning_rate": 0.0001, "loss": 0.4319, "loss/crossentropy": 2.9475324153900146, "loss/fcd": 0.3310546875, "loss/logits": 0.10082582384347916, "step": 404 }, { "epoch": 0.405, "grad_norm": 0.828125, "grad_norm_var": 0.008076922098795573, "learning_rate": 0.0001, "loss": 0.4165, "loss/crossentropy": 2.79871141910553, "loss/fcd": 0.3193359375, "loss/logits": 0.09713438898324966, "step": 405 }, { "epoch": 0.406, "grad_norm": 0.89453125, "grad_norm_var": 0.010715166727701822, "learning_rate": 0.0001, "loss": 0.425, "loss/crossentropy": 2.78818678855896, "loss/fcd": 0.3271484375, "loss/logits": 0.09782487899065018, "step": 406 }, { "epoch": 0.407, "grad_norm": 0.73046875, "grad_norm_var": 0.010449663798014323, "learning_rate": 0.0001, "loss": 0.4082, "loss/crossentropy": 2.843753933906555, "loss/fcd": 0.3154296875, "loss/logits": 0.09279356896877289, "step": 407 }, { "epoch": 0.408, "grad_norm": 0.5703125, "grad_norm_var": 0.011524454752604166, "learning_rate": 0.0001, "loss": 0.4299, "loss/crossentropy": 2.911107063293457, "loss/fcd": 0.330078125, "loss/logits": 0.09986836463212967, "step": 408 }, { "epoch": 0.409, "grad_norm": 0.61328125, "grad_norm_var": 0.011924235026041667, "learning_rate": 0.0001, "loss": 0.4285, "loss/crossentropy": 2.840214490890503, "loss/fcd": 0.3310546875, "loss/logits": 0.09741492196917534, "step": 409 }, { "epoch": 0.41, "grad_norm": 0.6015625, "grad_norm_var": 0.012294451395670572, "learning_rate": 0.0001, "loss": 0.4203, "loss/crossentropy": 2.8811668157577515, "loss/fcd": 0.322265625, "loss/logits": 0.09806715697050095, "step": 410 }, { "epoch": 0.411, "grad_norm": 0.59765625, "grad_norm_var": 0.012860107421875, "learning_rate": 0.0001, "loss": 0.4131, "loss/crossentropy": 2.6903880834579468, "loss/fcd": 0.322265625, "loss/logits": 0.09085888788104057, "step": 411 }, { "epoch": 0.412, "grad_norm": 0.75, "grad_norm_var": 0.008719825744628906, "learning_rate": 0.0001, "loss": 0.4222, "loss/crossentropy": 2.8993114233016968, "loss/fcd": 0.3251953125, "loss/logits": 0.09704534709453583, "step": 412 }, { "epoch": 0.413, "grad_norm": 0.73828125, "grad_norm_var": 0.008888498942057291, "learning_rate": 0.0001, "loss": 0.4299, "loss/crossentropy": 2.9046199321746826, "loss/fcd": 0.328125, "loss/logits": 0.10177389904856682, "step": 413 }, { "epoch": 0.414, "grad_norm": 0.734375, "grad_norm_var": 0.009067789713541666, "learning_rate": 0.0001, "loss": 0.4429, "loss/crossentropy": 2.827233672142029, "loss/fcd": 0.33984375, "loss/logits": 0.10300642624497414, "step": 414 }, { "epoch": 0.415, "grad_norm": 0.6484375, "grad_norm_var": 0.008514404296875, "learning_rate": 0.0001, "loss": 0.4356, "loss/crossentropy": 2.8794647455215454, "loss/fcd": 0.3359375, "loss/logits": 0.09965472295880318, "step": 415 }, { "epoch": 0.416, "grad_norm": 0.63671875, "grad_norm_var": 0.0077697118123372395, "learning_rate": 0.0001, "loss": 0.4302, "loss/crossentropy": 2.7547887563705444, "loss/fcd": 0.3330078125, "loss/logits": 0.09721554070711136, "step": 416 }, { "epoch": 0.417, "grad_norm": 0.59765625, "grad_norm_var": 0.008245595296223958, "learning_rate": 0.0001, "loss": 0.4237, "loss/crossentropy": 2.985141396522522, "loss/fcd": 0.3251953125, "loss/logits": 0.09846670553088188, "step": 417 }, { "epoch": 0.418, "grad_norm": 0.5859375, "grad_norm_var": 0.008869425455729166, "learning_rate": 0.0001, "loss": 0.4615, "loss/crossentropy": 2.7390782833099365, "loss/fcd": 0.3544921875, "loss/logits": 0.10705351829528809, "step": 418 }, { "epoch": 0.419, "grad_norm": 0.87890625, "grad_norm_var": 0.011145973205566406, "learning_rate": 0.0001, "loss": 0.4159, "loss/crossentropy": 2.7943382263183594, "loss/fcd": 0.32421875, "loss/logits": 0.09164649993181229, "step": 419 }, { "epoch": 0.42, "grad_norm": 0.59375, "grad_norm_var": 0.01171875, "learning_rate": 0.0001, "loss": 0.4377, "loss/crossentropy": 2.912733554840088, "loss/fcd": 0.3359375, "loss/logits": 0.10175374150276184, "step": 420 }, { "epoch": 0.421, "grad_norm": 0.625, "grad_norm_var": 0.0104888916015625, "learning_rate": 0.0001, "loss": 0.4021, "loss/crossentropy": 2.6603671312332153, "loss/fcd": 0.3125, "loss/logits": 0.08958860114216805, "step": 421 }, { "epoch": 0.422, "grad_norm": 0.84375, "grad_norm_var": 0.009162330627441406, "learning_rate": 0.0001, "loss": 0.4461, "loss/crossentropy": 2.68258535861969, "loss/fcd": 0.34375, "loss/logits": 0.10239701718091965, "step": 422 }, { "epoch": 0.423, "grad_norm": 0.7734375, "grad_norm_var": 0.009614817301432292, "learning_rate": 0.0001, "loss": 0.4318, "loss/crossentropy": 2.7980507612228394, "loss/fcd": 0.330078125, "loss/logits": 0.10173183679580688, "step": 423 }, { "epoch": 0.424, "grad_norm": 0.75390625, "grad_norm_var": 0.009175554911295573, "learning_rate": 0.0001, "loss": 0.512, "loss/crossentropy": 2.661671280860901, "loss/fcd": 0.3876953125, "loss/logits": 0.12431911006569862, "step": 424 }, { "epoch": 0.425, "grad_norm": 0.84765625, "grad_norm_var": 0.010342852274576823, "learning_rate": 0.0001, "loss": 0.4684, "loss/crossentropy": 2.962968587875366, "loss/fcd": 0.3515625, "loss/logits": 0.11681900918483734, "step": 425 }, { "epoch": 0.426, "grad_norm": 0.74609375, "grad_norm_var": 0.009742991129557291, "learning_rate": 0.0001, "loss": 0.4187, "loss/crossentropy": 2.812212586402893, "loss/fcd": 0.322265625, "loss/logits": 0.09639938920736313, "step": 426 }, { "epoch": 0.427, "grad_norm": 0.5625, "grad_norm_var": 0.010344378153483073, "learning_rate": 0.0001, "loss": 0.422, "loss/crossentropy": 2.7403759956359863, "loss/fcd": 0.328125, "loss/logits": 0.09389032050967216, "step": 427 }, { "epoch": 0.428, "grad_norm": 0.62109375, "grad_norm_var": 0.010648600260416667, "learning_rate": 0.0001, "loss": 0.4535, "loss/crossentropy": 2.9029101133346558, "loss/fcd": 0.3466796875, "loss/logits": 0.10677453130483627, "step": 428 }, { "epoch": 0.429, "grad_norm": 0.625, "grad_norm_var": 0.010860633850097657, "learning_rate": 0.0001, "loss": 0.4213, "loss/crossentropy": 2.9899282455444336, "loss/fcd": 0.3251953125, "loss/logits": 0.09613670036196709, "step": 429 }, { "epoch": 0.43, "grad_norm": 0.66015625, "grad_norm_var": 0.010786946614583333, "learning_rate": 0.0001, "loss": 0.4325, "loss/crossentropy": 2.871562957763672, "loss/fcd": 0.3349609375, "loss/logits": 0.09752309322357178, "step": 430 }, { "epoch": 0.431, "grad_norm": 0.68359375, "grad_norm_var": 0.010681088765462239, "learning_rate": 0.0001, "loss": 0.4306, "loss/crossentropy": 2.6593871116638184, "loss/fcd": 0.333984375, "loss/logits": 0.09662756696343422, "step": 431 }, { "epoch": 0.432, "grad_norm": 0.7421875, "grad_norm_var": 0.010631306966145834, "learning_rate": 0.0001, "loss": 0.4289, "loss/crossentropy": 2.8526699542999268, "loss/fcd": 0.3310546875, "loss/logits": 0.09788180887699127, "step": 432 }, { "epoch": 0.433, "grad_norm": 0.8203125, "grad_norm_var": 0.01080163319905599, "learning_rate": 0.0001, "loss": 0.447, "loss/crossentropy": 2.845671057701111, "loss/fcd": 0.341796875, "loss/logits": 0.10521851852536201, "step": 433 }, { "epoch": 0.434, "grad_norm": 0.734375, "grad_norm_var": 0.009719276428222656, "learning_rate": 0.0001, "loss": 0.4569, "loss/crossentropy": 2.7953325510025024, "loss/fcd": 0.349609375, "loss/logits": 0.10731937736272812, "step": 434 }, { "epoch": 0.435, "grad_norm": 0.71484375, "grad_norm_var": 0.007914161682128907, "learning_rate": 0.0001, "loss": 0.4052, "loss/crossentropy": 2.741719603538513, "loss/fcd": 0.31640625, "loss/logits": 0.08881807327270508, "step": 435 }, { "epoch": 0.436, "grad_norm": 0.6328125, "grad_norm_var": 0.00740807851155599, "learning_rate": 0.0001, "loss": 0.4394, "loss/crossentropy": 2.8444197177886963, "loss/fcd": 0.337890625, "loss/logits": 0.10151328146457672, "step": 436 }, { "epoch": 0.437, "grad_norm": 0.63671875, "grad_norm_var": 0.007281239827473958, "learning_rate": 0.0001, "loss": 0.4549, "loss/crossentropy": 2.6685785055160522, "loss/fcd": 0.3525390625, "loss/logits": 0.1023550033569336, "step": 437 }, { "epoch": 0.438, "grad_norm": 0.74609375, "grad_norm_var": 0.006167030334472657, "learning_rate": 0.0001, "loss": 0.4555, "loss/crossentropy": 2.8580113649368286, "loss/fcd": 0.34375, "loss/logits": 0.11170928180217743, "step": 438 }, { "epoch": 0.439, "grad_norm": 0.765625, "grad_norm_var": 0.006100908915201823, "learning_rate": 0.0001, "loss": 0.4679, "loss/crossentropy": 2.7053956985473633, "loss/fcd": 0.357421875, "loss/logits": 0.11045508086681366, "step": 439 }, { "epoch": 0.44, "grad_norm": 0.83203125, "grad_norm_var": 0.006983375549316407, "learning_rate": 0.0001, "loss": 0.4574, "loss/crossentropy": 3.011350154876709, "loss/fcd": 0.3515625, "loss/logits": 0.10579814016819, "step": 440 }, { "epoch": 0.441, "grad_norm": 0.9921875, "grad_norm_var": 0.0109283447265625, "learning_rate": 0.0001, "loss": 0.4559, "loss/crossentropy": 2.850973963737488, "loss/fcd": 0.3486328125, "loss/logits": 0.10725782811641693, "step": 441 }, { "epoch": 0.442, "grad_norm": 1.265625, "grad_norm_var": 0.029624366760253908, "learning_rate": 0.0001, "loss": 0.4703, "loss/crossentropy": 2.9250357151031494, "loss/fcd": 0.35546875, "loss/logits": 0.11481418460607529, "step": 442 }, { "epoch": 0.443, "grad_norm": 0.86328125, "grad_norm_var": 0.027671051025390626, "learning_rate": 0.0001, "loss": 0.4503, "loss/crossentropy": 2.8306068181991577, "loss/fcd": 0.34375, "loss/logits": 0.10658934339880943, "step": 443 }, { "epoch": 0.444, "grad_norm": 0.9296875, "grad_norm_var": 0.02745507558186849, "learning_rate": 0.0001, "loss": 0.4421, "loss/crossentropy": 2.835876703262329, "loss/fcd": 0.3359375, "loss/logits": 0.10615797340869904, "step": 444 }, { "epoch": 0.445, "grad_norm": 0.9375, "grad_norm_var": 0.026671791076660158, "learning_rate": 0.0001, "loss": 0.4746, "loss/crossentropy": 2.862402319908142, "loss/fcd": 0.36328125, "loss/logits": 0.11136271059513092, "step": 445 }, { "epoch": 0.446, "grad_norm": 0.796875, "grad_norm_var": 0.025111897786458334, "learning_rate": 0.0001, "loss": 0.4353, "loss/crossentropy": 2.6523581743240356, "loss/fcd": 0.3369140625, "loss/logits": 0.09838449582457542, "step": 446 }, { "epoch": 0.447, "grad_norm": 0.6875, "grad_norm_var": 0.02504266103108724, "learning_rate": 0.0001, "loss": 0.4331, "loss/crossentropy": 2.8293275833129883, "loss/fcd": 0.3369140625, "loss/logits": 0.09622528776526451, "step": 447 }, { "epoch": 0.448, "grad_norm": 0.62109375, "grad_norm_var": 0.02719294230143229, "learning_rate": 0.0001, "loss": 0.4363, "loss/crossentropy": 2.827236771583557, "loss/fcd": 0.341796875, "loss/logits": 0.09454872459173203, "step": 448 }, { "epoch": 0.449, "grad_norm": 0.65234375, "grad_norm_var": 0.028748512268066406, "learning_rate": 0.0001, "loss": 0.4322, "loss/crossentropy": 2.9126251935958862, "loss/fcd": 0.3310546875, "loss/logits": 0.10111148655414581, "step": 449 }, { "epoch": 0.45, "grad_norm": 0.8203125, "grad_norm_var": 0.028451983133951822, "learning_rate": 0.0001, "loss": 0.4393, "loss/crossentropy": 2.932951331138611, "loss/fcd": 0.3359375, "loss/logits": 0.10334927961230278, "step": 450 }, { "epoch": 0.451, "grad_norm": 0.83203125, "grad_norm_var": 0.027887407938639322, "learning_rate": 0.0001, "loss": 0.4458, "loss/crossentropy": 2.7913864850997925, "loss/fcd": 0.341796875, "loss/logits": 0.1039762981235981, "step": 451 }, { "epoch": 0.452, "grad_norm": 0.70703125, "grad_norm_var": 0.02644627888997396, "learning_rate": 0.0001, "loss": 0.4415, "loss/crossentropy": 2.8713871240615845, "loss/fcd": 0.3408203125, "loss/logits": 0.10065218433737755, "step": 452 }, { "epoch": 0.453, "grad_norm": 1.0234375, "grad_norm_var": 0.02645257314046224, "learning_rate": 0.0001, "loss": 0.5023, "loss/crossentropy": 2.884295105934143, "loss/fcd": 0.36328125, "loss/logits": 0.1390550322830677, "step": 453 }, { "epoch": 0.454, "grad_norm": 0.70703125, "grad_norm_var": 0.02704766591389974, "learning_rate": 0.0001, "loss": 0.4461, "loss/crossentropy": 2.7804571390151978, "loss/fcd": 0.3447265625, "loss/logits": 0.10132800415158272, "step": 454 }, { "epoch": 0.455, "grad_norm": 0.7578125, "grad_norm_var": 0.02712853749593099, "learning_rate": 0.0001, "loss": 0.441, "loss/crossentropy": 2.756545662879944, "loss/fcd": 0.337890625, "loss/logits": 0.10309552773833275, "step": 455 }, { "epoch": 0.456, "grad_norm": 0.7890625, "grad_norm_var": 0.02728449503580729, "learning_rate": 0.0001, "loss": 0.4314, "loss/crossentropy": 2.8818628787994385, "loss/fcd": 0.3310546875, "loss/logits": 0.10030314698815346, "step": 456 }, { "epoch": 0.457, "grad_norm": 0.73046875, "grad_norm_var": 0.026130104064941408, "learning_rate": 0.0001, "loss": 0.4104, "loss/crossentropy": 2.8387563228607178, "loss/fcd": 0.3203125, "loss/logits": 0.09010699018836021, "step": 457 }, { "epoch": 0.458, "grad_norm": 0.78515625, "grad_norm_var": 0.0120147705078125, "learning_rate": 0.0001, "loss": 0.4569, "loss/crossentropy": 2.864213466644287, "loss/fcd": 0.3525390625, "loss/logits": 0.10435482114553452, "step": 458 }, { "epoch": 0.459, "grad_norm": 0.79296875, "grad_norm_var": 0.011637115478515625, "learning_rate": 0.0001, "loss": 0.423, "loss/crossentropy": 2.761037230491638, "loss/fcd": 0.3291015625, "loss/logits": 0.09392649680376053, "step": 459 }, { "epoch": 0.46, "grad_norm": 0.73828125, "grad_norm_var": 0.01025079091389974, "learning_rate": 0.0001, "loss": 0.4241, "loss/crossentropy": 2.9691224098205566, "loss/fcd": 0.3291015625, "loss/logits": 0.09495318681001663, "step": 460 }, { "epoch": 0.461, "grad_norm": 0.6484375, "grad_norm_var": 0.009159278869628907, "learning_rate": 0.0001, "loss": 0.4355, "loss/crossentropy": 2.914411187171936, "loss/fcd": 0.3359375, "loss/logits": 0.09953488409519196, "step": 461 }, { "epoch": 0.462, "grad_norm": 0.8203125, "grad_norm_var": 0.009322547912597656, "learning_rate": 0.0001, "loss": 0.4917, "loss/crossentropy": 3.1634334325790405, "loss/fcd": 0.373046875, "loss/logits": 0.11867117509245872, "step": 462 }, { "epoch": 0.463, "grad_norm": 0.6953125, "grad_norm_var": 0.009253883361816406, "learning_rate": 0.0001, "loss": 0.4236, "loss/crossentropy": 2.865732789039612, "loss/fcd": 0.3271484375, "loss/logits": 0.09641865640878677, "step": 463 }, { "epoch": 0.464, "grad_norm": 0.63671875, "grad_norm_var": 0.008984820048014323, "learning_rate": 0.0001, "loss": 0.4393, "loss/crossentropy": 2.7939807176589966, "loss/fcd": 0.3369140625, "loss/logits": 0.10240520909428596, "step": 464 }, { "epoch": 0.465, "grad_norm": 0.65625, "grad_norm_var": 0.008930460611979166, "learning_rate": 0.0001, "loss": 0.4451, "loss/crossentropy": 2.7904140949249268, "loss/fcd": 0.341796875, "loss/logits": 0.10333988070487976, "step": 465 }, { "epoch": 0.466, "grad_norm": 0.71484375, "grad_norm_var": 0.008760515848795574, "learning_rate": 0.0001, "loss": 0.4413, "loss/crossentropy": 2.782050848007202, "loss/fcd": 0.33984375, "loss/logits": 0.10147681087255478, "step": 466 }, { "epoch": 0.467, "grad_norm": 0.75390625, "grad_norm_var": 0.008310381571451824, "learning_rate": 0.0001, "loss": 0.4635, "loss/crossentropy": 2.73897647857666, "loss/fcd": 0.3486328125, "loss/logits": 0.11481858789920807, "step": 467 }, { "epoch": 0.468, "grad_norm": 0.8203125, "grad_norm_var": 0.008503977457682292, "learning_rate": 0.0001, "loss": 0.4645, "loss/crossentropy": 2.7576816082000732, "loss/fcd": 0.353515625, "loss/logits": 0.11100797355175018, "step": 468 }, { "epoch": 0.469, "grad_norm": 0.953125, "grad_norm_var": 0.0062906901041666664, "learning_rate": 0.0001, "loss": 0.5082, "loss/crossentropy": 2.9028536081314087, "loss/fcd": 0.388671875, "loss/logits": 0.11947885528206825, "step": 469 }, { "epoch": 0.47, "grad_norm": 0.8046875, "grad_norm_var": 0.006327247619628907, "learning_rate": 0.0001, "loss": 0.4491, "loss/crossentropy": 2.803042411804199, "loss/fcd": 0.3466796875, "loss/logits": 0.10240250080823898, "step": 470 }, { "epoch": 0.471, "grad_norm": 0.6875, "grad_norm_var": 0.006620216369628906, "learning_rate": 0.0001, "loss": 0.4412, "loss/crossentropy": 2.86372172832489, "loss/fcd": 0.3408203125, "loss/logits": 0.10040333867073059, "step": 471 }, { "epoch": 0.472, "grad_norm": 0.67578125, "grad_norm_var": 0.006858062744140625, "learning_rate": 0.0001, "loss": 0.4593, "loss/crossentropy": 2.8334121704101562, "loss/fcd": 0.3505859375, "loss/logits": 0.10868023708462715, "step": 472 }, { "epoch": 0.473, "grad_norm": 1.9453125, "grad_norm_var": 0.09680474599202474, "learning_rate": 0.0001, "loss": 0.5875, "loss/crossentropy": 2.7985202074050903, "loss/fcd": 0.4072265625, "loss/logits": 0.18022966012358665, "step": 473 }, { "epoch": 0.474, "grad_norm": 1.3515625, "grad_norm_var": 0.11418228149414063, "learning_rate": 0.0001, "loss": 0.5436, "loss/crossentropy": 3.055509328842163, "loss/fcd": 0.392578125, "loss/logits": 0.15097329765558243, "step": 474 }, { "epoch": 0.475, "grad_norm": 1.0546875, "grad_norm_var": 0.11626529693603516, "learning_rate": 0.0001, "loss": 0.4853, "loss/crossentropy": 2.7848037481307983, "loss/fcd": 0.365234375, "loss/logits": 0.12005244940519333, "step": 475 }, { "epoch": 0.476, "grad_norm": 0.98828125, "grad_norm_var": 0.11570377349853515, "learning_rate": 0.0001, "loss": 0.506, "loss/crossentropy": 3.017175316810608, "loss/fcd": 0.380859375, "loss/logits": 0.1251136139035225, "step": 476 }, { "epoch": 0.477, "grad_norm": 1.2734375, "grad_norm_var": 0.12015933990478515, "learning_rate": 0.0001, "loss": 0.5155, "loss/crossentropy": 2.6267424821853638, "loss/fcd": 0.388671875, "loss/logits": 0.1268685646355152, "step": 477 }, { "epoch": 0.478, "grad_norm": 0.9453125, "grad_norm_var": 0.11935774485270183, "learning_rate": 0.0001, "loss": 0.4556, "loss/crossentropy": 2.7268201112747192, "loss/fcd": 0.353515625, "loss/logits": 0.10206079855561256, "step": 478 }, { "epoch": 0.479, "grad_norm": 0.84375, "grad_norm_var": 0.11599470774332682, "learning_rate": 0.0001, "loss": 0.4605, "loss/crossentropy": 2.879013180732727, "loss/fcd": 0.353515625, "loss/logits": 0.10701316967606544, "step": 479 }, { "epoch": 0.48, "grad_norm": 0.78515625, "grad_norm_var": 0.11128838857014973, "learning_rate": 0.0001, "loss": 0.4878, "loss/crossentropy": 2.570293426513672, "loss/fcd": 0.3701171875, "loss/logits": 0.11772741004824638, "step": 480 }, { "epoch": 0.481, "grad_norm": 0.765625, "grad_norm_var": 0.10770308176676432, "learning_rate": 0.0001, "loss": 0.4587, "loss/crossentropy": 2.846108078956604, "loss/fcd": 0.349609375, "loss/logits": 0.10910140722990036, "step": 481 }, { "epoch": 0.482, "grad_norm": 0.9453125, "grad_norm_var": 0.10348307291666667, "learning_rate": 0.0001, "loss": 0.495, "loss/crossentropy": 3.0078119039535522, "loss/fcd": 0.3701171875, "loss/logits": 0.12487388029694557, "step": 482 }, { "epoch": 0.483, "grad_norm": 0.9296875, "grad_norm_var": 0.10024153391520182, "learning_rate": 0.0001, "loss": 0.4598, "loss/crossentropy": 2.8971179723739624, "loss/fcd": 0.3486328125, "loss/logits": 0.11118747293949127, "step": 483 }, { "epoch": 0.484, "grad_norm": 0.81640625, "grad_norm_var": 0.10032857259114583, "learning_rate": 0.0001, "loss": 0.4545, "loss/crossentropy": 2.7730342149734497, "loss/fcd": 0.353515625, "loss/logits": 0.10101194679737091, "step": 484 }, { "epoch": 0.485, "grad_norm": 0.73046875, "grad_norm_var": 0.10438378651936848, "learning_rate": 0.0001, "loss": 0.4642, "loss/crossentropy": 2.933692455291748, "loss/fcd": 0.3583984375, "loss/logits": 0.10578196868300438, "step": 485 }, { "epoch": 0.486, "grad_norm": 0.75, "grad_norm_var": 0.10578657786051432, "learning_rate": 0.0001, "loss": 0.4646, "loss/crossentropy": 2.8260008096694946, "loss/fcd": 0.35546875, "loss/logits": 0.1091219000518322, "step": 486 }, { "epoch": 0.487, "grad_norm": 0.765625, "grad_norm_var": 0.10324598948160807, "learning_rate": 0.0001, "loss": 0.4442, "loss/crossentropy": 2.8580541610717773, "loss/fcd": 0.33984375, "loss/logits": 0.10437064245343208, "step": 487 }, { "epoch": 0.488, "grad_norm": 0.70703125, "grad_norm_var": 0.10206902821858724, "learning_rate": 0.0001, "loss": 0.4382, "loss/crossentropy": 2.885003685951233, "loss/fcd": 0.337890625, "loss/logits": 0.10032079368829727, "step": 488 }, { "epoch": 0.489, "grad_norm": 0.79296875, "grad_norm_var": 0.035955556233723956, "learning_rate": 0.0001, "loss": 0.4776, "loss/crossentropy": 2.99890398979187, "loss/fcd": 0.3603515625, "loss/logits": 0.11722472310066223, "step": 489 }, { "epoch": 0.49, "grad_norm": 0.73828125, "grad_norm_var": 0.022769610087076824, "learning_rate": 0.0001, "loss": 0.4513, "loss/crossentropy": 2.87681782245636, "loss/fcd": 0.345703125, "loss/logits": 0.10560647398233414, "step": 490 }, { "epoch": 0.491, "grad_norm": 0.7265625, "grad_norm_var": 0.02117811838785807, "learning_rate": 0.0001, "loss": 0.4427, "loss/crossentropy": 2.8826026916503906, "loss/fcd": 0.33984375, "loss/logits": 0.1029045432806015, "step": 491 }, { "epoch": 0.492, "grad_norm": 0.78515625, "grad_norm_var": 0.019849077860514323, "learning_rate": 0.0001, "loss": 0.4723, "loss/crossentropy": 2.90323805809021, "loss/fcd": 0.365234375, "loss/logits": 0.10705263912677765, "step": 492 }, { "epoch": 0.493, "grad_norm": 0.93359375, "grad_norm_var": 0.007033030192057292, "learning_rate": 0.0001, "loss": 0.5298, "loss/crossentropy": 2.849150061607361, "loss/fcd": 0.373046875, "loss/logits": 0.15670844912528992, "step": 493 }, { "epoch": 0.494, "grad_norm": 0.84765625, "grad_norm_var": 0.005867958068847656, "learning_rate": 0.0001, "loss": 0.4689, "loss/crossentropy": 2.828599691390991, "loss/fcd": 0.3564453125, "loss/logits": 0.11246544495224953, "step": 494 }, { "epoch": 0.495, "grad_norm": 0.87109375, "grad_norm_var": 0.006059773763020833, "learning_rate": 0.0001, "loss": 0.4789, "loss/crossentropy": 2.940881133079529, "loss/fcd": 0.359375, "loss/logits": 0.11956067010760307, "step": 495 }, { "epoch": 0.496, "grad_norm": 0.95703125, "grad_norm_var": 0.007436116536458333, "learning_rate": 0.0001, "loss": 0.4811, "loss/crossentropy": 2.7961113452911377, "loss/fcd": 0.3642578125, "loss/logits": 0.11681519448757172, "step": 496 }, { "epoch": 0.497, "grad_norm": 0.88671875, "grad_norm_var": 0.007532691955566407, "learning_rate": 0.0001, "loss": 0.4886, "loss/crossentropy": 2.9191471338272095, "loss/fcd": 0.3623046875, "loss/logits": 0.12631087005138397, "step": 497 }, { "epoch": 0.498, "grad_norm": 0.74609375, "grad_norm_var": 0.0067901611328125, "learning_rate": 0.0001, "loss": 0.4617, "loss/crossentropy": 2.8428783416748047, "loss/fcd": 0.3466796875, "loss/logits": 0.11499987542629242, "step": 498 }, { "epoch": 0.499, "grad_norm": 0.640625, "grad_norm_var": 0.007458241780598959, "learning_rate": 0.0001, "loss": 0.4178, "loss/crossentropy": 2.779066324234009, "loss/fcd": 0.322265625, "loss/logits": 0.09556294605135918, "step": 499 }, { "epoch": 0.5, "grad_norm": 0.87890625, "grad_norm_var": 0.007893625895182292, "learning_rate": 0.0001, "loss": 0.5152, "loss/crossentropy": 2.9425406455993652, "loss/fcd": 0.380859375, "loss/logits": 0.13436781987547874, "step": 500 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2917680717824e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }