{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.006350819255683983, "eval_steps": 1000, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 3.1754096278419914e-06, "grad_norm": 0.427734375, "learning_rate": 0.01, "loss": 1.4779, "loss/crossentropy": 2.7208712100982666, "loss/fcd": 1.25390625, "loss/logits": 0.3100138008594513, "step": 1 }, { "epoch": 6.350819255683983e-06, "grad_norm": 0.3671875, "learning_rate": 0.01, "loss": 1.4388, "loss/crossentropy": 2.782768964767456, "loss/fcd": 1.23046875, "loss/logits": 0.32207590341567993, "step": 2 }, { "epoch": 9.526228883525975e-06, "grad_norm": 0.349609375, "learning_rate": 0.01, "loss": 1.4326, "loss/crossentropy": 2.3647608757019043, "loss/fcd": 1.14453125, "loss/logits": 0.2745417281985283, "step": 3 }, { "epoch": 1.2701638511367966e-05, "grad_norm": 0.439453125, "learning_rate": 0.01, "loss": 1.5644, "loss/crossentropy": 2.9222898483276367, "loss/fcd": 1.44140625, "loss/logits": 0.43140920996665955, "step": 4 }, { "epoch": 1.587704813920996e-05, "grad_norm": 0.337890625, "learning_rate": 0.01, "loss": 1.4104, "loss/crossentropy": 2.522671341896057, "loss/fcd": 1.21484375, "loss/logits": 0.3076182156801224, "step": 5 }, { "epoch": 1.905245776705195e-05, "grad_norm": 0.427734375, "learning_rate": 0.01, "loss": 1.5088, "loss/crossentropy": 2.7782797813415527, "loss/fcd": 1.328125, "loss/logits": 0.393465518951416, "step": 6 }, { "epoch": 2.222786739489394e-05, "grad_norm": 0.474609375, "learning_rate": 0.01, "loss": 1.4087, "loss/crossentropy": 2.718917727470398, "loss/fcd": 1.16015625, "loss/logits": 0.26949192583560944, "step": 7 }, { "epoch": 2.540327702273593e-05, "grad_norm": 0.37109375, "learning_rate": 0.01, "loss": 1.451, "loss/crossentropy": 2.461330533027649, "loss/fcd": 1.18359375, "loss/logits": 0.25975215435028076, "step": 8 }, { "epoch": 2.8578686650577926e-05, "grad_norm": 0.361328125, "learning_rate": 0.01, "loss": 1.449, "loss/crossentropy": 2.5565720796585083, "loss/fcd": 1.140625, "loss/logits": 0.2803102284669876, "step": 9 }, { "epoch": 3.175409627841992e-05, "grad_norm": 0.408203125, "learning_rate": 0.01, "loss": 1.4704, "loss/crossentropy": 2.6353514194488525, "loss/fcd": 1.1796875, "loss/logits": 0.27618807554244995, "step": 10 }, { "epoch": 3.492950590626191e-05, "grad_norm": 0.365234375, "learning_rate": 0.01, "loss": 1.4067, "loss/crossentropy": 2.4728065729141235, "loss/fcd": 1.109375, "loss/logits": 0.25887319445610046, "step": 11 }, { "epoch": 3.81049155341039e-05, "grad_norm": 0.359375, "learning_rate": 0.01, "loss": 1.4828, "loss/crossentropy": 2.481222152709961, "loss/fcd": 1.2734375, "loss/logits": 0.2573816031217575, "step": 12 }, { "epoch": 4.128032516194589e-05, "grad_norm": 0.3359375, "learning_rate": 0.01, "loss": 1.4614, "loss/crossentropy": 2.431252956390381, "loss/fcd": 1.15625, "loss/logits": 0.2829872667789459, "step": 13 }, { "epoch": 4.445573478978788e-05, "grad_norm": 0.33984375, "learning_rate": 0.01, "loss": 1.3983, "loss/crossentropy": 2.593013644218445, "loss/fcd": 1.140625, "loss/logits": 0.2456735298037529, "step": 14 }, { "epoch": 4.763114441762987e-05, "grad_norm": 0.359375, "learning_rate": 0.01, "loss": 1.429, "loss/crossentropy": 2.5329941511154175, "loss/fcd": 1.125, "loss/logits": 0.24994614720344543, "step": 15 }, { "epoch": 5.080655404547186e-05, "grad_norm": 0.33203125, "grad_norm_var": 0.0018677870432535807, "learning_rate": 0.01, "loss": 1.4013, "loss/crossentropy": 2.519964098930359, "loss/fcd": 1.1484375, "loss/logits": 0.24906611442565918, "step": 16 }, { "epoch": 5.398196367331386e-05, "grad_norm": 0.3203125, "grad_norm_var": 0.0018843968709309896, "learning_rate": 0.01, "loss": 1.4593, "loss/crossentropy": 2.5781267881393433, "loss/fcd": 1.140625, "loss/logits": 0.24220598489046097, "step": 17 }, { "epoch": 5.715737330115585e-05, "grad_norm": 0.326171875, "grad_norm_var": 0.002014907201131185, "learning_rate": 0.01, "loss": 1.4073, "loss/crossentropy": 1.85208261013031, "loss/fcd": 1.029296875, "loss/logits": 0.2166135385632515, "step": 18 }, { "epoch": 6.033278292899784e-05, "grad_norm": 0.349609375, "grad_norm_var": 0.002014907201131185, "learning_rate": 0.01, "loss": 1.4266, "loss/crossentropy": 2.798880457878113, "loss/fcd": 1.25390625, "loss/logits": 0.2895851135253906, "step": 19 }, { "epoch": 6.350819255683983e-05, "grad_norm": 0.3046875, "grad_norm_var": 0.0018887837727864583, "learning_rate": 0.01, "loss": 1.4315, "loss/crossentropy": 2.4751474857330322, "loss/fcd": 1.1484375, "loss/logits": 0.25777776539325714, "step": 20 }, { "epoch": 6.668360218468182e-05, "grad_norm": 0.3515625, "grad_norm_var": 0.0018586317698160808, "learning_rate": 0.01, "loss": 1.5195, "loss/crossentropy": 2.7463793754577637, "loss/fcd": 1.20703125, "loss/logits": 0.28716035187244415, "step": 21 }, { "epoch": 6.985901181252382e-05, "grad_norm": 0.32421875, "grad_norm_var": 0.0016168594360351563, "learning_rate": 0.01, "loss": 1.4227, "loss/crossentropy": 2.59697163105011, "loss/fcd": 1.20703125, "loss/logits": 0.2831689566373825, "step": 22 }, { "epoch": 7.30344214403658e-05, "grad_norm": 0.33203125, "grad_norm_var": 0.0006178379058837891, "learning_rate": 0.01, "loss": 1.4379, "loss/crossentropy": 2.778831362724304, "loss/fcd": 1.20703125, "loss/logits": 0.29210230708122253, "step": 23 }, { "epoch": 7.62098310682078e-05, "grad_norm": 0.337890625, "grad_norm_var": 0.0005770365397135417, "learning_rate": 0.01, "loss": 1.4904, "loss/crossentropy": 2.7090160846710205, "loss/fcd": 1.16796875, "loss/logits": 0.24318455159664154, "step": 24 }, { "epoch": 7.938524069604979e-05, "grad_norm": 0.353515625, "grad_norm_var": 0.00056304931640625, "learning_rate": 0.01, "loss": 1.5222, "loss/crossentropy": 2.7422226667404175, "loss/fcd": 1.20703125, "loss/logits": 0.28261250257492065, "step": 25 }, { "epoch": 8.256065032389178e-05, "grad_norm": 0.412109375, "grad_norm_var": 0.0005975723266601563, "learning_rate": 0.01, "loss": 1.5148, "loss/crossentropy": 2.681079149246216, "loss/fcd": 1.21484375, "loss/logits": 0.28427667915821075, "step": 26 }, { "epoch": 8.573605995173377e-05, "grad_norm": 0.353515625, "grad_norm_var": 0.000572967529296875, "learning_rate": 0.01, "loss": 1.4487, "loss/crossentropy": 2.6917777061462402, "loss/fcd": 1.1796875, "loss/logits": 0.24098193645477295, "step": 27 }, { "epoch": 8.891146957957576e-05, "grad_norm": 0.328125, "grad_norm_var": 0.000566864013671875, "learning_rate": 0.01, "loss": 1.4421, "loss/crossentropy": 2.5494598150253296, "loss/fcd": 1.16015625, "loss/logits": 0.2880377471446991, "step": 28 }, { "epoch": 9.208687920741775e-05, "grad_norm": 0.333984375, "grad_norm_var": 0.0005685011545817057, "learning_rate": 0.01, "loss": 1.3913, "loss/crossentropy": 2.0876787304878235, "loss/fcd": 1.046875, "loss/logits": 0.2351987585425377, "step": 29 }, { "epoch": 9.526228883525974e-05, "grad_norm": 0.375, "grad_norm_var": 0.0006394545237223307, "learning_rate": 0.01, "loss": 1.5336, "loss/crossentropy": 2.380417227745056, "loss/fcd": 1.29296875, "loss/logits": 0.24527715146541595, "step": 30 }, { "epoch": 9.843769846310173e-05, "grad_norm": 0.349609375, "grad_norm_var": 0.0006245930989583333, "learning_rate": 0.01, "loss": 1.4812, "loss/crossentropy": 2.5255520343780518, "loss/fcd": 1.1953125, "loss/logits": 0.2832820862531662, "step": 31 }, { "epoch": 0.00010161310809094373, "grad_norm": 0.34765625, "grad_norm_var": 0.0006174723307291667, "learning_rate": 0.01, "loss": 1.4833, "loss/crossentropy": 2.7139992713928223, "loss/fcd": 1.234375, "loss/logits": 0.29594743251800537, "step": 32 }, { "epoch": 0.00010478851771878573, "grad_norm": 0.365234375, "grad_norm_var": 0.0006032148996988932, "learning_rate": 0.01, "loss": 1.4865, "loss/crossentropy": 2.5841017961502075, "loss/fcd": 1.16796875, "loss/logits": 0.2654271051287651, "step": 33 }, { "epoch": 0.00010796392734662772, "grad_norm": 0.361328125, "grad_norm_var": 0.0005849043528238933, "learning_rate": 0.01, "loss": 1.5301, "loss/crossentropy": 2.4794301986694336, "loss/fcd": 1.19921875, "loss/logits": 0.3082548677921295, "step": 34 }, { "epoch": 0.00011113933697446971, "grad_norm": 0.333984375, "grad_norm_var": 0.0005983829498291016, "learning_rate": 0.01, "loss": 1.4986, "loss/crossentropy": 2.325202703475952, "loss/fcd": 1.2421875, "loss/logits": 0.2768325060606003, "step": 35 }, { "epoch": 0.0001143147466023117, "grad_norm": 0.375, "grad_norm_var": 0.0005033969879150391, "learning_rate": 0.01, "loss": 1.5577, "loss/crossentropy": 2.5683467388153076, "loss/fcd": 1.19140625, "loss/logits": 0.27995897829532623, "step": 36 }, { "epoch": 0.0001174901562301537, "grad_norm": 0.31640625, "grad_norm_var": 0.0005835056304931641, "learning_rate": 0.01, "loss": 1.4163, "loss/crossentropy": 2.612117886543274, "loss/fcd": 1.1171875, "loss/logits": 0.2647635191679001, "step": 37 }, { "epoch": 0.00012066556585799568, "grad_norm": 0.337890625, "grad_norm_var": 0.0005482355753580729, "learning_rate": 0.01, "loss": 1.4827, "loss/crossentropy": 2.651753783226013, "loss/fcd": 1.171875, "loss/logits": 0.28075091540813446, "step": 38 }, { "epoch": 0.00012384097548583768, "grad_norm": 0.36328125, "grad_norm_var": 0.0005309422810872396, "learning_rate": 0.01, "loss": 1.5322, "loss/crossentropy": 2.570213198661804, "loss/fcd": 1.25390625, "loss/logits": 0.3050275444984436, "step": 39 }, { "epoch": 0.00012701638511367967, "grad_norm": 0.4140625, "grad_norm_var": 0.0007423241933186849, "learning_rate": 0.01, "loss": 1.5809, "loss/crossentropy": 3.0379934310913086, "loss/fcd": 1.3984375, "loss/logits": 0.3556653559207916, "step": 40 }, { "epoch": 0.00013019179474152166, "grad_norm": 0.330078125, "grad_norm_var": 0.0007892449696858724, "learning_rate": 0.01, "loss": 1.4382, "loss/crossentropy": 2.6215317249298096, "loss/fcd": 1.109375, "loss/logits": 0.2600295692682266, "step": 41 }, { "epoch": 0.00013336720436936365, "grad_norm": 0.32421875, "grad_norm_var": 0.0006154378255208334, "learning_rate": 0.01, "loss": 1.4789, "loss/crossentropy": 2.7147910594940186, "loss/fcd": 1.16796875, "loss/logits": 0.26684051752090454, "step": 42 }, { "epoch": 0.00013654261399720564, "grad_norm": 0.3046875, "grad_norm_var": 0.0007453759511311849, "learning_rate": 0.01, "loss": 1.4962, "loss/crossentropy": 2.52204692363739, "loss/fcd": 1.17578125, "loss/logits": 0.29763706028461456, "step": 43 }, { "epoch": 0.00013971802362504763, "grad_norm": 0.369140625, "grad_norm_var": 0.0007443745930989583, "learning_rate": 0.01, "loss": 1.4925, "loss/crossentropy": 2.475005030632019, "loss/fcd": 1.37890625, "loss/logits": 0.33748185634613037, "step": 44 }, { "epoch": 0.00014289343325288962, "grad_norm": 0.38671875, "grad_norm_var": 0.0008048852284749349, "learning_rate": 0.01, "loss": 1.4622, "loss/crossentropy": 2.7766867876052856, "loss/fcd": 1.16015625, "loss/logits": 0.2700689136981964, "step": 45 }, { "epoch": 0.0001460688428807316, "grad_norm": 0.33984375, "grad_norm_var": 0.0007808526357014974, "learning_rate": 0.01, "loss": 1.4512, "loss/crossentropy": 2.5489927530288696, "loss/fcd": 1.1328125, "loss/logits": 0.2863048315048218, "step": 46 }, { "epoch": 0.0001492442525085736, "grad_norm": 0.32421875, "grad_norm_var": 0.0008265177408854166, "learning_rate": 0.01, "loss": 1.4045, "loss/crossentropy": 2.813084125518799, "loss/fcd": 1.20703125, "loss/logits": 0.2752307057380676, "step": 47 }, { "epoch": 0.0001524196621364156, "grad_norm": 0.375, "grad_norm_var": 0.0008661270141601563, "learning_rate": 0.01, "loss": 1.4582, "loss/crossentropy": 2.4155198335647583, "loss/fcd": 1.11328125, "loss/logits": 0.25202202796936035, "step": 48 }, { "epoch": 0.00015559507176425759, "grad_norm": 0.330078125, "grad_norm_var": 0.000878143310546875, "learning_rate": 0.01, "loss": 1.4603, "loss/crossentropy": 2.588241219520569, "loss/fcd": 1.18359375, "loss/logits": 0.2733621597290039, "step": 49 }, { "epoch": 0.00015877048139209958, "grad_norm": 0.328125, "grad_norm_var": 0.0008930047353108724, "learning_rate": 0.01, "loss": 1.4793, "loss/crossentropy": 2.6843411922454834, "loss/fcd": 1.23046875, "loss/logits": 0.3067089468240738, "step": 50 }, { "epoch": 0.00016194589101994157, "grad_norm": 0.34765625, "grad_norm_var": 0.0008808771769205729, "learning_rate": 0.01, "loss": 1.4386, "loss/crossentropy": 2.6553226709365845, "loss/fcd": 1.22265625, "loss/logits": 0.28511741757392883, "step": 51 }, { "epoch": 0.00016512130064778356, "grad_norm": 0.365234375, "grad_norm_var": 0.0008515516916910807, "learning_rate": 0.01, "loss": 1.4864, "loss/crossentropy": 2.7515416145324707, "loss/fcd": 1.2578125, "loss/logits": 0.29100513458251953, "step": 52 }, { "epoch": 0.00016829671027562555, "grad_norm": 0.283203125, "grad_norm_var": 0.0010571797688802083, "learning_rate": 0.01, "loss": 1.3594, "loss/crossentropy": 2.328500747680664, "loss/fcd": 1.1015625, "loss/logits": 0.22987371683120728, "step": 53 }, { "epoch": 0.00017147211990346754, "grad_norm": 0.326171875, "grad_norm_var": 0.0010772069295247396, "learning_rate": 0.01, "loss": 1.4834, "loss/crossentropy": 2.631420373916626, "loss/fcd": 1.19921875, "loss/logits": 0.27365972101688385, "step": 54 }, { "epoch": 0.00017464752953130953, "grad_norm": 0.3046875, "grad_norm_var": 0.0011449178059895833, "learning_rate": 0.01, "loss": 1.4155, "loss/crossentropy": 2.4657742977142334, "loss/fcd": 1.140625, "loss/logits": 0.24968253076076508, "step": 55 }, { "epoch": 0.00017782293915915152, "grad_norm": 0.3359375, "grad_norm_var": 0.0007634480794270833, "learning_rate": 0.01, "loss": 1.4473, "loss/crossentropy": 2.5376617908477783, "loss/fcd": 1.203125, "loss/logits": 0.2923436760902405, "step": 56 }, { "epoch": 0.0001809983487869935, "grad_norm": 0.37109375, "grad_norm_var": 0.0008365472157796223, "learning_rate": 0.01, "loss": 1.4851, "loss/crossentropy": 2.4326465129852295, "loss/fcd": 1.234375, "loss/logits": 0.29466763138771057, "step": 57 }, { "epoch": 0.0001841737584148355, "grad_norm": 0.484375, "grad_norm_var": 0.0021346886952718098, "learning_rate": 0.01, "loss": 1.5311, "loss/crossentropy": 2.5695414543151855, "loss/fcd": 1.203125, "loss/logits": 0.29695531725883484, "step": 58 }, { "epoch": 0.0001873491680426775, "grad_norm": 0.318359375, "grad_norm_var": 0.002066485087076823, "learning_rate": 0.01, "loss": 1.4586, "loss/crossentropy": 2.130593240261078, "loss/fcd": 1.2421875, "loss/logits": 0.22711820900440216, "step": 59 }, { "epoch": 0.00019052457767051949, "grad_norm": 0.326171875, "grad_norm_var": 0.0020685831705729166, "learning_rate": 0.01, "loss": 1.4716, "loss/crossentropy": 2.5455225706100464, "loss/fcd": 1.16015625, "loss/logits": 0.28349919617176056, "step": 60 }, { "epoch": 0.00019369998729836148, "grad_norm": 0.33203125, "grad_norm_var": 0.0019635518391927084, "learning_rate": 0.01, "loss": 1.3877, "loss/crossentropy": 2.743673801422119, "loss/fcd": 1.1640625, "loss/logits": 0.25648681819438934, "step": 61 }, { "epoch": 0.00019687539692620347, "grad_norm": 0.341796875, "grad_norm_var": 0.001962900161743164, "learning_rate": 0.01, "loss": 1.4577, "loss/crossentropy": 2.7351332902908325, "loss/fcd": 1.20703125, "loss/logits": 0.2789645195007324, "step": 62 }, { "epoch": 0.00020005080655404546, "grad_norm": 0.306640625, "grad_norm_var": 0.002027130126953125, "learning_rate": 0.01, "loss": 1.4575, "loss/crossentropy": 2.711727738380432, "loss/fcd": 1.203125, "loss/logits": 0.2944686710834503, "step": 63 }, { "epoch": 0.00020322621618188745, "grad_norm": 0.349609375, "grad_norm_var": 0.0019566694895426433, "learning_rate": 0.01, "loss": 1.4244, "loss/crossentropy": 2.4603357315063477, "loss/fcd": 1.1640625, "loss/logits": 0.2621646523475647, "step": 64 }, { "epoch": 0.00020640162580972947, "grad_norm": 0.365234375, "grad_norm_var": 0.0019841353098551434, "learning_rate": 0.01, "loss": 1.49, "loss/crossentropy": 2.713850259780884, "loss/fcd": 1.24609375, "loss/logits": 0.29021304845809937, "step": 65 }, { "epoch": 0.00020957703543757146, "grad_norm": 0.3125, "grad_norm_var": 0.002030165990193685, "learning_rate": 0.01, "loss": 1.3941, "loss/crossentropy": 2.4431148767471313, "loss/fcd": 1.203125, "loss/logits": 0.2903219237923622, "step": 66 }, { "epoch": 0.00021275244506541345, "grad_norm": 0.369140625, "grad_norm_var": 0.002075449625651042, "learning_rate": 0.01, "loss": 1.5125, "loss/crossentropy": 2.772100806236267, "loss/fcd": 1.18359375, "loss/logits": 0.28916171193122864, "step": 67 }, { "epoch": 0.00021592785469325544, "grad_norm": 0.359375, "grad_norm_var": 0.0020604292551676434, "learning_rate": 0.01, "loss": 1.5124, "loss/crossentropy": 2.523521900177002, "loss/fcd": 1.1640625, "loss/logits": 0.2845292091369629, "step": 68 }, { "epoch": 0.00021910326432109743, "grad_norm": 0.341796875, "grad_norm_var": 0.0018086592356363932, "learning_rate": 0.01, "loss": 1.4699, "loss/crossentropy": 2.4786165952682495, "loss/fcd": 1.1484375, "loss/logits": 0.2643960863351822, "step": 69 }, { "epoch": 0.00022227867394893942, "grad_norm": 0.328125, "grad_norm_var": 0.0018035888671875, "learning_rate": 0.01, "loss": 1.4407, "loss/crossentropy": 2.470872163772583, "loss/fcd": 1.203125, "loss/logits": 0.2760556638240814, "step": 70 }, { "epoch": 0.00022545408357678141, "grad_norm": 0.3671875, "grad_norm_var": 0.0016977945963541667, "learning_rate": 0.01, "loss": 1.5187, "loss/crossentropy": 2.667816400527954, "loss/fcd": 1.2421875, "loss/logits": 0.3126528859138489, "step": 71 }, { "epoch": 0.0002286294932046234, "grad_norm": 0.3125, "grad_norm_var": 0.0017779032389322917, "learning_rate": 0.01, "loss": 1.4036, "loss/crossentropy": 2.4847803115844727, "loss/fcd": 1.125, "loss/logits": 0.2519890144467354, "step": 72 }, { "epoch": 0.0002318049028324654, "grad_norm": 0.326171875, "grad_norm_var": 0.0017724196116129557, "learning_rate": 0.01, "loss": 1.4269, "loss/crossentropy": 2.5423706769943237, "loss/fcd": 1.0625, "loss/logits": 0.23851784318685532, "step": 73 }, { "epoch": 0.0002349803124603074, "grad_norm": 0.30859375, "grad_norm_var": 0.0004677931467692057, "learning_rate": 0.01, "loss": 1.4397, "loss/crossentropy": 2.4151382446289062, "loss/fcd": 1.15625, "loss/logits": 0.266594797372818, "step": 74 }, { "epoch": 0.00023815572208814938, "grad_norm": 0.314453125, "grad_norm_var": 0.0004775842030843099, "learning_rate": 0.01, "loss": 1.4439, "loss/crossentropy": 2.5118494033813477, "loss/fcd": 1.125, "loss/logits": 0.2745512127876282, "step": 75 }, { "epoch": 0.00024133113171599137, "grad_norm": 0.3359375, "grad_norm_var": 0.0004719416300455729, "learning_rate": 0.01, "loss": 1.4842, "loss/crossentropy": 2.6033939123153687, "loss/fcd": 1.2109375, "loss/logits": 0.2739817053079605, "step": 76 }, { "epoch": 0.00024450654134383333, "grad_norm": 0.298828125, "grad_norm_var": 0.0005570570627848307, "learning_rate": 0.01, "loss": 1.436, "loss/crossentropy": 2.5182220935821533, "loss/fcd": 1.21484375, "loss/logits": 0.2935803234577179, "step": 77 }, { "epoch": 0.00024768195097167535, "grad_norm": 0.333984375, "grad_norm_var": 0.0005523522694905598, "learning_rate": 0.01, "loss": 1.4724, "loss/crossentropy": 2.3040342330932617, "loss/fcd": 1.19921875, "loss/logits": 0.2781476825475693, "step": 78 }, { "epoch": 0.0002508573605995173, "grad_norm": 0.328125, "grad_norm_var": 0.0005053202311197917, "learning_rate": 0.01, "loss": 1.4035, "loss/crossentropy": 2.680605173110962, "loss/fcd": 1.2578125, "loss/logits": 0.2877299040555954, "step": 79 }, { "epoch": 0.00025403277022735933, "grad_norm": 0.27734375, "grad_norm_var": 0.0006858666737874348, "learning_rate": 0.01, "loss": 1.3757, "loss/crossentropy": 2.3981412649154663, "loss/fcd": 1.1171875, "loss/logits": 0.27280084788799286, "step": 80 }, { "epoch": 0.0002572081798552013, "grad_norm": 0.314453125, "grad_norm_var": 0.0006081740061442058, "learning_rate": 0.01, "loss": 1.4238, "loss/crossentropy": 2.6359516382217407, "loss/fcd": 1.1796875, "loss/logits": 0.27530917525291443, "step": 81 }, { "epoch": 0.0002603835894830433, "grad_norm": 0.322265625, "grad_norm_var": 0.0005955378214518229, "learning_rate": 0.01, "loss": 1.4237, "loss/crossentropy": 2.7130980491638184, "loss/fcd": 1.1484375, "loss/logits": 0.256900817155838, "step": 82 }, { "epoch": 0.0002635589991108853, "grad_norm": 0.3125, "grad_norm_var": 0.00048076311747233074, "learning_rate": 0.01, "loss": 1.4136, "loss/crossentropy": 2.4784183502197266, "loss/fcd": 1.09765625, "loss/logits": 0.26330579817295074, "step": 83 }, { "epoch": 0.0002667344087387273, "grad_norm": 0.3515625, "grad_norm_var": 0.0004475752512613932, "learning_rate": 0.01, "loss": 1.4648, "loss/crossentropy": 2.4099881649017334, "loss/fcd": 1.0234375, "loss/logits": 0.23014568537473679, "step": 84 }, { "epoch": 0.00026990981836656926, "grad_norm": 0.306640625, "grad_norm_var": 0.0004384199778238932, "learning_rate": 0.01, "loss": 1.4462, "loss/crossentropy": 2.627593994140625, "loss/fcd": 1.1171875, "loss/logits": 0.2508466988801956, "step": 85 }, { "epoch": 0.0002730852279944113, "grad_norm": 0.322265625, "grad_norm_var": 0.00043512980143229165, "learning_rate": 0.01, "loss": 1.4045, "loss/crossentropy": 2.5824599266052246, "loss/fcd": 1.08203125, "loss/logits": 0.24739989638328552, "step": 86 }, { "epoch": 0.0002762606376222533, "grad_norm": 0.3203125, "grad_norm_var": 0.00028254191080729165, "learning_rate": 0.01, "loss": 1.4664, "loss/crossentropy": 2.474897265434265, "loss/fcd": 1.15625, "loss/logits": 0.2485433742403984, "step": 87 }, { "epoch": 0.00027943604725009526, "grad_norm": 0.30859375, "grad_norm_var": 0.00028629302978515624, "learning_rate": 0.01, "loss": 1.3967, "loss/crossentropy": 2.564705014228821, "loss/fcd": 1.1171875, "loss/logits": 0.27985185384750366, "step": 88 }, { "epoch": 0.0002826114568779373, "grad_norm": 0.357421875, "grad_norm_var": 0.0003829320271809896, "learning_rate": 0.01, "loss": 1.4301, "loss/crossentropy": 2.464194416999817, "loss/fcd": 1.1796875, "loss/logits": 0.2522476986050606, "step": 89 }, { "epoch": 0.00028578686650577924, "grad_norm": 0.412109375, "grad_norm_var": 0.0009010155995686849, "learning_rate": 0.01, "loss": 1.5715, "loss/crossentropy": 2.643665909767151, "loss/fcd": 1.48046875, "loss/logits": 0.34832656383514404, "step": 90 }, { "epoch": 0.00028896227613362126, "grad_norm": 0.50390625, "grad_norm_var": 0.0028513590494791668, "learning_rate": 0.01, "loss": 1.5188, "loss/crossentropy": 2.824517846107483, "loss/fcd": 1.17578125, "loss/logits": 0.27408355474472046, "step": 91 }, { "epoch": 0.0002921376857614632, "grad_norm": 0.369140625, "grad_norm_var": 0.0029116153717041017, "learning_rate": 0.01, "loss": 1.52, "loss/crossentropy": 2.6980406045913696, "loss/fcd": 1.17578125, "loss/logits": 0.27827976644039154, "step": 92 }, { "epoch": 0.00029531309538930524, "grad_norm": 0.36328125, "grad_norm_var": 0.002817726135253906, "learning_rate": 0.01, "loss": 1.4666, "loss/crossentropy": 2.242801547050476, "loss/fcd": 1.109375, "loss/logits": 0.2290467768907547, "step": 93 }, { "epoch": 0.0002984885050171472, "grad_norm": 0.306640625, "grad_norm_var": 0.0029009501139322918, "learning_rate": 0.01, "loss": 1.3915, "loss/crossentropy": 2.5826021432876587, "loss/fcd": 1.109375, "loss/logits": 0.2579677700996399, "step": 94 }, { "epoch": 0.0003016639146449892, "grad_norm": 0.298828125, "grad_norm_var": 0.003009907404581706, "learning_rate": 0.01, "loss": 1.4019, "loss/crossentropy": 2.6096630096435547, "loss/fcd": 1.12109375, "loss/logits": 0.25683988630771637, "step": 95 }, { "epoch": 0.0003048393242728312, "grad_norm": 0.322265625, "grad_norm_var": 0.002758026123046875, "learning_rate": 0.01, "loss": 1.5032, "loss/crossentropy": 2.5858817100524902, "loss/fcd": 1.203125, "loss/logits": 0.2797396555542946, "step": 96 }, { "epoch": 0.0003080147339006732, "grad_norm": 0.306640625, "grad_norm_var": 0.0027918497721354167, "learning_rate": 0.01, "loss": 1.3503, "loss/crossentropy": 2.5412296056747437, "loss/fcd": 1.1484375, "loss/logits": 0.2506371811032295, "step": 97 }, { "epoch": 0.00031119014352851517, "grad_norm": 0.4140625, "grad_norm_var": 0.0030675093332926434, "learning_rate": 0.01, "loss": 1.4256, "loss/crossentropy": 2.5495457649230957, "loss/fcd": 1.11328125, "loss/logits": 0.26621289551258087, "step": 98 }, { "epoch": 0.0003143655531563572, "grad_norm": 0.337890625, "grad_norm_var": 0.0029858907063802084, "learning_rate": 0.01, "loss": 1.4694, "loss/crossentropy": 2.51397442817688, "loss/fcd": 1.25390625, "loss/logits": 0.2892449349164963, "step": 99 }, { "epoch": 0.00031754096278419915, "grad_norm": 0.318359375, "grad_norm_var": 0.0030483086903889973, "learning_rate": 0.01, "loss": 1.4546, "loss/crossentropy": 2.636772394180298, "loss/fcd": 1.24609375, "loss/logits": 0.31371480226516724, "step": 100 }, { "epoch": 0.00032071637241204117, "grad_norm": 0.314453125, "grad_norm_var": 0.00300901730855306, "learning_rate": 0.01, "loss": 1.4112, "loss/crossentropy": 2.33548641204834, "loss/fcd": 1.12890625, "loss/logits": 0.25615763664245605, "step": 101 }, { "epoch": 0.00032389178203988313, "grad_norm": 0.3203125, "grad_norm_var": 0.003016090393066406, "learning_rate": 0.01, "loss": 1.4104, "loss/crossentropy": 2.6656574010849, "loss/fcd": 1.15625, "loss/logits": 0.27898988127708435, "step": 102 }, { "epoch": 0.00032706719166772515, "grad_norm": 0.35546875, "grad_norm_var": 0.00296173095703125, "learning_rate": 0.01, "loss": 1.4547, "loss/crossentropy": 2.5015711784362793, "loss/fcd": 1.265625, "loss/logits": 0.2896997630596161, "step": 103 }, { "epoch": 0.0003302426012955671, "grad_norm": 0.353515625, "grad_norm_var": 0.0028363386789957683, "learning_rate": 0.01, "loss": 1.528, "loss/crossentropy": 2.7661324739456177, "loss/fcd": 1.18359375, "loss/logits": 0.2987857013940811, "step": 104 }, { "epoch": 0.00033341801092340913, "grad_norm": 0.326171875, "grad_norm_var": 0.0028805891672770184, "learning_rate": 0.01, "loss": 1.4684, "loss/crossentropy": 2.9468404054641724, "loss/fcd": 1.2578125, "loss/logits": 0.3031136393547058, "step": 105 }, { "epoch": 0.0003365934205512511, "grad_norm": 0.38671875, "grad_norm_var": 0.0027154922485351563, "learning_rate": 0.01, "loss": 1.5256, "loss/crossentropy": 2.6668256521224976, "loss/fcd": 1.25390625, "loss/logits": 0.2801906019449234, "step": 106 }, { "epoch": 0.0003397688301790931, "grad_norm": 0.333984375, "grad_norm_var": 0.001029825210571289, "learning_rate": 0.01, "loss": 1.3728, "loss/crossentropy": 2.2032121419906616, "loss/fcd": 1.06640625, "loss/logits": 0.23979627341032028, "step": 107 }, { "epoch": 0.0003429442398069351, "grad_norm": 0.333984375, "grad_norm_var": 0.0009668827056884766, "learning_rate": 0.01, "loss": 1.4365, "loss/crossentropy": 2.5022414922714233, "loss/fcd": 1.1875, "loss/logits": 0.2410401627421379, "step": 108 }, { "epoch": 0.0003461196494347771, "grad_norm": 0.3125, "grad_norm_var": 0.0009503523508707682, "learning_rate": 0.01, "loss": 1.4188, "loss/crossentropy": 2.535405993461609, "loss/fcd": 1.16015625, "loss/logits": 0.28362762928009033, "step": 109 }, { "epoch": 0.00034929505906261906, "grad_norm": 0.3671875, "grad_norm_var": 0.0009597142537434896, "learning_rate": 0.01, "loss": 1.4512, "loss/crossentropy": 2.6636990308761597, "loss/fcd": 1.16796875, "loss/logits": 0.2748279422521591, "step": 110 }, { "epoch": 0.0003524704686904611, "grad_norm": 0.349609375, "grad_norm_var": 0.0008580525716145833, "learning_rate": 0.01, "loss": 1.4629, "loss/crossentropy": 2.3607990741729736, "loss/fcd": 1.09375, "loss/logits": 0.24376747012138367, "step": 111 }, { "epoch": 0.00035564587831830304, "grad_norm": 0.333984375, "grad_norm_var": 0.0008376439412434896, "learning_rate": 0.01, "loss": 1.4728, "loss/crossentropy": 2.6249308586120605, "loss/fcd": 1.15625, "loss/logits": 0.27309828996658325, "step": 112 }, { "epoch": 0.00035882128794614506, "grad_norm": 0.3515625, "grad_norm_var": 0.0007546583811442058, "learning_rate": 0.01, "loss": 1.473, "loss/crossentropy": 2.418852925300598, "loss/fcd": 1.11328125, "loss/logits": 0.25403836369514465, "step": 113 }, { "epoch": 0.000361996697573987, "grad_norm": 0.34765625, "grad_norm_var": 0.0004131158192952474, "learning_rate": 0.01, "loss": 1.4853, "loss/crossentropy": 2.6179674863815308, "loss/fcd": 1.22265625, "loss/logits": 0.300196036696434, "step": 114 }, { "epoch": 0.00036517210720182904, "grad_norm": 0.34375, "grad_norm_var": 0.0004134496053059896, "learning_rate": 0.01, "loss": 1.429, "loss/crossentropy": 2.7355352640151978, "loss/fcd": 1.19140625, "loss/logits": 0.2557987570762634, "step": 115 }, { "epoch": 0.000368347516829671, "grad_norm": 0.275390625, "grad_norm_var": 0.0006561279296875, "learning_rate": 0.01, "loss": 1.4199, "loss/crossentropy": 2.570266604423523, "loss/fcd": 1.09375, "loss/logits": 0.24775272607803345, "step": 116 }, { "epoch": 0.000371522926457513, "grad_norm": 0.333984375, "grad_norm_var": 0.0006189346313476562, "learning_rate": 0.01, "loss": 1.4611, "loss/crossentropy": 2.429047107696533, "loss/fcd": 1.1484375, "loss/logits": 0.2641705647110939, "step": 117 }, { "epoch": 0.000374698336085355, "grad_norm": 0.330078125, "grad_norm_var": 0.0006004174550374349, "learning_rate": 0.01, "loss": 1.412, "loss/crossentropy": 2.774795174598694, "loss/fcd": 1.1953125, "loss/logits": 0.2932389825582504, "step": 118 }, { "epoch": 0.000377873745713197, "grad_norm": 0.36328125, "grad_norm_var": 0.0006206353505452474, "learning_rate": 0.01, "loss": 1.4485, "loss/crossentropy": 2.630544066429138, "loss/fcd": 1.1484375, "loss/logits": 0.25972336530685425, "step": 119 }, { "epoch": 0.00038104915534103897, "grad_norm": 0.34375, "grad_norm_var": 0.0006092707316080729, "learning_rate": 0.01, "loss": 1.495, "loss/crossentropy": 2.4511743783950806, "loss/fcd": 1.12890625, "loss/logits": 0.24168507009744644, "step": 120 }, { "epoch": 0.000384224564968881, "grad_norm": 0.3828125, "grad_norm_var": 0.0007083733876546224, "learning_rate": 0.01, "loss": 1.433, "loss/crossentropy": 2.5144174098968506, "loss/fcd": 1.15625, "loss/logits": 0.274233341217041, "step": 121 }, { "epoch": 0.00038739997459672295, "grad_norm": 0.337890625, "grad_norm_var": 0.0005736668904622396, "learning_rate": 0.01, "loss": 1.4649, "loss/crossentropy": 2.7152767181396484, "loss/fcd": 1.1953125, "loss/logits": 0.303632989525795, "step": 122 }, { "epoch": 0.00039057538422456497, "grad_norm": 0.3203125, "grad_norm_var": 0.0005964756011962891, "learning_rate": 0.01, "loss": 1.4423, "loss/crossentropy": 2.6179680824279785, "loss/fcd": 1.234375, "loss/logits": 0.27403272688388824, "step": 123 }, { "epoch": 0.00039375079385240694, "grad_norm": 0.41796875, "grad_norm_var": 0.0009785334269205729, "learning_rate": 0.01, "loss": 1.5361, "loss/crossentropy": 2.723619818687439, "loss/fcd": 1.25390625, "loss/logits": 0.2955824136734009, "step": 124 }, { "epoch": 0.00039692620348024895, "grad_norm": 0.318359375, "grad_norm_var": 0.0009556929270426432, "learning_rate": 0.01, "loss": 1.4518, "loss/crossentropy": 2.67808997631073, "loss/fcd": 1.18359375, "loss/logits": 0.27943629026412964, "step": 125 }, { "epoch": 0.0004001016131080909, "grad_norm": 0.306640625, "grad_norm_var": 0.001004473368326823, "learning_rate": 0.01, "loss": 1.4223, "loss/crossentropy": 2.586408495903015, "loss/fcd": 1.1640625, "loss/logits": 0.26741328090429306, "step": 126 }, { "epoch": 0.00040327702273593294, "grad_norm": 0.310546875, "grad_norm_var": 0.0010553359985351562, "learning_rate": 0.01, "loss": 1.5107, "loss/crossentropy": 2.6759918928146362, "loss/fcd": 1.30078125, "loss/logits": 0.3407515734434128, "step": 127 }, { "epoch": 0.0004064524323637749, "grad_norm": 0.302734375, "grad_norm_var": 0.0011356989542643228, "learning_rate": 0.01, "loss": 1.3945, "loss/crossentropy": 2.5387790203094482, "loss/fcd": 1.14453125, "loss/logits": 0.26654160022735596, "step": 128 }, { "epoch": 0.0004096278419916169, "grad_norm": 0.341796875, "grad_norm_var": 0.0011222680409749349, "learning_rate": 0.01, "loss": 1.4359, "loss/crossentropy": 2.5810807943344116, "loss/fcd": 1.19140625, "loss/logits": 0.2787865102291107, "step": 129 }, { "epoch": 0.00041280325161945894, "grad_norm": 0.2890625, "grad_norm_var": 0.0012462457021077474, "learning_rate": 0.01, "loss": 1.3947, "loss/crossentropy": 2.320431113243103, "loss/fcd": 1.14453125, "loss/logits": 0.24304775893688202, "step": 130 }, { "epoch": 0.0004159786612473009, "grad_norm": 0.3203125, "grad_norm_var": 0.0012451012929280598, "learning_rate": 0.01, "loss": 1.4421, "loss/crossentropy": 2.159026563167572, "loss/fcd": 1.12109375, "loss/logits": 0.22368262708187103, "step": 131 }, { "epoch": 0.0004191540708751429, "grad_norm": 0.298828125, "grad_norm_var": 0.0011058648427327473, "learning_rate": 0.01, "loss": 1.3708, "loss/crossentropy": 2.3981943130493164, "loss/fcd": 1.1953125, "loss/logits": 0.2658518999814987, "step": 132 }, { "epoch": 0.0004223294805029849, "grad_norm": 0.267578125, "grad_norm_var": 0.0013674259185791015, "learning_rate": 0.01, "loss": 1.3354, "loss/crossentropy": 2.5503530502319336, "loss/fcd": 1.0625, "loss/logits": 0.24151669442653656, "step": 133 }, { "epoch": 0.0004255048901308269, "grad_norm": 0.34765625, "grad_norm_var": 0.0013910293579101562, "learning_rate": 0.01, "loss": 1.4102, "loss/crossentropy": 2.3127379417419434, "loss/fcd": 1.1875, "loss/logits": 0.27173902094364166, "step": 134 }, { "epoch": 0.00042868029975866886, "grad_norm": 0.384765625, "grad_norm_var": 0.0015170892079671224, "learning_rate": 0.01, "loss": 1.4079, "loss/crossentropy": 2.505191922187805, "loss/fcd": 1.140625, "loss/logits": 0.2447165921330452, "step": 135 }, { "epoch": 0.0004318557093865109, "grad_norm": 0.291015625, "grad_norm_var": 0.0015990575154622396, "learning_rate": 0.01, "loss": 1.4287, "loss/crossentropy": 2.256587266921997, "loss/fcd": 1.109375, "loss/logits": 0.2564867436885834, "step": 136 }, { "epoch": 0.00043503111901435285, "grad_norm": 0.35546875, "grad_norm_var": 0.0014437357584635416, "learning_rate": 0.01, "loss": 1.5249, "loss/crossentropy": 2.60795521736145, "loss/fcd": 1.1640625, "loss/logits": 0.26928049325942993, "step": 137 }, { "epoch": 0.00043820652864219486, "grad_norm": 0.3203125, "grad_norm_var": 0.0014344374338785806, "learning_rate": 0.01, "loss": 1.4425, "loss/crossentropy": 2.5051785707473755, "loss/fcd": 1.15625, "loss/logits": 0.28340843319892883, "step": 138 }, { "epoch": 0.00044138193827003683, "grad_norm": 0.349609375, "grad_norm_var": 0.0014713923136393228, "learning_rate": 0.01, "loss": 1.4304, "loss/crossentropy": 2.663282871246338, "loss/fcd": 1.125, "loss/logits": 0.2656109929084778, "step": 139 }, { "epoch": 0.00044455734789787885, "grad_norm": 0.345703125, "grad_norm_var": 0.0009156386057535807, "learning_rate": 0.01, "loss": 1.4106, "loss/crossentropy": 2.504486560821533, "loss/fcd": 1.2578125, "loss/logits": 0.2963070571422577, "step": 140 }, { "epoch": 0.0004477327575257208, "grad_norm": 0.32421875, "grad_norm_var": 0.0009150187174479167, "learning_rate": 0.01, "loss": 1.4839, "loss/crossentropy": 2.436030149459839, "loss/fcd": 1.18359375, "loss/logits": 0.27378055453300476, "step": 141 }, { "epoch": 0.00045090816715356283, "grad_norm": 0.322265625, "grad_norm_var": 0.0008977254231770834, "learning_rate": 0.01, "loss": 1.3903, "loss/crossentropy": 2.564209818840027, "loss/fcd": 1.1640625, "loss/logits": 0.25755342841148376, "step": 142 }, { "epoch": 0.0004540835767814048, "grad_norm": 0.45703125, "grad_norm_var": 0.0019908746083577475, "learning_rate": 0.01, "loss": 1.4915, "loss/crossentropy": 2.5940905809402466, "loss/fcd": 1.21875, "loss/logits": 0.29042021930217743, "step": 143 }, { "epoch": 0.0004572589864092468, "grad_norm": 0.353515625, "grad_norm_var": 0.0019512017567952474, "learning_rate": 0.01, "loss": 1.5161, "loss/crossentropy": 2.265847086906433, "loss/fcd": 1.15234375, "loss/logits": 0.2685060203075409, "step": 144 }, { "epoch": 0.0004604343960370888, "grad_norm": 0.34375, "grad_norm_var": 0.0019530614217122396, "learning_rate": 0.01, "loss": 1.4452, "loss/crossentropy": 2.273502290248871, "loss/fcd": 1.17578125, "loss/logits": 0.2671818733215332, "step": 145 }, { "epoch": 0.0004636098056649308, "grad_norm": 0.37890625, "grad_norm_var": 0.001898956298828125, "learning_rate": 0.01, "loss": 1.4402, "loss/crossentropy": 2.471543550491333, "loss/fcd": 1.23046875, "loss/logits": 0.2653593271970749, "step": 146 }, { "epoch": 0.00046678521529277276, "grad_norm": 0.30859375, "grad_norm_var": 0.0019403457641601562, "learning_rate": 0.01, "loss": 1.4606, "loss/crossentropy": 2.6899622678756714, "loss/fcd": 1.1953125, "loss/logits": 0.2877172827720642, "step": 147 }, { "epoch": 0.0004699606249206148, "grad_norm": 0.37109375, "grad_norm_var": 0.001864480972290039, "learning_rate": 0.01, "loss": 1.4526, "loss/crossentropy": 2.956667900085449, "loss/fcd": 1.234375, "loss/logits": 0.28221777081489563, "step": 148 }, { "epoch": 0.00047313603454845674, "grad_norm": 0.6796875, "grad_norm_var": 0.00821984608968099, "learning_rate": 0.01, "loss": 1.5743, "loss/crossentropy": 2.64200758934021, "loss/fcd": 1.09765625, "loss/logits": 0.24940991401672363, "step": 149 }, { "epoch": 0.00047631144417629876, "grad_norm": 0.37890625, "grad_norm_var": 0.008184242248535156, "learning_rate": 0.01, "loss": 1.4778, "loss/crossentropy": 2.6749950647354126, "loss/fcd": 1.1953125, "loss/logits": 0.2563931345939636, "step": 150 }, { "epoch": 0.0004794868538041407, "grad_norm": 0.333984375, "grad_norm_var": 0.008264414469401042, "learning_rate": 0.01, "loss": 1.434, "loss/crossentropy": 2.1559653282165527, "loss/fcd": 1.13671875, "loss/logits": 0.2502833604812622, "step": 151 }, { "epoch": 0.00048266226343198274, "grad_norm": 0.33984375, "grad_norm_var": 0.007901620864868165, "learning_rate": 0.01, "loss": 1.4208, "loss/crossentropy": 2.569344639778137, "loss/fcd": 1.15234375, "loss/logits": 0.25542619079351425, "step": 152 }, { "epoch": 0.0004858376730598247, "grad_norm": 0.35546875, "grad_norm_var": 0.007901620864868165, "learning_rate": 0.01, "loss": 1.4734, "loss/crossentropy": 2.095423400402069, "loss/fcd": 1.27734375, "loss/logits": 0.2576697915792465, "step": 153 }, { "epoch": 0.0004890130826876667, "grad_norm": 0.419921875, "grad_norm_var": 0.00782623291015625, "learning_rate": 0.01, "loss": 1.5693, "loss/crossentropy": 2.7307355403900146, "loss/fcd": 1.234375, "loss/logits": 0.30212198197841644, "step": 154 }, { "epoch": 0.0004921884923155087, "grad_norm": 0.33203125, "grad_norm_var": 0.007914209365844726, "learning_rate": 0.01, "loss": 1.3887, "loss/crossentropy": 2.5603846311569214, "loss/fcd": 1.1015625, "loss/logits": 0.2556309178471565, "step": 155 }, { "epoch": 0.0004953639019433507, "grad_norm": 0.322265625, "grad_norm_var": 0.00804886817932129, "learning_rate": 0.01, "loss": 1.4775, "loss/crossentropy": 2.180402934551239, "loss/fcd": 1.28125, "loss/logits": 0.2792344093322754, "step": 156 }, { "epoch": 0.0004985393115711927, "grad_norm": 0.31640625, "grad_norm_var": 0.00810697873433431, "learning_rate": 0.01, "loss": 1.4205, "loss/crossentropy": 2.6520687341690063, "loss/fcd": 1.13671875, "loss/logits": 0.2700956165790558, "step": 157 }, { "epoch": 0.0005017147211990346, "grad_norm": 0.400390625, "grad_norm_var": 0.007930231094360352, "learning_rate": 0.01, "loss": 1.4971, "loss/crossentropy": 2.7188620567321777, "loss/fcd": 1.1875, "loss/logits": 0.26629623770713806, "step": 158 }, { "epoch": 0.0005048901308268767, "grad_norm": 0.337890625, "grad_norm_var": 0.007605425516764323, "learning_rate": 0.01, "loss": 1.4551, "loss/crossentropy": 2.771690011024475, "loss/fcd": 1.19921875, "loss/logits": 0.25044943392276764, "step": 159 }, { "epoch": 0.0005080655404547187, "grad_norm": 0.341796875, "grad_norm_var": 0.0076449076334635414, "learning_rate": 0.01, "loss": 1.4931, "loss/crossentropy": 2.6274183988571167, "loss/fcd": 1.19140625, "loss/logits": 0.2595004439353943, "step": 160 }, { "epoch": 0.0005112409500825606, "grad_norm": 0.349609375, "grad_norm_var": 0.0076245466868082685, "learning_rate": 0.01, "loss": 1.4441, "loss/crossentropy": 2.223161995410919, "loss/fcd": 1.099609375, "loss/logits": 0.26477208733558655, "step": 161 }, { "epoch": 0.0005144163597104026, "grad_norm": 0.37109375, "grad_norm_var": 0.007622130711873372, "learning_rate": 0.01, "loss": 1.4072, "loss/crossentropy": 2.3883336782455444, "loss/fcd": 1.1796875, "loss/logits": 0.2731279581785202, "step": 162 }, { "epoch": 0.0005175917693382447, "grad_norm": 0.353515625, "grad_norm_var": 0.007365862528483073, "learning_rate": 0.01, "loss": 1.4698, "loss/crossentropy": 2.441252112388611, "loss/fcd": 1.109375, "loss/logits": 0.24772216379642487, "step": 163 }, { "epoch": 0.0005207671789660866, "grad_norm": 0.361328125, "grad_norm_var": 0.007377227147420247, "learning_rate": 0.01, "loss": 1.4157, "loss/crossentropy": 2.1570171117782593, "loss/fcd": 1.0703125, "loss/logits": 0.24314653873443604, "step": 164 }, { "epoch": 0.0005239425885939286, "grad_norm": 0.35546875, "grad_norm_var": 0.0007598718007405599, "learning_rate": 0.01, "loss": 1.4526, "loss/crossentropy": 2.3008190393447876, "loss/fcd": 1.09765625, "loss/logits": 0.2550596594810486, "step": 165 }, { "epoch": 0.0005271179982217706, "grad_norm": 0.392578125, "grad_norm_var": 0.0008162816365559895, "learning_rate": 0.01, "loss": 1.4603, "loss/crossentropy": 2.7526636123657227, "loss/fcd": 1.23046875, "loss/logits": 0.30346739292144775, "step": 166 }, { "epoch": 0.0005302934078496126, "grad_norm": 0.345703125, "grad_norm_var": 0.0007916768391927083, "learning_rate": 0.01, "loss": 1.5179, "loss/crossentropy": 2.387694835662842, "loss/fcd": 1.09375, "loss/logits": 0.25215374678373337, "step": 167 }, { "epoch": 0.0005334688174774546, "grad_norm": 0.37109375, "grad_norm_var": 0.0007855733235677083, "learning_rate": 0.01, "loss": 1.4478, "loss/crossentropy": 2.802337408065796, "loss/fcd": 1.19921875, "loss/logits": 0.2960353344678879, "step": 168 }, { "epoch": 0.0005366442271052966, "grad_norm": 0.47265625, "grad_norm_var": 0.0016057332356770833, "learning_rate": 0.01, "loss": 1.5511, "loss/crossentropy": 2.398598551750183, "loss/fcd": 1.26171875, "loss/logits": 0.29355739057064056, "step": 169 }, { "epoch": 0.0005398196367331385, "grad_norm": 0.31640625, "grad_norm_var": 0.0015206495920817057, "learning_rate": 0.01, "loss": 1.3984, "loss/crossentropy": 1.9290159940719604, "loss/fcd": 1.10546875, "loss/logits": 0.2618868947029114, "step": 170 }, { "epoch": 0.0005429950463609806, "grad_norm": 0.392578125, "grad_norm_var": 0.001533953348795573, "learning_rate": 0.01, "loss": 1.5731, "loss/crossentropy": 2.3453359603881836, "loss/fcd": 1.1875, "loss/logits": 0.2691439539194107, "step": 171 }, { "epoch": 0.0005461704559888226, "grad_norm": 0.326171875, "grad_norm_var": 0.0015139261881510417, "learning_rate": 0.01, "loss": 1.4263, "loss/crossentropy": 2.6114273071289062, "loss/fcd": 1.125, "loss/logits": 0.2619021609425545, "step": 172 }, { "epoch": 0.0005493458656166645, "grad_norm": 0.3359375, "grad_norm_var": 0.0014169692993164062, "learning_rate": 0.01, "loss": 1.4375, "loss/crossentropy": 2.200191617012024, "loss/fcd": 1.15234375, "loss/logits": 0.26280002295970917, "step": 173 }, { "epoch": 0.0005525212752445066, "grad_norm": 0.3359375, "grad_norm_var": 0.001363992691040039, "learning_rate": 0.01, "loss": 1.4267, "loss/crossentropy": 2.5563933849334717, "loss/fcd": 1.16796875, "loss/logits": 0.2589213624596596, "step": 174 }, { "epoch": 0.0005556966848723486, "grad_norm": 0.373046875, "grad_norm_var": 0.0013376712799072266, "learning_rate": 0.01, "loss": 1.5258, "loss/crossentropy": 2.5349756479263306, "loss/fcd": 1.22265625, "loss/logits": 0.2855932116508484, "step": 175 }, { "epoch": 0.0005588720945001905, "grad_norm": 0.33984375, "grad_norm_var": 0.001343218485514323, "learning_rate": 0.01, "loss": 1.4634, "loss/crossentropy": 2.6581934690475464, "loss/fcd": 1.27734375, "loss/logits": 0.2964487373828888, "step": 176 }, { "epoch": 0.0005620475041280325, "grad_norm": 0.318359375, "grad_norm_var": 0.0014561335245768228, "learning_rate": 0.01, "loss": 1.4807, "loss/crossentropy": 2.501360297203064, "loss/fcd": 1.1484375, "loss/logits": 0.25330156832933426, "step": 177 }, { "epoch": 0.0005652229137558746, "grad_norm": 0.34375, "grad_norm_var": 0.0014628092447916666, "learning_rate": 0.01, "loss": 1.4428, "loss/crossentropy": 2.63576877117157, "loss/fcd": 1.06640625, "loss/logits": 0.2456054762005806, "step": 178 }, { "epoch": 0.0005683983233837165, "grad_norm": 0.306640625, "grad_norm_var": 0.0016306559244791666, "learning_rate": 0.01, "loss": 1.3972, "loss/crossentropy": 2.364240527153015, "loss/fcd": 1.234375, "loss/logits": 0.28903724253177643, "step": 179 }, { "epoch": 0.0005715737330115585, "grad_norm": 0.3203125, "grad_norm_var": 0.0017037550608317057, "learning_rate": 0.01, "loss": 1.4493, "loss/crossentropy": 2.532119631767273, "loss/fcd": 1.23828125, "loss/logits": 0.28427527844905853, "step": 180 }, { "epoch": 0.0005747491426394004, "grad_norm": 0.318359375, "grad_norm_var": 0.0017771402994791667, "learning_rate": 0.01, "loss": 1.4516, "loss/crossentropy": 2.647464871406555, "loss/fcd": 1.28515625, "loss/logits": 0.335851326584816, "step": 181 }, { "epoch": 0.0005779245522672425, "grad_norm": 0.3359375, "grad_norm_var": 0.0016605218251546225, "learning_rate": 0.01, "loss": 1.4675, "loss/crossentropy": 2.7925937175750732, "loss/fcd": 1.30078125, "loss/logits": 0.30253617465496063, "step": 182 }, { "epoch": 0.0005810999618950845, "grad_norm": 0.34765625, "grad_norm_var": 0.0016604105631510416, "learning_rate": 0.01, "loss": 1.479, "loss/crossentropy": 2.285614013671875, "loss/fcd": 1.2890625, "loss/logits": 0.3271857500076294, "step": 183 }, { "epoch": 0.0005842753715229264, "grad_norm": 0.3125, "grad_norm_var": 0.0016880671183268229, "learning_rate": 0.01, "loss": 1.394, "loss/crossentropy": 2.2541778087615967, "loss/fcd": 1.08984375, "loss/logits": 0.2381695881485939, "step": 184 }, { "epoch": 0.0005874507811507684, "grad_norm": 0.330078125, "grad_norm_var": 0.0005033969879150391, "learning_rate": 0.01, "loss": 1.4873, "loss/crossentropy": 2.8136264085769653, "loss/fcd": 1.32421875, "loss/logits": 0.3297244906425476, "step": 185 }, { "epoch": 0.0005906261907786105, "grad_norm": 0.3515625, "grad_norm_var": 0.0004953861236572266, "learning_rate": 0.01, "loss": 1.4212, "loss/crossentropy": 2.4847536087036133, "loss/fcd": 1.22265625, "loss/logits": 0.29252856969833374, "step": 186 }, { "epoch": 0.0005938016004064524, "grad_norm": 0.3359375, "grad_norm_var": 0.0002745946248372396, "learning_rate": 0.01, "loss": 1.4624, "loss/crossentropy": 2.4440802335739136, "loss/fcd": 1.08984375, "loss/logits": 0.2591068744659424, "step": 187 }, { "epoch": 0.0005969770100342944, "grad_norm": 0.3125, "grad_norm_var": 0.0002991835276285807, "learning_rate": 0.01, "loss": 1.3703, "loss/crossentropy": 2.7041367292404175, "loss/fcd": 1.08984375, "loss/logits": 0.24919994920492172, "step": 188 }, { "epoch": 0.0006001524196621364, "grad_norm": 0.326171875, "grad_norm_var": 0.0003005345662434896, "learning_rate": 0.01, "loss": 1.4428, "loss/crossentropy": 2.750958204269409, "loss/fcd": 1.26953125, "loss/logits": 0.32267922163009644, "step": 189 }, { "epoch": 0.0006033278292899784, "grad_norm": 0.34375, "grad_norm_var": 0.00030867258707682293, "learning_rate": 0.01, "loss": 1.4338, "loss/crossentropy": 2.6606979370117188, "loss/fcd": 1.11328125, "loss/logits": 0.2493468001484871, "step": 190 }, { "epoch": 0.0006065032389178204, "grad_norm": 0.3046875, "grad_norm_var": 0.00022912025451660156, "learning_rate": 0.01, "loss": 1.4358, "loss/crossentropy": 2.5754127502441406, "loss/fcd": 1.1328125, "loss/logits": 0.2530096769332886, "step": 191 }, { "epoch": 0.0006096786485456624, "grad_norm": 0.3203125, "grad_norm_var": 0.00022212664286295572, "learning_rate": 0.01, "loss": 1.4354, "loss/crossentropy": 2.5391939878463745, "loss/fcd": 1.11328125, "loss/logits": 0.2560339719057083, "step": 192 }, { "epoch": 0.0006128540581735043, "grad_norm": 0.39453125, "grad_norm_var": 0.0004992167154947917, "learning_rate": 0.01, "loss": 1.595, "loss/crossentropy": 2.6107431650161743, "loss/fcd": 1.25, "loss/logits": 0.31747186183929443, "step": 193 }, { "epoch": 0.0006160294678013464, "grad_norm": 0.380859375, "grad_norm_var": 0.0006456851959228515, "learning_rate": 0.01, "loss": 1.4717, "loss/crossentropy": 2.490286111831665, "loss/fcd": 1.11328125, "loss/logits": 0.2593752592802048, "step": 194 }, { "epoch": 0.0006192048774291884, "grad_norm": 0.326171875, "grad_norm_var": 0.0005986372629801433, "learning_rate": 0.01, "loss": 1.4105, "loss/crossentropy": 2.4615145921707153, "loss/fcd": 1.1953125, "loss/logits": 0.2654329538345337, "step": 195 }, { "epoch": 0.0006223802870570303, "grad_norm": 0.353515625, "grad_norm_var": 0.0006021499633789062, "learning_rate": 0.01, "loss": 1.4569, "loss/crossentropy": 2.7290648221969604, "loss/fcd": 1.15625, "loss/logits": 0.26174046099185944, "step": 196 }, { "epoch": 0.0006255556966848723, "grad_norm": 0.380859375, "grad_norm_var": 0.0006896336873372396, "learning_rate": 0.01, "loss": 1.459, "loss/crossentropy": 2.6363881826400757, "loss/fcd": 1.15234375, "loss/logits": 0.254153311252594, "step": 197 }, { "epoch": 0.0006287311063127144, "grad_norm": 0.345703125, "grad_norm_var": 0.0006889184315999349, "learning_rate": 0.01, "loss": 1.406, "loss/crossentropy": 2.7989996671676636, "loss/fcd": 1.2265625, "loss/logits": 0.30807191133499146, "step": 198 }, { "epoch": 0.0006319065159405563, "grad_norm": 0.322265625, "grad_norm_var": 0.0007089614868164062, "learning_rate": 0.01, "loss": 1.4014, "loss/crossentropy": 2.668474316596985, "loss/fcd": 1.15625, "loss/logits": 0.26886168122291565, "step": 199 }, { "epoch": 0.0006350819255683983, "grad_norm": 0.4921875, "grad_norm_var": 0.0020659764607747397, "learning_rate": 0.01, "loss": 1.5361, "loss/crossentropy": 2.406678557395935, "loss/fcd": 1.2109375, "loss/logits": 0.30439266562461853, "step": 200 }, { "epoch": 0.0006382573351962403, "grad_norm": 0.34765625, "grad_norm_var": 0.0020355065663655598, "learning_rate": 0.01, "loss": 1.3873, "loss/crossentropy": 2.499788761138916, "loss/fcd": 1.140625, "loss/logits": 0.2588384300470352, "step": 201 }, { "epoch": 0.0006414327448240823, "grad_norm": 0.3046875, "grad_norm_var": 0.0021781762440999347, "learning_rate": 0.01, "loss": 1.4279, "loss/crossentropy": 2.622753381729126, "loss/fcd": 1.20703125, "loss/logits": 0.29281996190547943, "step": 202 }, { "epoch": 0.0006446081544519243, "grad_norm": 0.33984375, "grad_norm_var": 0.002172072728474935, "learning_rate": 0.01, "loss": 1.4493, "loss/crossentropy": 2.5830085277557373, "loss/fcd": 1.09765625, "loss/logits": 0.24869374930858612, "step": 203 }, { "epoch": 0.0006477835640797663, "grad_norm": 0.34375, "grad_norm_var": 0.002077976862589518, "learning_rate": 0.01, "loss": 1.4577, "loss/crossentropy": 2.762994647026062, "loss/fcd": 1.2265625, "loss/logits": 0.28321488201618195, "step": 204 }, { "epoch": 0.0006509589737076082, "grad_norm": 0.98828125, "grad_norm_var": 0.027224985758463542, "learning_rate": 0.01, "loss": 1.4711, "loss/crossentropy": 2.5399746894836426, "loss/fcd": 1.1875, "loss/logits": 0.2770008146762848, "step": 205 }, { "epoch": 0.0006541343833354503, "grad_norm": 0.375, "grad_norm_var": 0.027080535888671875, "learning_rate": 0.01, "loss": 1.5803, "loss/crossentropy": 2.5655182600021362, "loss/fcd": 1.21875, "loss/logits": 0.27917300164699554, "step": 206 }, { "epoch": 0.0006573097929632923, "grad_norm": 0.32421875, "grad_norm_var": 0.02686913808186849, "learning_rate": 0.01, "loss": 1.4061, "loss/crossentropy": 2.4141650199890137, "loss/fcd": 1.06640625, "loss/logits": 0.2412392497062683, "step": 207 }, { "epoch": 0.0006604852025911342, "grad_norm": 0.314453125, "grad_norm_var": 0.026930602391560872, "learning_rate": 0.01, "loss": 1.4689, "loss/crossentropy": 2.5916244983673096, "loss/fcd": 1.1875, "loss/logits": 0.27838681638240814, "step": 208 }, { "epoch": 0.0006636606122189762, "grad_norm": 0.32421875, "grad_norm_var": 0.02725218137105306, "learning_rate": 0.01, "loss": 1.4522, "loss/crossentropy": 2.4947317838668823, "loss/fcd": 1.12890625, "loss/logits": 0.2453143149614334, "step": 209 }, { "epoch": 0.0006668360218468183, "grad_norm": 0.318359375, "grad_norm_var": 0.02758482297261556, "learning_rate": 0.01, "loss": 1.4263, "loss/crossentropy": 2.519081950187683, "loss/fcd": 1.25, "loss/logits": 0.30316148698329926, "step": 210 }, { "epoch": 0.0006700114314746602, "grad_norm": 0.328125, "grad_norm_var": 0.027569071451822916, "learning_rate": 0.01, "loss": 1.3868, "loss/crossentropy": 2.5940135717391968, "loss/fcd": 1.15234375, "loss/logits": 0.26943735778331757, "step": 211 }, { "epoch": 0.0006731868411025022, "grad_norm": 0.36328125, "grad_norm_var": 0.027530527114868163, "learning_rate": 0.01, "loss": 1.4702, "loss/crossentropy": 2.4450011253356934, "loss/fcd": 1.1171875, "loss/logits": 0.22430332750082016, "step": 212 }, { "epoch": 0.0006763622507303442, "grad_norm": 0.376953125, "grad_norm_var": 0.027535359064737957, "learning_rate": 0.01, "loss": 1.5672, "loss/crossentropy": 2.81851589679718, "loss/fcd": 1.328125, "loss/logits": 0.2979218512773514, "step": 213 }, { "epoch": 0.0006795376603581862, "grad_norm": 0.333984375, "grad_norm_var": 0.027610127131144205, "learning_rate": 0.01, "loss": 1.3832, "loss/crossentropy": 2.3940327167510986, "loss/fcd": 1.15234375, "loss/logits": 0.2581535875797272, "step": 214 }, { "epoch": 0.0006827130699860282, "grad_norm": 0.314453125, "grad_norm_var": 0.027681716283162437, "learning_rate": 0.01, "loss": 1.3988, "loss/crossentropy": 2.7536251544952393, "loss/fcd": 1.12890625, "loss/logits": 0.2611486464738846, "step": 215 }, { "epoch": 0.0006858884796138702, "grad_norm": 0.310546875, "grad_norm_var": 0.027192433675130207, "learning_rate": 0.01, "loss": 1.3678, "loss/crossentropy": 2.7212696075439453, "loss/fcd": 1.140625, "loss/logits": 0.2533951997756958, "step": 216 }, { "epoch": 0.0006890638892417122, "grad_norm": 0.32421875, "grad_norm_var": 0.027313741048177083, "learning_rate": 0.01, "loss": 1.4042, "loss/crossentropy": 2.3983840942382812, "loss/fcd": 1.10546875, "loss/logits": 0.2641438990831375, "step": 217 }, { "epoch": 0.0006922392988695542, "grad_norm": 0.310546875, "grad_norm_var": 0.027261718114217123, "learning_rate": 0.01, "loss": 1.4711, "loss/crossentropy": 2.5954113006591797, "loss/fcd": 1.17578125, "loss/logits": 0.26259416341781616, "step": 218 }, { "epoch": 0.0006954147084973962, "grad_norm": 0.302734375, "grad_norm_var": 0.027518717447916667, "learning_rate": 0.01, "loss": 1.4226, "loss/crossentropy": 2.5200321674346924, "loss/fcd": 1.203125, "loss/logits": 0.2933817505836487, "step": 219 }, { "epoch": 0.0006985901181252381, "grad_norm": 0.34765625, "grad_norm_var": 0.027504920959472656, "learning_rate": 0.01, "loss": 1.4569, "loss/crossentropy": 2.3375638723373413, "loss/fcd": 1.16796875, "loss/logits": 0.2861281484365463, "step": 220 }, { "epoch": 0.0007017655277530802, "grad_norm": 0.333984375, "grad_norm_var": 0.0005247592926025391, "learning_rate": 0.01, "loss": 1.4991, "loss/crossentropy": 2.851243495941162, "loss/fcd": 1.28125, "loss/logits": 0.34203924238681793, "step": 221 }, { "epoch": 0.0007049409373809222, "grad_norm": 0.345703125, "grad_norm_var": 0.000408172607421875, "learning_rate": 0.01, "loss": 1.41, "loss/crossentropy": 2.5948612689971924, "loss/fcd": 1.12109375, "loss/logits": 0.2781279981136322, "step": 222 }, { "epoch": 0.0007081163470087641, "grad_norm": 0.34765625, "grad_norm_var": 0.00042572021484375, "learning_rate": 0.01, "loss": 1.4379, "loss/crossentropy": 2.756687879562378, "loss/fcd": 1.234375, "loss/logits": 0.2924545705318451, "step": 223 }, { "epoch": 0.0007112917566366061, "grad_norm": 0.36328125, "grad_norm_var": 0.0004666487375895182, "learning_rate": 0.01, "loss": 1.441, "loss/crossentropy": 2.749671459197998, "loss/fcd": 1.2265625, "loss/logits": 0.29917286336421967, "step": 224 }, { "epoch": 0.0007144671662644482, "grad_norm": 0.306640625, "grad_norm_var": 0.0005091349283854167, "learning_rate": 0.01, "loss": 1.4103, "loss/crossentropy": 2.416172742843628, "loss/fcd": 1.1484375, "loss/logits": 0.266731821000576, "step": 225 }, { "epoch": 0.0007176425758922901, "grad_norm": 0.361328125, "grad_norm_var": 0.0005406061808268229, "learning_rate": 0.01, "loss": 1.4862, "loss/crossentropy": 2.7141880989074707, "loss/fcd": 1.2890625, "loss/logits": 0.3132380098104477, "step": 226 }, { "epoch": 0.0007208179855201321, "grad_norm": 0.333984375, "grad_norm_var": 0.0005368391672770182, "learning_rate": 0.01, "loss": 1.4914, "loss/crossentropy": 2.365302562713623, "loss/fcd": 1.22265625, "loss/logits": 0.29388483613729477, "step": 227 }, { "epoch": 0.000723993395147974, "grad_norm": 0.33203125, "grad_norm_var": 0.0004844506581624349, "learning_rate": 0.01, "loss": 1.4582, "loss/crossentropy": 2.3674185276031494, "loss/fcd": 1.24609375, "loss/logits": 0.32378852367401123, "step": 228 }, { "epoch": 0.0007271688047758161, "grad_norm": 0.314453125, "grad_norm_var": 0.0003715356190999349, "learning_rate": 0.01, "loss": 1.3911, "loss/crossentropy": 2.483814001083374, "loss/fcd": 1.171875, "loss/logits": 0.28102540969848633, "step": 229 }, { "epoch": 0.0007303442144036581, "grad_norm": 0.275390625, "grad_norm_var": 0.0005565484364827474, "learning_rate": 0.01, "loss": 1.3346, "loss/crossentropy": 2.3871065378189087, "loss/fcd": 1.07421875, "loss/logits": 0.24542085081338882, "step": 230 }, { "epoch": 0.0007335196240315, "grad_norm": 0.33984375, "grad_norm_var": 0.0005559285481770833, "learning_rate": 0.01, "loss": 1.4928, "loss/crossentropy": 2.5487005710601807, "loss/fcd": 1.23828125, "loss/logits": 0.3122349679470062, "step": 231 }, { "epoch": 0.000736695033659342, "grad_norm": 0.32421875, "grad_norm_var": 0.0005355676015218099, "learning_rate": 0.01, "loss": 1.373, "loss/crossentropy": 2.2985492944717407, "loss/fcd": 1.0546875, "loss/logits": 0.22457106411457062, "step": 232 }, { "epoch": 0.0007398704432871841, "grad_norm": 0.330078125, "grad_norm_var": 0.0005339940388997395, "learning_rate": 0.01, "loss": 1.4225, "loss/crossentropy": 2.4530088901519775, "loss/fcd": 1.0703125, "loss/logits": 0.24287613481283188, "step": 233 }, { "epoch": 0.000743045852915026, "grad_norm": 0.333984375, "grad_norm_var": 0.0005095799763997396, "learning_rate": 0.01, "loss": 1.4921, "loss/crossentropy": 2.585182785987854, "loss/fcd": 1.15625, "loss/logits": 0.26414525508880615, "step": 234 }, { "epoch": 0.000746221262542868, "grad_norm": 0.380859375, "grad_norm_var": 0.0005985895792643229, "learning_rate": 0.01, "loss": 1.5437, "loss/crossentropy": 2.9683706760406494, "loss/fcd": 1.38671875, "loss/logits": 0.38517357409000397, "step": 235 }, { "epoch": 0.00074939667217071, "grad_norm": 0.349609375, "grad_norm_var": 0.0006019433339436848, "learning_rate": 0.01, "loss": 1.4808, "loss/crossentropy": 2.6974499225616455, "loss/fcd": 1.171875, "loss/logits": 0.27078036963939667, "step": 236 }, { "epoch": 0.000752572081798552, "grad_norm": 0.337890625, "grad_norm_var": 0.0006019433339436848, "learning_rate": 0.01, "loss": 1.436, "loss/crossentropy": 2.585400700569153, "loss/fcd": 1.296875, "loss/logits": 0.3050535172224045, "step": 237 }, { "epoch": 0.000755747491426394, "grad_norm": 0.349609375, "grad_norm_var": 0.0006079196929931641, "learning_rate": 0.01, "loss": 1.3943, "loss/crossentropy": 2.4235087633132935, "loss/fcd": 1.13671875, "loss/logits": 0.2283131629228592, "step": 238 }, { "epoch": 0.000758922901054236, "grad_norm": 0.28515625, "grad_norm_var": 0.0007574558258056641, "learning_rate": 0.01, "loss": 1.384, "loss/crossentropy": 2.5592979192733765, "loss/fcd": 1.15234375, "loss/logits": 0.27737075090408325, "step": 239 }, { "epoch": 0.0007620983106820779, "grad_norm": 0.3515625, "grad_norm_var": 0.000717782974243164, "learning_rate": 0.01, "loss": 1.4614, "loss/crossentropy": 2.592607259750366, "loss/fcd": 1.203125, "loss/logits": 0.2808872312307358, "step": 240 }, { "epoch": 0.00076527372030992, "grad_norm": 0.310546875, "grad_norm_var": 0.0007057030995686849, "learning_rate": 0.01, "loss": 1.4133, "loss/crossentropy": 2.4031397104263306, "loss/fcd": 1.15625, "loss/logits": 0.26764968037605286, "step": 241 }, { "epoch": 0.000768449129937762, "grad_norm": 0.314453125, "grad_norm_var": 0.0006591637929280598, "learning_rate": 0.01, "loss": 1.4582, "loss/crossentropy": 2.512922167778015, "loss/fcd": 1.1171875, "loss/logits": 0.26913800835609436, "step": 242 }, { "epoch": 0.0007716245395656039, "grad_norm": 0.451171875, "grad_norm_var": 0.0015956719716389975, "learning_rate": 0.01, "loss": 1.518, "loss/crossentropy": 2.7079628705978394, "loss/fcd": 1.17578125, "loss/logits": 0.26650217175483704, "step": 243 }, { "epoch": 0.0007747999491934459, "grad_norm": 0.283203125, "grad_norm_var": 0.0017724990844726562, "learning_rate": 0.01, "loss": 1.3681, "loss/crossentropy": 2.364258289337158, "loss/fcd": 1.140625, "loss/logits": 0.2806752920150757, "step": 244 }, { "epoch": 0.000777975358821288, "grad_norm": 0.330078125, "grad_norm_var": 0.0017485936482747396, "learning_rate": 0.01, "loss": 1.4954, "loss/crossentropy": 2.696042060852051, "loss/fcd": 1.18359375, "loss/logits": 0.27867695689201355, "step": 245 }, { "epoch": 0.0007811507684491299, "grad_norm": 0.3125, "grad_norm_var": 0.0015435377756754557, "learning_rate": 0.01, "loss": 1.3777, "loss/crossentropy": 2.5323106050491333, "loss/fcd": 1.14453125, "loss/logits": 0.2699219509959221, "step": 246 }, { "epoch": 0.0007843261780769719, "grad_norm": 0.310546875, "grad_norm_var": 0.001584307352701823, "learning_rate": 0.01, "loss": 1.4188, "loss/crossentropy": 2.5892099142074585, "loss/fcd": 1.078125, "loss/logits": 0.24551667273044586, "step": 247 }, { "epoch": 0.0007875015877048139, "grad_norm": 0.310546875, "grad_norm_var": 0.0016151269276936848, "learning_rate": 0.01, "loss": 1.4561, "loss/crossentropy": 2.6845802068710327, "loss/fcd": 1.171875, "loss/logits": 0.26707538962364197, "step": 248 }, { "epoch": 0.0007906769973326559, "grad_norm": 0.32421875, "grad_norm_var": 0.0016202290852864584, "learning_rate": 0.01, "loss": 1.4382, "loss/crossentropy": 2.3116230964660645, "loss/fcd": 1.1875, "loss/logits": 0.2521924749016762, "step": 249 }, { "epoch": 0.0007938524069604979, "grad_norm": 0.302734375, "grad_norm_var": 0.001679229736328125, "learning_rate": 0.01, "loss": 1.4411, "loss/crossentropy": 2.585776686668396, "loss/fcd": 1.13671875, "loss/logits": 0.2590962275862694, "step": 250 }, { "epoch": 0.0007970278165883399, "grad_norm": 0.3125, "grad_norm_var": 0.0015217940012613933, "learning_rate": 0.01, "loss": 1.466, "loss/crossentropy": 2.755423426628113, "loss/fcd": 1.17578125, "loss/logits": 0.27835342288017273, "step": 251 }, { "epoch": 0.0008002032262161818, "grad_norm": 0.349609375, "grad_norm_var": 0.0015217940012613933, "learning_rate": 0.01, "loss": 1.4623, "loss/crossentropy": 2.294103503227234, "loss/fcd": 1.15625, "loss/logits": 0.2587505131959915, "step": 252 }, { "epoch": 0.0008033786358440239, "grad_norm": 0.353515625, "grad_norm_var": 0.0015591780344645183, "learning_rate": 0.01, "loss": 1.4866, "loss/crossentropy": 2.5495104789733887, "loss/fcd": 1.19921875, "loss/logits": 0.2796829491853714, "step": 253 }, { "epoch": 0.0008065540454718659, "grad_norm": 0.296875, "grad_norm_var": 0.001582781473795573, "learning_rate": 0.01, "loss": 1.439, "loss/crossentropy": 2.592455267906189, "loss/fcd": 1.1796875, "loss/logits": 0.2704998552799225, "step": 254 }, { "epoch": 0.0008097294550997078, "grad_norm": 0.314453125, "grad_norm_var": 0.0014809767405192058, "learning_rate": 0.01, "loss": 1.4944, "loss/crossentropy": 2.697207450866699, "loss/fcd": 1.171875, "loss/logits": 0.2521091401576996, "step": 255 }, { "epoch": 0.0008129048647275498, "grad_norm": 0.33984375, "grad_norm_var": 0.0014508406321207682, "learning_rate": 0.01, "loss": 1.4825, "loss/crossentropy": 2.294856309890747, "loss/fcd": 1.2890625, "loss/logits": 0.27980829775333405, "step": 256 }, { "epoch": 0.0008160802743553919, "grad_norm": 0.32421875, "grad_norm_var": 0.0014342625935872396, "learning_rate": 0.01, "loss": 1.4309, "loss/crossentropy": 2.4966965913772583, "loss/fcd": 1.14453125, "loss/logits": 0.2523074522614479, "step": 257 }, { "epoch": 0.0008192556839832338, "grad_norm": 0.322265625, "grad_norm_var": 0.0014251073201497395, "learning_rate": 0.01, "loss": 1.5155, "loss/crossentropy": 2.7268197536468506, "loss/fcd": 1.28125, "loss/logits": 0.3203650116920471, "step": 258 }, { "epoch": 0.0008224310936110758, "grad_norm": 0.36328125, "grad_norm_var": 0.0004573663075764974, "learning_rate": 0.01, "loss": 1.4252, "loss/crossentropy": 2.6139529943466187, "loss/fcd": 1.14453125, "loss/logits": 0.26910896599292755, "step": 259 }, { "epoch": 0.0008256065032389179, "grad_norm": 0.31640625, "grad_norm_var": 0.00035495758056640624, "learning_rate": 0.01, "loss": 1.4928, "loss/crossentropy": 2.4806265830993652, "loss/fcd": 1.1953125, "loss/logits": 0.29723477363586426, "step": 260 }, { "epoch": 0.0008287819128667598, "grad_norm": 0.314453125, "grad_norm_var": 0.0003575007120768229, "learning_rate": 0.01, "loss": 1.4488, "loss/crossentropy": 2.901823878288269, "loss/fcd": 1.27734375, "loss/logits": 0.30374760925769806, "step": 261 }, { "epoch": 0.0008319573224946018, "grad_norm": 0.35546875, "grad_norm_var": 0.000412750244140625, "learning_rate": 0.01, "loss": 1.4966, "loss/crossentropy": 2.629228115081787, "loss/fcd": 1.203125, "loss/logits": 0.28927190601825714, "step": 262 }, { "epoch": 0.0008351327321224438, "grad_norm": 0.314453125, "grad_norm_var": 0.0004058202107747396, "learning_rate": 0.01, "loss": 1.4468, "loss/crossentropy": 2.703573703765869, "loss/fcd": 1.20703125, "loss/logits": 0.29513630270957947, "step": 263 }, { "epoch": 0.0008383081417502858, "grad_norm": 0.337890625, "grad_norm_var": 0.00039647420247395835, "learning_rate": 0.01, "loss": 1.5088, "loss/crossentropy": 2.52901828289032, "loss/fcd": 1.1015625, "loss/logits": 0.25419046729803085, "step": 264 }, { "epoch": 0.0008414835513781278, "grad_norm": 0.3125, "grad_norm_var": 0.00041039784749348957, "learning_rate": 0.01, "loss": 1.4359, "loss/crossentropy": 2.7386242151260376, "loss/fcd": 1.203125, "loss/logits": 0.26731544733047485, "step": 265 }, { "epoch": 0.0008446589610059698, "grad_norm": 0.31640625, "grad_norm_var": 0.0003780206044514974, "learning_rate": 0.01, "loss": 1.4215, "loss/crossentropy": 2.5745092630386353, "loss/fcd": 1.1875, "loss/logits": 0.2610969841480255, "step": 266 }, { "epoch": 0.0008478343706338117, "grad_norm": 0.314453125, "grad_norm_var": 0.0003742853800455729, "learning_rate": 0.01, "loss": 1.4884, "loss/crossentropy": 2.7579007148742676, "loss/fcd": 1.125, "loss/logits": 0.2636326849460602, "step": 267 }, { "epoch": 0.0008510097802616538, "grad_norm": 0.328125, "grad_norm_var": 0.00034089088439941405, "learning_rate": 0.01, "loss": 1.4626, "loss/crossentropy": 2.6244633197784424, "loss/fcd": 1.203125, "loss/logits": 0.29265178740024567, "step": 268 }, { "epoch": 0.0008541851898894958, "grad_norm": 0.341796875, "grad_norm_var": 0.00030732154846191406, "learning_rate": 0.01, "loss": 1.4284, "loss/crossentropy": 2.9934885501861572, "loss/fcd": 1.2109375, "loss/logits": 0.26712846755981445, "step": 269 }, { "epoch": 0.0008573605995173377, "grad_norm": 0.328125, "grad_norm_var": 0.0002478122711181641, "learning_rate": 0.01, "loss": 1.4689, "loss/crossentropy": 2.727446675300598, "loss/fcd": 1.203125, "loss/logits": 0.29830583930015564, "step": 270 }, { "epoch": 0.0008605360091451797, "grad_norm": 0.34765625, "grad_norm_var": 0.00025780995686848957, "learning_rate": 0.01, "loss": 1.3933, "loss/crossentropy": 2.287726879119873, "loss/fcd": 1.171875, "loss/logits": 0.2887675315141678, "step": 271 }, { "epoch": 0.0008637114187730218, "grad_norm": 0.333984375, "grad_norm_var": 0.0002521355946858724, "learning_rate": 0.01, "loss": 1.495, "loss/crossentropy": 2.5840429067611694, "loss/fcd": 1.26953125, "loss/logits": 0.3175569176673889, "step": 272 }, { "epoch": 0.0008668868284008637, "grad_norm": 0.30078125, "grad_norm_var": 0.0003028710683186849, "learning_rate": 0.01, "loss": 1.3694, "loss/crossentropy": 2.683778405189514, "loss/fcd": 1.1484375, "loss/logits": 0.2484614998102188, "step": 273 }, { "epoch": 0.0008700622380287057, "grad_norm": 0.283203125, "grad_norm_var": 0.00042812029520670575, "learning_rate": 0.01, "loss": 1.4353, "loss/crossentropy": 2.6620240211486816, "loss/fcd": 1.11328125, "loss/logits": 0.26276877522468567, "step": 274 }, { "epoch": 0.0008732376476565477, "grad_norm": 0.3203125, "grad_norm_var": 0.0003274122873942057, "learning_rate": 0.01, "loss": 1.4477, "loss/crossentropy": 2.6749569177627563, "loss/fcd": 1.18359375, "loss/logits": 0.2814165949821472, "step": 275 }, { "epoch": 0.0008764130572843897, "grad_norm": 0.33984375, "grad_norm_var": 0.00034152666727701824, "learning_rate": 0.01, "loss": 1.4468, "loss/crossentropy": 2.7316863536834717, "loss/fcd": 1.19140625, "loss/logits": 0.2842772603034973, "step": 276 }, { "epoch": 0.0008795884669122317, "grad_norm": 0.7265625, "grad_norm_var": 0.010412851969401041, "learning_rate": 0.01, "loss": 1.5203, "loss/crossentropy": 2.503424048423767, "loss/fcd": 1.28515625, "loss/logits": 0.3325551301240921, "step": 277 }, { "epoch": 0.0008827638765400737, "grad_norm": 0.330078125, "grad_norm_var": 0.010434961318969727, "learning_rate": 0.01, "loss": 1.4553, "loss/crossentropy": 2.2831602096557617, "loss/fcd": 1.03515625, "loss/logits": 0.2164444476366043, "step": 278 }, { "epoch": 0.0008859392861679156, "grad_norm": 0.30859375, "grad_norm_var": 0.010463714599609375, "learning_rate": 0.01, "loss": 1.4021, "loss/crossentropy": 2.398961663246155, "loss/fcd": 1.1953125, "loss/logits": 0.28027981519699097, "step": 279 }, { "epoch": 0.0008891146957957577, "grad_norm": 0.314453125, "grad_norm_var": 0.01053009033203125, "learning_rate": 0.01, "loss": 1.4186, "loss/crossentropy": 2.539450526237488, "loss/fcd": 1.09375, "loss/logits": 0.23892314732074738, "step": 280 }, { "epoch": 0.0008922901054235997, "grad_norm": 0.404296875, "grad_norm_var": 0.010638411839803059, "learning_rate": 0.01, "loss": 1.461, "loss/crossentropy": 2.535378932952881, "loss/fcd": 1.1875, "loss/logits": 0.2856827974319458, "step": 281 }, { "epoch": 0.0008954655150514416, "grad_norm": 0.328125, "grad_norm_var": 0.010590728123982747, "learning_rate": 0.01, "loss": 1.3902, "loss/crossentropy": 2.68851101398468, "loss/fcd": 1.1171875, "loss/logits": 0.2427075430750847, "step": 282 }, { "epoch": 0.0008986409246792836, "grad_norm": 0.30078125, "grad_norm_var": 0.010672950744628906, "learning_rate": 0.01, "loss": 1.4097, "loss/crossentropy": 2.473228931427002, "loss/fcd": 1.06640625, "loss/logits": 0.2395230457186699, "step": 283 }, { "epoch": 0.0009018163343071257, "grad_norm": 0.30859375, "grad_norm_var": 0.010759735107421875, "learning_rate": 0.01, "loss": 1.4096, "loss/crossentropy": 2.476340174674988, "loss/fcd": 1.13671875, "loss/logits": 0.2649814188480377, "step": 284 }, { "epoch": 0.0009049917439349676, "grad_norm": 0.322265625, "grad_norm_var": 0.01080773671468099, "learning_rate": 0.01, "loss": 1.4095, "loss/crossentropy": 2.6251412630081177, "loss/fcd": 1.17578125, "loss/logits": 0.2975796312093735, "step": 285 }, { "epoch": 0.0009081671535628096, "grad_norm": 0.3203125, "grad_norm_var": 0.010834185282389323, "learning_rate": 0.01, "loss": 1.4108, "loss/crossentropy": 2.8257126808166504, "loss/fcd": 1.15234375, "loss/logits": 0.25809506326913834, "step": 286 }, { "epoch": 0.0009113425631906515, "grad_norm": 0.353515625, "grad_norm_var": 0.010834995905558269, "learning_rate": 0.01, "loss": 1.48, "loss/crossentropy": 2.63088321685791, "loss/fcd": 1.27734375, "loss/logits": 0.29609737545251846, "step": 287 }, { "epoch": 0.0009145179728184936, "grad_norm": 0.287109375, "grad_norm_var": 0.011070744196573893, "learning_rate": 0.01, "loss": 1.3795, "loss/crossentropy": 2.8399826288223267, "loss/fcd": 1.15625, "loss/logits": 0.25897857546806335, "step": 288 }, { "epoch": 0.0009176933824463356, "grad_norm": 0.349609375, "grad_norm_var": 0.010920143127441407, "learning_rate": 0.01, "loss": 1.4599, "loss/crossentropy": 2.7151553630828857, "loss/fcd": 1.16015625, "loss/logits": 0.2690805196762085, "step": 289 }, { "epoch": 0.0009208687920741775, "grad_norm": 0.31640625, "grad_norm_var": 0.010693979263305665, "learning_rate": 0.01, "loss": 1.4564, "loss/crossentropy": 2.2923821210861206, "loss/fcd": 1.09765625, "loss/logits": 0.2446538209915161, "step": 290 }, { "epoch": 0.0009240442017020195, "grad_norm": 0.337890625, "grad_norm_var": 0.010639190673828125, "learning_rate": 0.01, "loss": 1.4427, "loss/crossentropy": 2.5572006702423096, "loss/fcd": 1.19140625, "loss/logits": 0.2795237749814987, "step": 291 }, { "epoch": 0.0009272196113298616, "grad_norm": 0.458984375, "grad_norm_var": 0.011316919326782226, "learning_rate": 0.01, "loss": 1.4987, "loss/crossentropy": 2.622624158859253, "loss/fcd": 1.1875, "loss/logits": 0.2653464898467064, "step": 292 }, { "epoch": 0.0009303950209577035, "grad_norm": 0.6640625, "grad_norm_var": 0.008510319391886394, "learning_rate": 0.01, "loss": 1.6727, "loss/crossentropy": 2.358694851398468, "loss/fcd": 1.3515625, "loss/logits": 0.2982519268989563, "step": 293 }, { "epoch": 0.0009335704305855455, "grad_norm": 0.34375, "grad_norm_var": 0.008473714192708334, "learning_rate": 0.01, "loss": 1.4066, "loss/crossentropy": 2.466861605644226, "loss/fcd": 1.1328125, "loss/logits": 0.26953184604644775, "step": 294 }, { "epoch": 0.0009367458402133875, "grad_norm": 0.375, "grad_norm_var": 0.008316993713378906, "learning_rate": 0.01, "loss": 1.5221, "loss/crossentropy": 2.5451639890670776, "loss/fcd": 1.2265625, "loss/logits": 0.2941044867038727, "step": 295 }, { "epoch": 0.0009399212498412295, "grad_norm": 0.294921875, "grad_norm_var": 0.008463541666666666, "learning_rate": 0.01, "loss": 1.3671, "loss/crossentropy": 2.6134976148605347, "loss/fcd": 1.12890625, "loss/logits": 0.24579111486673355, "step": 296 }, { "epoch": 0.0009430966594690715, "grad_norm": 0.33984375, "grad_norm_var": 0.008345524470011393, "learning_rate": 0.01, "loss": 1.4349, "loss/crossentropy": 2.585867762565613, "loss/fcd": 1.140625, "loss/logits": 0.2784867584705353, "step": 297 }, { "epoch": 0.0009462720690969135, "grad_norm": 0.34765625, "grad_norm_var": 0.008295933405558268, "learning_rate": 0.01, "loss": 1.481, "loss/crossentropy": 2.6173094511032104, "loss/fcd": 1.30859375, "loss/logits": 0.33237364888191223, "step": 298 }, { "epoch": 0.0009494474787247554, "grad_norm": 0.341796875, "grad_norm_var": 0.008090655008951822, "learning_rate": 0.01, "loss": 1.4112, "loss/crossentropy": 2.62748646736145, "loss/fcd": 1.1484375, "loss/logits": 0.2712366282939911, "step": 299 }, { "epoch": 0.0009526228883525975, "grad_norm": 0.345703125, "grad_norm_var": 0.007921838760375976, "learning_rate": 0.01, "loss": 1.4636, "loss/crossentropy": 2.501181125640869, "loss/fcd": 1.2421875, "loss/logits": 0.28991882503032684, "step": 300 }, { "epoch": 0.0009557982979804395, "grad_norm": 0.349609375, "grad_norm_var": 0.007822148005167643, "learning_rate": 0.01, "loss": 1.4618, "loss/crossentropy": 2.5432281494140625, "loss/fcd": 1.25, "loss/logits": 0.2761085480451584, "step": 301 }, { "epoch": 0.0009589737076082814, "grad_norm": 0.34765625, "grad_norm_var": 0.007709105809529622, "learning_rate": 0.01, "loss": 1.4714, "loss/crossentropy": 2.834138512611389, "loss/fcd": 1.234375, "loss/logits": 0.29171431064605713, "step": 302 }, { "epoch": 0.0009621491172361235, "grad_norm": 0.39453125, "grad_norm_var": 0.0077468236287434895, "learning_rate": 0.01, "loss": 1.5679, "loss/crossentropy": 2.732903003692627, "loss/fcd": 1.28125, "loss/logits": 0.3431689292192459, "step": 303 }, { "epoch": 0.0009653245268639655, "grad_norm": 0.3203125, "grad_norm_var": 0.007455809911092123, "learning_rate": 0.01, "loss": 1.4487, "loss/crossentropy": 2.640901803970337, "loss/fcd": 1.23046875, "loss/logits": 0.28139546513557434, "step": 304 }, { "epoch": 0.0009684999364918074, "grad_norm": 0.359375, "grad_norm_var": 0.0074345906575520836, "learning_rate": 0.01, "loss": 1.5228, "loss/crossentropy": 2.5405126810073853, "loss/fcd": 1.21875, "loss/logits": 0.2743394076824188, "step": 305 }, { "epoch": 0.0009716753461196494, "grad_norm": 0.318359375, "grad_norm_var": 0.007420587539672852, "learning_rate": 0.01, "loss": 1.4612, "loss/crossentropy": 2.593175768852234, "loss/fcd": 1.203125, "loss/logits": 0.2995911240577698, "step": 306 }, { "epoch": 0.0009748507557474915, "grad_norm": 0.328125, "grad_norm_var": 0.007469940185546875, "learning_rate": 0.01, "loss": 1.4115, "loss/crossentropy": 2.631293296813965, "loss/fcd": 1.21484375, "loss/logits": 0.2403849959373474, "step": 307 }, { "epoch": 0.0009780261653753333, "grad_norm": 0.337890625, "grad_norm_var": 0.00695947011311849, "learning_rate": 0.01, "loss": 1.4568, "loss/crossentropy": 2.7287439107894897, "loss/fcd": 1.29296875, "loss/logits": 0.271475687623024, "step": 308 }, { "epoch": 0.0009812015750031754, "grad_norm": 0.44921875, "grad_norm_var": 0.0012212117513020833, "learning_rate": 0.01, "loss": 1.4888, "loss/crossentropy": 2.7232574224472046, "loss/fcd": 1.44140625, "loss/logits": 0.41784295439720154, "step": 309 }, { "epoch": 0.0009843769846310175, "grad_norm": 0.314453125, "grad_norm_var": 0.0012977441151936849, "learning_rate": 0.01, "loss": 1.4642, "loss/crossentropy": 2.504624843597412, "loss/fcd": 1.15234375, "loss/logits": 0.2769088000059128, "step": 310 }, { "epoch": 0.0009875523942588593, "grad_norm": 0.373046875, "grad_norm_var": 0.0012908935546875, "learning_rate": 0.01, "loss": 1.5034, "loss/crossentropy": 2.7473918199539185, "loss/fcd": 1.265625, "loss/logits": 0.30949144065380096, "step": 311 }, { "epoch": 0.0009907278038867014, "grad_norm": 0.330078125, "grad_norm_var": 0.0011209487915039063, "learning_rate": 0.01, "loss": 1.4195, "loss/crossentropy": 2.5405654907226562, "loss/fcd": 1.09375, "loss/logits": 0.24167446792125702, "step": 312 }, { "epoch": 0.0009939032135145435, "grad_norm": 0.33203125, "grad_norm_var": 0.0011351903279622395, "learning_rate": 0.01, "loss": 1.442, "loss/crossentropy": 2.6730507612228394, "loss/fcd": 1.1328125, "loss/logits": 0.24724073708057404, "step": 313 }, { "epoch": 0.0009970786231423853, "grad_norm": 0.32421875, "grad_norm_var": 0.0011748631795247396, "learning_rate": 0.01, "loss": 1.4261, "loss/crossentropy": 2.471889615058899, "loss/fcd": 1.171875, "loss/logits": 0.28130483627319336, "step": 314 }, { "epoch": 0.0010002540327702274, "grad_norm": 0.283203125, "grad_norm_var": 0.0014371236165364584, "learning_rate": 0.01, "loss": 1.3813, "loss/crossentropy": 2.4892624616622925, "loss/fcd": 1.14453125, "loss/logits": 0.2577434182167053, "step": 315 }, { "epoch": 0.0010034294423980693, "grad_norm": 0.365234375, "grad_norm_var": 0.0014647801717122396, "learning_rate": 0.01, "loss": 1.436, "loss/crossentropy": 2.7085033655166626, "loss/fcd": 1.140625, "loss/logits": 0.25846952199935913, "step": 316 }, { "epoch": 0.0010066048520259113, "grad_norm": 0.384765625, "grad_norm_var": 0.0015614827473958333, "learning_rate": 0.01, "loss": 1.4591, "loss/crossentropy": 2.3977363109588623, "loss/fcd": 1.1953125, "loss/logits": 0.25775520503520966, "step": 317 }, { "epoch": 0.0010097802616537534, "grad_norm": 0.3515625, "grad_norm_var": 0.0015624364217122395, "learning_rate": 0.01, "loss": 1.4333, "loss/crossentropy": 2.5633625984191895, "loss/fcd": 1.2890625, "loss/logits": 0.28313587605953217, "step": 318 }, { "epoch": 0.0010129556712815953, "grad_norm": 0.310546875, "grad_norm_var": 0.0014811038970947265, "learning_rate": 0.01, "loss": 1.444, "loss/crossentropy": 2.5945725440979004, "loss/fcd": 1.16015625, "loss/logits": 0.26434415578842163, "step": 319 }, { "epoch": 0.0010161310809094373, "grad_norm": 0.341796875, "grad_norm_var": 0.0014459609985351563, "learning_rate": 0.01, "loss": 1.4344, "loss/crossentropy": 2.671095371246338, "loss/fcd": 1.16796875, "loss/logits": 0.25914353132247925, "step": 320 }, { "epoch": 0.0010193064905372794, "grad_norm": 0.33984375, "grad_norm_var": 0.00142974853515625, "learning_rate": 0.01, "loss": 1.4069, "loss/crossentropy": 2.146224617958069, "loss/fcd": 1.046875, "loss/logits": 0.22655215859413147, "step": 321 }, { "epoch": 0.0010224819001651213, "grad_norm": 0.359375, "grad_norm_var": 0.001401376724243164, "learning_rate": 0.01, "loss": 1.4754, "loss/crossentropy": 2.6014915704727173, "loss/fcd": 1.23828125, "loss/logits": 0.25363868474960327, "step": 322 }, { "epoch": 0.0010256573097929633, "grad_norm": 0.326171875, "grad_norm_var": 0.001406097412109375, "learning_rate": 0.01, "loss": 1.4658, "loss/crossentropy": 2.6632081270217896, "loss/fcd": 1.2421875, "loss/logits": 0.284743070602417, "step": 323 }, { "epoch": 0.0010288327194208052, "grad_norm": 0.33984375, "grad_norm_var": 0.0014044284820556641, "learning_rate": 0.01, "loss": 1.4675, "loss/crossentropy": 2.61789071559906, "loss/fcd": 1.21875, "loss/logits": 0.3034716546535492, "step": 324 }, { "epoch": 0.0010320081290486473, "grad_norm": 0.294921875, "grad_norm_var": 0.0007552464803059895, "learning_rate": 0.01, "loss": 1.4427, "loss/crossentropy": 2.4929096698760986, "loss/fcd": 1.13671875, "loss/logits": 0.2669839411973953, "step": 325 }, { "epoch": 0.0010351835386764893, "grad_norm": 0.439453125, "grad_norm_var": 0.0013778050740559896, "learning_rate": 0.01, "loss": 1.4402, "loss/crossentropy": 2.617214560508728, "loss/fcd": 1.1875, "loss/logits": 0.2735295593738556, "step": 326 }, { "epoch": 0.0010383589483043312, "grad_norm": 0.3828125, "grad_norm_var": 0.0014222304026285807, "learning_rate": 0.01, "loss": 1.4732, "loss/crossentropy": 2.788111448287964, "loss/fcd": 1.2265625, "loss/logits": 0.2897178828716278, "step": 327 }, { "epoch": 0.0010415343579321733, "grad_norm": 0.32421875, "grad_norm_var": 0.0014353434244791666, "learning_rate": 0.01, "loss": 1.4346, "loss/crossentropy": 2.380117654800415, "loss/fcd": 1.0859375, "loss/logits": 0.23115020245313644, "step": 328 }, { "epoch": 0.0010447097675600153, "grad_norm": 0.302734375, "grad_norm_var": 0.0015347639719645182, "learning_rate": 0.01, "loss": 1.3784, "loss/crossentropy": 2.169575035572052, "loss/fcd": 1.083984375, "loss/logits": 0.24762088060379028, "step": 329 }, { "epoch": 0.0010478851771878572, "grad_norm": 0.36328125, "grad_norm_var": 0.0015379428863525391, "learning_rate": 0.01, "loss": 1.474, "loss/crossentropy": 2.580411434173584, "loss/fcd": 1.27734375, "loss/logits": 0.3172074258327484, "step": 330 }, { "epoch": 0.0010510605868156993, "grad_norm": 0.3046875, "grad_norm_var": 0.0013916015625, "learning_rate": 0.01, "loss": 1.4177, "loss/crossentropy": 2.675889253616333, "loss/fcd": 1.1484375, "loss/logits": 0.2542211189866066, "step": 331 }, { "epoch": 0.0010542359964435411, "grad_norm": 0.3125, "grad_norm_var": 0.001428079605102539, "learning_rate": 0.01, "loss": 1.4464, "loss/crossentropy": 2.8227070569992065, "loss/fcd": 1.24609375, "loss/logits": 0.3030836582183838, "step": 332 }, { "epoch": 0.0010574114060713832, "grad_norm": 0.296875, "grad_norm_var": 0.00141448974609375, "learning_rate": 0.01, "loss": 1.3732, "loss/crossentropy": 2.3414876461029053, "loss/fcd": 1.1484375, "loss/logits": 0.29129286110401154, "step": 333 }, { "epoch": 0.0010605868156992253, "grad_norm": 0.322265625, "grad_norm_var": 0.0014109134674072266, "learning_rate": 0.01, "loss": 1.4437, "loss/crossentropy": 2.666181445121765, "loss/fcd": 1.203125, "loss/logits": 0.28567154705524445, "step": 334 }, { "epoch": 0.0010637622253270671, "grad_norm": 0.291015625, "grad_norm_var": 0.0014986515045166016, "learning_rate": 0.01, "loss": 1.4001, "loss/crossentropy": 2.4635136127471924, "loss/fcd": 1.25, "loss/logits": 0.32767751812934875, "step": 335 }, { "epoch": 0.0010669376349549092, "grad_norm": 0.345703125, "grad_norm_var": 0.0015037377675374349, "learning_rate": 0.01, "loss": 1.5217, "loss/crossentropy": 2.43788743019104, "loss/fcd": 1.17578125, "loss/logits": 0.2790217697620392, "step": 336 }, { "epoch": 0.0010701130445827513, "grad_norm": 0.388671875, "grad_norm_var": 0.0016901016235351563, "learning_rate": 0.01, "loss": 1.53, "loss/crossentropy": 2.5477564334869385, "loss/fcd": 1.21484375, "loss/logits": 0.32547467947006226, "step": 337 }, { "epoch": 0.0010732884542105931, "grad_norm": 0.3046875, "grad_norm_var": 0.0017150243123372395, "learning_rate": 0.01, "loss": 1.4396, "loss/crossentropy": 2.5275453329086304, "loss/fcd": 1.1875, "loss/logits": 0.27268455922603607, "step": 338 }, { "epoch": 0.0010764638638384352, "grad_norm": 0.33203125, "grad_norm_var": 0.001711257298787435, "learning_rate": 0.01, "loss": 1.4771, "loss/crossentropy": 2.446451187133789, "loss/fcd": 1.22265625, "loss/logits": 0.32446105778217316, "step": 339 }, { "epoch": 0.001079639273466277, "grad_norm": 0.3203125, "grad_norm_var": 0.0017201582590738933, "learning_rate": 0.01, "loss": 1.4163, "loss/crossentropy": 2.5728654861450195, "loss/fcd": 1.0859375, "loss/logits": 0.24421326816082, "step": 340 }, { "epoch": 0.0010828146830941191, "grad_norm": 0.38671875, "grad_norm_var": 0.0017821629842122396, "learning_rate": 0.01, "loss": 1.5053, "loss/crossentropy": 2.864202618598938, "loss/fcd": 1.22265625, "loss/logits": 0.2826021537184715, "step": 341 }, { "epoch": 0.0010859900927219612, "grad_norm": 0.33203125, "grad_norm_var": 0.0010591983795166016, "learning_rate": 0.01, "loss": 1.4157, "loss/crossentropy": 2.535142660140991, "loss/fcd": 1.1875, "loss/logits": 0.2700059413909912, "step": 342 }, { "epoch": 0.001089165502349803, "grad_norm": 0.345703125, "grad_norm_var": 0.000893402099609375, "learning_rate": 0.01, "loss": 1.5093, "loss/crossentropy": 2.5049625635147095, "loss/fcd": 1.1875, "loss/logits": 0.27245666086673737, "step": 343 }, { "epoch": 0.0010923409119776451, "grad_norm": 0.361328125, "grad_norm_var": 0.0009528954823811848, "learning_rate": 0.01, "loss": 1.5928, "loss/crossentropy": 2.648856520652771, "loss/fcd": 1.37109375, "loss/logits": 0.39411380887031555, "step": 344 }, { "epoch": 0.0010955163216054872, "grad_norm": 0.361328125, "grad_norm_var": 0.0009395440419514973, "learning_rate": 0.01, "loss": 1.4595, "loss/crossentropy": 2.478541851043701, "loss/fcd": 1.140625, "loss/logits": 0.2562572583556175, "step": 345 }, { "epoch": 0.001098691731233329, "grad_norm": 0.32421875, "grad_norm_var": 0.0008905887603759766, "learning_rate": 0.01, "loss": 1.4149, "loss/crossentropy": 2.7645288705825806, "loss/fcd": 1.203125, "loss/logits": 0.27800317108631134, "step": 346 }, { "epoch": 0.0011018671408611711, "grad_norm": 0.375, "grad_norm_var": 0.0009329319000244141, "learning_rate": 0.01, "loss": 1.5472, "loss/crossentropy": 2.8698976039886475, "loss/fcd": 1.16015625, "loss/logits": 0.2746939957141876, "step": 347 }, { "epoch": 0.0011050425504890132, "grad_norm": 0.345703125, "grad_norm_var": 0.0008910497029622396, "learning_rate": 0.01, "loss": 1.4374, "loss/crossentropy": 2.5966951847076416, "loss/fcd": 1.1875, "loss/logits": 0.2681450843811035, "step": 348 }, { "epoch": 0.001108217960116855, "grad_norm": 0.318359375, "grad_norm_var": 0.0007975101470947266, "learning_rate": 0.01, "loss": 1.4788, "loss/crossentropy": 2.558975577354431, "loss/fcd": 1.21875, "loss/logits": 0.2862369269132614, "step": 349 }, { "epoch": 0.0011113933697446971, "grad_norm": 0.302734375, "grad_norm_var": 0.0008699893951416016, "learning_rate": 0.01, "loss": 1.4452, "loss/crossentropy": 2.523944854736328, "loss/fcd": 1.1328125, "loss/logits": 0.27994687855243683, "step": 350 }, { "epoch": 0.001114568779372539, "grad_norm": 0.3359375, "grad_norm_var": 0.0007043838500976563, "learning_rate": 0.01, "loss": 1.5203, "loss/crossentropy": 2.458726167678833, "loss/fcd": 1.125, "loss/logits": 0.2676275223493576, "step": 351 }, { "epoch": 0.001117744189000381, "grad_norm": 0.330078125, "grad_norm_var": 0.0007130304972330729, "learning_rate": 0.01, "loss": 1.4512, "loss/crossentropy": 2.7090238332748413, "loss/fcd": 1.13671875, "loss/logits": 0.25532904267311096, "step": 352 }, { "epoch": 0.0011209195986282231, "grad_norm": 0.306640625, "grad_norm_var": 0.0006182352701822917, "learning_rate": 0.01, "loss": 1.4357, "loss/crossentropy": 2.69818913936615, "loss/fcd": 1.2265625, "loss/logits": 0.28583139181137085, "step": 353 }, { "epoch": 0.001124095008256065, "grad_norm": 0.359375, "grad_norm_var": 0.00057373046875, "learning_rate": 0.01, "loss": 1.4114, "loss/crossentropy": 2.8350677490234375, "loss/fcd": 1.15234375, "loss/logits": 0.24179340153932571, "step": 354 }, { "epoch": 0.001127270417883907, "grad_norm": 0.3359375, "grad_norm_var": 0.0005706151326497396, "learning_rate": 0.01, "loss": 1.4431, "loss/crossentropy": 2.584088921546936, "loss/fcd": 1.1484375, "loss/logits": 0.2574824094772339, "step": 355 }, { "epoch": 0.0011304458275117491, "grad_norm": 0.7421875, "grad_norm_var": 0.010581906636555989, "learning_rate": 0.01, "loss": 1.4019, "loss/crossentropy": 2.5810946226119995, "loss/fcd": 1.046875, "loss/logits": 0.24207720905542374, "step": 356 }, { "epoch": 0.001133621237139591, "grad_norm": 0.56640625, "grad_norm_var": 0.013085365295410156, "learning_rate": 0.01, "loss": 1.4547, "loss/crossentropy": 2.77290678024292, "loss/fcd": 1.24609375, "loss/logits": 0.2914367616176605, "step": 357 }, { "epoch": 0.001136796646767433, "grad_norm": 0.3203125, "grad_norm_var": 0.013165283203125, "learning_rate": 0.01, "loss": 1.4052, "loss/crossentropy": 2.5615508556365967, "loss/fcd": 1.19140625, "loss/logits": 0.29001541435718536, "step": 358 }, { "epoch": 0.001139972056395275, "grad_norm": 0.361328125, "grad_norm_var": 0.013115437825520833, "learning_rate": 0.01, "loss": 1.427, "loss/crossentropy": 2.5820958614349365, "loss/fcd": 1.203125, "loss/logits": 0.302823007106781, "step": 359 }, { "epoch": 0.001143147466023117, "grad_norm": 0.32421875, "grad_norm_var": 0.013283650080362955, "learning_rate": 0.01, "loss": 1.4533, "loss/crossentropy": 2.56258487701416, "loss/fcd": 1.28515625, "loss/logits": 0.27707424759864807, "step": 360 }, { "epoch": 0.001146322875650959, "grad_norm": 0.453125, "grad_norm_var": 0.013635508219401042, "learning_rate": 0.01, "loss": 1.5093, "loss/crossentropy": 3.055524468421936, "loss/fcd": 1.33984375, "loss/logits": 0.3141314387321472, "step": 361 }, { "epoch": 0.001149498285278801, "grad_norm": 0.373046875, "grad_norm_var": 0.01341258684794108, "learning_rate": 0.01, "loss": 1.4136, "loss/crossentropy": 2.3731695413589478, "loss/fcd": 1.05078125, "loss/logits": 0.23925812542438507, "step": 362 }, { "epoch": 0.001152673694906643, "grad_norm": 0.353515625, "grad_norm_var": 0.013468360900878907, "learning_rate": 0.01, "loss": 1.4026, "loss/crossentropy": 2.3354973793029785, "loss/fcd": 1.15234375, "loss/logits": 0.26777036488056183, "step": 363 }, { "epoch": 0.001155849104534485, "grad_norm": 0.412109375, "grad_norm_var": 0.013413238525390624, "learning_rate": 0.01, "loss": 1.5295, "loss/crossentropy": 2.536329984664917, "loss/fcd": 1.1796875, "loss/logits": 0.2786889374256134, "step": 364 }, { "epoch": 0.001159024514162327, "grad_norm": 0.345703125, "grad_norm_var": 0.013208961486816407, "learning_rate": 0.01, "loss": 1.4399, "loss/crossentropy": 2.611635446548462, "loss/fcd": 1.15234375, "loss/logits": 0.2757887840270996, "step": 365 }, { "epoch": 0.001162199923790169, "grad_norm": 0.30859375, "grad_norm_var": 0.013143777847290039, "learning_rate": 0.01, "loss": 1.4174, "loss/crossentropy": 2.489536166191101, "loss/fcd": 1.23828125, "loss/logits": 0.29095780849456787, "step": 366 }, { "epoch": 0.0011653753334180108, "grad_norm": 0.333984375, "grad_norm_var": 0.013157908121744792, "learning_rate": 0.01, "loss": 1.4752, "loss/crossentropy": 2.7903414964675903, "loss/fcd": 1.34375, "loss/logits": 0.3479475602507591, "step": 367 }, { "epoch": 0.001168550743045853, "grad_norm": 0.328125, "grad_norm_var": 0.013173532485961915, "learning_rate": 0.01, "loss": 1.4804, "loss/crossentropy": 2.6934927701950073, "loss/fcd": 1.234375, "loss/logits": 0.29752205312252045, "step": 368 }, { "epoch": 0.001171726152673695, "grad_norm": 0.330078125, "grad_norm_var": 0.012950372695922852, "learning_rate": 0.01, "loss": 1.4398, "loss/crossentropy": 2.137880325317383, "loss/fcd": 1.05078125, "loss/logits": 0.23075462132692337, "step": 369 }, { "epoch": 0.0011749015623015368, "grad_norm": 0.3515625, "grad_norm_var": 0.01298661231994629, "learning_rate": 0.01, "loss": 1.3853, "loss/crossentropy": 2.3140697479248047, "loss/fcd": 1.15234375, "loss/logits": 0.23338378965854645, "step": 370 }, { "epoch": 0.001178076971929379, "grad_norm": 0.345703125, "grad_norm_var": 0.012922159830729167, "learning_rate": 0.01, "loss": 1.3258, "loss/crossentropy": 2.427556872367859, "loss/fcd": 1.078125, "loss/logits": 0.2503844350576401, "step": 371 }, { "epoch": 0.001181252381557221, "grad_norm": 0.2890625, "grad_norm_var": 0.004514567057291667, "learning_rate": 0.01, "loss": 1.3753, "loss/crossentropy": 2.454453945159912, "loss/fcd": 1.12109375, "loss/logits": 0.2562709003686905, "step": 372 }, { "epoch": 0.0011844277911850628, "grad_norm": 0.326171875, "grad_norm_var": 0.0015839735666910808, "learning_rate": 0.01, "loss": 1.4779, "loss/crossentropy": 2.4442111253738403, "loss/fcd": 1.12109375, "loss/logits": 0.2599013224244118, "step": 373 }, { "epoch": 0.001187603200812905, "grad_norm": 0.392578125, "grad_norm_var": 0.0016504287719726562, "learning_rate": 0.01, "loss": 1.41, "loss/crossentropy": 2.7129191160202026, "loss/fcd": 1.1796875, "loss/logits": 0.271731972694397, "step": 374 }, { "epoch": 0.0011907786104407468, "grad_norm": 0.3203125, "grad_norm_var": 0.001703500747680664, "learning_rate": 0.01, "loss": 1.4178, "loss/crossentropy": 2.528813362121582, "loss/fcd": 1.140625, "loss/logits": 0.2772262841463089, "step": 375 }, { "epoch": 0.0011939540200685888, "grad_norm": 0.322265625, "grad_norm_var": 0.0017102559407552083, "learning_rate": 0.01, "loss": 1.3679, "loss/crossentropy": 2.4332374334335327, "loss/fcd": 1.140625, "loss/logits": 0.24810632318258286, "step": 376 }, { "epoch": 0.001197129429696431, "grad_norm": 0.330078125, "grad_norm_var": 0.0009502251942952474, "learning_rate": 0.01, "loss": 1.4125, "loss/crossentropy": 2.719116449356079, "loss/fcd": 1.16796875, "loss/logits": 0.26793956756591797, "step": 377 }, { "epoch": 0.0012003048393242728, "grad_norm": 0.328125, "grad_norm_var": 0.000886980692545573, "learning_rate": 0.01, "loss": 1.4061, "loss/crossentropy": 2.5011860132217407, "loss/fcd": 1.072265625, "loss/logits": 0.2244342789053917, "step": 378 }, { "epoch": 0.0012034802489521148, "grad_norm": 0.3046875, "grad_norm_var": 0.000939035415649414, "learning_rate": 0.01, "loss": 1.4576, "loss/crossentropy": 2.3722652196884155, "loss/fcd": 1.17578125, "loss/logits": 0.28097401559352875, "step": 379 }, { "epoch": 0.001206655658579957, "grad_norm": 0.34765625, "grad_norm_var": 0.000540924072265625, "learning_rate": 0.01, "loss": 1.4147, "loss/crossentropy": 2.4277199506759644, "loss/fcd": 1.099609375, "loss/logits": 0.25263649970293045, "step": 380 }, { "epoch": 0.0012098310682077988, "grad_norm": 0.302734375, "grad_norm_var": 0.0005751927693684896, "learning_rate": 0.01, "loss": 1.3938, "loss/crossentropy": 2.6109243631362915, "loss/fcd": 1.109375, "loss/logits": 0.2633241266012192, "step": 381 }, { "epoch": 0.0012130064778356408, "grad_norm": 0.326171875, "grad_norm_var": 0.0005470116933186849, "learning_rate": 0.01, "loss": 1.4952, "loss/crossentropy": 2.5426456928253174, "loss/fcd": 1.28125, "loss/logits": 0.31167298555374146, "step": 382 }, { "epoch": 0.0012161818874634827, "grad_norm": 0.318359375, "grad_norm_var": 0.0005538781483968099, "learning_rate": 0.01, "loss": 1.3938, "loss/crossentropy": 2.4204543828964233, "loss/fcd": 1.1328125, "loss/logits": 0.25714488327503204, "step": 383 }, { "epoch": 0.0012193572970913248, "grad_norm": 0.59765625, "grad_norm_var": 0.005063613255818685, "learning_rate": 0.01, "loss": 1.549, "loss/crossentropy": 2.629058599472046, "loss/fcd": 1.203125, "loss/logits": 0.2943042516708374, "step": 384 }, { "epoch": 0.0012225327067191668, "grad_norm": 0.37109375, "grad_norm_var": 0.00508263905843099, "learning_rate": 0.01, "loss": 1.4793, "loss/crossentropy": 2.58511483669281, "loss/fcd": 1.16796875, "loss/logits": 0.2729629874229431, "step": 385 }, { "epoch": 0.0012257081163470087, "grad_norm": 0.361328125, "grad_norm_var": 0.005092732111612956, "learning_rate": 0.01, "loss": 1.4231, "loss/crossentropy": 2.822973132133484, "loss/fcd": 1.19921875, "loss/logits": 0.275088295340538, "step": 386 }, { "epoch": 0.0012288835259748508, "grad_norm": 0.318359375, "grad_norm_var": 0.00515147844950358, "learning_rate": 0.01, "loss": 1.3923, "loss/crossentropy": 2.767518162727356, "loss/fcd": 1.140625, "loss/logits": 0.2667757719755173, "step": 387 }, { "epoch": 0.0012320589356026928, "grad_norm": 0.31640625, "grad_norm_var": 0.004985920588175456, "learning_rate": 0.01, "loss": 1.4146, "loss/crossentropy": 2.4579033851623535, "loss/fcd": 1.1875, "loss/logits": 0.275812029838562, "step": 388 }, { "epoch": 0.0012352343452305347, "grad_norm": 0.3515625, "grad_norm_var": 0.004948933919270833, "learning_rate": 0.01, "loss": 1.4002, "loss/crossentropy": 2.4946489334106445, "loss/fcd": 1.14453125, "loss/logits": 0.2812535837292671, "step": 389 }, { "epoch": 0.0012384097548583768, "grad_norm": 0.3046875, "grad_norm_var": 0.004939635594685872, "learning_rate": 0.01, "loss": 1.4186, "loss/crossentropy": 2.5126614570617676, "loss/fcd": 1.15234375, "loss/logits": 0.2510756552219391, "step": 390 }, { "epoch": 0.0012415851644862188, "grad_norm": 0.330078125, "grad_norm_var": 0.004913330078125, "learning_rate": 0.01, "loss": 1.4239, "loss/crossentropy": 2.582464575767517, "loss/fcd": 1.17578125, "loss/logits": 0.27455802261829376, "step": 391 }, { "epoch": 0.0012447605741140607, "grad_norm": 0.37890625, "grad_norm_var": 0.004936838150024414, "learning_rate": 0.01, "loss": 1.4551, "loss/crossentropy": 2.4381093978881836, "loss/fcd": 1.21484375, "loss/logits": 0.28313565254211426, "step": 392 }, { "epoch": 0.0012479359837419028, "grad_norm": 0.3515625, "grad_norm_var": 0.004910786946614583, "learning_rate": 0.01, "loss": 1.433, "loss/crossentropy": 2.434713363647461, "loss/fcd": 1.1015625, "loss/logits": 0.2571691572666168, "step": 393 }, { "epoch": 0.0012511113933697446, "grad_norm": 0.322265625, "grad_norm_var": 0.004930480321248373, "learning_rate": 0.01, "loss": 1.4774, "loss/crossentropy": 2.481712579727173, "loss/fcd": 1.23828125, "loss/logits": 0.2832081615924835, "step": 394 }, { "epoch": 0.0012542868029975867, "grad_norm": 0.396484375, "grad_norm_var": 0.004899851481119792, "learning_rate": 0.01, "loss": 1.5177, "loss/crossentropy": 2.761539101600647, "loss/fcd": 1.34765625, "loss/logits": 0.3508002460002899, "step": 395 }, { "epoch": 0.0012574622126254288, "grad_norm": 0.318359375, "grad_norm_var": 0.004985920588175456, "learning_rate": 0.01, "loss": 1.4278, "loss/crossentropy": 2.652655839920044, "loss/fcd": 1.23046875, "loss/logits": 0.2886646091938019, "step": 396 }, { "epoch": 0.0012606376222532706, "grad_norm": 0.302734375, "grad_norm_var": 0.004985920588175456, "learning_rate": 0.01, "loss": 1.3711, "loss/crossentropy": 2.5145236253738403, "loss/fcd": 1.12890625, "loss/logits": 0.25199297070503235, "step": 397 }, { "epoch": 0.0012638130318811127, "grad_norm": 0.3125, "grad_norm_var": 0.005048561096191406, "learning_rate": 0.01, "loss": 1.3831, "loss/crossentropy": 2.167693614959717, "loss/fcd": 1.001953125, "loss/logits": 0.2246558740735054, "step": 398 }, { "epoch": 0.0012669884415089548, "grad_norm": 0.3125, "grad_norm_var": 0.005077981948852539, "learning_rate": 0.01, "loss": 1.4347, "loss/crossentropy": 2.3945388793945312, "loss/fcd": 1.15234375, "loss/logits": 0.2500525116920471, "step": 399 }, { "epoch": 0.0012701638511367966, "grad_norm": 0.419921875, "grad_norm_var": 0.0012522379557291666, "learning_rate": 0.01, "loss": 1.5576, "loss/crossentropy": 2.2466899156570435, "loss/fcd": 1.54296875, "loss/logits": 0.3709343522787094, "step": 400 }, { "epoch": 0.0012733392607646387, "grad_norm": 0.31640625, "grad_norm_var": 0.0012255350748697916, "learning_rate": 0.01, "loss": 1.4053, "loss/crossentropy": 2.497683882713318, "loss/fcd": 1.13671875, "loss/logits": 0.2679227739572525, "step": 401 }, { "epoch": 0.0012765146703924805, "grad_norm": 0.34375, "grad_norm_var": 0.0011910597483317058, "learning_rate": 0.01, "loss": 1.449, "loss/crossentropy": 2.5589895248413086, "loss/fcd": 1.20703125, "loss/logits": 0.29804037511348724, "step": 402 }, { "epoch": 0.0012796900800203226, "grad_norm": 0.3203125, "grad_norm_var": 0.001186370849609375, "learning_rate": 0.01, "loss": 1.5009, "loss/crossentropy": 2.510563015937805, "loss/fcd": 1.3046875, "loss/logits": 0.3212246596813202, "step": 403 }, { "epoch": 0.0012828654896481647, "grad_norm": 0.322265625, "grad_norm_var": 0.0011721134185791015, "learning_rate": 0.01, "loss": 1.488, "loss/crossentropy": 2.703429937362671, "loss/fcd": 1.23828125, "loss/logits": 0.2762700319290161, "step": 404 }, { "epoch": 0.0012860408992760065, "grad_norm": 0.357421875, "grad_norm_var": 0.0011850357055664062, "learning_rate": 0.01, "loss": 1.4679, "loss/crossentropy": 2.6150410175323486, "loss/fcd": 1.140625, "loss/logits": 0.25775669515132904, "step": 405 }, { "epoch": 0.0012892163089038486, "grad_norm": 0.3046875, "grad_norm_var": 0.0011850357055664062, "learning_rate": 0.01, "loss": 1.3751, "loss/crossentropy": 2.432300329208374, "loss/fcd": 1.12109375, "loss/logits": 0.26045307517051697, "step": 406 }, { "epoch": 0.0012923917185316907, "grad_norm": 0.337890625, "grad_norm_var": 0.0011804580688476562, "learning_rate": 0.01, "loss": 1.4527, "loss/crossentropy": 2.548518419265747, "loss/fcd": 1.18359375, "loss/logits": 0.28656983375549316, "step": 407 }, { "epoch": 0.0012955671281595325, "grad_norm": 0.28515625, "grad_norm_var": 0.0012262344360351562, "learning_rate": 0.01, "loss": 1.3608, "loss/crossentropy": 2.566452145576477, "loss/fcd": 1.125, "loss/logits": 0.24359553307294846, "step": 408 }, { "epoch": 0.0012987425377873746, "grad_norm": 0.318359375, "grad_norm_var": 0.0012119134267171224, "learning_rate": 0.01, "loss": 1.4121, "loss/crossentropy": 2.5307424068450928, "loss/fcd": 1.08984375, "loss/logits": 0.2518118619918823, "step": 409 }, { "epoch": 0.0013019179474152165, "grad_norm": 0.287109375, "grad_norm_var": 0.0013286431630452475, "learning_rate": 0.01, "loss": 1.4344, "loss/crossentropy": 2.4665204286575317, "loss/fcd": 1.12109375, "loss/logits": 0.25681471824645996, "step": 410 }, { "epoch": 0.0013050933570430585, "grad_norm": 0.330078125, "grad_norm_var": 0.0010022322336832683, "learning_rate": 0.01, "loss": 1.453, "loss/crossentropy": 2.661336064338684, "loss/fcd": 1.234375, "loss/logits": 0.29113084077835083, "step": 411 }, { "epoch": 0.0013082687666709006, "grad_norm": 0.314453125, "grad_norm_var": 0.001006301244099935, "learning_rate": 0.01, "loss": 1.4065, "loss/crossentropy": 2.2939807176589966, "loss/fcd": 1.109375, "loss/logits": 0.24707216024398804, "step": 412 }, { "epoch": 0.0013114441762987425, "grad_norm": 0.291015625, "grad_norm_var": 0.00104826291402181, "learning_rate": 0.01, "loss": 1.4161, "loss/crossentropy": 2.572722315788269, "loss/fcd": 1.171875, "loss/logits": 0.28457625210285187, "step": 413 }, { "epoch": 0.0013146195859265845, "grad_norm": 0.294921875, "grad_norm_var": 0.0010930379231770834, "learning_rate": 0.01, "loss": 1.4467, "loss/crossentropy": 2.500192165374756, "loss/fcd": 1.125, "loss/logits": 0.25375255942344666, "step": 414 }, { "epoch": 0.0013177949955544266, "grad_norm": 0.333984375, "grad_norm_var": 0.001093912124633789, "learning_rate": 0.01, "loss": 1.4462, "loss/crossentropy": 2.7540515661239624, "loss/fcd": 1.21484375, "loss/logits": 0.29294553399086, "step": 415 }, { "epoch": 0.0013209704051822685, "grad_norm": 0.32421875, "grad_norm_var": 0.00043735504150390627, "learning_rate": 0.01, "loss": 1.4603, "loss/crossentropy": 2.6406532526016235, "loss/fcd": 1.16015625, "loss/logits": 0.2780788838863373, "step": 416 }, { "epoch": 0.0013241458148101105, "grad_norm": 0.294921875, "grad_norm_var": 0.0004697004954020182, "learning_rate": 0.01, "loss": 1.4653, "loss/crossentropy": 2.6926311254501343, "loss/fcd": 1.234375, "loss/logits": 0.28893327713012695, "step": 417 }, { "epoch": 0.0013273212244379524, "grad_norm": 0.314453125, "grad_norm_var": 0.00041605631510416665, "learning_rate": 0.01, "loss": 1.4152, "loss/crossentropy": 2.7244991064071655, "loss/fcd": 1.1484375, "loss/logits": 0.2697499990463257, "step": 418 }, { "epoch": 0.0013304966340657945, "grad_norm": 0.29296875, "grad_norm_var": 0.0004414240519205729, "learning_rate": 0.01, "loss": 1.4371, "loss/crossentropy": 2.4327027797698975, "loss/fcd": 1.16796875, "loss/logits": 0.27161940932273865, "step": 419 }, { "epoch": 0.0013336720436936365, "grad_norm": 0.279296875, "grad_norm_var": 0.0005022684733072916, "learning_rate": 0.01, "loss": 1.4066, "loss/crossentropy": 2.5847145318984985, "loss/fcd": 1.1328125, "loss/logits": 0.2696874141693115, "step": 420 }, { "epoch": 0.0013368474533214784, "grad_norm": 0.33984375, "grad_norm_var": 0.0004105726877848307, "learning_rate": 0.01, "loss": 1.4465, "loss/crossentropy": 2.5688854455947876, "loss/fcd": 1.15625, "loss/logits": 0.26708918809890747, "step": 421 }, { "epoch": 0.0013400228629493205, "grad_norm": 0.314453125, "grad_norm_var": 0.00041097005208333334, "learning_rate": 0.01, "loss": 1.4145, "loss/crossentropy": 2.4451531171798706, "loss/fcd": 1.1171875, "loss/logits": 0.2400546818971634, "step": 422 }, { "epoch": 0.0013431982725771625, "grad_norm": 0.36328125, "grad_norm_var": 0.0005471388498942058, "learning_rate": 0.01, "loss": 1.4827, "loss/crossentropy": 2.847463846206665, "loss/fcd": 1.30859375, "loss/logits": 0.30222761631011963, "step": 423 }, { "epoch": 0.0013463736822050044, "grad_norm": 0.408203125, "grad_norm_var": 0.0010668436686197917, "learning_rate": 0.01, "loss": 1.464, "loss/crossentropy": 2.5831762552261353, "loss/fcd": 1.18359375, "loss/logits": 0.2909919023513794, "step": 424 }, { "epoch": 0.0013495490918328465, "grad_norm": 0.322265625, "grad_norm_var": 0.0010675430297851563, "learning_rate": 0.01, "loss": 1.4535, "loss/crossentropy": 2.518390655517578, "loss/fcd": 1.171875, "loss/logits": 0.2784492075443268, "step": 425 }, { "epoch": 0.0013527245014606883, "grad_norm": 0.3046875, "grad_norm_var": 0.0010118961334228515, "learning_rate": 0.01, "loss": 1.3997, "loss/crossentropy": 2.5067780017852783, "loss/fcd": 1.125, "loss/logits": 0.2519618421792984, "step": 426 }, { "epoch": 0.0013558999110885304, "grad_norm": 0.326171875, "grad_norm_var": 0.0010076999664306641, "learning_rate": 0.01, "loss": 1.4157, "loss/crossentropy": 2.346894145011902, "loss/fcd": 1.1484375, "loss/logits": 0.2657436281442642, "step": 427 }, { "epoch": 0.0013590753207163725, "grad_norm": 0.380859375, "grad_norm_var": 0.0012346744537353516, "learning_rate": 0.01, "loss": 1.4526, "loss/crossentropy": 2.674981951713562, "loss/fcd": 1.18359375, "loss/logits": 0.2726554870605469, "step": 428 }, { "epoch": 0.0013622507303442143, "grad_norm": 0.3203125, "grad_norm_var": 0.0011590957641601563, "learning_rate": 0.01, "loss": 1.4379, "loss/crossentropy": 2.4476983547210693, "loss/fcd": 1.125, "loss/logits": 0.26706820726394653, "step": 429 }, { "epoch": 0.0013654261399720564, "grad_norm": 0.328125, "grad_norm_var": 0.0010907332102457683, "learning_rate": 0.01, "loss": 1.4659, "loss/crossentropy": 2.641856551170349, "loss/fcd": 1.1171875, "loss/logits": 0.25720103085041046, "step": 430 }, { "epoch": 0.0013686015495998985, "grad_norm": 0.330078125, "grad_norm_var": 0.001088571548461914, "learning_rate": 0.01, "loss": 1.4985, "loss/crossentropy": 2.6238348484039307, "loss/fcd": 1.19921875, "loss/logits": 0.2798547148704529, "step": 431 }, { "epoch": 0.0013717769592277403, "grad_norm": 0.310546875, "grad_norm_var": 0.001106707255045573, "learning_rate": 0.01, "loss": 1.4339, "loss/crossentropy": 2.629550576210022, "loss/fcd": 1.1796875, "loss/logits": 0.2794565111398697, "step": 432 }, { "epoch": 0.0013749523688555824, "grad_norm": 0.294921875, "grad_norm_var": 0.001106707255045573, "learning_rate": 0.01, "loss": 1.4052, "loss/crossentropy": 2.6656163930892944, "loss/fcd": 1.21875, "loss/logits": 0.28168073296546936, "step": 433 }, { "epoch": 0.0013781277784834245, "grad_norm": 0.3046875, "grad_norm_var": 0.0011288801829020183, "learning_rate": 0.01, "loss": 1.4026, "loss/crossentropy": 2.7009440660476685, "loss/fcd": 1.1640625, "loss/logits": 0.26332157850265503, "step": 434 }, { "epoch": 0.0013813031881112663, "grad_norm": 0.31640625, "grad_norm_var": 0.0010590712229410807, "learning_rate": 0.01, "loss": 1.4266, "loss/crossentropy": 2.674599766731262, "loss/fcd": 1.140625, "loss/logits": 0.26388123631477356, "step": 435 }, { "epoch": 0.0013844785977391084, "grad_norm": 0.298828125, "grad_norm_var": 0.0009567101796468098, "learning_rate": 0.01, "loss": 1.4509, "loss/crossentropy": 2.4273698329925537, "loss/fcd": 1.1328125, "loss/logits": 0.2610742747783661, "step": 436 }, { "epoch": 0.0013876540073669502, "grad_norm": 0.361328125, "grad_norm_var": 0.0010166803995768229, "learning_rate": 0.01, "loss": 1.4906, "loss/crossentropy": 2.577130913734436, "loss/fcd": 1.125, "loss/logits": 0.27118298411369324, "step": 437 }, { "epoch": 0.0013908294169947923, "grad_norm": 0.322265625, "grad_norm_var": 0.0010039647420247396, "learning_rate": 0.01, "loss": 1.4656, "loss/crossentropy": 2.918359637260437, "loss/fcd": 1.20703125, "loss/logits": 0.2865753024816513, "step": 438 }, { "epoch": 0.0013940048266226344, "grad_norm": 0.31640625, "grad_norm_var": 0.0009383519490559895, "learning_rate": 0.01, "loss": 1.5244, "loss/crossentropy": 2.4194256067276, "loss/fcd": 1.13671875, "loss/logits": 0.2703660875558853, "step": 439 }, { "epoch": 0.0013971802362504762, "grad_norm": 0.3203125, "grad_norm_var": 0.0004798730214436849, "learning_rate": 0.01, "loss": 1.4094, "loss/crossentropy": 2.0009674429893494, "loss/fcd": 1.05859375, "loss/logits": 0.23895444720983505, "step": 440 }, { "epoch": 0.0014003556458783183, "grad_norm": 0.298828125, "grad_norm_var": 0.0005145867665608724, "learning_rate": 0.01, "loss": 1.4214, "loss/crossentropy": 2.3764067888259888, "loss/fcd": 1.15234375, "loss/logits": 0.2504069581627846, "step": 441 }, { "epoch": 0.0014035310555061604, "grad_norm": 0.341796875, "grad_norm_var": 0.00052032470703125, "learning_rate": 0.01, "loss": 1.461, "loss/crossentropy": 2.4465948343276978, "loss/fcd": 1.11328125, "loss/logits": 0.2404251992702484, "step": 442 }, { "epoch": 0.0014067064651340022, "grad_norm": 0.34765625, "grad_norm_var": 0.000557565689086914, "learning_rate": 0.01, "loss": 1.5415, "loss/crossentropy": 2.4751689434051514, "loss/fcd": 1.06640625, "loss/logits": 0.25208456814289093, "step": 443 }, { "epoch": 0.0014098818747618443, "grad_norm": 0.28515625, "grad_norm_var": 0.0004119237263997396, "learning_rate": 0.01, "loss": 1.3675, "loss/crossentropy": 2.397505283355713, "loss/fcd": 1.1484375, "loss/logits": 0.27833351492881775, "step": 444 }, { "epoch": 0.0014130572843896862, "grad_norm": 0.337890625, "grad_norm_var": 0.0004352410634358724, "learning_rate": 0.01, "loss": 1.4534, "loss/crossentropy": 2.5381107330322266, "loss/fcd": 1.13671875, "loss/logits": 0.26634952425956726, "step": 445 }, { "epoch": 0.0014162326940175282, "grad_norm": 0.41796875, "grad_norm_var": 0.00104063351949056, "learning_rate": 0.01, "loss": 1.3842, "loss/crossentropy": 2.5989311933517456, "loss/fcd": 1.25, "loss/logits": 0.2924342602491379, "step": 446 }, { "epoch": 0.0014194081036453703, "grad_norm": 0.35546875, "grad_norm_var": 0.0010970433553059895, "learning_rate": 0.01, "loss": 1.4464, "loss/crossentropy": 2.703127384185791, "loss/fcd": 1.22265625, "loss/logits": 0.27995145320892334, "step": 447 }, { "epoch": 0.0014225835132732122, "grad_norm": 0.3671875, "grad_norm_var": 0.001174020767211914, "learning_rate": 0.01, "loss": 1.5287, "loss/crossentropy": 2.8878402709960938, "loss/fcd": 1.234375, "loss/logits": 0.29731544852256775, "step": 448 }, { "epoch": 0.0014257589229010543, "grad_norm": 0.298828125, "grad_norm_var": 0.0011564731597900391, "learning_rate": 0.01, "loss": 1.4411, "loss/crossentropy": 2.336888313293457, "loss/fcd": 1.12890625, "loss/logits": 0.26908518373966217, "step": 449 }, { "epoch": 0.0014289343325288963, "grad_norm": 0.30859375, "grad_norm_var": 0.0011438846588134766, "learning_rate": 0.01, "loss": 1.4436, "loss/crossentropy": 2.5269131660461426, "loss/fcd": 1.15234375, "loss/logits": 0.24337883293628693, "step": 450 }, { "epoch": 0.0014321097421567382, "grad_norm": 0.31640625, "grad_norm_var": 0.0011438846588134766, "learning_rate": 0.01, "loss": 1.4582, "loss/crossentropy": 2.3824414014816284, "loss/fcd": 1.13671875, "loss/logits": 0.2648303210735321, "step": 451 }, { "epoch": 0.0014352851517845803, "grad_norm": 0.408203125, "grad_norm_var": 0.0014233748118082682, "learning_rate": 0.01, "loss": 1.5877, "loss/crossentropy": 2.7260658740997314, "loss/fcd": 1.16015625, "loss/logits": 0.27970214933156967, "step": 452 }, { "epoch": 0.001438460561412422, "grad_norm": 0.34765625, "grad_norm_var": 0.0013921101888020833, "learning_rate": 0.01, "loss": 1.4809, "loss/crossentropy": 2.473803997039795, "loss/fcd": 1.28515625, "loss/logits": 0.34300902485847473, "step": 453 }, { "epoch": 0.0014416359710402642, "grad_norm": 0.357421875, "grad_norm_var": 0.0014006932576497396, "learning_rate": 0.01, "loss": 1.5214, "loss/crossentropy": 2.6857175827026367, "loss/fcd": 1.1875, "loss/logits": 0.28329257667064667, "step": 454 }, { "epoch": 0.0014448113806681063, "grad_norm": 0.30859375, "grad_norm_var": 0.0014281590779622396, "learning_rate": 0.01, "loss": 1.3475, "loss/crossentropy": 2.447817802429199, "loss/fcd": 1.109375, "loss/logits": 0.26159919798374176, "step": 455 }, { "epoch": 0.001447986790295948, "grad_norm": 0.30078125, "grad_norm_var": 0.0014996846516927083, "learning_rate": 0.01, "loss": 1.3775, "loss/crossentropy": 2.558703303337097, "loss/fcd": 1.2109375, "loss/logits": 0.283077210187912, "step": 456 }, { "epoch": 0.0014511621999237902, "grad_norm": 0.35546875, "grad_norm_var": 0.0014088789621988932, "learning_rate": 0.01, "loss": 1.5396, "loss/crossentropy": 2.5725014209747314, "loss/fcd": 1.32421875, "loss/logits": 0.33682313561439514, "step": 457 }, { "epoch": 0.0014543376095516323, "grad_norm": 0.30859375, "grad_norm_var": 0.0014739990234375, "learning_rate": 0.01, "loss": 1.4163, "loss/crossentropy": 2.5333362817764282, "loss/fcd": 1.16015625, "loss/logits": 0.27569472044706345, "step": 458 }, { "epoch": 0.001457513019179474, "grad_norm": 0.30859375, "grad_norm_var": 0.001523590087890625, "learning_rate": 0.01, "loss": 1.4, "loss/crossentropy": 2.6462429761886597, "loss/fcd": 1.1328125, "loss/logits": 0.25457239151000977, "step": 459 }, { "epoch": 0.0014606884288073162, "grad_norm": 0.3828125, "grad_norm_var": 0.0014520645141601562, "learning_rate": 0.01, "loss": 1.5071, "loss/crossentropy": 2.0781235694885254, "loss/fcd": 1.0625, "loss/logits": 0.24288621544837952, "step": 460 }, { "epoch": 0.001463863838435158, "grad_norm": 0.310546875, "grad_norm_var": 0.0015157063802083333, "learning_rate": 0.01, "loss": 1.4779, "loss/crossentropy": 2.3874698877334595, "loss/fcd": 1.23046875, "loss/logits": 0.2746141031384468, "step": 461 }, { "epoch": 0.001467039248063, "grad_norm": 0.298828125, "grad_norm_var": 0.0011773268381754557, "learning_rate": 0.01, "loss": 1.4227, "loss/crossentropy": 2.5566097497940063, "loss/fcd": 1.09765625, "loss/logits": 0.24260970950126648, "step": 462 }, { "epoch": 0.0014702146576908422, "grad_norm": 0.330078125, "grad_norm_var": 0.0011428197224934895, "learning_rate": 0.01, "loss": 1.4854, "loss/crossentropy": 2.649473786354065, "loss/fcd": 1.296875, "loss/logits": 0.33814525604248047, "step": 463 }, { "epoch": 0.001473390067318684, "grad_norm": 0.3359375, "grad_norm_var": 0.001056353251139323, "learning_rate": 0.01, "loss": 1.4967, "loss/crossentropy": 2.744623064994812, "loss/fcd": 1.23828125, "loss/logits": 0.30253274738788605, "step": 464 }, { "epoch": 0.001476565476946526, "grad_norm": 0.3125, "grad_norm_var": 0.0010115146636962891, "learning_rate": 0.01, "loss": 1.4366, "loss/crossentropy": 2.663595676422119, "loss/fcd": 1.140625, "loss/logits": 0.25435324013233185, "step": 465 }, { "epoch": 0.0014797408865743682, "grad_norm": 0.326171875, "grad_norm_var": 0.0009790420532226562, "learning_rate": 0.01, "loss": 1.4308, "loss/crossentropy": 2.4967721700668335, "loss/fcd": 1.20703125, "loss/logits": 0.3036217838525772, "step": 466 }, { "epoch": 0.00148291629620221, "grad_norm": 0.3125, "grad_norm_var": 0.000988006591796875, "learning_rate": 0.01, "loss": 1.4003, "loss/crossentropy": 2.489341616630554, "loss/fcd": 1.171875, "loss/logits": 0.2605610340833664, "step": 467 }, { "epoch": 0.001486091705830052, "grad_norm": 0.3359375, "grad_norm_var": 0.0005757490793863933, "learning_rate": 0.01, "loss": 1.4721, "loss/crossentropy": 2.4876623153686523, "loss/fcd": 1.1640625, "loss/logits": 0.27536633610725403, "step": 468 }, { "epoch": 0.001489267115457894, "grad_norm": 0.330078125, "grad_norm_var": 0.0005467096964518229, "learning_rate": 0.01, "loss": 1.4804, "loss/crossentropy": 2.558952212333679, "loss/fcd": 1.1796875, "loss/logits": 0.2956201583147049, "step": 469 }, { "epoch": 0.001492442525085736, "grad_norm": 0.39453125, "grad_norm_var": 0.0007886091868082682, "learning_rate": 0.01, "loss": 1.5204, "loss/crossentropy": 2.3994314670562744, "loss/fcd": 1.14453125, "loss/logits": 0.25016407668590546, "step": 470 }, { "epoch": 0.001495617934713578, "grad_norm": 0.33203125, "grad_norm_var": 0.0007615248362223307, "learning_rate": 0.01, "loss": 1.4803, "loss/crossentropy": 2.4502326250076294, "loss/fcd": 1.21875, "loss/logits": 0.2681497782468796, "step": 471 }, { "epoch": 0.00149879334434142, "grad_norm": 0.337890625, "grad_norm_var": 0.0007044474283854166, "learning_rate": 0.01, "loss": 1.3994, "loss/crossentropy": 2.5294952392578125, "loss/fcd": 1.1640625, "loss/logits": 0.24464938044548035, "step": 472 }, { "epoch": 0.001501968753969262, "grad_norm": 0.328125, "grad_norm_var": 0.000665728251139323, "learning_rate": 0.01, "loss": 1.4051, "loss/crossentropy": 2.5350325107574463, "loss/fcd": 1.19140625, "loss/logits": 0.27912886440753937, "step": 473 }, { "epoch": 0.001505144163597104, "grad_norm": 0.34765625, "grad_norm_var": 0.0006479263305664062, "learning_rate": 0.01, "loss": 1.5036, "loss/crossentropy": 2.3504234552383423, "loss/fcd": 1.18359375, "loss/logits": 0.2695607841014862, "step": 474 }, { "epoch": 0.001508319573224946, "grad_norm": 0.302734375, "grad_norm_var": 0.0006689548492431641, "learning_rate": 0.01, "loss": 1.411, "loss/crossentropy": 2.317971110343933, "loss/fcd": 1.08203125, "loss/logits": 0.2737869620323181, "step": 475 }, { "epoch": 0.001511494982852788, "grad_norm": 0.333984375, "grad_norm_var": 0.0004897435506184896, "learning_rate": 0.01, "loss": 1.4918, "loss/crossentropy": 2.261656165122986, "loss/fcd": 1.1484375, "loss/logits": 0.24204014986753464, "step": 476 }, { "epoch": 0.00151467039248063, "grad_norm": 0.3125, "grad_norm_var": 0.00048508644104003904, "learning_rate": 0.01, "loss": 1.386, "loss/crossentropy": 2.424006700515747, "loss/fcd": 1.125, "loss/logits": 0.277113139629364, "step": 477 }, { "epoch": 0.001517845802108472, "grad_norm": 0.337890625, "grad_norm_var": 0.0004208723704020182, "learning_rate": 0.01, "loss": 1.4474, "loss/crossentropy": 2.6235703229904175, "loss/fcd": 1.16796875, "loss/logits": 0.2773337587714195, "step": 478 }, { "epoch": 0.001521021211736314, "grad_norm": 0.326171875, "grad_norm_var": 0.0004227797190348307, "learning_rate": 0.01, "loss": 1.4165, "loss/crossentropy": 2.522370934486389, "loss/fcd": 1.09375, "loss/logits": 0.2516328915953636, "step": 479 }, { "epoch": 0.0015241966213641559, "grad_norm": 0.30859375, "grad_norm_var": 0.0004539330800374349, "learning_rate": 0.01, "loss": 1.4198, "loss/crossentropy": 2.6058274507522583, "loss/fcd": 1.16015625, "loss/logits": 0.285677969455719, "step": 480 }, { "epoch": 0.001527372030991998, "grad_norm": 0.314453125, "grad_norm_var": 0.00044962565104166664, "learning_rate": 0.01, "loss": 1.4088, "loss/crossentropy": 2.4809470176696777, "loss/fcd": 1.15234375, "loss/logits": 0.2481013536453247, "step": 481 }, { "epoch": 0.00153054744061984, "grad_norm": 0.322265625, "grad_norm_var": 0.00045261383056640627, "learning_rate": 0.01, "loss": 1.4197, "loss/crossentropy": 2.798996329307556, "loss/fcd": 1.1875, "loss/logits": 0.294664591550827, "step": 482 }, { "epoch": 0.0015337228502476819, "grad_norm": 0.703125, "grad_norm_var": 0.009086545308430989, "learning_rate": 0.01, "loss": 1.5146, "loss/crossentropy": 2.822615146636963, "loss/fcd": 1.2109375, "loss/logits": 0.281253382563591, "step": 483 }, { "epoch": 0.001536898259875524, "grad_norm": 0.34375, "grad_norm_var": 0.009071286519368489, "learning_rate": 0.01, "loss": 1.4699, "loss/crossentropy": 2.71571147441864, "loss/fcd": 1.125, "loss/logits": 0.2605608403682709, "step": 484 }, { "epoch": 0.001540073669503366, "grad_norm": 0.318359375, "grad_norm_var": 0.009118398030598959, "learning_rate": 0.01, "loss": 1.4369, "loss/crossentropy": 2.7621062994003296, "loss/fcd": 1.25, "loss/logits": 0.3083229064941406, "step": 485 }, { "epoch": 0.0015432490791312079, "grad_norm": 0.337890625, "grad_norm_var": 0.009012842178344726, "learning_rate": 0.01, "loss": 1.418, "loss/crossentropy": 2.4629874229431152, "loss/fcd": 1.15625, "loss/logits": 0.27178558707237244, "step": 486 }, { "epoch": 0.00154642448875905, "grad_norm": 0.361328125, "grad_norm_var": 0.008994483947753906, "learning_rate": 0.01, "loss": 1.4466, "loss/crossentropy": 2.4104692935943604, "loss/fcd": 1.1875, "loss/logits": 0.3030923306941986, "step": 487 }, { "epoch": 0.0015495998983868918, "grad_norm": 0.318359375, "grad_norm_var": 0.009055836995442709, "learning_rate": 0.01, "loss": 1.3757, "loss/crossentropy": 2.294907569885254, "loss/fcd": 1.125, "loss/logits": 0.24228475242853165, "step": 488 }, { "epoch": 0.0015527753080147339, "grad_norm": 0.306640625, "grad_norm_var": 0.009150425593058268, "learning_rate": 0.01, "loss": 1.4264, "loss/crossentropy": 2.353929281234741, "loss/fcd": 1.10546875, "loss/logits": 0.26592618972063065, "step": 489 }, { "epoch": 0.001555950717642576, "grad_norm": 0.298828125, "grad_norm_var": 0.009312947591145834, "learning_rate": 0.01, "loss": 1.4303, "loss/crossentropy": 2.675679564476013, "loss/fcd": 1.1796875, "loss/logits": 0.26250655949115753, "step": 490 }, { "epoch": 0.0015591261272704178, "grad_norm": 0.337890625, "grad_norm_var": 0.00918420155843099, "learning_rate": 0.01, "loss": 1.4183, "loss/crossentropy": 2.7257109880447388, "loss/fcd": 1.1953125, "loss/logits": 0.2816329747438431, "step": 491 }, { "epoch": 0.0015623015368982599, "grad_norm": 0.302734375, "grad_norm_var": 0.009307289123535156, "learning_rate": 0.01, "loss": 1.3896, "loss/crossentropy": 2.630375623703003, "loss/fcd": 1.12109375, "loss/logits": 0.24010887742042542, "step": 492 }, { "epoch": 0.001565476946526102, "grad_norm": 0.333984375, "grad_norm_var": 0.009237527847290039, "learning_rate": 0.01, "loss": 1.4368, "loss/crossentropy": 2.5104730129241943, "loss/fcd": 1.140625, "loss/logits": 0.2663855701684952, "step": 493 }, { "epoch": 0.0015686523561539438, "grad_norm": 0.326171875, "grad_norm_var": 0.009262323379516602, "learning_rate": 0.01, "loss": 1.4557, "loss/crossentropy": 2.551353931427002, "loss/fcd": 1.23046875, "loss/logits": 0.2552918493747711, "step": 494 }, { "epoch": 0.0015718277657817859, "grad_norm": 0.3203125, "grad_norm_var": 0.009281158447265625, "learning_rate": 0.01, "loss": 1.467, "loss/crossentropy": 2.8752095699310303, "loss/fcd": 1.26953125, "loss/logits": 0.32157447934150696, "step": 495 }, { "epoch": 0.0015750031754096277, "grad_norm": 0.345703125, "grad_norm_var": 0.009176365534464518, "learning_rate": 0.01, "loss": 1.4466, "loss/crossentropy": 2.2843401432037354, "loss/fcd": 1.1484375, "loss/logits": 0.26545679569244385, "step": 496 }, { "epoch": 0.0015781785850374698, "grad_norm": 0.3359375, "grad_norm_var": 0.00910485585530599, "learning_rate": 0.01, "loss": 1.4711, "loss/crossentropy": 2.8034032583236694, "loss/fcd": 1.1875, "loss/logits": 0.26386207342147827, "step": 497 }, { "epoch": 0.0015813539946653119, "grad_norm": 0.3125, "grad_norm_var": 0.009148009618123372, "learning_rate": 0.01, "loss": 1.3725, "loss/crossentropy": 2.6164721250534058, "loss/fcd": 1.09765625, "loss/logits": 0.2445560097694397, "step": 498 }, { "epoch": 0.0015845294042931537, "grad_norm": 0.318359375, "grad_norm_var": 0.0002960205078125, "learning_rate": 0.01, "loss": 1.5089, "loss/crossentropy": 2.7629817724227905, "loss/fcd": 1.2734375, "loss/logits": 0.3092408627271652, "step": 499 }, { "epoch": 0.0015877048139209958, "grad_norm": 0.294921875, "grad_norm_var": 0.00033059120178222654, "learning_rate": 0.01, "loss": 1.4078, "loss/crossentropy": 2.631264328956604, "loss/fcd": 1.16796875, "loss/logits": 0.2838464677333832, "step": 500 }, { "epoch": 0.0015908802235488379, "grad_norm": 0.33203125, "grad_norm_var": 0.00033359527587890626, "learning_rate": 0.01, "loss": 1.4193, "loss/crossentropy": 2.6747204065322876, "loss/fcd": 1.1171875, "loss/logits": 0.24447203427553177, "step": 501 }, { "epoch": 0.0015940556331766797, "grad_norm": 0.359375, "grad_norm_var": 0.0004023075103759766, "learning_rate": 0.01, "loss": 1.4351, "loss/crossentropy": 2.4404605627059937, "loss/fcd": 1.12109375, "loss/logits": 0.2514181584119797, "step": 502 }, { "epoch": 0.0015972310428045218, "grad_norm": 0.30859375, "grad_norm_var": 0.00032291412353515627, "learning_rate": 0.01, "loss": 1.4454, "loss/crossentropy": 1.9665863513946533, "loss/fcd": 1.11328125, "loss/logits": 0.2470186948776245, "step": 503 }, { "epoch": 0.0016004064524323637, "grad_norm": 0.302734375, "grad_norm_var": 0.00034580230712890627, "learning_rate": 0.01, "loss": 1.4621, "loss/crossentropy": 2.3390400409698486, "loss/fcd": 1.21875, "loss/logits": 0.27564719319343567, "step": 504 }, { "epoch": 0.0016035818620602057, "grad_norm": 0.337890625, "grad_norm_var": 0.00034681955973307293, "learning_rate": 0.01, "loss": 1.4602, "loss/crossentropy": 2.573357105255127, "loss/fcd": 1.1875, "loss/logits": 0.28885987401008606, "step": 505 }, { "epoch": 0.0016067572716880478, "grad_norm": 0.345703125, "grad_norm_var": 0.0003330866495768229, "learning_rate": 0.01, "loss": 1.4615, "loss/crossentropy": 2.5696252584457397, "loss/fcd": 1.2109375, "loss/logits": 0.2851059287786484, "step": 506 }, { "epoch": 0.0016099326813158897, "grad_norm": 0.271484375, "grad_norm_var": 0.000502777099609375, "learning_rate": 0.01, "loss": 1.3481, "loss/crossentropy": 2.5426188707351685, "loss/fcd": 1.09765625, "loss/logits": 0.2500939592719078, "step": 507 }, { "epoch": 0.0016131080909437317, "grad_norm": 0.296875, "grad_norm_var": 0.0005198001861572265, "learning_rate": 0.01, "loss": 1.4045, "loss/crossentropy": 2.681436061859131, "loss/fcd": 1.11328125, "loss/logits": 0.2324848249554634, "step": 508 }, { "epoch": 0.0016162835005715738, "grad_norm": 1.3046875, "grad_norm_var": 0.06103865305582682, "learning_rate": 0.01, "loss": 1.4696, "loss/crossentropy": 2.347053289413452, "loss/fcd": 1.12109375, "loss/logits": 0.24349500238895416, "step": 509 }, { "epoch": 0.0016194589101994157, "grad_norm": 0.94921875, "grad_norm_var": 0.08065590858459473, "learning_rate": 0.01, "loss": 1.4381, "loss/crossentropy": 2.7159606218338013, "loss/fcd": 1.18359375, "loss/logits": 0.27338050305843353, "step": 510 }, { "epoch": 0.0016226343198272577, "grad_norm": 0.42578125, "grad_norm_var": 0.07993493080139161, "learning_rate": 0.01, "loss": 1.4496, "loss/crossentropy": 2.604248881340027, "loss/fcd": 1.28125, "loss/logits": 0.32629138231277466, "step": 511 }, { "epoch": 0.0016258097294550996, "grad_norm": 0.34375, "grad_norm_var": 0.07995649973551432, "learning_rate": 0.01, "loss": 1.4472, "loss/crossentropy": 2.6356743574142456, "loss/fcd": 1.2109375, "loss/logits": 0.2862953841686249, "step": 512 }, { "epoch": 0.0016289851390829417, "grad_norm": 0.34375, "grad_norm_var": 0.07986494700113932, "learning_rate": 0.01, "loss": 1.5216, "loss/crossentropy": 2.7528648376464844, "loss/fcd": 1.24609375, "loss/logits": 0.3076392263174057, "step": 513 }, { "epoch": 0.0016321605487107837, "grad_norm": 0.34375, "grad_norm_var": 0.07944482167561849, "learning_rate": 0.01, "loss": 1.4737, "loss/crossentropy": 2.4589303731918335, "loss/fcd": 1.15234375, "loss/logits": 0.26726794242858887, "step": 514 }, { "epoch": 0.0016353359583386256, "grad_norm": 0.298828125, "grad_norm_var": 0.07975921630859376, "learning_rate": 0.01, "loss": 1.3529, "loss/crossentropy": 2.2944650650024414, "loss/fcd": 1.09765625, "loss/logits": 0.24623721837997437, "step": 515 }, { "epoch": 0.0016385113679664677, "grad_norm": 0.365234375, "grad_norm_var": 0.07881393432617187, "learning_rate": 0.01, "loss": 1.4962, "loss/crossentropy": 2.560919404029846, "loss/fcd": 1.26953125, "loss/logits": 0.2942315936088562, "step": 516 }, { "epoch": 0.0016416867775943097, "grad_norm": 0.291015625, "grad_norm_var": 0.07947182655334473, "learning_rate": 0.01, "loss": 1.3961, "loss/crossentropy": 2.5325158834457397, "loss/fcd": 1.11328125, "loss/logits": 0.25655095279216766, "step": 517 }, { "epoch": 0.0016448621872221516, "grad_norm": 0.384765625, "grad_norm_var": 0.0792711893717448, "learning_rate": 0.01, "loss": 1.487, "loss/crossentropy": 2.7486977577209473, "loss/fcd": 1.25, "loss/logits": 0.27564749121665955, "step": 518 }, { "epoch": 0.0016480375968499937, "grad_norm": 0.341796875, "grad_norm_var": 0.07879319190979003, "learning_rate": 0.01, "loss": 1.4829, "loss/crossentropy": 2.573593854904175, "loss/fcd": 1.2265625, "loss/logits": 0.2879941165447235, "step": 519 }, { "epoch": 0.0016512130064778357, "grad_norm": 0.3125, "grad_norm_var": 0.07862796783447265, "learning_rate": 0.01, "loss": 1.4226, "loss/crossentropy": 2.4571280479431152, "loss/fcd": 1.1328125, "loss/logits": 0.2627527117729187, "step": 520 }, { "epoch": 0.0016543884161056776, "grad_norm": 0.345703125, "grad_norm_var": 0.07853082021077475, "learning_rate": 0.01, "loss": 1.4866, "loss/crossentropy": 2.75668728351593, "loss/fcd": 1.171875, "loss/logits": 0.2633800208568573, "step": 521 }, { "epoch": 0.0016575638257335197, "grad_norm": 0.369140625, "grad_norm_var": 0.07828515370686849, "learning_rate": 0.01, "loss": 1.4964, "loss/crossentropy": 2.5602306127548218, "loss/fcd": 1.20703125, "loss/logits": 0.288630411028862, "step": 522 }, { "epoch": 0.0016607392353613615, "grad_norm": 0.34765625, "grad_norm_var": 0.07696913083394369, "learning_rate": 0.01, "loss": 1.4636, "loss/crossentropy": 2.863381028175354, "loss/fcd": 1.2578125, "loss/logits": 0.2976280450820923, "step": 523 }, { "epoch": 0.0016639146449892036, "grad_norm": 0.291015625, "grad_norm_var": 0.07708428700764974, "learning_rate": 0.01, "loss": 1.3519, "loss/crossentropy": 2.6331528425216675, "loss/fcd": 1.10546875, "loss/logits": 0.22287855297327042, "step": 524 }, { "epoch": 0.0016670900546170457, "grad_norm": 0.326171875, "grad_norm_var": 0.02426451047261556, "learning_rate": 0.01, "loss": 1.4223, "loss/crossentropy": 2.852648377418518, "loss/fcd": 1.15625, "loss/logits": 0.2766903191804886, "step": 525 }, { "epoch": 0.0016702654642448875, "grad_norm": 0.314453125, "grad_norm_var": 0.001271820068359375, "learning_rate": 0.01, "loss": 1.4903, "loss/crossentropy": 2.2694883346557617, "loss/fcd": 1.11328125, "loss/logits": 0.25743845105171204, "step": 526 }, { "epoch": 0.0016734408738727296, "grad_norm": 0.390625, "grad_norm_var": 0.0009485244750976563, "learning_rate": 0.01, "loss": 1.4267, "loss/crossentropy": 2.4987694025039673, "loss/fcd": 1.1796875, "loss/logits": 0.25972025096416473, "step": 527 }, { "epoch": 0.0016766162835005717, "grad_norm": 0.8046875, "grad_norm_var": 0.014572588602701823, "learning_rate": 0.01, "loss": 1.4029, "loss/crossentropy": 2.7057249546051025, "loss/fcd": 1.1640625, "loss/logits": 0.2593592405319214, "step": 528 }, { "epoch": 0.0016797916931284135, "grad_norm": 0.423828125, "grad_norm_var": 0.014725732803344726, "learning_rate": 0.01, "loss": 1.5172, "loss/crossentropy": 2.707273006439209, "loss/fcd": 1.13671875, "loss/logits": 0.2673468589782715, "step": 529 }, { "epoch": 0.0016829671027562556, "grad_norm": 0.396484375, "grad_norm_var": 0.014701271057128906, "learning_rate": 0.01, "loss": 1.4816, "loss/crossentropy": 2.828641414642334, "loss/fcd": 1.2578125, "loss/logits": 0.310713455080986, "step": 530 }, { "epoch": 0.0016861425123840975, "grad_norm": 0.375, "grad_norm_var": 0.014287805557250977, "learning_rate": 0.01, "loss": 1.4477, "loss/crossentropy": 2.784609794616699, "loss/fcd": 1.21484375, "loss/logits": 0.276685893535614, "step": 531 }, { "epoch": 0.0016893179220119395, "grad_norm": 0.373046875, "grad_norm_var": 0.014276234308878581, "learning_rate": 0.01, "loss": 1.5054, "loss/crossentropy": 2.5702160596847534, "loss/fcd": 1.30078125, "loss/logits": 0.36388683319091797, "step": 532 }, { "epoch": 0.0016924933316397816, "grad_norm": 0.41796875, "grad_norm_var": 0.013768959045410156, "learning_rate": 0.01, "loss": 1.5418, "loss/crossentropy": 2.8089823722839355, "loss/fcd": 1.23046875, "loss/logits": 0.2930210679769516, "step": 533 }, { "epoch": 0.0016956687412676235, "grad_norm": 0.361328125, "grad_norm_var": 0.013814735412597656, "learning_rate": 0.01, "loss": 1.501, "loss/crossentropy": 2.4470736980438232, "loss/fcd": 1.31640625, "loss/logits": 0.332662433385849, "step": 534 }, { "epoch": 0.0016988441508954655, "grad_norm": 0.314453125, "grad_norm_var": 0.014026133219401042, "learning_rate": 0.01, "loss": 1.3948, "loss/crossentropy": 2.675025701522827, "loss/fcd": 1.1796875, "loss/logits": 0.2879149913787842, "step": 535 }, { "epoch": 0.0017020195605233076, "grad_norm": 0.353515625, "grad_norm_var": 0.013733402887980143, "learning_rate": 0.01, "loss": 1.4171, "loss/crossentropy": 2.5031471252441406, "loss/fcd": 1.1171875, "loss/logits": 0.24770331382751465, "step": 536 }, { "epoch": 0.0017051949701511495, "grad_norm": 0.3515625, "grad_norm_var": 0.013702646891276041, "learning_rate": 0.01, "loss": 1.4343, "loss/crossentropy": 2.68326735496521, "loss/fcd": 1.17578125, "loss/logits": 0.2794763445854187, "step": 537 }, { "epoch": 0.0017083703797789915, "grad_norm": 0.3203125, "grad_norm_var": 0.01397563616434733, "learning_rate": 0.01, "loss": 1.4218, "loss/crossentropy": 2.5284366607666016, "loss/fcd": 1.1640625, "loss/logits": 0.2877808213233948, "step": 538 }, { "epoch": 0.0017115457894068334, "grad_norm": 0.333984375, "grad_norm_var": 0.014055633544921875, "learning_rate": 0.01, "loss": 1.4537, "loss/crossentropy": 2.5142822265625, "loss/fcd": 1.14453125, "loss/logits": 0.2755347788333893, "step": 539 }, { "epoch": 0.0017147211990346755, "grad_norm": 0.30078125, "grad_norm_var": 0.013940159479777019, "learning_rate": 0.01, "loss": 1.4063, "loss/crossentropy": 2.613142251968384, "loss/fcd": 1.14453125, "loss/logits": 0.25680992007255554, "step": 540 }, { "epoch": 0.0017178966086625175, "grad_norm": 0.357421875, "grad_norm_var": 0.013756545384724934, "learning_rate": 0.01, "loss": 1.4218, "loss/crossentropy": 2.484408974647522, "loss/fcd": 1.11328125, "loss/logits": 0.23990514874458313, "step": 541 }, { "epoch": 0.0017210720182903594, "grad_norm": 0.361328125, "grad_norm_var": 0.01344145139058431, "learning_rate": 0.01, "loss": 1.4771, "loss/crossentropy": 2.6911842823028564, "loss/fcd": 1.17578125, "loss/logits": 0.29066312313079834, "step": 542 }, { "epoch": 0.0017242474279182015, "grad_norm": 0.29296875, "grad_norm_var": 0.014026371637980144, "learning_rate": 0.01, "loss": 1.4113, "loss/crossentropy": 2.5860735177993774, "loss/fcd": 1.0703125, "loss/logits": 0.2284604012966156, "step": 543 }, { "epoch": 0.0017274228375460435, "grad_norm": 0.306640625, "grad_norm_var": 0.0015711466471354167, "learning_rate": 0.01, "loss": 1.4392, "loss/crossentropy": 2.6202419996261597, "loss/fcd": 1.2109375, "loss/logits": 0.2744522839784622, "step": 544 }, { "epoch": 0.0017305982471738854, "grad_norm": 0.34375, "grad_norm_var": 0.0012107690175374348, "learning_rate": 0.01, "loss": 1.4582, "loss/crossentropy": 2.581278920173645, "loss/fcd": 1.1484375, "loss/logits": 0.25072699785232544, "step": 545 }, { "epoch": 0.0017337736568017275, "grad_norm": 0.3125, "grad_norm_var": 0.0011034647623697916, "learning_rate": 0.01, "loss": 1.4697, "loss/crossentropy": 2.24593323469162, "loss/fcd": 1.140625, "loss/logits": 0.272368960082531, "step": 546 }, { "epoch": 0.0017369490664295693, "grad_norm": 0.3203125, "grad_norm_var": 0.0010518391927083334, "learning_rate": 0.01, "loss": 1.4589, "loss/crossentropy": 2.5771596431732178, "loss/fcd": 1.1328125, "loss/logits": 0.2503276988863945, "step": 547 }, { "epoch": 0.0017401244760574114, "grad_norm": 0.294921875, "grad_norm_var": 0.0010772705078125, "learning_rate": 0.01, "loss": 1.3908, "loss/crossentropy": 2.4931938648223877, "loss/fcd": 1.14453125, "loss/logits": 0.2743004411458969, "step": 548 }, { "epoch": 0.0017432998856852535, "grad_norm": 0.359375, "grad_norm_var": 0.0006357192993164063, "learning_rate": 0.01, "loss": 1.4196, "loss/crossentropy": 2.6194196939468384, "loss/fcd": 1.16796875, "loss/logits": 0.25608105957508087, "step": 549 }, { "epoch": 0.0017464752953130953, "grad_norm": 0.32421875, "grad_norm_var": 0.0005683739980061849, "learning_rate": 0.01, "loss": 1.408, "loss/crossentropy": 2.6443508863449097, "loss/fcd": 1.1796875, "loss/logits": 0.2798755019903183, "step": 550 }, { "epoch": 0.0017496507049409374, "grad_norm": 0.314453125, "grad_norm_var": 0.0005683739980061849, "learning_rate": 0.01, "loss": 1.4003, "loss/crossentropy": 2.4457504749298096, "loss/fcd": 1.16015625, "loss/logits": 0.2776547968387604, "step": 551 }, { "epoch": 0.0017528261145687795, "grad_norm": 0.314453125, "grad_norm_var": 0.000530862808227539, "learning_rate": 0.01, "loss": 1.3963, "loss/crossentropy": 2.410616159439087, "loss/fcd": 1.109375, "loss/logits": 0.25483644008636475, "step": 552 }, { "epoch": 0.0017560015241966213, "grad_norm": 0.2890625, "grad_norm_var": 0.0005583286285400391, "learning_rate": 0.01, "loss": 1.3719, "loss/crossentropy": 2.743094325065613, "loss/fcd": 1.15625, "loss/logits": 0.2586950957775116, "step": 553 }, { "epoch": 0.0017591769338244634, "grad_norm": 0.294921875, "grad_norm_var": 0.0006031672159830729, "learning_rate": 0.01, "loss": 1.4165, "loss/crossentropy": 2.5197595357894897, "loss/fcd": 1.1171875, "loss/logits": 0.2635814994573593, "step": 554 }, { "epoch": 0.0017623523434523052, "grad_norm": 0.333984375, "grad_norm_var": 0.0006031672159830729, "learning_rate": 0.01, "loss": 1.4253, "loss/crossentropy": 2.327980160713196, "loss/fcd": 1.09765625, "loss/logits": 0.25148363411426544, "step": 555 }, { "epoch": 0.0017655277530801473, "grad_norm": 0.357421875, "grad_norm_var": 0.0006580193837483724, "learning_rate": 0.01, "loss": 1.4837, "loss/crossentropy": 2.6026593446731567, "loss/fcd": 1.2421875, "loss/logits": 0.31184278428554535, "step": 556 }, { "epoch": 0.0017687031627079894, "grad_norm": 0.326171875, "grad_norm_var": 0.0005781650543212891, "learning_rate": 0.01, "loss": 1.517, "loss/crossentropy": 2.6219998598098755, "loss/fcd": 1.30078125, "loss/logits": 0.31159134209156036, "step": 557 }, { "epoch": 0.0017718785723358312, "grad_norm": 0.337890625, "grad_norm_var": 0.0004885196685791016, "learning_rate": 0.01, "loss": 1.4981, "loss/crossentropy": 2.541996479034424, "loss/fcd": 1.1796875, "loss/logits": 0.2920401245355606, "step": 558 }, { "epoch": 0.0017750539819636733, "grad_norm": 0.3125, "grad_norm_var": 0.00044147173563639325, "learning_rate": 0.01, "loss": 1.4294, "loss/crossentropy": 2.537493109703064, "loss/fcd": 1.18359375, "loss/logits": 0.2771601378917694, "step": 559 }, { "epoch": 0.0017782293915915154, "grad_norm": 0.357421875, "grad_norm_var": 0.000502634048461914, "learning_rate": 0.01, "loss": 1.4418, "loss/crossentropy": 2.2772059440612793, "loss/fcd": 1.0546875, "loss/logits": 0.21753863990306854, "step": 560 }, { "epoch": 0.0017814048012193572, "grad_norm": 0.357421875, "grad_norm_var": 0.0005492528279622395, "learning_rate": 0.01, "loss": 1.4031, "loss/crossentropy": 2.5261411666870117, "loss/fcd": 1.21484375, "loss/logits": 0.2889343798160553, "step": 561 }, { "epoch": 0.0017845802108471993, "grad_norm": 0.306640625, "grad_norm_var": 0.0005615075429280599, "learning_rate": 0.01, "loss": 1.4255, "loss/crossentropy": 2.7319823503494263, "loss/fcd": 1.140625, "loss/logits": 0.2600414454936981, "step": 562 }, { "epoch": 0.0017877556204750414, "grad_norm": 0.3046875, "grad_norm_var": 0.0005866845448811848, "learning_rate": 0.01, "loss": 1.374, "loss/crossentropy": 2.31773442029953, "loss/fcd": 1.09375, "loss/logits": 0.24347911775112152, "step": 563 }, { "epoch": 0.0017909310301028832, "grad_norm": 0.37109375, "grad_norm_var": 0.0006530125935872396, "learning_rate": 0.01, "loss": 1.4424, "loss/crossentropy": 2.505849838256836, "loss/fcd": 1.1640625, "loss/logits": 0.2696918398141861, "step": 564 }, { "epoch": 0.0017941064397307253, "grad_norm": 0.357421875, "grad_norm_var": 0.000645303726196289, "learning_rate": 0.01, "loss": 1.4955, "loss/crossentropy": 2.4948384761810303, "loss/fcd": 1.17578125, "loss/logits": 0.2777717113494873, "step": 565 }, { "epoch": 0.0017972818493585672, "grad_norm": 0.3828125, "grad_norm_var": 0.000824594497680664, "learning_rate": 0.01, "loss": 1.5011, "loss/crossentropy": 2.968953847885132, "loss/fcd": 1.31640625, "loss/logits": 0.3360586166381836, "step": 566 }, { "epoch": 0.0018004572589864092, "grad_norm": 0.31640625, "grad_norm_var": 0.000820159912109375, "learning_rate": 0.01, "loss": 1.4092, "loss/crossentropy": 2.330485701560974, "loss/fcd": 1.05859375, "loss/logits": 0.2367456927895546, "step": 567 }, { "epoch": 0.0018036326686142513, "grad_norm": 0.35546875, "grad_norm_var": 0.0008265018463134765, "learning_rate": 0.01, "loss": 1.467, "loss/crossentropy": 2.3054102659225464, "loss/fcd": 1.19921875, "loss/logits": 0.25663553178310394, "step": 568 }, { "epoch": 0.0018068080782420932, "grad_norm": 0.341796875, "grad_norm_var": 0.000676727294921875, "learning_rate": 0.01, "loss": 1.433, "loss/crossentropy": 2.4399633407592773, "loss/fcd": 1.09375, "loss/logits": 0.22539356350898743, "step": 569 }, { "epoch": 0.0018099834878699352, "grad_norm": 0.369140625, "grad_norm_var": 0.0005909601847330729, "learning_rate": 0.01, "loss": 1.5069, "loss/crossentropy": 2.59701406955719, "loss/fcd": 1.2734375, "loss/logits": 0.30646806955337524, "step": 570 }, { "epoch": 0.0018131588974977773, "grad_norm": 0.326171875, "grad_norm_var": 0.0006041844685872396, "learning_rate": 0.01, "loss": 1.4505, "loss/crossentropy": 2.680655598640442, "loss/fcd": 1.18359375, "loss/logits": 0.2872551530599594, "step": 571 }, { "epoch": 0.0018163343071256192, "grad_norm": 0.322265625, "grad_norm_var": 0.0006116231282552084, "learning_rate": 0.01, "loss": 1.3987, "loss/crossentropy": 2.4663596153259277, "loss/fcd": 1.1171875, "loss/logits": 0.24074437469244003, "step": 572 }, { "epoch": 0.0018195097167534612, "grad_norm": 0.333984375, "grad_norm_var": 0.0006006876627604167, "learning_rate": 0.01, "loss": 1.4039, "loss/crossentropy": 2.4291341304779053, "loss/fcd": 1.125, "loss/logits": 0.2391839101910591, "step": 573 }, { "epoch": 0.001822685126381303, "grad_norm": 0.30078125, "grad_norm_var": 0.0007012526194254558, "learning_rate": 0.01, "loss": 1.3876, "loss/crossentropy": 2.355897307395935, "loss/fcd": 1.1484375, "loss/logits": 0.25034745037555695, "step": 574 }, { "epoch": 0.0018258605360091452, "grad_norm": 0.318359375, "grad_norm_var": 0.0006830851236979167, "learning_rate": 0.01, "loss": 1.5194, "loss/crossentropy": 2.3032115697860718, "loss/fcd": 1.140625, "loss/logits": 0.23866456001996994, "step": 575 }, { "epoch": 0.0018290359456369872, "grad_norm": 0.30859375, "grad_norm_var": 0.0007112979888916016, "learning_rate": 0.01, "loss": 1.4578, "loss/crossentropy": 2.6512651443481445, "loss/fcd": 1.1875, "loss/logits": 0.29944591224193573, "step": 576 }, { "epoch": 0.001832211355264829, "grad_norm": 0.376953125, "grad_norm_var": 0.0007914066314697266, "learning_rate": 0.01, "loss": 1.43, "loss/crossentropy": 2.230044901371002, "loss/fcd": 1.16796875, "loss/logits": 0.24575766921043396, "step": 577 }, { "epoch": 0.0018353867648926712, "grad_norm": 0.349609375, "grad_norm_var": 0.0007326602935791016, "learning_rate": 0.01, "loss": 1.4654, "loss/crossentropy": 2.741807699203491, "loss/fcd": 1.19921875, "loss/logits": 0.2666233777999878, "step": 578 }, { "epoch": 0.0018385621745205132, "grad_norm": 0.4375, "grad_norm_var": 0.0012147108713785808, "learning_rate": 0.01, "loss": 1.4397, "loss/crossentropy": 2.496319532394409, "loss/fcd": 1.09375, "loss/logits": 0.24659114331007004, "step": 579 }, { "epoch": 0.001841737584148355, "grad_norm": 0.326171875, "grad_norm_var": 0.0012026468912760416, "learning_rate": 0.01, "loss": 1.4324, "loss/crossentropy": 2.6676981449127197, "loss/fcd": 1.17578125, "loss/logits": 0.2621256411075592, "step": 580 }, { "epoch": 0.0018449129937761972, "grad_norm": 0.298828125, "grad_norm_var": 0.0013218561808268229, "learning_rate": 0.01, "loss": 1.3545, "loss/crossentropy": 2.163087010383606, "loss/fcd": 1.0546875, "loss/logits": 0.22825468331575394, "step": 581 }, { "epoch": 0.001848088403404039, "grad_norm": 0.42578125, "grad_norm_var": 0.0016736348470052084, "learning_rate": 0.01, "loss": 1.4465, "loss/crossentropy": 3.120112419128418, "loss/fcd": 1.265625, "loss/logits": 0.2956196069717407, "step": 582 }, { "epoch": 0.001851263813031881, "grad_norm": 0.32421875, "grad_norm_var": 0.0016484578450520833, "learning_rate": 0.01, "loss": 1.451, "loss/crossentropy": 2.5830010175704956, "loss/fcd": 1.20703125, "loss/logits": 0.29329709708690643, "step": 583 }, { "epoch": 0.0018544392226597232, "grad_norm": 0.375, "grad_norm_var": 0.0017002741495768228, "learning_rate": 0.01, "loss": 1.4339, "loss/crossentropy": 2.678964376449585, "loss/fcd": 1.12109375, "loss/logits": 0.2558753862977028, "step": 584 }, { "epoch": 0.001857614632287565, "grad_norm": 0.3125, "grad_norm_var": 0.0017701307932535806, "learning_rate": 0.01, "loss": 1.4185, "loss/crossentropy": 2.3820152282714844, "loss/fcd": 1.171875, "loss/logits": 0.26917869597673416, "step": 585 }, { "epoch": 0.001860790041915407, "grad_norm": 0.31640625, "grad_norm_var": 0.0017679850260416666, "learning_rate": 0.01, "loss": 1.4387, "loss/crossentropy": 2.5672844648361206, "loss/fcd": 1.1484375, "loss/logits": 0.258611723780632, "step": 586 }, { "epoch": 0.0018639654515432492, "grad_norm": 0.298828125, "grad_norm_var": 0.0018681208292643228, "learning_rate": 0.01, "loss": 1.4066, "loss/crossentropy": 2.4626046419143677, "loss/fcd": 1.09375, "loss/logits": 0.25758662819862366, "step": 587 }, { "epoch": 0.001867140861171091, "grad_norm": 0.33203125, "grad_norm_var": 0.0018521467844645181, "learning_rate": 0.01, "loss": 1.4533, "loss/crossentropy": 2.604653835296631, "loss/fcd": 1.17578125, "loss/logits": 0.2763136774301529, "step": 588 }, { "epoch": 0.001870316270798933, "grad_norm": 0.314453125, "grad_norm_var": 0.0018909295399983724, "learning_rate": 0.01, "loss": 1.4396, "loss/crossentropy": 2.775049090385437, "loss/fcd": 1.17578125, "loss/logits": 0.26809555292129517, "step": 589 }, { "epoch": 0.001873491680426775, "grad_norm": 0.302734375, "grad_norm_var": 0.0018813451131184896, "learning_rate": 0.01, "loss": 1.3484, "loss/crossentropy": 2.436235785484314, "loss/fcd": 1.125, "loss/logits": 0.2635025605559349, "step": 590 }, { "epoch": 0.001876667090054617, "grad_norm": 0.48828125, "grad_norm_var": 0.0032268365224202475, "learning_rate": 0.01, "loss": 1.5049, "loss/crossentropy": 2.6901278495788574, "loss/fcd": 1.23046875, "loss/logits": 0.2944987267255783, "step": 591 }, { "epoch": 0.001879842499682459, "grad_norm": 0.310546875, "grad_norm_var": 0.0032164891560872394, "learning_rate": 0.01, "loss": 1.4463, "loss/crossentropy": 2.3182199001312256, "loss/fcd": 1.109375, "loss/logits": 0.2564455643296242, "step": 592 }, { "epoch": 0.001883017909310301, "grad_norm": 0.3046875, "grad_norm_var": 0.0032770633697509766, "learning_rate": 0.01, "loss": 1.4154, "loss/crossentropy": 2.633136034011841, "loss/fcd": 1.19921875, "loss/logits": 0.2994384318590164, "step": 593 }, { "epoch": 0.001886193318938143, "grad_norm": 0.294921875, "grad_norm_var": 0.003429269790649414, "learning_rate": 0.01, "loss": 1.3809, "loss/crossentropy": 2.471627354621887, "loss/fcd": 1.12890625, "loss/logits": 0.24968092143535614, "step": 594 }, { "epoch": 0.001889368728565985, "grad_norm": 0.337890625, "grad_norm_var": 0.002773475646972656, "learning_rate": 0.01, "loss": 1.5153, "loss/crossentropy": 2.423085331916809, "loss/fcd": 1.2421875, "loss/logits": 0.2946561872959137, "step": 595 }, { "epoch": 0.001892544138193827, "grad_norm": 0.341796875, "grad_norm_var": 0.002769915262858073, "learning_rate": 0.01, "loss": 1.4629, "loss/crossentropy": 2.705485224723816, "loss/fcd": 1.14453125, "loss/logits": 0.2634154409170151, "step": 596 }, { "epoch": 0.001895719547821669, "grad_norm": 0.314453125, "grad_norm_var": 0.002707354227701823, "learning_rate": 0.01, "loss": 1.4277, "loss/crossentropy": 2.5945022106170654, "loss/fcd": 1.14453125, "loss/logits": 0.268547847867012, "step": 597 }, { "epoch": 0.0018988949574495109, "grad_norm": 0.91796875, "grad_norm_var": 0.023663775126139323, "learning_rate": 0.01, "loss": 1.5652, "loss/crossentropy": 2.545665144920349, "loss/fcd": 1.15625, "loss/logits": 0.2540304958820343, "step": 598 }, { "epoch": 0.001902070367077353, "grad_norm": 0.337890625, "grad_norm_var": 0.023595794041951498, "learning_rate": 0.01, "loss": 1.3804, "loss/crossentropy": 2.5585720539093018, "loss/fcd": 1.16796875, "loss/logits": 0.2599219083786011, "step": 599 }, { "epoch": 0.001905245776705195, "grad_norm": 0.357421875, "grad_norm_var": 0.02360051472981771, "learning_rate": 0.01, "loss": 1.4922, "loss/crossentropy": 2.3954579830169678, "loss/fcd": 1.18359375, "loss/logits": 0.2601335644721985, "step": 600 }, { "epoch": 0.0019084211863330369, "grad_norm": 0.314453125, "grad_norm_var": 0.023586384455362954, "learning_rate": 0.01, "loss": 1.412, "loss/crossentropy": 2.3822191953659058, "loss/fcd": 1.13671875, "loss/logits": 0.2504914551973343, "step": 601 }, { "epoch": 0.001911596595960879, "grad_norm": 0.76953125, "grad_norm_var": 0.033314116795857746, "learning_rate": 0.01, "loss": 1.4561, "loss/crossentropy": 2.870268940925598, "loss/fcd": 1.20703125, "loss/logits": 0.28852511942386627, "step": 602 }, { "epoch": 0.001914772005588721, "grad_norm": 0.43359375, "grad_norm_var": 0.03270104726155599, "learning_rate": 0.01, "loss": 1.5516, "loss/crossentropy": 2.524397850036621, "loss/fcd": 1.12890625, "loss/logits": 0.257030688226223, "step": 603 }, { "epoch": 0.0019179474152165629, "grad_norm": 0.31640625, "grad_norm_var": 0.03286736806233724, "learning_rate": 0.01, "loss": 1.4158, "loss/crossentropy": 2.5887163877487183, "loss/fcd": 1.20703125, "loss/logits": 0.2740444391965866, "step": 604 }, { "epoch": 0.001921122824844405, "grad_norm": 0.32421875, "grad_norm_var": 0.03275729815165202, "learning_rate": 0.01, "loss": 1.3928, "loss/crossentropy": 2.5203968286514282, "loss/fcd": 1.09765625, "loss/logits": 0.23689764738082886, "step": 605 }, { "epoch": 0.001924298234472247, "grad_norm": 0.337890625, "grad_norm_var": 0.032359043757120766, "learning_rate": 0.01, "loss": 1.426, "loss/crossentropy": 2.7229238748550415, "loss/fcd": 1.16015625, "loss/logits": 0.2714942395687103, "step": 606 }, { "epoch": 0.0019274736441000889, "grad_norm": 0.357421875, "grad_norm_var": 0.032000160217285155, "learning_rate": 0.01, "loss": 1.4557, "loss/crossentropy": 2.5514140129089355, "loss/fcd": 1.1328125, "loss/logits": 0.27445633709430695, "step": 607 }, { "epoch": 0.001930649053727931, "grad_norm": 0.337890625, "grad_norm_var": 0.03172734578450521, "learning_rate": 0.01, "loss": 1.51, "loss/crossentropy": 2.622326135635376, "loss/fcd": 1.32421875, "loss/logits": 0.32083553075790405, "step": 608 }, { "epoch": 0.0019338244633557728, "grad_norm": 0.298828125, "grad_norm_var": 0.03180387814839681, "learning_rate": 0.01, "loss": 1.3486, "loss/crossentropy": 2.5313292741775513, "loss/fcd": 1.09765625, "loss/logits": 0.2378472462296486, "step": 609 }, { "epoch": 0.0019369998729836149, "grad_norm": 0.322265625, "grad_norm_var": 0.031469202041625975, "learning_rate": 0.01, "loss": 1.4026, "loss/crossentropy": 2.4071370363235474, "loss/fcd": 1.08203125, "loss/logits": 0.2657105177640915, "step": 610 }, { "epoch": 0.001940175282611457, "grad_norm": 0.3046875, "grad_norm_var": 0.031818580627441403, "learning_rate": 0.01, "loss": 1.4167, "loss/crossentropy": 2.405640125274658, "loss/fcd": 1.19140625, "loss/logits": 0.2946714460849762, "step": 611 }, { "epoch": 0.0019433506922392988, "grad_norm": 0.33203125, "grad_norm_var": 0.031899245580037434, "learning_rate": 0.01, "loss": 1.4588, "loss/crossentropy": 2.3646132946014404, "loss/fcd": 1.28125, "loss/logits": 0.3089660108089447, "step": 612 }, { "epoch": 0.0019465261018671409, "grad_norm": 0.3046875, "grad_norm_var": 0.032014719645182294, "learning_rate": 0.01, "loss": 1.3631, "loss/crossentropy": 2.4248170852661133, "loss/fcd": 1.12890625, "loss/logits": 0.25212036073207855, "step": 613 }, { "epoch": 0.001949701511494983, "grad_norm": 0.333984375, "grad_norm_var": 0.012838474909464518, "learning_rate": 0.01, "loss": 1.4496, "loss/crossentropy": 2.5949106216430664, "loss/fcd": 1.1640625, "loss/logits": 0.28406448662281036, "step": 614 }, { "epoch": 0.0019528769211228248, "grad_norm": 0.33203125, "grad_norm_var": 0.012859026590983072, "learning_rate": 0.01, "loss": 1.4813, "loss/crossentropy": 2.7447903156280518, "loss/fcd": 1.24609375, "loss/logits": 0.298361673951149, "step": 615 }, { "epoch": 0.0019560523307506667, "grad_norm": 0.326171875, "grad_norm_var": 0.012935320536295572, "learning_rate": 0.01, "loss": 1.3875, "loss/crossentropy": 2.359969735145569, "loss/fcd": 1.08984375, "loss/logits": 0.24715407937765121, "step": 616 }, { "epoch": 0.001959227740378509, "grad_norm": 0.328125, "grad_norm_var": 0.012865559260050455, "learning_rate": 0.01, "loss": 1.4656, "loss/crossentropy": 2.637782573699951, "loss/fcd": 1.21484375, "loss/logits": 0.32779377698898315, "step": 617 }, { "epoch": 0.001962403150006351, "grad_norm": 0.302734375, "grad_norm_var": 0.0009943008422851562, "learning_rate": 0.01, "loss": 1.4211, "loss/crossentropy": 2.5846203565597534, "loss/fcd": 1.2109375, "loss/logits": 0.30234503746032715, "step": 618 }, { "epoch": 0.0019655785596341927, "grad_norm": 0.314453125, "grad_norm_var": 0.0002487023671468099, "learning_rate": 0.01, "loss": 1.4833, "loss/crossentropy": 2.716583251953125, "loss/fcd": 1.1875, "loss/logits": 0.2635141611099243, "step": 619 }, { "epoch": 0.001968753969262035, "grad_norm": 0.3125, "grad_norm_var": 0.0002532800038655599, "learning_rate": 0.01, "loss": 1.4597, "loss/crossentropy": 2.3238495588302612, "loss/fcd": 1.140625, "loss/logits": 0.2640545666217804, "step": 620 }, { "epoch": 0.001971929378889877, "grad_norm": 0.3125, "grad_norm_var": 0.0002601464589436849, "learning_rate": 0.01, "loss": 1.4681, "loss/crossentropy": 2.7349835634231567, "loss/fcd": 1.1875, "loss/logits": 0.28571614623069763, "step": 621 }, { "epoch": 0.0019751047885177187, "grad_norm": 0.2890625, "grad_norm_var": 0.0003082275390625, "learning_rate": 0.01, "loss": 1.3909, "loss/crossentropy": 2.5959324836730957, "loss/fcd": 1.17578125, "loss/logits": 0.2575627267360687, "step": 622 }, { "epoch": 0.001978280198145561, "grad_norm": 0.333984375, "grad_norm_var": 0.000223541259765625, "learning_rate": 0.01, "loss": 1.4718, "loss/crossentropy": 2.50488817691803, "loss/fcd": 1.078125, "loss/logits": 0.26162026822566986, "step": 623 }, { "epoch": 0.001981455607773403, "grad_norm": 0.341796875, "grad_norm_var": 0.0002349217732747396, "learning_rate": 0.01, "loss": 1.3964, "loss/crossentropy": 2.354801058769226, "loss/fcd": 1.04296875, "loss/logits": 0.22930779308080673, "step": 624 }, { "epoch": 0.0019846310174012447, "grad_norm": 0.302734375, "grad_norm_var": 0.000225830078125, "learning_rate": 0.01, "loss": 1.4053, "loss/crossentropy": 2.3710676431655884, "loss/fcd": 1.1484375, "loss/logits": 0.26326730847358704, "step": 625 }, { "epoch": 0.001987806427029087, "grad_norm": 0.34765625, "grad_norm_var": 0.00027934710184733075, "learning_rate": 0.01, "loss": 1.5165, "loss/crossentropy": 2.4967103004455566, "loss/fcd": 1.13671875, "loss/logits": 0.2597675621509552, "step": 626 }, { "epoch": 0.001990981836656929, "grad_norm": 0.294921875, "grad_norm_var": 0.00030517578125, "learning_rate": 0.01, "loss": 1.3852, "loss/crossentropy": 2.4198557138442993, "loss/fcd": 1.119140625, "loss/logits": 0.2657344192266464, "step": 627 }, { "epoch": 0.0019941572462847707, "grad_norm": 0.34765625, "grad_norm_var": 0.00034688313802083335, "learning_rate": 0.01, "loss": 1.4237, "loss/crossentropy": 2.6308071613311768, "loss/fcd": 1.04296875, "loss/logits": 0.24051672220230103, "step": 628 }, { "epoch": 0.0019973326559126125, "grad_norm": 0.3125, "grad_norm_var": 0.0003344217936197917, "learning_rate": 0.01, "loss": 1.4174, "loss/crossentropy": 2.283755898475647, "loss/fcd": 1.16015625, "loss/logits": 0.24410566687583923, "step": 629 }, { "epoch": 0.002000508065540455, "grad_norm": 0.3125, "grad_norm_var": 0.0003255049387613932, "learning_rate": 0.01, "loss": 1.4492, "loss/crossentropy": 2.2229156494140625, "loss/fcd": 1.1484375, "loss/logits": 0.2539399638772011, "step": 630 }, { "epoch": 0.0020036834751682967, "grad_norm": 0.32421875, "grad_norm_var": 0.0003162225087483724, "learning_rate": 0.01, "loss": 1.4671, "loss/crossentropy": 2.5931116342544556, "loss/fcd": 1.2421875, "loss/logits": 0.2689453810453415, "step": 631 }, { "epoch": 0.0020068588847961385, "grad_norm": 0.357421875, "grad_norm_var": 0.0004072666168212891, "learning_rate": 0.01, "loss": 1.4177, "loss/crossentropy": 2.5985732078552246, "loss/fcd": 1.24609375, "loss/logits": 0.2685824930667877, "step": 632 }, { "epoch": 0.002010034294423981, "grad_norm": 0.37890625, "grad_norm_var": 0.0006172021230061848, "learning_rate": 0.01, "loss": 1.5423, "loss/crossentropy": 2.6069706678390503, "loss/fcd": 1.2421875, "loss/logits": 0.2848104387521744, "step": 633 }, { "epoch": 0.0020132097040518227, "grad_norm": 0.3359375, "grad_norm_var": 0.0005915323893229167, "learning_rate": 0.01, "loss": 1.5052, "loss/crossentropy": 2.5721887350082397, "loss/fcd": 1.2890625, "loss/logits": 0.3021959960460663, "step": 634 }, { "epoch": 0.0020163851136796645, "grad_norm": 0.36328125, "grad_norm_var": 0.0006642500559488932, "learning_rate": 0.01, "loss": 1.5034, "loss/crossentropy": 2.6439785957336426, "loss/fcd": 1.31640625, "loss/logits": 0.30735690891742706, "step": 635 }, { "epoch": 0.002019560523307507, "grad_norm": 0.7421875, "grad_norm_var": 0.01124558448791504, "learning_rate": 0.01, "loss": 1.435, "loss/crossentropy": 2.5221176147460938, "loss/fcd": 1.08984375, "loss/logits": 0.2539301812648773, "step": 636 }, { "epoch": 0.0020227359329353487, "grad_norm": 0.333984375, "grad_norm_var": 0.01114959716796875, "learning_rate": 0.01, "loss": 1.41, "loss/crossentropy": 2.700472831726074, "loss/fcd": 1.1484375, "loss/logits": 0.2744401842355728, "step": 637 }, { "epoch": 0.0020259113425631905, "grad_norm": 0.359375, "grad_norm_var": 0.010817718505859376, "learning_rate": 0.01, "loss": 1.5295, "loss/crossentropy": 2.5189380645751953, "loss/fcd": 1.18359375, "loss/logits": 0.2685580551624298, "step": 638 }, { "epoch": 0.002029086752191033, "grad_norm": 0.30078125, "grad_norm_var": 0.011009836196899414, "learning_rate": 0.01, "loss": 1.4192, "loss/crossentropy": 2.3201377987861633, "loss/fcd": 1.1484375, "loss/logits": 0.257222980260849, "step": 639 }, { "epoch": 0.0020322621618188747, "grad_norm": 0.302734375, "grad_norm_var": 0.011198663711547851, "learning_rate": 0.01, "loss": 1.3745, "loss/crossentropy": 2.53426992893219, "loss/fcd": 1.10546875, "loss/logits": 0.24480676651000977, "step": 640 }, { "epoch": 0.0020354375714467165, "grad_norm": 0.328125, "grad_norm_var": 0.011054229736328126, "learning_rate": 0.01, "loss": 1.5066, "loss/crossentropy": 2.820248007774353, "loss/fcd": 1.22265625, "loss/logits": 0.29276375472545624, "step": 641 }, { "epoch": 0.002038612981074559, "grad_norm": 0.298828125, "grad_norm_var": 0.01127635637919108, "learning_rate": 0.01, "loss": 1.4417, "loss/crossentropy": 2.373296022415161, "loss/fcd": 1.19921875, "loss/logits": 0.2656339704990387, "step": 642 }, { "epoch": 0.0020417883907024007, "grad_norm": 0.34765625, "grad_norm_var": 0.011021868387858073, "learning_rate": 0.01, "loss": 1.4794, "loss/crossentropy": 2.6577214002609253, "loss/fcd": 1.1328125, "loss/logits": 0.2590339481830597, "step": 643 }, { "epoch": 0.0020449638003302425, "grad_norm": 0.330078125, "grad_norm_var": 0.011068073908487956, "learning_rate": 0.01, "loss": 1.4814, "loss/crossentropy": 2.433274507522583, "loss/fcd": 1.26953125, "loss/logits": 0.30881068110466003, "step": 644 }, { "epoch": 0.002048139209958085, "grad_norm": 0.34375, "grad_norm_var": 0.010939391454060872, "learning_rate": 0.01, "loss": 1.4719, "loss/crossentropy": 2.7279518842697144, "loss/fcd": 1.19140625, "loss/logits": 0.28452087938785553, "step": 645 }, { "epoch": 0.0020513146195859267, "grad_norm": 0.3671875, "grad_norm_var": 0.010780064264933269, "learning_rate": 0.01, "loss": 1.4783, "loss/crossentropy": 2.6356825828552246, "loss/fcd": 1.2578125, "loss/logits": 0.3002449870109558, "step": 646 }, { "epoch": 0.0020544900292137685, "grad_norm": 0.53515625, "grad_norm_var": 0.01245891253153483, "learning_rate": 0.01, "loss": 1.5708, "loss/crossentropy": 2.6528849601745605, "loss/fcd": 1.4296875, "loss/logits": 0.37231509387493134, "step": 647 }, { "epoch": 0.0020576654388416104, "grad_norm": 0.337890625, "grad_norm_var": 0.012532663345336915, "learning_rate": 0.01, "loss": 1.5023, "loss/crossentropy": 2.6329965591430664, "loss/fcd": 1.26953125, "loss/logits": 0.29040248692035675, "step": 648 }, { "epoch": 0.0020608408484694527, "grad_norm": 0.298828125, "grad_norm_var": 0.012895647684733074, "learning_rate": 0.01, "loss": 1.3782, "loss/crossentropy": 2.3247926235198975, "loss/fcd": 1.08984375, "loss/logits": 0.238947331905365, "step": 649 }, { "epoch": 0.0020640162580972945, "grad_norm": 0.306640625, "grad_norm_var": 0.013083759943644207, "learning_rate": 0.01, "loss": 1.4313, "loss/crossentropy": 2.667474627494812, "loss/fcd": 1.234375, "loss/logits": 0.29370683431625366, "step": 650 }, { "epoch": 0.0020671916677251364, "grad_norm": 0.33984375, "grad_norm_var": 0.013134495417277018, "learning_rate": 0.01, "loss": 1.4507, "loss/crossentropy": 2.687560558319092, "loss/fcd": 1.1640625, "loss/logits": 0.2673056125640869, "step": 651 }, { "epoch": 0.0020703670773529787, "grad_norm": 0.34765625, "grad_norm_var": 0.003129943211873372, "learning_rate": 0.01, "loss": 1.4838, "loss/crossentropy": 2.6628637313842773, "loss/fcd": 1.2109375, "loss/logits": 0.2936474680900574, "step": 652 }, { "epoch": 0.0020735424869808205, "grad_norm": 0.314453125, "grad_norm_var": 0.003175719579060872, "learning_rate": 0.01, "loss": 1.4761, "loss/crossentropy": 2.5043389797210693, "loss/fcd": 1.1484375, "loss/logits": 0.2613539472222328, "step": 653 }, { "epoch": 0.0020767178966086624, "grad_norm": 0.33984375, "grad_norm_var": 0.003152195612589518, "learning_rate": 0.01, "loss": 1.4584, "loss/crossentropy": 2.564815640449524, "loss/fcd": 1.24609375, "loss/logits": 0.2867933213710785, "step": 654 }, { "epoch": 0.0020798933062365047, "grad_norm": 0.322265625, "grad_norm_var": 0.0030687967936197918, "learning_rate": 0.01, "loss": 1.3907, "loss/crossentropy": 2.521752119064331, "loss/fcd": 1.0625, "loss/logits": 0.24879375100135803, "step": 655 }, { "epoch": 0.0020830687158643465, "grad_norm": 0.361328125, "grad_norm_var": 0.002982012430826823, "learning_rate": 0.01, "loss": 1.4402, "loss/crossentropy": 2.33803927898407, "loss/fcd": 1.05078125, "loss/logits": 0.22626015543937683, "step": 656 }, { "epoch": 0.0020862441254921884, "grad_norm": 0.337890625, "grad_norm_var": 0.0029660383860270183, "learning_rate": 0.01, "loss": 1.4732, "loss/crossentropy": 2.4174643754959106, "loss/fcd": 1.21484375, "loss/logits": 0.27812931686639786, "step": 657 }, { "epoch": 0.0020894195351200307, "grad_norm": 0.384765625, "grad_norm_var": 0.0028919061024983723, "learning_rate": 0.01, "loss": 1.4966, "loss/crossentropy": 2.628118634223938, "loss/fcd": 1.19921875, "loss/logits": 0.27514004707336426, "step": 658 }, { "epoch": 0.0020925949447478725, "grad_norm": 0.337890625, "grad_norm_var": 0.0029021581013997395, "learning_rate": 0.01, "loss": 1.4109, "loss/crossentropy": 2.334811568260193, "loss/fcd": 1.1796875, "loss/logits": 0.27129536867141724, "step": 659 }, { "epoch": 0.0020957703543757144, "grad_norm": 0.30859375, "grad_norm_var": 0.002989053726196289, "learning_rate": 0.01, "loss": 1.4195, "loss/crossentropy": 2.149420142173767, "loss/fcd": 1.171875, "loss/logits": 0.2680782228708267, "step": 660 }, { "epoch": 0.0020989457640035567, "grad_norm": 0.328125, "grad_norm_var": 0.0030152479807535808, "learning_rate": 0.01, "loss": 1.4554, "loss/crossentropy": 2.656423568725586, "loss/fcd": 1.26953125, "loss/logits": 0.325935959815979, "step": 661 }, { "epoch": 0.0021021211736313985, "grad_norm": 0.359375, "grad_norm_var": 0.002999099095662435, "learning_rate": 0.01, "loss": 1.4312, "loss/crossentropy": 2.470800042152405, "loss/fcd": 1.1015625, "loss/logits": 0.25754159688949585, "step": 662 }, { "epoch": 0.0021052965832592404, "grad_norm": 0.37109375, "grad_norm_var": 0.0005771478017171223, "learning_rate": 0.01, "loss": 1.4342, "loss/crossentropy": 2.6609132289886475, "loss/fcd": 1.09765625, "loss/logits": 0.2304501011967659, "step": 663 }, { "epoch": 0.0021084719928870822, "grad_norm": 0.3125, "grad_norm_var": 0.0006153742472330729, "learning_rate": 0.01, "loss": 1.4536, "loss/crossentropy": 2.381262183189392, "loss/fcd": 1.14453125, "loss/logits": 0.24152355641126633, "step": 664 }, { "epoch": 0.0021116474025149245, "grad_norm": 0.33984375, "grad_norm_var": 0.0005189100901285807, "learning_rate": 0.01, "loss": 1.4686, "loss/crossentropy": 2.541628360748291, "loss/fcd": 1.15234375, "loss/logits": 0.2741599529981613, "step": 665 }, { "epoch": 0.0021148228121427664, "grad_norm": 0.34765625, "grad_norm_var": 0.00045115152994791666, "learning_rate": 0.01, "loss": 1.5625, "loss/crossentropy": 2.6435221433639526, "loss/fcd": 1.09765625, "loss/logits": 0.2723618447780609, "step": 666 }, { "epoch": 0.0021179982217706082, "grad_norm": 0.318359375, "grad_norm_var": 0.00048279762268066406, "learning_rate": 0.01, "loss": 1.4574, "loss/crossentropy": 2.561619997024536, "loss/fcd": 1.1640625, "loss/logits": 0.2773497402667999, "step": 667 }, { "epoch": 0.0021211736313984505, "grad_norm": 0.3359375, "grad_norm_var": 0.0004786014556884766, "learning_rate": 0.01, "loss": 1.4274, "loss/crossentropy": 2.344738245010376, "loss/fcd": 1.28515625, "loss/logits": 0.3049396872520447, "step": 668 }, { "epoch": 0.0021243490410262924, "grad_norm": 0.318359375, "grad_norm_var": 0.0004669030507405599, "learning_rate": 0.01, "loss": 1.4727, "loss/crossentropy": 2.2849442958831787, "loss/fcd": 1.140625, "loss/logits": 0.25174686312675476, "step": 669 }, { "epoch": 0.0021275244506541342, "grad_norm": 0.91015625, "grad_norm_var": 0.02086040178934733, "learning_rate": 0.01, "loss": 1.3991, "loss/crossentropy": 2.5691081285476685, "loss/fcd": 1.15625, "loss/logits": 0.2566594257950783, "step": 670 }, { "epoch": 0.0021306998602819765, "grad_norm": 0.50390625, "grad_norm_var": 0.021654192606608072, "learning_rate": 0.01, "loss": 1.568, "loss/crossentropy": 2.643552303314209, "loss/fcd": 1.1328125, "loss/logits": 0.25350765138864517, "step": 671 }, { "epoch": 0.0021338752699098184, "grad_norm": 0.3671875, "grad_norm_var": 0.02163707415262858, "learning_rate": 0.01, "loss": 1.3725, "loss/crossentropy": 2.505910038948059, "loss/fcd": 1.09375, "loss/logits": 0.23823265731334686, "step": 672 }, { "epoch": 0.0021370506795376602, "grad_norm": 0.2734375, "grad_norm_var": 0.02231318155924479, "learning_rate": 0.01, "loss": 1.3368, "loss/crossentropy": 2.520551562309265, "loss/fcd": 1.03515625, "loss/logits": 0.2299058511853218, "step": 673 }, { "epoch": 0.0021402260891655025, "grad_norm": 0.353515625, "grad_norm_var": 0.022364044189453126, "learning_rate": 0.01, "loss": 1.5173, "loss/crossentropy": 2.587928533554077, "loss/fcd": 1.1796875, "loss/logits": 0.2683887481689453, "step": 674 }, { "epoch": 0.0021434014987933444, "grad_norm": 0.3203125, "grad_norm_var": 0.022482919692993163, "learning_rate": 0.01, "loss": 1.4092, "loss/crossentropy": 2.409998297691345, "loss/fcd": 1.09765625, "loss/logits": 0.2581988647580147, "step": 675 }, { "epoch": 0.0021465769084211862, "grad_norm": 0.333984375, "grad_norm_var": 0.022283935546875, "learning_rate": 0.01, "loss": 1.4516, "loss/crossentropy": 2.5237138271331787, "loss/fcd": 1.1328125, "loss/logits": 0.25686848908662796, "step": 676 }, { "epoch": 0.0021497523180490285, "grad_norm": 0.359375, "grad_norm_var": 0.022125244140625, "learning_rate": 0.01, "loss": 1.4731, "loss/crossentropy": 2.3801904916763306, "loss/fcd": 1.1953125, "loss/logits": 0.26870596408843994, "step": 677 }, { "epoch": 0.0021529277276768704, "grad_norm": 0.328125, "grad_norm_var": 0.022283935546875, "learning_rate": 0.01, "loss": 1.448, "loss/crossentropy": 2.6016610860824585, "loss/fcd": 1.21875, "loss/logits": 0.29165321588516235, "step": 678 }, { "epoch": 0.0021561031373047122, "grad_norm": 0.3125, "grad_norm_var": 0.022574806213378908, "learning_rate": 0.01, "loss": 1.4234, "loss/crossentropy": 2.5239063501358032, "loss/fcd": 1.1171875, "loss/logits": 0.24302402883768082, "step": 679 }, { "epoch": 0.002159278546932554, "grad_norm": 0.302734375, "grad_norm_var": 0.022665007909138998, "learning_rate": 0.01, "loss": 1.4235, "loss/crossentropy": 2.3897151947021484, "loss/fcd": 1.0546875, "loss/logits": 0.23649680614471436, "step": 680 }, { "epoch": 0.0021624539565603964, "grad_norm": 0.328125, "grad_norm_var": 0.02273100217183431, "learning_rate": 0.01, "loss": 1.427, "loss/crossentropy": 2.730047106742859, "loss/fcd": 1.14453125, "loss/logits": 0.2729177922010422, "step": 681 }, { "epoch": 0.0021656293661882382, "grad_norm": 0.33203125, "grad_norm_var": 0.022805007298787434, "learning_rate": 0.01, "loss": 1.4067, "loss/crossentropy": 2.509036660194397, "loss/fcd": 1.16015625, "loss/logits": 0.2633977085351944, "step": 682 }, { "epoch": 0.00216880477581608, "grad_norm": 0.396484375, "grad_norm_var": 0.022597742080688477, "learning_rate": 0.01, "loss": 1.5531, "loss/crossentropy": 2.6248340606689453, "loss/fcd": 1.15234375, "loss/logits": 0.26121625304222107, "step": 683 }, { "epoch": 0.0021719801854439224, "grad_norm": 0.337890625, "grad_norm_var": 0.022586568196614584, "learning_rate": 0.01, "loss": 1.4064, "loss/crossentropy": 2.32330060005188, "loss/fcd": 1.16015625, "loss/logits": 0.26456931233406067, "step": 684 }, { "epoch": 0.0021751555950717642, "grad_norm": 0.306640625, "grad_norm_var": 0.02269128163655599, "learning_rate": 0.01, "loss": 1.4071, "loss/crossentropy": 2.5149052143096924, "loss/fcd": 1.08984375, "loss/logits": 0.24568501859903336, "step": 685 }, { "epoch": 0.002178331004699606, "grad_norm": 0.35546875, "grad_norm_var": 0.0026488622029622396, "learning_rate": 0.01, "loss": 1.4628, "loss/crossentropy": 2.7665618658065796, "loss/fcd": 1.23046875, "loss/logits": 0.31343936920166016, "step": 686 }, { "epoch": 0.0021815064143274484, "grad_norm": 0.376953125, "grad_norm_var": 0.0009576002756754557, "learning_rate": 0.01, "loss": 1.4537, "loss/crossentropy": 2.459946870803833, "loss/fcd": 1.140625, "loss/logits": 0.24365650117397308, "step": 687 }, { "epoch": 0.0021846818239552902, "grad_norm": 0.34765625, "grad_norm_var": 0.0009016513824462891, "learning_rate": 0.01, "loss": 1.4642, "loss/crossentropy": 2.626524329185486, "loss/fcd": 1.18359375, "loss/logits": 0.2959468364715576, "step": 688 }, { "epoch": 0.002187857233583132, "grad_norm": 0.306640625, "grad_norm_var": 0.000696563720703125, "learning_rate": 0.01, "loss": 1.3954, "loss/crossentropy": 2.618670344352722, "loss/fcd": 1.15625, "loss/logits": 0.26519955694675446, "step": 689 }, { "epoch": 0.0021910326432109744, "grad_norm": 0.341796875, "grad_norm_var": 0.0006799697875976562, "learning_rate": 0.01, "loss": 1.4594, "loss/crossentropy": 2.4219008684158325, "loss/fcd": 1.2109375, "loss/logits": 0.2737656831741333, "step": 690 }, { "epoch": 0.0021942080528388162, "grad_norm": 0.34375, "grad_norm_var": 0.0006631851196289062, "learning_rate": 0.01, "loss": 1.4401, "loss/crossentropy": 2.6064971685409546, "loss/fcd": 1.1171875, "loss/logits": 0.24395552277565002, "step": 691 }, { "epoch": 0.002197383462466658, "grad_norm": 0.33984375, "grad_norm_var": 0.0006620883941650391, "learning_rate": 0.01, "loss": 1.4849, "loss/crossentropy": 2.8410667181015015, "loss/fcd": 1.2109375, "loss/logits": 0.287314236164093, "step": 692 }, { "epoch": 0.0022005588720945004, "grad_norm": 0.35546875, "grad_norm_var": 0.0006521701812744141, "learning_rate": 0.01, "loss": 1.571, "loss/crossentropy": 2.5582644939422607, "loss/fcd": 1.19921875, "loss/logits": 0.2957863509654999, "step": 693 }, { "epoch": 0.0022037342817223422, "grad_norm": 0.40625, "grad_norm_var": 0.0009280999501546223, "learning_rate": 0.01, "loss": 1.4531, "loss/crossentropy": 2.511031985282898, "loss/fcd": 1.15234375, "loss/logits": 0.261308878660202, "step": 694 }, { "epoch": 0.002206909691350184, "grad_norm": 0.302734375, "grad_norm_var": 0.0009739557902018229, "learning_rate": 0.01, "loss": 1.3986, "loss/crossentropy": 2.73672616481781, "loss/fcd": 1.140625, "loss/logits": 0.25674088299274445, "step": 695 }, { "epoch": 0.0022100851009780264, "grad_norm": 0.3359375, "grad_norm_var": 0.0008666833241780599, "learning_rate": 0.01, "loss": 1.4276, "loss/crossentropy": 2.6394673585891724, "loss/fcd": 1.28125, "loss/logits": 0.31712327897548676, "step": 696 }, { "epoch": 0.0022132605106058682, "grad_norm": 0.353515625, "grad_norm_var": 0.0008511861165364583, "learning_rate": 0.01, "loss": 1.4581, "loss/crossentropy": 2.45247745513916, "loss/fcd": 1.11328125, "loss/logits": 0.25778521597385406, "step": 697 }, { "epoch": 0.00221643592023371, "grad_norm": 0.373046875, "grad_norm_var": 0.0008788903554280599, "learning_rate": 0.01, "loss": 1.4689, "loss/crossentropy": 2.920904517173767, "loss/fcd": 1.23828125, "loss/logits": 0.2662116140127182, "step": 698 }, { "epoch": 0.002219611329861552, "grad_norm": 0.37109375, "grad_norm_var": 0.000757598876953125, "learning_rate": 0.01, "loss": 1.5753, "loss/crossentropy": 2.6180105209350586, "loss/fcd": 1.21484375, "loss/logits": 0.29333923757076263, "step": 699 }, { "epoch": 0.0022227867394893942, "grad_norm": 0.3203125, "grad_norm_var": 0.0007986545562744141, "learning_rate": 0.01, "loss": 1.4051, "loss/crossentropy": 2.27789843082428, "loss/fcd": 1.15625, "loss/logits": 0.2633768767118454, "step": 700 }, { "epoch": 0.002225962149117236, "grad_norm": 0.296875, "grad_norm_var": 0.0008559544881184896, "learning_rate": 0.01, "loss": 1.4069, "loss/crossentropy": 2.6427571773529053, "loss/fcd": 1.23828125, "loss/logits": 0.3034712225198746, "step": 701 }, { "epoch": 0.002229137558745078, "grad_norm": 0.3046875, "grad_norm_var": 0.0009493509928385417, "learning_rate": 0.01, "loss": 1.4219, "loss/crossentropy": 2.2899880409240723, "loss/fcd": 1.09375, "loss/logits": 0.22581543773412704, "step": 702 }, { "epoch": 0.0022323129683729202, "grad_norm": 0.32421875, "grad_norm_var": 0.0008793989817301432, "learning_rate": 0.01, "loss": 1.4828, "loss/crossentropy": 2.5826855897903442, "loss/fcd": 1.28125, "loss/logits": 0.30267195403575897, "step": 703 }, { "epoch": 0.002235488378000762, "grad_norm": 0.328125, "grad_norm_var": 0.0008806705474853516, "learning_rate": 0.01, "loss": 1.4049, "loss/crossentropy": 2.7553216218948364, "loss/fcd": 1.09375, "loss/logits": 0.2631734013557434, "step": 704 }, { "epoch": 0.002238663787628604, "grad_norm": 0.310546875, "grad_norm_var": 0.0008654117584228516, "learning_rate": 0.01, "loss": 1.4386, "loss/crossentropy": 2.5291916131973267, "loss/fcd": 1.23046875, "loss/logits": 0.31132782995700836, "step": 705 }, { "epoch": 0.0022418391972564462, "grad_norm": 0.376953125, "grad_norm_var": 0.000960397720336914, "learning_rate": 0.01, "loss": 1.4633, "loss/crossentropy": 2.658027172088623, "loss/fcd": 1.203125, "loss/logits": 0.28219635784626007, "step": 706 }, { "epoch": 0.002245014606884288, "grad_norm": 0.3671875, "grad_norm_var": 0.0010057926177978516, "learning_rate": 0.01, "loss": 1.4268, "loss/crossentropy": 2.5007067918777466, "loss/fcd": 1.16796875, "loss/logits": 0.2701251655817032, "step": 707 }, { "epoch": 0.00224819001651213, "grad_norm": 0.3046875, "grad_norm_var": 0.0010916233062744141, "learning_rate": 0.01, "loss": 1.4204, "loss/crossentropy": 2.3672842979431152, "loss/fcd": 1.1640625, "loss/logits": 0.2820379063487053, "step": 708 }, { "epoch": 0.0022513654261399722, "grad_norm": 0.326171875, "grad_norm_var": 0.0010828018188476563, "learning_rate": 0.01, "loss": 1.4588, "loss/crossentropy": 2.2994272708892822, "loss/fcd": 1.16015625, "loss/logits": 0.2736620306968689, "step": 709 }, { "epoch": 0.002254540835767814, "grad_norm": 0.384765625, "grad_norm_var": 0.0009151299794514974, "learning_rate": 0.01, "loss": 1.4186, "loss/crossentropy": 2.307642340660095, "loss/fcd": 1.11328125, "loss/logits": 0.24960515648126602, "step": 710 }, { "epoch": 0.002257716245395656, "grad_norm": 0.474609375, "grad_norm_var": 0.0019921461741129556, "learning_rate": 0.01, "loss": 1.5403, "loss/crossentropy": 2.3663251399993896, "loss/fcd": 1.27734375, "loss/logits": 0.3394431471824646, "step": 711 }, { "epoch": 0.0022608916550234982, "grad_norm": 0.369140625, "grad_norm_var": 0.002011871337890625, "learning_rate": 0.01, "loss": 1.4133, "loss/crossentropy": 2.486906886100769, "loss/fcd": 1.1015625, "loss/logits": 0.24058721959590912, "step": 712 }, { "epoch": 0.00226406706465134, "grad_norm": 0.34375, "grad_norm_var": 0.0020121097564697265, "learning_rate": 0.01, "loss": 1.4163, "loss/crossentropy": 2.720227003097534, "loss/fcd": 1.140625, "loss/logits": 0.2607147991657257, "step": 713 }, { "epoch": 0.002267242474279182, "grad_norm": 0.34375, "grad_norm_var": 0.00196990966796875, "learning_rate": 0.01, "loss": 1.3965, "loss/crossentropy": 2.6770442724227905, "loss/fcd": 1.21875, "loss/logits": 0.29529672861099243, "step": 714 }, { "epoch": 0.002270417883907024, "grad_norm": 0.337890625, "grad_norm_var": 0.0019307295481363932, "learning_rate": 0.01, "loss": 1.4449, "loss/crossentropy": 2.6055012941360474, "loss/fcd": 1.20703125, "loss/logits": 0.28090208768844604, "step": 715 }, { "epoch": 0.002273593293534866, "grad_norm": 0.328125, "grad_norm_var": 0.0019092400868733724, "learning_rate": 0.01, "loss": 1.4553, "loss/crossentropy": 2.4810702800750732, "loss/fcd": 1.07421875, "loss/logits": 0.23952434957027435, "step": 716 }, { "epoch": 0.002276768703162708, "grad_norm": 0.302734375, "grad_norm_var": 0.0018737157185872396, "learning_rate": 0.01, "loss": 1.3923, "loss/crossentropy": 2.501761555671692, "loss/fcd": 1.1328125, "loss/logits": 0.28178179264068604, "step": 717 }, { "epoch": 0.00227994411279055, "grad_norm": 0.314453125, "grad_norm_var": 0.0018265883127848307, "learning_rate": 0.01, "loss": 1.4206, "loss/crossentropy": 2.4999107122421265, "loss/fcd": 1.1640625, "loss/logits": 0.2527662664651871, "step": 718 }, { "epoch": 0.002283119522418392, "grad_norm": 0.33203125, "grad_norm_var": 0.0018076419830322266, "learning_rate": 0.01, "loss": 1.4722, "loss/crossentropy": 2.7011624574661255, "loss/fcd": 1.16015625, "loss/logits": 0.26353636384010315, "step": 719 }, { "epoch": 0.002286294932046234, "grad_norm": 0.326171875, "grad_norm_var": 0.0018126805623372396, "learning_rate": 0.01, "loss": 1.3935, "loss/crossentropy": 2.4508912563323975, "loss/fcd": 1.09375, "loss/logits": 0.2523498311638832, "step": 720 }, { "epoch": 0.002289470341674076, "grad_norm": 0.37109375, "grad_norm_var": 0.0017520745595296225, "learning_rate": 0.01, "loss": 1.4527, "loss/crossentropy": 2.5403130054473877, "loss/fcd": 1.15625, "loss/logits": 0.26142075657844543, "step": 721 }, { "epoch": 0.002292645751301918, "grad_norm": 0.3359375, "grad_norm_var": 0.0017110188802083334, "learning_rate": 0.01, "loss": 1.4589, "loss/crossentropy": 2.5185729265213013, "loss/fcd": 1.1328125, "loss/logits": 0.2594883292913437, "step": 722 }, { "epoch": 0.00229582116092976, "grad_norm": 0.306640625, "grad_norm_var": 0.0017824649810791015, "learning_rate": 0.01, "loss": 1.4087, "loss/crossentropy": 2.57889723777771, "loss/fcd": 1.140625, "loss/logits": 0.259462371468544, "step": 723 }, { "epoch": 0.002298996570557602, "grad_norm": 0.33203125, "grad_norm_var": 0.0016863346099853516, "learning_rate": 0.01, "loss": 1.4154, "loss/crossentropy": 2.7318270206451416, "loss/fcd": 1.14453125, "loss/logits": 0.2645667642354965, "step": 724 }, { "epoch": 0.002302171980185444, "grad_norm": 0.30859375, "grad_norm_var": 0.0017511367797851563, "learning_rate": 0.01, "loss": 1.4326, "loss/crossentropy": 2.5983744859695435, "loss/fcd": 1.2109375, "loss/logits": 0.2835640013217926, "step": 725 }, { "epoch": 0.002305347389813286, "grad_norm": 0.337890625, "grad_norm_var": 0.0016366958618164063, "learning_rate": 0.01, "loss": 1.4169, "loss/crossentropy": 2.7769795656204224, "loss/fcd": 1.125, "loss/logits": 0.2684818506240845, "step": 726 }, { "epoch": 0.002308522799441128, "grad_norm": 0.345703125, "grad_norm_var": 0.000388336181640625, "learning_rate": 0.01, "loss": 1.4024, "loss/crossentropy": 2.6922361850738525, "loss/fcd": 1.1015625, "loss/logits": 0.2499493882060051, "step": 727 }, { "epoch": 0.00231169820906897, "grad_norm": 0.314453125, "grad_norm_var": 0.0003153483072916667, "learning_rate": 0.01, "loss": 1.4284, "loss/crossentropy": 2.3640514612197876, "loss/fcd": 1.08984375, "loss/logits": 0.2648880034685135, "step": 728 }, { "epoch": 0.002314873618696812, "grad_norm": 0.318359375, "grad_norm_var": 0.0003093560536702474, "learning_rate": 0.01, "loss": 1.4294, "loss/crossentropy": 2.4070149660110474, "loss/fcd": 1.171875, "loss/logits": 0.29105672240257263, "step": 729 }, { "epoch": 0.002318049028324654, "grad_norm": 0.326171875, "grad_norm_var": 0.0002929051717122396, "learning_rate": 0.01, "loss": 1.3682, "loss/crossentropy": 2.5560896396636963, "loss/fcd": 1.16015625, "loss/logits": 0.254703588783741, "step": 730 }, { "epoch": 0.002321224437952496, "grad_norm": 0.35546875, "grad_norm_var": 0.0003368218739827474, "learning_rate": 0.01, "loss": 1.4322, "loss/crossentropy": 2.7137579917907715, "loss/fcd": 1.15625, "loss/logits": 0.26079654693603516, "step": 731 }, { "epoch": 0.002324399847580338, "grad_norm": 0.283203125, "grad_norm_var": 0.00046513875325520835, "learning_rate": 0.01, "loss": 1.3774, "loss/crossentropy": 2.848947286605835, "loss/fcd": 1.11328125, "loss/logits": 0.2647935599088669, "step": 732 }, { "epoch": 0.00232757525720818, "grad_norm": 0.361328125, "grad_norm_var": 0.0005004247029622396, "learning_rate": 0.01, "loss": 1.4829, "loss/crossentropy": 2.6093363761901855, "loss/fcd": 1.16015625, "loss/logits": 0.27085070312023163, "step": 733 }, { "epoch": 0.0023307506668360217, "grad_norm": 0.328125, "grad_norm_var": 0.0004849592844645182, "learning_rate": 0.01, "loss": 1.4308, "loss/crossentropy": 2.4548600912094116, "loss/fcd": 1.23046875, "loss/logits": 0.3042246103286743, "step": 734 }, { "epoch": 0.002333926076463864, "grad_norm": 0.4765625, "grad_norm_var": 0.0018258253733317057, "learning_rate": 0.01, "loss": 1.4918, "loss/crossentropy": 2.677885055541992, "loss/fcd": 1.19140625, "loss/logits": 0.2653072327375412, "step": 735 }, { "epoch": 0.002337101486091706, "grad_norm": 0.333984375, "grad_norm_var": 0.0018160343170166016, "learning_rate": 0.01, "loss": 1.4528, "loss/crossentropy": 2.5845121145248413, "loss/fcd": 1.2109375, "loss/logits": 0.29135359823703766, "step": 736 }, { "epoch": 0.0023402768957195477, "grad_norm": 0.37890625, "grad_norm_var": 0.0018525282541910807, "learning_rate": 0.01, "loss": 1.5023, "loss/crossentropy": 2.4463754892349243, "loss/fcd": 1.37109375, "loss/logits": 0.35034844279289246, "step": 737 }, { "epoch": 0.00234345230534739, "grad_norm": 0.373046875, "grad_norm_var": 0.0019174575805664062, "learning_rate": 0.01, "loss": 1.4284, "loss/crossentropy": 2.6639740467071533, "loss/fcd": 1.203125, "loss/logits": 0.2613166868686676, "step": 738 }, { "epoch": 0.002346627714975232, "grad_norm": 0.357421875, "grad_norm_var": 0.00183563232421875, "learning_rate": 0.01, "loss": 1.4026, "loss/crossentropy": 2.1357060074806213, "loss/fcd": 1.05859375, "loss/logits": 0.22983689606189728, "step": 739 }, { "epoch": 0.0023498031246030737, "grad_norm": 0.279296875, "grad_norm_var": 0.002105569839477539, "learning_rate": 0.01, "loss": 1.3374, "loss/crossentropy": 2.4799500703811646, "loss/fcd": 1.01953125, "loss/logits": 0.21136115491390228, "step": 740 }, { "epoch": 0.002352978534230916, "grad_norm": 0.3359375, "grad_norm_var": 0.0020290215810139975, "learning_rate": 0.01, "loss": 1.474, "loss/crossentropy": 2.559669256210327, "loss/fcd": 1.2109375, "loss/logits": 0.2721591740846634, "step": 741 }, { "epoch": 0.002356153943858758, "grad_norm": 0.341796875, "grad_norm_var": 0.0020267327626546225, "learning_rate": 0.01, "loss": 1.4579, "loss/crossentropy": 2.6848400831222534, "loss/fcd": 1.1875, "loss/logits": 0.2639164924621582, "step": 742 }, { "epoch": 0.0023593293534865997, "grad_norm": 0.37890625, "grad_norm_var": 0.0021015803019205728, "learning_rate": 0.01, "loss": 1.544, "loss/crossentropy": 2.2969682216644287, "loss/fcd": 1.18359375, "loss/logits": 0.2828761488199234, "step": 743 }, { "epoch": 0.002362504763114442, "grad_norm": 0.326171875, "grad_norm_var": 0.002060190836588542, "learning_rate": 0.01, "loss": 1.4455, "loss/crossentropy": 2.55966579914093, "loss/fcd": 1.17578125, "loss/logits": 0.26350104808807373, "step": 744 }, { "epoch": 0.002365680172742284, "grad_norm": 0.3203125, "grad_norm_var": 0.002052927017211914, "learning_rate": 0.01, "loss": 1.4065, "loss/crossentropy": 2.59329891204834, "loss/fcd": 1.171875, "loss/logits": 0.27897585928440094, "step": 745 }, { "epoch": 0.0023688555823701257, "grad_norm": 0.322265625, "grad_norm_var": 0.0020648797353108725, "learning_rate": 0.01, "loss": 1.4161, "loss/crossentropy": 2.6247501373291016, "loss/fcd": 1.28125, "loss/logits": 0.31912797689437866, "step": 746 }, { "epoch": 0.002372030991997968, "grad_norm": 0.302734375, "grad_norm_var": 0.0021794637044270835, "learning_rate": 0.01, "loss": 1.4608, "loss/crossentropy": 2.291262149810791, "loss/fcd": 1.13671875, "loss/logits": 0.25881047546863556, "step": 747 }, { "epoch": 0.00237520640162581, "grad_norm": 0.333984375, "grad_norm_var": 0.001930681864420573, "learning_rate": 0.01, "loss": 1.5039, "loss/crossentropy": 2.586389422416687, "loss/fcd": 1.265625, "loss/logits": 0.30161425471305847, "step": 748 }, { "epoch": 0.0023783818112536517, "grad_norm": 0.3125, "grad_norm_var": 0.0019859155019124348, "learning_rate": 0.01, "loss": 1.433, "loss/crossentropy": 2.4427740573883057, "loss/fcd": 1.140625, "loss/logits": 0.28813768923282623, "step": 749 }, { "epoch": 0.0023815572208814935, "grad_norm": 0.36328125, "grad_norm_var": 0.001989348729451497, "learning_rate": 0.01, "loss": 1.5594, "loss/crossentropy": 2.66691517829895, "loss/fcd": 1.1875, "loss/logits": 0.2686517834663391, "step": 750 }, { "epoch": 0.002384732630509336, "grad_norm": 0.330078125, "grad_norm_var": 0.0007817586263020833, "learning_rate": 0.01, "loss": 1.4638, "loss/crossentropy": 2.6278196573257446, "loss/fcd": 1.13671875, "loss/logits": 0.2773092985153198, "step": 751 }, { "epoch": 0.0023879080401371777, "grad_norm": 0.298828125, "grad_norm_var": 0.0008727391560872396, "learning_rate": 0.01, "loss": 1.3777, "loss/crossentropy": 2.3818631172180176, "loss/fcd": 1.07421875, "loss/logits": 0.24697883427143097, "step": 752 }, { "epoch": 0.0023910834497650195, "grad_norm": 0.34375, "grad_norm_var": 0.0007428487141927083, "learning_rate": 0.01, "loss": 1.4447, "loss/crossentropy": 2.7443277835845947, "loss/fcd": 1.1796875, "loss/logits": 0.2773742824792862, "step": 753 }, { "epoch": 0.002394258859392862, "grad_norm": 0.34765625, "grad_norm_var": 0.0006459395090738933, "learning_rate": 0.01, "loss": 1.4976, "loss/crossentropy": 2.5605225563049316, "loss/fcd": 1.07421875, "loss/logits": 0.24911946058273315, "step": 754 }, { "epoch": 0.0023974342690207037, "grad_norm": 0.337890625, "grad_norm_var": 0.0006007989247639974, "learning_rate": 0.01, "loss": 1.4058, "loss/crossentropy": 2.374634265899658, "loss/fcd": 1.1484375, "loss/logits": 0.265060618519783, "step": 755 }, { "epoch": 0.0024006096786485455, "grad_norm": 0.30859375, "grad_norm_var": 0.00045750935872395835, "learning_rate": 0.01, "loss": 1.5082, "loss/crossentropy": 2.953762650489807, "loss/fcd": 1.265625, "loss/logits": 0.2881181836128235, "step": 756 }, { "epoch": 0.002403785088276388, "grad_norm": 0.3359375, "grad_norm_var": 0.00045750935872395835, "learning_rate": 0.01, "loss": 1.5397, "loss/crossentropy": 2.7358083724975586, "loss/fcd": 1.25390625, "loss/logits": 0.30809466540813446, "step": 757 }, { "epoch": 0.0024069604979042297, "grad_norm": 0.34375, "grad_norm_var": 0.0004604180653889974, "learning_rate": 0.01, "loss": 1.5027, "loss/crossentropy": 2.64107608795166, "loss/fcd": 1.22265625, "loss/logits": 0.32376445829868317, "step": 758 }, { "epoch": 0.0024101359075320715, "grad_norm": 0.31640625, "grad_norm_var": 0.0003108819325764974, "learning_rate": 0.01, "loss": 1.4585, "loss/crossentropy": 2.4790984392166138, "loss/fcd": 1.24609375, "loss/logits": 0.27474017441272736, "step": 759 }, { "epoch": 0.002413311317159914, "grad_norm": 0.34765625, "grad_norm_var": 0.00033518473307291664, "learning_rate": 0.01, "loss": 1.4676, "loss/crossentropy": 2.7548564672470093, "loss/fcd": 1.25, "loss/logits": 0.29049740731716156, "step": 760 }, { "epoch": 0.0024164867267877557, "grad_norm": 0.3203125, "grad_norm_var": 0.00033518473307291664, "learning_rate": 0.01, "loss": 1.4389, "loss/crossentropy": 2.6543887853622437, "loss/fcd": 1.1640625, "loss/logits": 0.2856620103120804, "step": 761 }, { "epoch": 0.0024196621364155975, "grad_norm": 0.3046875, "grad_norm_var": 0.0003705183664957682, "learning_rate": 0.01, "loss": 1.4033, "loss/crossentropy": 2.5761606693267822, "loss/fcd": 1.14453125, "loss/logits": 0.27416975051164627, "step": 762 }, { "epoch": 0.00242283754604344, "grad_norm": 0.302734375, "grad_norm_var": 0.0003705183664957682, "learning_rate": 0.01, "loss": 1.4355, "loss/crossentropy": 2.407198429107666, "loss/fcd": 1.19140625, "loss/logits": 0.2781771272420883, "step": 763 }, { "epoch": 0.0024260129556712817, "grad_norm": 0.36328125, "grad_norm_var": 0.0004475275675455729, "learning_rate": 0.01, "loss": 1.4951, "loss/crossentropy": 2.7275702953338623, "loss/fcd": 1.1171875, "loss/logits": 0.24413208663463593, "step": 764 }, { "epoch": 0.0024291883652991235, "grad_norm": 0.39453125, "grad_norm_var": 0.0006785074869791667, "learning_rate": 0.01, "loss": 1.4653, "loss/crossentropy": 2.611600637435913, "loss/fcd": 1.28515625, "loss/logits": 0.2880173772573471, "step": 765 }, { "epoch": 0.0024323637749269654, "grad_norm": 0.34765625, "grad_norm_var": 0.000634765625, "learning_rate": 0.01, "loss": 1.4796, "loss/crossentropy": 2.603045701980591, "loss/fcd": 1.16796875, "loss/logits": 0.2837670296430588, "step": 766 }, { "epoch": 0.0024355391845548077, "grad_norm": 0.291015625, "grad_norm_var": 0.0007504781087239583, "learning_rate": 0.01, "loss": 1.3624, "loss/crossentropy": 2.4649842977523804, "loss/fcd": 1.0859375, "loss/logits": 0.26569877564907074, "step": 767 }, { "epoch": 0.0024387145941826495, "grad_norm": 0.306640625, "grad_norm_var": 0.00072021484375, "learning_rate": 0.01, "loss": 1.399, "loss/crossentropy": 2.565674066543579, "loss/fcd": 1.12890625, "loss/logits": 0.24269527196884155, "step": 768 }, { "epoch": 0.0024418900038104914, "grad_norm": 0.326171875, "grad_norm_var": 0.0007120609283447265, "learning_rate": 0.01, "loss": 1.4208, "loss/crossentropy": 2.7142174243927, "loss/fcd": 1.17578125, "loss/logits": 0.2695314288139343, "step": 769 }, { "epoch": 0.0024450654134383337, "grad_norm": 0.384765625, "grad_norm_var": 0.0008808771769205729, "learning_rate": 0.01, "loss": 1.5473, "loss/crossentropy": 2.489791750907898, "loss/fcd": 1.22265625, "loss/logits": 0.287143737077713, "step": 770 }, { "epoch": 0.0024482408230661755, "grad_norm": 0.302734375, "grad_norm_var": 0.0009363810221354167, "learning_rate": 0.01, "loss": 1.3888, "loss/crossentropy": 2.420745015144348, "loss/fcd": 1.12890625, "loss/logits": 0.2364453598856926, "step": 771 }, { "epoch": 0.0024514162326940174, "grad_norm": 0.3359375, "grad_norm_var": 0.0009012222290039062, "learning_rate": 0.01, "loss": 1.4635, "loss/crossentropy": 2.5093045234680176, "loss/fcd": 1.2109375, "loss/logits": 0.27113473415374756, "step": 772 }, { "epoch": 0.0024545916423218597, "grad_norm": 0.294921875, "grad_norm_var": 0.0009890079498291015, "learning_rate": 0.01, "loss": 1.4166, "loss/crossentropy": 2.390032649040222, "loss/fcd": 1.1484375, "loss/logits": 0.2553604692220688, "step": 773 }, { "epoch": 0.0024577670519497015, "grad_norm": 0.39453125, "grad_norm_var": 0.001241922378540039, "learning_rate": 0.01, "loss": 1.5225, "loss/crossentropy": 2.57145619392395, "loss/fcd": 1.09375, "loss/logits": 0.2458021640777588, "step": 774 }, { "epoch": 0.0024609424615775434, "grad_norm": 0.3359375, "grad_norm_var": 0.0012215773264567058, "learning_rate": 0.01, "loss": 1.4457, "loss/crossentropy": 2.715102791786194, "loss/fcd": 1.140625, "loss/logits": 0.2620503529906273, "step": 775 }, { "epoch": 0.0024641178712053857, "grad_norm": 0.333984375, "grad_norm_var": 0.0012094497680664063, "learning_rate": 0.01, "loss": 1.3942, "loss/crossentropy": 2.3614325523376465, "loss/fcd": 1.09765625, "loss/logits": 0.25080475211143494, "step": 776 }, { "epoch": 0.0024672932808332275, "grad_norm": 0.5546875, "grad_norm_var": 0.004223060607910156, "learning_rate": 0.01, "loss": 1.4208, "loss/crossentropy": 2.558432459831238, "loss/fcd": 1.1875, "loss/logits": 0.26947685331106186, "step": 777 }, { "epoch": 0.0024704686904610694, "grad_norm": 0.322265625, "grad_norm_var": 0.004139947891235352, "learning_rate": 0.01, "loss": 1.4039, "loss/crossentropy": 2.397109270095825, "loss/fcd": 1.17578125, "loss/logits": 0.2804575711488724, "step": 778 }, { "epoch": 0.0024736441000889117, "grad_norm": 0.310546875, "grad_norm_var": 0.004095061620076498, "learning_rate": 0.01, "loss": 1.4409, "loss/crossentropy": 2.4883055686950684, "loss/fcd": 1.07421875, "loss/logits": 0.23739789426326752, "step": 779 }, { "epoch": 0.0024768195097167535, "grad_norm": 0.28515625, "grad_norm_var": 0.004337930679321289, "learning_rate": 0.01, "loss": 1.4261, "loss/crossentropy": 2.5086852312088013, "loss/fcd": 1.1484375, "loss/logits": 0.2631785273551941, "step": 780 }, { "epoch": 0.0024799949193445954, "grad_norm": 0.3046875, "grad_norm_var": 0.004250192642211914, "learning_rate": 0.01, "loss": 1.3997, "loss/crossentropy": 2.764965057373047, "loss/fcd": 1.16015625, "loss/logits": 0.24434641003608704, "step": 781 }, { "epoch": 0.0024831703289724377, "grad_norm": 0.302734375, "grad_norm_var": 0.00432732899983724, "learning_rate": 0.01, "loss": 1.3959, "loss/crossentropy": 2.45246958732605, "loss/fcd": 1.1875, "loss/logits": 0.2930505871772766, "step": 782 }, { "epoch": 0.0024863457386002795, "grad_norm": 0.314453125, "grad_norm_var": 0.00421899159749349, "learning_rate": 0.01, "loss": 1.374, "loss/crossentropy": 2.5466792583465576, "loss/fcd": 1.15625, "loss/logits": 0.2562691420316696, "step": 783 }, { "epoch": 0.0024895211482281214, "grad_norm": 0.353515625, "grad_norm_var": 0.00415948232014974, "learning_rate": 0.01, "loss": 1.4296, "loss/crossentropy": 2.330942988395691, "loss/fcd": 1.15234375, "loss/logits": 0.26256877183914185, "step": 784 }, { "epoch": 0.0024926965578559632, "grad_norm": 0.306640625, "grad_norm_var": 0.00422210693359375, "learning_rate": 0.01, "loss": 1.3987, "loss/crossentropy": 2.604798197746277, "loss/fcd": 1.23828125, "loss/logits": 0.303231805562973, "step": 785 }, { "epoch": 0.0024958719674838055, "grad_norm": 0.306640625, "grad_norm_var": 0.004135640462239584, "learning_rate": 0.01, "loss": 1.3553, "loss/crossentropy": 2.3989371061325073, "loss/fcd": 1.1484375, "loss/logits": 0.23527930676937103, "step": 786 }, { "epoch": 0.0024990473771116474, "grad_norm": 0.384765625, "grad_norm_var": 0.00420373280843099, "learning_rate": 0.01, "loss": 1.4521, "loss/crossentropy": 2.483570694923401, "loss/fcd": 1.25, "loss/logits": 0.29801447689533234, "step": 787 }, { "epoch": 0.0025022227867394892, "grad_norm": 0.333984375, "grad_norm_var": 0.004205052057902018, "learning_rate": 0.01, "loss": 1.4335, "loss/crossentropy": 2.8643245697021484, "loss/fcd": 1.16015625, "loss/logits": 0.2896386682987213, "step": 788 }, { "epoch": 0.0025053981963673315, "grad_norm": 0.34375, "grad_norm_var": 0.004060808817545573, "learning_rate": 0.01, "loss": 1.4421, "loss/crossentropy": 2.5948187112808228, "loss/fcd": 1.1484375, "loss/logits": 0.2736224979162216, "step": 789 }, { "epoch": 0.0025085736059951734, "grad_norm": 0.361328125, "grad_norm_var": 0.00390165646870931, "learning_rate": 0.01, "loss": 1.4923, "loss/crossentropy": 2.8671985864639282, "loss/fcd": 1.24609375, "loss/logits": 0.3032727986574173, "step": 790 }, { "epoch": 0.0025117490156230152, "grad_norm": 0.3125, "grad_norm_var": 0.003951629002888997, "learning_rate": 0.01, "loss": 1.3923, "loss/crossentropy": 2.593184471130371, "loss/fcd": 1.11328125, "loss/logits": 0.258040115237236, "step": 791 }, { "epoch": 0.0025149244252508575, "grad_norm": 0.37109375, "grad_norm_var": 0.004010518391927083, "learning_rate": 0.01, "loss": 1.5228, "loss/crossentropy": 2.678791642189026, "loss/fcd": 1.27734375, "loss/logits": 0.3173917233943939, "step": 792 }, { "epoch": 0.0025180998348786994, "grad_norm": 0.32421875, "grad_norm_var": 0.0007883071899414063, "learning_rate": 0.01, "loss": 1.456, "loss/crossentropy": 2.5349851846694946, "loss/fcd": 1.078125, "loss/logits": 0.24774370342493057, "step": 793 }, { "epoch": 0.0025212752445065412, "grad_norm": 0.3671875, "grad_norm_var": 0.0008837223052978515, "learning_rate": 0.01, "loss": 1.4855, "loss/crossentropy": 2.494810700416565, "loss/fcd": 1.25390625, "loss/logits": 0.29925431311130524, "step": 794 }, { "epoch": 0.0025244506541343835, "grad_norm": 0.59375, "grad_norm_var": 0.00515435536702474, "learning_rate": 0.01, "loss": 1.4659, "loss/crossentropy": 2.837399959564209, "loss/fcd": 1.23046875, "loss/logits": 0.2899967133998871, "step": 795 }, { "epoch": 0.0025276260637622254, "grad_norm": 0.388671875, "grad_norm_var": 0.0049580732981363935, "learning_rate": 0.01, "loss": 1.4312, "loss/crossentropy": 2.285482406616211, "loss/fcd": 1.1484375, "loss/logits": 0.23734137415885925, "step": 796 }, { "epoch": 0.0025308014733900672, "grad_norm": 0.330078125, "grad_norm_var": 0.004830169677734375, "learning_rate": 0.01, "loss": 1.3838, "loss/crossentropy": 2.7041385173797607, "loss/fcd": 1.1484375, "loss/logits": 0.26254376769065857, "step": 797 }, { "epoch": 0.0025339768830179095, "grad_norm": 0.326171875, "grad_norm_var": 0.00469818115234375, "learning_rate": 0.01, "loss": 1.4342, "loss/crossentropy": 2.3834526538848877, "loss/fcd": 1.1484375, "loss/logits": 0.26749204099178314, "step": 798 }, { "epoch": 0.0025371522926457514, "grad_norm": 0.349609375, "grad_norm_var": 0.004574012756347656, "learning_rate": 0.01, "loss": 1.4899, "loss/crossentropy": 2.626059412956238, "loss/fcd": 1.19140625, "loss/logits": 0.26551854610443115, "step": 799 }, { "epoch": 0.0025403277022735932, "grad_norm": 0.34375, "grad_norm_var": 0.004587920506795248, "learning_rate": 0.01, "loss": 1.4416, "loss/crossentropy": 2.5694613456726074, "loss/fcd": 1.1796875, "loss/logits": 0.2925996407866478, "step": 800 }, { "epoch": 0.002543503111901435, "grad_norm": 0.30859375, "grad_norm_var": 0.00457452138264974, "learning_rate": 0.01, "loss": 1.419, "loss/crossentropy": 2.376332402229309, "loss/fcd": 1.16015625, "loss/logits": 0.25811731815338135, "step": 801 }, { "epoch": 0.0025466785215292774, "grad_norm": 0.34765625, "grad_norm_var": 0.004392608006795248, "learning_rate": 0.01, "loss": 1.4578, "loss/crossentropy": 2.6415032148361206, "loss/fcd": 1.1796875, "loss/logits": 0.2876610606908798, "step": 802 }, { "epoch": 0.0025498539311571192, "grad_norm": 0.314453125, "grad_norm_var": 0.004485305150349935, "learning_rate": 0.01, "loss": 1.425, "loss/crossentropy": 2.6944425106048584, "loss/fcd": 1.17578125, "loss/logits": 0.26216862350702286, "step": 803 }, { "epoch": 0.002553029340784961, "grad_norm": 0.294921875, "grad_norm_var": 0.0047021071116129555, "learning_rate": 0.01, "loss": 1.3646, "loss/crossentropy": 2.462023138999939, "loss/fcd": 1.12890625, "loss/logits": 0.2676442861557007, "step": 804 }, { "epoch": 0.0025562047504128034, "grad_norm": 0.32421875, "grad_norm_var": 0.004754877090454102, "learning_rate": 0.01, "loss": 1.462, "loss/crossentropy": 2.4774625301361084, "loss/fcd": 1.15234375, "loss/logits": 0.256020151078701, "step": 805 }, { "epoch": 0.0025593801600406452, "grad_norm": 0.3046875, "grad_norm_var": 0.004897308349609375, "learning_rate": 0.01, "loss": 1.3615, "loss/crossentropy": 2.5109862089157104, "loss/fcd": 1.12109375, "loss/logits": 0.26087169349193573, "step": 806 }, { "epoch": 0.002562555569668487, "grad_norm": 0.302734375, "grad_norm_var": 0.004952224095662435, "learning_rate": 0.01, "loss": 1.3917, "loss/crossentropy": 2.2008323669433594, "loss/fcd": 1.0546875, "loss/logits": 0.23423786461353302, "step": 807 }, { "epoch": 0.0025657309792963294, "grad_norm": 0.314453125, "grad_norm_var": 0.00498956044514974, "learning_rate": 0.01, "loss": 1.4296, "loss/crossentropy": 2.6296935081481934, "loss/fcd": 1.1796875, "loss/logits": 0.2653568387031555, "step": 808 }, { "epoch": 0.0025689063889241712, "grad_norm": 0.3046875, "grad_norm_var": 0.0050699869791666664, "learning_rate": 0.01, "loss": 1.3866, "loss/crossentropy": 2.3815841674804688, "loss/fcd": 1.125, "loss/logits": 0.24855978786945343, "step": 809 }, { "epoch": 0.002572081798552013, "grad_norm": 0.318359375, "grad_norm_var": 0.005072768529256185, "learning_rate": 0.01, "loss": 1.4052, "loss/crossentropy": 2.758337616920471, "loss/fcd": 1.24609375, "loss/logits": 0.2657631188631058, "step": 810 }, { "epoch": 0.0025752572081798554, "grad_norm": 0.34375, "grad_norm_var": 0.0005765120188395182, "learning_rate": 0.01, "loss": 1.4637, "loss/crossentropy": 2.55074143409729, "loss/fcd": 1.171875, "loss/logits": 0.2729620784521103, "step": 811 }, { "epoch": 0.0025784326178076972, "grad_norm": 0.32421875, "grad_norm_var": 0.0002979914347330729, "learning_rate": 0.01, "loss": 1.4226, "loss/crossentropy": 2.7302236557006836, "loss/fcd": 1.22265625, "loss/logits": 0.2935919612646103, "step": 812 }, { "epoch": 0.002581608027435539, "grad_norm": 0.353515625, "grad_norm_var": 0.0003575007120768229, "learning_rate": 0.01, "loss": 1.4596, "loss/crossentropy": 2.5811444520950317, "loss/fcd": 1.20703125, "loss/logits": 0.2931455224752426, "step": 813 }, { "epoch": 0.0025847834370633814, "grad_norm": 0.34765625, "grad_norm_var": 0.0003940423329671224, "learning_rate": 0.01, "loss": 1.5095, "loss/crossentropy": 2.576464295387268, "loss/fcd": 1.1328125, "loss/logits": 0.24814368784427643, "step": 814 }, { "epoch": 0.0025879588466912232, "grad_norm": 0.31640625, "grad_norm_var": 0.000353240966796875, "learning_rate": 0.01, "loss": 1.4242, "loss/crossentropy": 2.5269323587417603, "loss/fcd": 1.1640625, "loss/logits": 0.26689252257347107, "step": 815 }, { "epoch": 0.002591134256319065, "grad_norm": 0.38671875, "grad_norm_var": 0.0005889256795247396, "learning_rate": 0.01, "loss": 1.4666, "loss/crossentropy": 2.7156922817230225, "loss/fcd": 1.3671875, "loss/logits": 0.3547069877386093, "step": 816 }, { "epoch": 0.0025943096659469074, "grad_norm": 0.33203125, "grad_norm_var": 0.0005706151326497396, "learning_rate": 0.01, "loss": 1.4707, "loss/crossentropy": 2.677256464958191, "loss/fcd": 1.15625, "loss/logits": 0.27609871327877045, "step": 817 }, { "epoch": 0.0025974850755747492, "grad_norm": 0.35546875, "grad_norm_var": 0.0005960464477539062, "learning_rate": 0.01, "loss": 1.4908, "loss/crossentropy": 2.414480209350586, "loss/fcd": 1.11328125, "loss/logits": 0.2341102808713913, "step": 818 }, { "epoch": 0.002600660485202591, "grad_norm": 0.322265625, "grad_norm_var": 0.0005863825480143229, "learning_rate": 0.01, "loss": 1.3889, "loss/crossentropy": 2.241674780845642, "loss/fcd": 1.12109375, "loss/logits": 0.2622302696108818, "step": 819 }, { "epoch": 0.002603835894830433, "grad_norm": 0.333984375, "grad_norm_var": 0.0005100886027018229, "learning_rate": 0.01, "loss": 1.4562, "loss/crossentropy": 2.620813488960266, "loss/fcd": 1.19140625, "loss/logits": 0.28008031845092773, "step": 820 }, { "epoch": 0.0026070113044582752, "grad_norm": 0.298828125, "grad_norm_var": 0.0005710442860921224, "learning_rate": 0.01, "loss": 1.4406, "loss/crossentropy": 2.397587776184082, "loss/fcd": 1.09765625, "loss/logits": 0.24911227077245712, "step": 821 }, { "epoch": 0.002610186714086117, "grad_norm": 0.3515625, "grad_norm_var": 0.0005580743153889973, "learning_rate": 0.01, "loss": 1.4318, "loss/crossentropy": 2.5856579542160034, "loss/fcd": 1.19921875, "loss/logits": 0.26554256677627563, "step": 822 }, { "epoch": 0.002613362123713959, "grad_norm": 0.328125, "grad_norm_var": 0.0005004247029622396, "learning_rate": 0.01, "loss": 1.4188, "loss/crossentropy": 2.4827295541763306, "loss/fcd": 1.078125, "loss/logits": 0.24997267872095108, "step": 823 }, { "epoch": 0.0026165375333418012, "grad_norm": 0.306640625, "grad_norm_var": 0.0005238215128580729, "learning_rate": 0.01, "loss": 1.3941, "loss/crossentropy": 2.719724178314209, "loss/fcd": 1.24609375, "loss/logits": 0.2936270534992218, "step": 824 }, { "epoch": 0.002619712942969643, "grad_norm": 0.287109375, "grad_norm_var": 0.0006089369455973307, "learning_rate": 0.01, "loss": 1.4224, "loss/crossentropy": 2.45254123210907, "loss/fcd": 1.11328125, "loss/logits": 0.24862974882125854, "step": 825 }, { "epoch": 0.002622888352597485, "grad_norm": 0.451171875, "grad_norm_var": 0.0014757633209228516, "learning_rate": 0.01, "loss": 1.4833, "loss/crossentropy": 2.5249361991882324, "loss/fcd": 1.1015625, "loss/logits": 0.2535742521286011, "step": 826 }, { "epoch": 0.0026260637622253272, "grad_norm": 0.337890625, "grad_norm_var": 0.0014749526977539062, "learning_rate": 0.01, "loss": 1.4121, "loss/crossentropy": 2.583114743232727, "loss/fcd": 1.13671875, "loss/logits": 0.27725452929735184, "step": 827 }, { "epoch": 0.002629239171853169, "grad_norm": 0.314453125, "grad_norm_var": 0.0015009403228759765, "learning_rate": 0.01, "loss": 1.4386, "loss/crossentropy": 2.4620174169540405, "loss/fcd": 1.12109375, "loss/logits": 0.2715196758508682, "step": 828 }, { "epoch": 0.002632414581481011, "grad_norm": 0.31640625, "grad_norm_var": 0.0015151341756184896, "learning_rate": 0.01, "loss": 1.4343, "loss/crossentropy": 2.4475836753845215, "loss/fcd": 1.12109375, "loss/logits": 0.24357828497886658, "step": 829 }, { "epoch": 0.0026355899911088532, "grad_norm": 0.326171875, "grad_norm_var": 0.0015125115712483724, "learning_rate": 0.01, "loss": 1.396, "loss/crossentropy": 2.9068440198898315, "loss/fcd": 1.1640625, "loss/logits": 0.25082169473171234, "step": 830 }, { "epoch": 0.002638765400736695, "grad_norm": 0.3203125, "grad_norm_var": 0.0015036106109619141, "learning_rate": 0.01, "loss": 1.4762, "loss/crossentropy": 2.502850890159607, "loss/fcd": 1.1796875, "loss/logits": 0.2955174744129181, "step": 831 }, { "epoch": 0.002641940810364537, "grad_norm": 0.63671875, "grad_norm_var": 0.007114775975545247, "learning_rate": 0.01, "loss": 1.5688, "loss/crossentropy": 2.6306108236312866, "loss/fcd": 1.16796875, "loss/logits": 0.26905806362628937, "step": 832 }, { "epoch": 0.0026451162199923792, "grad_norm": 0.322265625, "grad_norm_var": 0.00714569091796875, "learning_rate": 0.01, "loss": 1.4429, "loss/crossentropy": 2.7270872592926025, "loss/fcd": 1.1640625, "loss/logits": 0.2583453506231308, "step": 833 }, { "epoch": 0.002648291629620221, "grad_norm": 0.349609375, "grad_norm_var": 0.007144021987915039, "learning_rate": 0.01, "loss": 1.4516, "loss/crossentropy": 2.4301480054855347, "loss/fcd": 1.12109375, "loss/logits": 0.2268705815076828, "step": 834 }, { "epoch": 0.002651467039248063, "grad_norm": 0.3515625, "grad_norm_var": 0.007088470458984375, "learning_rate": 0.01, "loss": 1.4651, "loss/crossentropy": 2.5820902585983276, "loss/fcd": 1.2265625, "loss/logits": 0.2804351896047592, "step": 835 }, { "epoch": 0.002654642448875905, "grad_norm": 0.333984375, "grad_norm_var": 0.007088470458984375, "learning_rate": 0.01, "loss": 1.4907, "loss/crossentropy": 2.4791622161865234, "loss/fcd": 1.19140625, "loss/logits": 0.2818540930747986, "step": 836 }, { "epoch": 0.002657817858503747, "grad_norm": 0.345703125, "grad_norm_var": 0.006893157958984375, "learning_rate": 0.01, "loss": 1.4604, "loss/crossentropy": 2.688996911048889, "loss/fcd": 1.203125, "loss/logits": 0.2950032204389572, "step": 837 }, { "epoch": 0.002660993268131589, "grad_norm": 0.3203125, "grad_norm_var": 0.006968434651692708, "learning_rate": 0.01, "loss": 1.4625, "loss/crossentropy": 2.627940058708191, "loss/fcd": 1.1796875, "loss/logits": 0.2809496968984604, "step": 838 }, { "epoch": 0.002664168677759431, "grad_norm": 0.328125, "grad_norm_var": 0.006968434651692708, "learning_rate": 0.01, "loss": 1.415, "loss/crossentropy": 2.53342342376709, "loss/fcd": 1.13671875, "loss/logits": 0.25208134949207306, "step": 839 }, { "epoch": 0.002667344087387273, "grad_norm": 0.375, "grad_norm_var": 0.006837701797485352, "learning_rate": 0.01, "loss": 1.4873, "loss/crossentropy": 2.443286418914795, "loss/fcd": 1.2578125, "loss/logits": 0.28253769874572754, "step": 840 }, { "epoch": 0.002670519497015115, "grad_norm": 0.333984375, "grad_norm_var": 0.006536340713500977, "learning_rate": 0.01, "loss": 1.4441, "loss/crossentropy": 2.526942253112793, "loss/fcd": 1.16796875, "loss/logits": 0.2894390672445297, "step": 841 }, { "epoch": 0.002673694906642957, "grad_norm": 0.31640625, "grad_norm_var": 0.006037330627441407, "learning_rate": 0.01, "loss": 1.4093, "loss/crossentropy": 2.517907738685608, "loss/fcd": 1.18359375, "loss/logits": 0.25637828558683395, "step": 842 }, { "epoch": 0.002676870316270799, "grad_norm": 0.302734375, "grad_norm_var": 0.0061798095703125, "learning_rate": 0.01, "loss": 1.3762, "loss/crossentropy": 2.454347848892212, "loss/fcd": 1.18359375, "loss/logits": 0.26797422766685486, "step": 843 }, { "epoch": 0.002680045725898641, "grad_norm": 0.294921875, "grad_norm_var": 0.006295204162597656, "learning_rate": 0.01, "loss": 1.3618, "loss/crossentropy": 2.2708157300949097, "loss/fcd": 1.05078125, "loss/logits": 0.22855685651302338, "step": 844 }, { "epoch": 0.002683221135526483, "grad_norm": 0.361328125, "grad_norm_var": 0.006229766209920247, "learning_rate": 0.01, "loss": 1.4618, "loss/crossentropy": 2.2917895913124084, "loss/fcd": 1.18359375, "loss/logits": 0.2567787766456604, "step": 845 }, { "epoch": 0.002686396545154325, "grad_norm": 0.357421875, "grad_norm_var": 0.006186532974243164, "learning_rate": 0.01, "loss": 1.4602, "loss/crossentropy": 2.8456441164016724, "loss/fcd": 1.19140625, "loss/logits": 0.2906789779663086, "step": 846 }, { "epoch": 0.002689571954782167, "grad_norm": 0.3046875, "grad_norm_var": 0.006270202000935873, "learning_rate": 0.01, "loss": 1.4181, "loss/crossentropy": 2.4699090719223022, "loss/fcd": 1.16015625, "loss/logits": 0.26574426889419556, "step": 847 }, { "epoch": 0.002692747364410009, "grad_norm": 0.32421875, "grad_norm_var": 0.0005176385243733724, "learning_rate": 0.01, "loss": 1.4665, "loss/crossentropy": 2.5497175455093384, "loss/fcd": 1.26953125, "loss/logits": 0.3088984936475754, "step": 848 }, { "epoch": 0.002695922774037851, "grad_norm": 0.314453125, "grad_norm_var": 0.0005322615305582682, "learning_rate": 0.01, "loss": 1.3841, "loss/crossentropy": 1.979094922542572, "loss/fcd": 1.056640625, "loss/logits": 0.22718285024166107, "step": 849 }, { "epoch": 0.002699098183665693, "grad_norm": 0.3125, "grad_norm_var": 0.0005319595336914062, "learning_rate": 0.01, "loss": 1.4201, "loss/crossentropy": 2.7100846767425537, "loss/fcd": 1.1796875, "loss/logits": 0.2712179571390152, "step": 850 }, { "epoch": 0.002702273593293535, "grad_norm": 0.322265625, "grad_norm_var": 0.0005007266998291015, "learning_rate": 0.01, "loss": 1.4194, "loss/crossentropy": 2.4905487298965454, "loss/fcd": 1.1875, "loss/logits": 0.27177831530570984, "step": 851 }, { "epoch": 0.0027054490029213766, "grad_norm": 0.3203125, "grad_norm_var": 0.0005015055338541667, "learning_rate": 0.01, "loss": 1.4136, "loss/crossentropy": 2.6221119165420532, "loss/fcd": 1.08203125, "loss/logits": 0.26027603447437286, "step": 852 }, { "epoch": 0.002708624412549219, "grad_norm": 0.27734375, "grad_norm_var": 0.0006244500478108724, "learning_rate": 0.01, "loss": 1.4032, "loss/crossentropy": 2.3643332719802856, "loss/fcd": 1.08984375, "loss/logits": 0.2513630613684654, "step": 853 }, { "epoch": 0.002711799822177061, "grad_norm": 0.330078125, "grad_norm_var": 0.0006270726521809896, "learning_rate": 0.01, "loss": 1.4191, "loss/crossentropy": 2.443013310432434, "loss/fcd": 1.12109375, "loss/logits": 0.2727329283952713, "step": 854 }, { "epoch": 0.0027149752318049026, "grad_norm": 0.404296875, "grad_norm_var": 0.001036818822224935, "learning_rate": 0.01, "loss": 1.5809, "loss/crossentropy": 2.763969898223877, "loss/fcd": 1.22265625, "loss/logits": 0.27742110192775726, "step": 855 }, { "epoch": 0.002718150641432745, "grad_norm": 0.30859375, "grad_norm_var": 0.0008984724680582682, "learning_rate": 0.01, "loss": 1.4433, "loss/crossentropy": 2.837620258331299, "loss/fcd": 1.24609375, "loss/logits": 0.2985023260116577, "step": 856 }, { "epoch": 0.002721326051060587, "grad_norm": 0.29296875, "grad_norm_var": 0.0009495417277018229, "learning_rate": 0.01, "loss": 1.3734, "loss/crossentropy": 2.5134434700012207, "loss/fcd": 1.1015625, "loss/logits": 0.25185875594615936, "step": 857 }, { "epoch": 0.0027245014606884286, "grad_norm": 0.349609375, "grad_norm_var": 0.0009957472483317056, "learning_rate": 0.01, "loss": 1.4092, "loss/crossentropy": 2.582868814468384, "loss/fcd": 1.21484375, "loss/logits": 0.27443407475948334, "step": 858 }, { "epoch": 0.002727676870316271, "grad_norm": 0.310546875, "grad_norm_var": 0.0009778181711832682, "learning_rate": 0.01, "loss": 1.4308, "loss/crossentropy": 2.435882806777954, "loss/fcd": 1.15234375, "loss/logits": 0.2678230404853821, "step": 859 }, { "epoch": 0.002730852279944113, "grad_norm": 0.34765625, "grad_norm_var": 0.0009464899698893229, "learning_rate": 0.01, "loss": 1.4419, "loss/crossentropy": 2.4174832105636597, "loss/fcd": 1.119140625, "loss/logits": 0.25999006628990173, "step": 860 }, { "epoch": 0.0027340276895719546, "grad_norm": 0.3359375, "grad_norm_var": 0.0008718967437744141, "learning_rate": 0.01, "loss": 1.425, "loss/crossentropy": 2.6464943885803223, "loss/fcd": 1.14453125, "loss/logits": 0.27115726470947266, "step": 861 }, { "epoch": 0.002737203099199797, "grad_norm": 0.29296875, "grad_norm_var": 0.000859832763671875, "learning_rate": 0.01, "loss": 1.3665, "loss/crossentropy": 2.288085401058197, "loss/fcd": 1.0546875, "loss/logits": 0.24431900680065155, "step": 862 }, { "epoch": 0.002740378508827639, "grad_norm": 0.30859375, "grad_norm_var": 0.0008518854777018229, "learning_rate": 0.01, "loss": 1.4844, "loss/crossentropy": 2.3463457822799683, "loss/fcd": 1.09765625, "loss/logits": 0.242444708943367, "step": 863 }, { "epoch": 0.0027435539184554806, "grad_norm": 0.39453125, "grad_norm_var": 0.0011814753214518228, "learning_rate": 0.01, "loss": 1.4526, "loss/crossentropy": 2.678402900695801, "loss/fcd": 1.16796875, "loss/logits": 0.28724005818367004, "step": 864 }, { "epoch": 0.002746729328083323, "grad_norm": 0.30078125, "grad_norm_var": 0.0012149651845296224, "learning_rate": 0.01, "loss": 1.4303, "loss/crossentropy": 2.6553531885147095, "loss/fcd": 1.12890625, "loss/logits": 0.25995686650276184, "step": 865 }, { "epoch": 0.002749904737711165, "grad_norm": 0.30859375, "grad_norm_var": 0.0012227217356363931, "learning_rate": 0.01, "loss": 1.4211, "loss/crossentropy": 2.5615280866622925, "loss/fcd": 1.19921875, "loss/logits": 0.2804127335548401, "step": 866 }, { "epoch": 0.0027530801473390066, "grad_norm": 0.330078125, "grad_norm_var": 0.0012233575185139974, "learning_rate": 0.01, "loss": 1.4315, "loss/crossentropy": 2.463539242744446, "loss/fcd": 1.21484375, "loss/logits": 0.25969094038009644, "step": 867 }, { "epoch": 0.002756255556966849, "grad_norm": 0.3203125, "grad_norm_var": 0.0012233575185139974, "learning_rate": 0.01, "loss": 1.4127, "loss/crossentropy": 2.588402032852173, "loss/fcd": 1.140625, "loss/logits": 0.2672501355409622, "step": 868 }, { "epoch": 0.002759430966594691, "grad_norm": 0.287109375, "grad_norm_var": 0.0011662165323893228, "learning_rate": 0.01, "loss": 1.3485, "loss/crossentropy": 2.7573047876358032, "loss/fcd": 1.09765625, "loss/logits": 0.24214741587638855, "step": 869 }, { "epoch": 0.0027626063762225326, "grad_norm": 0.33984375, "grad_norm_var": 0.0011769453684488931, "learning_rate": 0.01, "loss": 1.4414, "loss/crossentropy": 2.596221089363098, "loss/fcd": 1.3359375, "loss/logits": 0.35063809156417847, "step": 870 }, { "epoch": 0.0027657817858503745, "grad_norm": 0.40234375, "grad_norm_var": 0.001157061258951823, "learning_rate": 0.01, "loss": 1.5527, "loss/crossentropy": 2.480383038520813, "loss/fcd": 1.28125, "loss/logits": 0.2909218743443489, "step": 871 }, { "epoch": 0.002768957195478217, "grad_norm": 0.306640625, "grad_norm_var": 0.0011620680491129557, "learning_rate": 0.01, "loss": 1.3781, "loss/crossentropy": 2.716557264328003, "loss/fcd": 1.1171875, "loss/logits": 0.25886131823062897, "step": 872 }, { "epoch": 0.0027721326051060586, "grad_norm": 0.333984375, "grad_norm_var": 0.0010822931925455728, "learning_rate": 0.01, "loss": 1.4704, "loss/crossentropy": 2.611976981163025, "loss/fcd": 1.1328125, "loss/logits": 0.2547585070133209, "step": 873 }, { "epoch": 0.0027753080147339005, "grad_norm": 0.3359375, "grad_norm_var": 0.0010570367177327475, "learning_rate": 0.01, "loss": 1.4086, "loss/crossentropy": 2.223254084587097, "loss/fcd": 1.125, "loss/logits": 0.2478521689772606, "step": 874 }, { "epoch": 0.002778483424361743, "grad_norm": 0.328125, "grad_norm_var": 0.0010342915852864583, "learning_rate": 0.01, "loss": 1.4322, "loss/crossentropy": 2.3783280849456787, "loss/fcd": 1.30078125, "loss/logits": 0.30417361855506897, "step": 875 }, { "epoch": 0.0027816588339895846, "grad_norm": 0.328125, "grad_norm_var": 0.0010110855102539063, "learning_rate": 0.01, "loss": 1.4755, "loss/crossentropy": 2.6082438230514526, "loss/fcd": 1.12890625, "loss/logits": 0.2530500888824463, "step": 876 }, { "epoch": 0.0027848342436174265, "grad_norm": 0.341796875, "grad_norm_var": 0.0010191440582275391, "learning_rate": 0.01, "loss": 1.4435, "loss/crossentropy": 2.5703450441360474, "loss/fcd": 1.28125, "loss/logits": 0.30038726329803467, "step": 877 }, { "epoch": 0.002788009653245269, "grad_norm": 0.296875, "grad_norm_var": 0.0010014692942301431, "learning_rate": 0.01, "loss": 1.3996, "loss/crossentropy": 2.516785979270935, "loss/fcd": 1.11328125, "loss/logits": 0.2592373341321945, "step": 878 }, { "epoch": 0.0027911850628731106, "grad_norm": 0.3125, "grad_norm_var": 0.00099180539449056, "learning_rate": 0.01, "loss": 1.4439, "loss/crossentropy": 2.5470499992370605, "loss/fcd": 1.19921875, "loss/logits": 0.26968318223953247, "step": 879 }, { "epoch": 0.0027943604725009525, "grad_norm": 0.33203125, "grad_norm_var": 0.0006917158762613933, "learning_rate": 0.01, "loss": 1.4456, "loss/crossentropy": 2.4465125799179077, "loss/fcd": 1.2265625, "loss/logits": 0.2825637459754944, "step": 880 }, { "epoch": 0.002797535882128795, "grad_norm": 0.36328125, "grad_norm_var": 0.0007313887278238932, "learning_rate": 0.01, "loss": 1.4199, "loss/crossentropy": 2.6791458129882812, "loss/fcd": 1.12890625, "loss/logits": 0.25636987388134, "step": 881 }, { "epoch": 0.0028007112917566366, "grad_norm": 0.3046875, "grad_norm_var": 0.0007430871327718098, "learning_rate": 0.01, "loss": 1.394, "loss/crossentropy": 2.476168990135193, "loss/fcd": 1.0703125, "loss/logits": 0.23859203606843948, "step": 882 }, { "epoch": 0.0028038867013844785, "grad_norm": 0.337890625, "grad_norm_var": 0.0007480462392171224, "learning_rate": 0.01, "loss": 1.4832, "loss/crossentropy": 2.4546685218811035, "loss/fcd": 1.33203125, "loss/logits": 0.28794096410274506, "step": 883 }, { "epoch": 0.002807062111012321, "grad_norm": 0.310546875, "grad_norm_var": 0.0007659276326497396, "learning_rate": 0.01, "loss": 1.4336, "loss/crossentropy": 2.5521098375320435, "loss/fcd": 1.19140625, "loss/logits": 0.28022629767656326, "step": 884 }, { "epoch": 0.0028102375206401626, "grad_norm": 0.314453125, "grad_norm_var": 0.0006604512532552083, "learning_rate": 0.01, "loss": 1.4673, "loss/crossentropy": 2.688372492790222, "loss/fcd": 1.26953125, "loss/logits": 0.3135879784822464, "step": 885 }, { "epoch": 0.0028134129302680045, "grad_norm": 0.3203125, "grad_norm_var": 0.0006601333618164063, "learning_rate": 0.01, "loss": 1.4317, "loss/crossentropy": 2.485527992248535, "loss/fcd": 1.140625, "loss/logits": 0.2601539343595505, "step": 886 }, { "epoch": 0.0028165883398958464, "grad_norm": 0.3046875, "grad_norm_var": 0.00030568440755208336, "learning_rate": 0.01, "loss": 1.4513, "loss/crossentropy": 2.6165852546691895, "loss/fcd": 1.1796875, "loss/logits": 0.274740993976593, "step": 887 }, { "epoch": 0.0028197637495236886, "grad_norm": 0.31640625, "grad_norm_var": 0.00029002825419108074, "learning_rate": 0.01, "loss": 1.4586, "loss/crossentropy": 2.602621555328369, "loss/fcd": 1.11328125, "loss/logits": 0.27761097252368927, "step": 888 }, { "epoch": 0.0028229391591515305, "grad_norm": 0.298828125, "grad_norm_var": 0.0003197828928629557, "learning_rate": 0.01, "loss": 1.3324, "loss/crossentropy": 2.4203325510025024, "loss/fcd": 1.1015625, "loss/logits": 0.2277730330824852, "step": 889 }, { "epoch": 0.0028261145687793724, "grad_norm": 0.3359375, "grad_norm_var": 0.0003197828928629557, "learning_rate": 0.01, "loss": 1.3539, "loss/crossentropy": 2.5617196559906006, "loss/fcd": 1.08984375, "loss/logits": 0.23476186394691467, "step": 890 }, { "epoch": 0.0028292899784072146, "grad_norm": 0.3046875, "grad_norm_var": 0.00033389727274576824, "learning_rate": 0.01, "loss": 1.4448, "loss/crossentropy": 2.471903085708618, "loss/fcd": 1.10546875, "loss/logits": 0.23556457459926605, "step": 891 }, { "epoch": 0.0028324653880350565, "grad_norm": 0.2734375, "grad_norm_var": 0.00046296119689941405, "learning_rate": 0.01, "loss": 1.3837, "loss/crossentropy": 2.3533374071121216, "loss/fcd": 1.04296875, "loss/logits": 0.23106467723846436, "step": 892 }, { "epoch": 0.0028356407976628984, "grad_norm": 0.296875, "grad_norm_var": 0.00043919881184895836, "learning_rate": 0.01, "loss": 1.464, "loss/crossentropy": 2.291712164878845, "loss/fcd": 1.25390625, "loss/logits": 0.27682557702064514, "step": 893 }, { "epoch": 0.0028388162072907406, "grad_norm": 0.3125, "grad_norm_var": 0.000418853759765625, "learning_rate": 0.01, "loss": 1.4181, "loss/crossentropy": 2.5978509187698364, "loss/fcd": 1.09765625, "loss/logits": 0.24423115700483322, "step": 894 }, { "epoch": 0.0028419916169185825, "grad_norm": 0.3671875, "grad_norm_var": 0.0005879720052083333, "learning_rate": 0.01, "loss": 1.4442, "loss/crossentropy": 2.315797448158264, "loss/fcd": 1.09375, "loss/logits": 0.257804811000824, "step": 895 }, { "epoch": 0.0028451670265464244, "grad_norm": 0.34765625, "grad_norm_var": 0.0006317138671875, "learning_rate": 0.01, "loss": 1.5356, "loss/crossentropy": 2.6648523807525635, "loss/fcd": 1.28125, "loss/logits": 0.30204954743385315, "step": 896 }, { "epoch": 0.0028483424361742666, "grad_norm": 0.30859375, "grad_norm_var": 0.000498199462890625, "learning_rate": 0.01, "loss": 1.4561, "loss/crossentropy": 2.4717235565185547, "loss/fcd": 1.171875, "loss/logits": 0.2729053795337677, "step": 897 }, { "epoch": 0.0028515178458021085, "grad_norm": 0.5546875, "grad_norm_var": 0.004030100504557292, "learning_rate": 0.01, "loss": 1.5499, "loss/crossentropy": 3.113871932029724, "loss/fcd": 1.39453125, "loss/logits": 0.3611048758029938, "step": 898 }, { "epoch": 0.0028546932554299504, "grad_norm": 0.30859375, "grad_norm_var": 0.004058949152628581, "learning_rate": 0.01, "loss": 1.3453, "loss/crossentropy": 2.314198851585388, "loss/fcd": 1.09765625, "loss/logits": 0.24821141362190247, "step": 899 }, { "epoch": 0.0028578686650577926, "grad_norm": 0.322265625, "grad_norm_var": 0.00403758684794108, "learning_rate": 0.01, "loss": 1.4586, "loss/crossentropy": 2.719810128211975, "loss/fcd": 1.20703125, "loss/logits": 0.285122886300087, "step": 900 }, { "epoch": 0.0028610440746856345, "grad_norm": 0.302734375, "grad_norm_var": 0.004071156183878581, "learning_rate": 0.01, "loss": 1.3929, "loss/crossentropy": 2.3923213481903076, "loss/fcd": 1.21875, "loss/logits": 0.27605894207954407, "step": 901 }, { "epoch": 0.0028642194843134764, "grad_norm": 0.36328125, "grad_norm_var": 0.004132699966430664, "learning_rate": 0.01, "loss": 1.4446, "loss/crossentropy": 2.7804335355758667, "loss/fcd": 1.16796875, "loss/logits": 0.2534557655453682, "step": 902 }, { "epoch": 0.0028673948939413186, "grad_norm": 0.412109375, "grad_norm_var": 0.00445702870686849, "learning_rate": 0.01, "loss": 1.6171, "loss/crossentropy": 2.685579299926758, "loss/fcd": 1.4921875, "loss/logits": 0.3942929208278656, "step": 903 }, { "epoch": 0.0028705703035691605, "grad_norm": 0.26953125, "grad_norm_var": 0.004736264546712239, "learning_rate": 0.01, "loss": 1.3019, "loss/crossentropy": 2.5210787057876587, "loss/fcd": 1.08984375, "loss/logits": 0.24153603613376617, "step": 904 }, { "epoch": 0.0028737457131970024, "grad_norm": 0.302734375, "grad_norm_var": 0.004717763264973958, "learning_rate": 0.01, "loss": 1.4107, "loss/crossentropy": 2.492193818092346, "loss/fcd": 1.068359375, "loss/logits": 0.2364477664232254, "step": 905 }, { "epoch": 0.002876921122824844, "grad_norm": 0.330078125, "grad_norm_var": 0.004720290501912435, "learning_rate": 0.01, "loss": 1.4085, "loss/crossentropy": 2.4808989763259888, "loss/fcd": 1.0546875, "loss/logits": 0.24788396060466766, "step": 906 }, { "epoch": 0.0028800965324526865, "grad_norm": 0.302734375, "grad_norm_var": 0.00472869873046875, "learning_rate": 0.01, "loss": 1.4217, "loss/crossentropy": 2.5834563970565796, "loss/fcd": 1.2109375, "loss/logits": 0.2806694507598877, "step": 907 }, { "epoch": 0.0028832719420805284, "grad_norm": 0.287109375, "grad_norm_var": 0.00462644894917806, "learning_rate": 0.01, "loss": 1.3935, "loss/crossentropy": 2.3288962841033936, "loss/fcd": 1.21875, "loss/logits": 0.2856874167919159, "step": 908 }, { "epoch": 0.00288644735170837, "grad_norm": 0.333984375, "grad_norm_var": 0.004515012105305989, "learning_rate": 0.01, "loss": 1.4676, "loss/crossentropy": 2.6342945098876953, "loss/fcd": 1.21875, "loss/logits": 0.2851613834500313, "step": 909 }, { "epoch": 0.0028896227613362125, "grad_norm": 0.310546875, "grad_norm_var": 0.004522180557250977, "learning_rate": 0.01, "loss": 1.4644, "loss/crossentropy": 2.5430550575256348, "loss/fcd": 1.20703125, "loss/logits": 0.29332470893859863, "step": 910 }, { "epoch": 0.0028927981709640544, "grad_norm": 0.330078125, "grad_norm_var": 0.004468727111816406, "learning_rate": 0.01, "loss": 1.4622, "loss/crossentropy": 2.6547353267669678, "loss/fcd": 1.1015625, "loss/logits": 0.2597702741622925, "step": 911 }, { "epoch": 0.002895973580591896, "grad_norm": 0.3671875, "grad_norm_var": 0.00452117919921875, "learning_rate": 0.01, "loss": 1.4547, "loss/crossentropy": 2.401389479637146, "loss/fcd": 1.16796875, "loss/logits": 0.25198329240083694, "step": 912 }, { "epoch": 0.0028991489902197385, "grad_norm": 0.34375, "grad_norm_var": 0.004461097717285156, "learning_rate": 0.01, "loss": 1.457, "loss/crossentropy": 2.744496464729309, "loss/fcd": 1.1875, "loss/logits": 0.2659071385860443, "step": 913 }, { "epoch": 0.0029023243998475804, "grad_norm": 0.392578125, "grad_norm_var": 0.0014650821685791016, "learning_rate": 0.01, "loss": 1.4321, "loss/crossentropy": 2.878139853477478, "loss/fcd": 1.19921875, "loss/logits": 0.3004954159259796, "step": 914 }, { "epoch": 0.002905499809475422, "grad_norm": 0.3203125, "grad_norm_var": 0.001440286636352539, "learning_rate": 0.01, "loss": 1.4528, "loss/crossentropy": 2.594788074493408, "loss/fcd": 1.2421875, "loss/logits": 0.3067094385623932, "step": 915 }, { "epoch": 0.0029086752191032645, "grad_norm": 0.31640625, "grad_norm_var": 0.0014490127563476563, "learning_rate": 0.01, "loss": 1.4456, "loss/crossentropy": 2.4159862995147705, "loss/fcd": 1.13671875, "loss/logits": 0.2698502242565155, "step": 916 }, { "epoch": 0.0029118506287311064, "grad_norm": 0.349609375, "grad_norm_var": 0.0014139175415039062, "learning_rate": 0.01, "loss": 1.5096, "loss/crossentropy": 2.7082713842391968, "loss/fcd": 1.37109375, "loss/logits": 0.31943124532699585, "step": 917 }, { "epoch": 0.002915026038358948, "grad_norm": 0.306640625, "grad_norm_var": 0.001387643814086914, "learning_rate": 0.01, "loss": 1.4261, "loss/crossentropy": 2.5128647089004517, "loss/fcd": 1.1796875, "loss/logits": 0.2783215194940567, "step": 918 }, { "epoch": 0.0029182014479867905, "grad_norm": 0.32421875, "grad_norm_var": 0.00090484619140625, "learning_rate": 0.01, "loss": 1.4686, "loss/crossentropy": 2.6365870237350464, "loss/fcd": 1.30078125, "loss/logits": 0.29506950080394745, "step": 919 }, { "epoch": 0.0029213768576146324, "grad_norm": 0.31640625, "grad_norm_var": 0.00070037841796875, "learning_rate": 0.01, "loss": 1.4489, "loss/crossentropy": 2.3003333806991577, "loss/fcd": 1.04296875, "loss/logits": 0.21715252101421356, "step": 920 }, { "epoch": 0.002924552267242474, "grad_norm": 0.32421875, "grad_norm_var": 0.0006592909495035808, "learning_rate": 0.01, "loss": 1.4679, "loss/crossentropy": 2.3930691480636597, "loss/fcd": 1.1875, "loss/logits": 0.29751165211200714, "step": 921 }, { "epoch": 0.002927727676870316, "grad_norm": 0.3046875, "grad_norm_var": 0.0006942113240559895, "learning_rate": 0.01, "loss": 1.4147, "loss/crossentropy": 2.6179150342941284, "loss/fcd": 1.1796875, "loss/logits": 0.2832389771938324, "step": 922 }, { "epoch": 0.0029309030864981584, "grad_norm": 0.3359375, "grad_norm_var": 0.0006561120351155599, "learning_rate": 0.01, "loss": 1.4828, "loss/crossentropy": 2.6063419580459595, "loss/fcd": 1.1953125, "loss/logits": 0.29597169160842896, "step": 923 }, { "epoch": 0.002934078496126, "grad_norm": 0.3125, "grad_norm_var": 0.000554656982421875, "learning_rate": 0.01, "loss": 1.402, "loss/crossentropy": 2.688423752784729, "loss/fcd": 1.2265625, "loss/logits": 0.30215703696012497, "step": 924 }, { "epoch": 0.002937253905753842, "grad_norm": 0.31640625, "grad_norm_var": 0.0005659580230712891, "learning_rate": 0.01, "loss": 1.3957, "loss/crossentropy": 2.4074472188949585, "loss/fcd": 1.15234375, "loss/logits": 0.23468804359436035, "step": 925 }, { "epoch": 0.0029404293153816844, "grad_norm": 0.341796875, "grad_norm_var": 0.0005481561024983724, "learning_rate": 0.01, "loss": 1.3932, "loss/crossentropy": 2.6345489025115967, "loss/fcd": 1.13671875, "loss/logits": 0.2546175494790077, "step": 926 }, { "epoch": 0.002943604725009526, "grad_norm": 0.302734375, "grad_norm_var": 0.0005997816721598307, "learning_rate": 0.01, "loss": 1.3703, "loss/crossentropy": 2.6155985593795776, "loss/fcd": 1.1015625, "loss/logits": 0.2561270222067833, "step": 927 }, { "epoch": 0.002946780134637368, "grad_norm": 0.322265625, "grad_norm_var": 0.0005014419555664062, "learning_rate": 0.01, "loss": 1.4661, "loss/crossentropy": 2.834420084953308, "loss/fcd": 1.23046875, "loss/logits": 0.2918202877044678, "step": 928 }, { "epoch": 0.0029499555442652104, "grad_norm": 0.408203125, "grad_norm_var": 0.0009058475494384766, "learning_rate": 0.01, "loss": 1.465, "loss/crossentropy": 2.3837335109710693, "loss/fcd": 1.09375, "loss/logits": 0.24453241378068924, "step": 929 }, { "epoch": 0.002953130953893052, "grad_norm": 0.32421875, "grad_norm_var": 0.0006360371907552083, "learning_rate": 0.01, "loss": 1.4427, "loss/crossentropy": 2.5774593353271484, "loss/fcd": 1.16796875, "loss/logits": 0.2620382457971573, "step": 930 }, { "epoch": 0.002956306363520894, "grad_norm": 0.279296875, "grad_norm_var": 0.0007758935292561849, "learning_rate": 0.01, "loss": 1.4075, "loss/crossentropy": 2.4388264417648315, "loss/fcd": 1.18359375, "loss/logits": 0.28906485438346863, "step": 931 }, { "epoch": 0.0029594817731487364, "grad_norm": 0.38671875, "grad_norm_var": 0.0010127862294514974, "learning_rate": 0.01, "loss": 1.5417, "loss/crossentropy": 2.6522743701934814, "loss/fcd": 1.2890625, "loss/logits": 0.30069686472415924, "step": 932 }, { "epoch": 0.002962657182776578, "grad_norm": 0.35546875, "grad_norm_var": 0.0010314305623372396, "learning_rate": 0.01, "loss": 1.5195, "loss/crossentropy": 2.641858696937561, "loss/fcd": 1.1015625, "loss/logits": 0.245390385389328, "step": 933 }, { "epoch": 0.00296583259240442, "grad_norm": 0.51953125, "grad_norm_var": 0.003233448664347331, "learning_rate": 0.01, "loss": 1.4171, "loss/crossentropy": 2.656904935836792, "loss/fcd": 1.14453125, "loss/logits": 0.25563862919807434, "step": 934 }, { "epoch": 0.0029690080020322624, "grad_norm": 0.453125, "grad_norm_var": 0.003963581720987956, "learning_rate": 0.01, "loss": 1.5476, "loss/crossentropy": 2.3813849687576294, "loss/fcd": 1.390625, "loss/logits": 0.2858322858810425, "step": 935 }, { "epoch": 0.002972183411660104, "grad_norm": 0.32421875, "grad_norm_var": 0.00393217404683431, "learning_rate": 0.01, "loss": 1.4042, "loss/crossentropy": 2.0524495244026184, "loss/fcd": 1.15234375, "loss/logits": 0.2592615410685539, "step": 936 }, { "epoch": 0.002975358821287946, "grad_norm": 0.34375, "grad_norm_var": 0.003887033462524414, "learning_rate": 0.01, "loss": 1.3911, "loss/crossentropy": 2.323083281517029, "loss/fcd": 1.01953125, "loss/logits": 0.2301131784915924, "step": 937 }, { "epoch": 0.002978534230915788, "grad_norm": 0.318359375, "grad_norm_var": 0.0038125991821289064, "learning_rate": 0.01, "loss": 1.4334, "loss/crossentropy": 2.8026427030563354, "loss/fcd": 1.21875, "loss/logits": 0.2876330763101578, "step": 938 }, { "epoch": 0.00298170964054363, "grad_norm": 0.375, "grad_norm_var": 0.0038202285766601564, "learning_rate": 0.01, "loss": 1.4534, "loss/crossentropy": 2.8406271934509277, "loss/fcd": 1.2265625, "loss/logits": 0.30728158354759216, "step": 939 }, { "epoch": 0.002984885050171472, "grad_norm": 0.322265625, "grad_norm_var": 0.0037705580393473308, "learning_rate": 0.01, "loss": 1.4381, "loss/crossentropy": 2.3877947330474854, "loss/fcd": 1.1328125, "loss/logits": 0.2693821042776108, "step": 940 }, { "epoch": 0.002988060459799314, "grad_norm": 0.33984375, "grad_norm_var": 0.0036816755930582684, "learning_rate": 0.01, "loss": 1.4882, "loss/crossentropy": 2.621238350868225, "loss/fcd": 1.16796875, "loss/logits": 0.2864304333925247, "step": 941 }, { "epoch": 0.002991235869427156, "grad_norm": 0.322265625, "grad_norm_var": 0.003745889663696289, "learning_rate": 0.01, "loss": 1.4198, "loss/crossentropy": 2.700785517692566, "loss/fcd": 1.140625, "loss/logits": 0.24927233904600143, "step": 942 }, { "epoch": 0.002994411279054998, "grad_norm": 0.328125, "grad_norm_var": 0.0036055882771809894, "learning_rate": 0.01, "loss": 1.383, "loss/crossentropy": 2.4637316465377808, "loss/fcd": 1.12109375, "loss/logits": 0.24573469907045364, "step": 943 }, { "epoch": 0.00299758668868284, "grad_norm": 0.29296875, "grad_norm_var": 0.003797515233357747, "learning_rate": 0.01, "loss": 1.3902, "loss/crossentropy": 2.4451873302459717, "loss/fcd": 1.14453125, "loss/logits": 0.2588518261909485, "step": 944 }, { "epoch": 0.003000762098310682, "grad_norm": 0.365234375, "grad_norm_var": 0.0036128838857014973, "learning_rate": 0.01, "loss": 1.5196, "loss/crossentropy": 2.537835717201233, "loss/fcd": 1.1484375, "loss/logits": 0.2622714936733246, "step": 945 }, { "epoch": 0.003003937507938524, "grad_norm": 0.328125, "grad_norm_var": 0.003598769505818685, "learning_rate": 0.01, "loss": 1.5059, "loss/crossentropy": 2.683961033821106, "loss/fcd": 1.265625, "loss/logits": 0.2954079210758209, "step": 946 }, { "epoch": 0.003007112917566366, "grad_norm": 0.302734375, "grad_norm_var": 0.0034015496571858724, "learning_rate": 0.01, "loss": 1.3637, "loss/crossentropy": 2.6332361698150635, "loss/fcd": 1.18359375, "loss/logits": 0.2805543690919876, "step": 947 }, { "epoch": 0.003010288327194208, "grad_norm": 0.302734375, "grad_norm_var": 0.0034856160481770834, "learning_rate": 0.01, "loss": 1.4121, "loss/crossentropy": 2.66991651058197, "loss/fcd": 1.11328125, "loss/logits": 0.2659083902835846, "step": 948 }, { "epoch": 0.00301346373682205, "grad_norm": 0.337890625, "grad_norm_var": 0.00349119504292806, "learning_rate": 0.01, "loss": 1.4366, "loss/crossentropy": 2.2836010456085205, "loss/fcd": 1.08984375, "loss/logits": 0.2548503130674362, "step": 949 }, { "epoch": 0.003016639146449892, "grad_norm": 0.294921875, "grad_norm_var": 0.0015225728352864583, "learning_rate": 0.01, "loss": 1.3926, "loss/crossentropy": 2.6424695253372192, "loss/fcd": 1.12890625, "loss/logits": 0.24301359802484512, "step": 950 }, { "epoch": 0.003019814556077734, "grad_norm": 0.33203125, "grad_norm_var": 0.0005233128865559896, "learning_rate": 0.01, "loss": 1.4374, "loss/crossentropy": 2.536555051803589, "loss/fcd": 1.1484375, "loss/logits": 0.2718455195426941, "step": 951 }, { "epoch": 0.003022989965705576, "grad_norm": 0.341796875, "grad_norm_var": 0.0005363305409749349, "learning_rate": 0.01, "loss": 1.4333, "loss/crossentropy": 2.5725537538528442, "loss/fcd": 1.140625, "loss/logits": 0.27416522800922394, "step": 952 }, { "epoch": 0.003026165375333418, "grad_norm": 0.337890625, "grad_norm_var": 0.0005261739095052083, "learning_rate": 0.01, "loss": 1.4737, "loss/crossentropy": 2.7799192667007446, "loss/fcd": 1.18359375, "loss/logits": 0.2824050039052963, "step": 953 }, { "epoch": 0.00302934078496126, "grad_norm": 0.328125, "grad_norm_var": 0.0005200544993082683, "learning_rate": 0.01, "loss": 1.476, "loss/crossentropy": 2.4345229864120483, "loss/fcd": 1.23046875, "loss/logits": 0.28268595039844513, "step": 954 }, { "epoch": 0.003032516194589102, "grad_norm": 0.6015625, "grad_norm_var": 0.005140542984008789, "learning_rate": 0.01, "loss": 1.4534, "loss/crossentropy": 2.3001020550727844, "loss/fcd": 1.23046875, "loss/logits": 0.251981720328331, "step": 955 }, { "epoch": 0.003035691604216944, "grad_norm": 0.375, "grad_norm_var": 0.0051727294921875, "learning_rate": 0.01, "loss": 1.407, "loss/crossentropy": 2.913905620574951, "loss/fcd": 1.29296875, "loss/logits": 0.3022526204586029, "step": 956 }, { "epoch": 0.0030388670138447858, "grad_norm": 0.33203125, "grad_norm_var": 0.005182647705078125, "learning_rate": 0.01, "loss": 1.4016, "loss/crossentropy": 2.5670337677001953, "loss/fcd": 1.2109375, "loss/logits": 0.2593180239200592, "step": 957 }, { "epoch": 0.003042042423472628, "grad_norm": 0.345703125, "grad_norm_var": 0.005145263671875, "learning_rate": 0.01, "loss": 1.398, "loss/crossentropy": 2.2563215494155884, "loss/fcd": 1.07421875, "loss/logits": 0.23961275815963745, "step": 958 }, { "epoch": 0.00304521783310047, "grad_norm": 0.3828125, "grad_norm_var": 0.005196889241536458, "learning_rate": 0.01, "loss": 1.4604, "loss/crossentropy": 2.3349900245666504, "loss/fcd": 1.19921875, "loss/logits": 0.25280235707759857, "step": 959 }, { "epoch": 0.0030483932427283118, "grad_norm": 0.333984375, "grad_norm_var": 0.00498960812886556, "learning_rate": 0.01, "loss": 1.4631, "loss/crossentropy": 2.7368998527526855, "loss/fcd": 1.23828125, "loss/logits": 0.2871920168399811, "step": 960 }, { "epoch": 0.003051568652356154, "grad_norm": 0.353515625, "grad_norm_var": 0.004978545506795247, "learning_rate": 0.01, "loss": 1.4908, "loss/crossentropy": 2.7697826623916626, "loss/fcd": 1.2578125, "loss/logits": 0.29911352694034576, "step": 961 }, { "epoch": 0.003054744061983996, "grad_norm": 0.36328125, "grad_norm_var": 0.004944213231404622, "learning_rate": 0.01, "loss": 1.3917, "loss/crossentropy": 2.340360403060913, "loss/fcd": 1.16796875, "loss/logits": 0.25422149896621704, "step": 962 }, { "epoch": 0.0030579194716118378, "grad_norm": 0.34375, "grad_norm_var": 0.00476830800374349, "learning_rate": 0.01, "loss": 1.4393, "loss/crossentropy": 2.7381350994110107, "loss/fcd": 1.125, "loss/logits": 0.2546796202659607, "step": 963 }, { "epoch": 0.00306109488123968, "grad_norm": 0.435546875, "grad_norm_var": 0.004915301005045573, "learning_rate": 0.01, "loss": 1.4986, "loss/crossentropy": 2.784728765487671, "loss/fcd": 1.20703125, "loss/logits": 0.26286639273166656, "step": 964 }, { "epoch": 0.003064270290867522, "grad_norm": 0.337890625, "grad_norm_var": 0.004915301005045573, "learning_rate": 0.01, "loss": 1.4682, "loss/crossentropy": 2.8558719158172607, "loss/fcd": 1.3203125, "loss/logits": 0.3325531631708145, "step": 965 }, { "epoch": 0.0030674457004953638, "grad_norm": 0.35546875, "grad_norm_var": 0.0045787652333577475, "learning_rate": 0.01, "loss": 1.5253, "loss/crossentropy": 2.7826974391937256, "loss/fcd": 1.5078125, "loss/logits": 0.3498596251010895, "step": 966 }, { "epoch": 0.003070621110123206, "grad_norm": 0.345703125, "grad_norm_var": 0.004523468017578125, "learning_rate": 0.01, "loss": 1.4103, "loss/crossentropy": 2.738588571548462, "loss/fcd": 1.10546875, "loss/logits": 0.2412184327840805, "step": 967 }, { "epoch": 0.003073796519751048, "grad_norm": 0.31640625, "grad_norm_var": 0.0046579837799072266, "learning_rate": 0.01, "loss": 1.3924, "loss/crossentropy": 2.625987410545349, "loss/fcd": 1.12890625, "loss/logits": 0.2591940835118294, "step": 968 }, { "epoch": 0.0030769719293788898, "grad_norm": 0.375, "grad_norm_var": 0.004594866434733073, "learning_rate": 0.01, "loss": 1.448, "loss/crossentropy": 2.872772216796875, "loss/fcd": 1.171875, "loss/logits": 0.28268587589263916, "step": 969 }, { "epoch": 0.003080147339006732, "grad_norm": 0.291015625, "grad_norm_var": 0.004889917373657226, "learning_rate": 0.01, "loss": 1.4409, "loss/crossentropy": 2.5798784494400024, "loss/fcd": 1.20703125, "loss/logits": 0.2871846854686737, "step": 970 }, { "epoch": 0.003083322748634574, "grad_norm": 0.341796875, "grad_norm_var": 0.0010192235310872396, "learning_rate": 0.01, "loss": 1.438, "loss/crossentropy": 2.5781710147857666, "loss/fcd": 1.18359375, "loss/logits": 0.2937946915626526, "step": 971 }, { "epoch": 0.0030864981582624158, "grad_norm": 0.37109375, "grad_norm_var": 0.0010080973307291666, "learning_rate": 0.01, "loss": 1.5058, "loss/crossentropy": 2.2210591435432434, "loss/fcd": 1.1015625, "loss/logits": 0.24351733922958374, "step": 972 }, { "epoch": 0.0030896735678902576, "grad_norm": 0.345703125, "grad_norm_var": 0.00098417599995931, "learning_rate": 0.01, "loss": 1.4781, "loss/crossentropy": 2.6322258710861206, "loss/fcd": 1.16796875, "loss/logits": 0.2937764972448349, "step": 973 }, { "epoch": 0.0030928489775181, "grad_norm": 0.3046875, "grad_norm_var": 0.0011260350545247396, "learning_rate": 0.01, "loss": 1.3773, "loss/crossentropy": 2.5671868324279785, "loss/fcd": 1.09765625, "loss/logits": 0.25163160264492035, "step": 974 }, { "epoch": 0.0030960243871459418, "grad_norm": 0.322265625, "grad_norm_var": 0.0010890801747639974, "learning_rate": 0.01, "loss": 1.4999, "loss/crossentropy": 2.754633903503418, "loss/fcd": 1.203125, "loss/logits": 0.28180718421936035, "step": 975 }, { "epoch": 0.0030991997967737836, "grad_norm": 0.2890625, "grad_norm_var": 0.0012875874837239584, "learning_rate": 0.01, "loss": 1.4037, "loss/crossentropy": 2.6361886262893677, "loss/fcd": 1.18359375, "loss/logits": 0.2723146229982376, "step": 976 }, { "epoch": 0.003102375206401626, "grad_norm": 0.33203125, "grad_norm_var": 0.0012870629628499348, "learning_rate": 0.01, "loss": 1.4562, "loss/crossentropy": 2.637445569038391, "loss/fcd": 1.21484375, "loss/logits": 0.30439358949661255, "step": 977 }, { "epoch": 0.0031055506160294678, "grad_norm": 0.302734375, "grad_norm_var": 0.0013437271118164062, "learning_rate": 0.01, "loss": 1.4091, "loss/crossentropy": 2.5362322330474854, "loss/fcd": 1.1171875, "loss/logits": 0.24747908115386963, "step": 978 }, { "epoch": 0.0031087260256573096, "grad_norm": 0.3125, "grad_norm_var": 0.001381365458170573, "learning_rate": 0.01, "loss": 1.4282, "loss/crossentropy": 2.4734596014022827, "loss/fcd": 1.1328125, "loss/logits": 0.26764166355133057, "step": 979 }, { "epoch": 0.003111901435285152, "grad_norm": 0.31640625, "grad_norm_var": 0.0006900628407796224, "learning_rate": 0.01, "loss": 1.4026, "loss/crossentropy": 2.739744544029236, "loss/fcd": 1.15625, "loss/logits": 0.27686062455177307, "step": 980 }, { "epoch": 0.0031150768449129938, "grad_norm": 0.337890625, "grad_norm_var": 0.0006900628407796224, "learning_rate": 0.01, "loss": 1.3851, "loss/crossentropy": 2.1636370420455933, "loss/fcd": 1.0390625, "loss/logits": 0.2328188493847847, "step": 981 }, { "epoch": 0.0031182522545408356, "grad_norm": 0.3359375, "grad_norm_var": 0.0006442864735921224, "learning_rate": 0.01, "loss": 1.438, "loss/crossentropy": 2.546046495437622, "loss/fcd": 1.15625, "loss/logits": 0.26181843876838684, "step": 982 }, { "epoch": 0.003121427664168678, "grad_norm": 0.287109375, "grad_norm_var": 0.0007167657216389974, "learning_rate": 0.01, "loss": 1.3737, "loss/crossentropy": 2.5789296627044678, "loss/fcd": 1.12109375, "loss/logits": 0.24968606233596802, "step": 983 }, { "epoch": 0.0031246030737965198, "grad_norm": 0.310546875, "grad_norm_var": 0.0007247289021809895, "learning_rate": 0.01, "loss": 1.4595, "loss/crossentropy": 2.515397310256958, "loss/fcd": 1.35546875, "loss/logits": 0.3120555281639099, "step": 984 }, { "epoch": 0.0031277784834243616, "grad_norm": 0.37109375, "grad_norm_var": 0.0006988525390625, "learning_rate": 0.01, "loss": 1.536, "loss/crossentropy": 2.334409475326538, "loss/fcd": 1.23046875, "loss/logits": 0.2596924602985382, "step": 985 }, { "epoch": 0.003130953893052204, "grad_norm": 0.322265625, "grad_norm_var": 0.0006256103515625, "learning_rate": 0.01, "loss": 1.3841, "loss/crossentropy": 2.4108821153640747, "loss/fcd": 1.1015625, "loss/logits": 0.24592270702123642, "step": 986 }, { "epoch": 0.0031341293026800458, "grad_norm": 0.34375, "grad_norm_var": 0.0006301720937093099, "learning_rate": 0.01, "loss": 1.4378, "loss/crossentropy": 2.685761332511902, "loss/fcd": 1.140625, "loss/logits": 0.271460197865963, "step": 987 }, { "epoch": 0.0031373047123078876, "grad_norm": 0.33203125, "grad_norm_var": 0.0004871209462483724, "learning_rate": 0.01, "loss": 1.4952, "loss/crossentropy": 2.3752769231796265, "loss/fcd": 1.125, "loss/logits": 0.2527815103530884, "step": 988 }, { "epoch": 0.00314048012193573, "grad_norm": 0.3359375, "grad_norm_var": 0.0004633585611979167, "learning_rate": 0.01, "loss": 1.4448, "loss/crossentropy": 2.775411605834961, "loss/fcd": 1.21484375, "loss/logits": 0.2925758957862854, "step": 989 }, { "epoch": 0.0031436555315635718, "grad_norm": 0.298828125, "grad_norm_var": 0.0004792372385660807, "learning_rate": 0.01, "loss": 1.4216, "loss/crossentropy": 2.5800126791000366, "loss/fcd": 1.1796875, "loss/logits": 0.2921972870826721, "step": 990 }, { "epoch": 0.0031468309411914136, "grad_norm": 0.30859375, "grad_norm_var": 0.000490252176920573, "learning_rate": 0.01, "loss": 1.3915, "loss/crossentropy": 2.578204393386841, "loss/fcd": 1.07421875, "loss/logits": 0.2426690310239792, "step": 991 }, { "epoch": 0.0031500063508192555, "grad_norm": 0.30859375, "grad_norm_var": 0.00043080647786458335, "learning_rate": 0.01, "loss": 1.4585, "loss/crossentropy": 2.6083006858825684, "loss/fcd": 1.203125, "loss/logits": 0.2811434864997864, "step": 992 }, { "epoch": 0.0031531817604470978, "grad_norm": 0.31640625, "grad_norm_var": 0.00042572021484375, "learning_rate": 0.01, "loss": 1.4765, "loss/crossentropy": 3.125910520553589, "loss/fcd": 1.2890625, "loss/logits": 0.30498407781124115, "step": 993 }, { "epoch": 0.0031563571700749396, "grad_norm": 0.357421875, "grad_norm_var": 0.00047734578450520836, "learning_rate": 0.01, "loss": 1.4862, "loss/crossentropy": 2.6022582054138184, "loss/fcd": 1.1328125, "loss/logits": 0.2557816356420517, "step": 994 }, { "epoch": 0.0031595325797027815, "grad_norm": 0.326171875, "grad_norm_var": 0.00046677589416503905, "learning_rate": 0.01, "loss": 1.4439, "loss/crossentropy": 2.7221599817276, "loss/fcd": 1.19140625, "loss/logits": 0.26561397314071655, "step": 995 }, { "epoch": 0.0031627079893306238, "grad_norm": 0.30078125, "grad_norm_var": 0.0005011081695556641, "learning_rate": 0.01, "loss": 1.4181, "loss/crossentropy": 2.575050950050354, "loss/fcd": 1.09375, "loss/logits": 0.24419991672039032, "step": 996 }, { "epoch": 0.0031658833989584656, "grad_norm": 0.310546875, "grad_norm_var": 0.0004993279774983724, "learning_rate": 0.01, "loss": 1.4263, "loss/crossentropy": 2.599568247795105, "loss/fcd": 1.31640625, "loss/logits": 0.29872435331344604, "step": 997 }, { "epoch": 0.0031690588085863075, "grad_norm": 0.36328125, "grad_norm_var": 0.0005936781565348307, "learning_rate": 0.01, "loss": 1.5182, "loss/crossentropy": 2.9009666442871094, "loss/fcd": 1.23046875, "loss/logits": 0.25188739597797394, "step": 998 }, { "epoch": 0.0031722342182141498, "grad_norm": 0.29296875, "grad_norm_var": 0.000566546122233073, "learning_rate": 0.01, "loss": 1.4238, "loss/crossentropy": 2.509885787963867, "loss/fcd": 1.12890625, "loss/logits": 0.2513066530227661, "step": 999 }, { "epoch": 0.0031754096278419916, "grad_norm": 0.34375, "grad_norm_var": 0.0005716800689697266, "learning_rate": 0.01, "loss": 1.382, "loss/crossentropy": 2.4817991256713867, "loss/fcd": 1.1796875, "loss/logits": 0.26043565571308136, "step": 1000 }, { "epoch": 0.0031785850374698335, "grad_norm": 0.31640625, "grad_norm_var": 0.0004372755686442057, "learning_rate": 0.01, "loss": 1.4035, "loss/crossentropy": 2.5738685131073, "loss/fcd": 1.12109375, "loss/logits": 0.2485879585146904, "step": 1001 }, { "epoch": 0.0031817604470976758, "grad_norm": 0.33203125, "grad_norm_var": 0.00044148763020833334, "learning_rate": 0.01, "loss": 1.3984, "loss/crossentropy": 2.8606088161468506, "loss/fcd": 1.20703125, "loss/logits": 0.3094464838504791, "step": 1002 }, { "epoch": 0.0031849358567255176, "grad_norm": 0.330078125, "grad_norm_var": 0.0004175662994384766, "learning_rate": 0.01, "loss": 1.4294, "loss/crossentropy": 2.789549946784973, "loss/fcd": 1.2421875, "loss/logits": 0.28422877192497253, "step": 1003 }, { "epoch": 0.0031881112663533595, "grad_norm": 0.361328125, "grad_norm_var": 0.00050506591796875, "learning_rate": 0.01, "loss": 1.5646, "loss/crossentropy": 2.5885664224624634, "loss/fcd": 1.34375, "loss/logits": 0.3584621697664261, "step": 1004 }, { "epoch": 0.0031912866759812018, "grad_norm": 0.298828125, "grad_norm_var": 0.0005379835764567057, "learning_rate": 0.01, "loss": 1.3991, "loss/crossentropy": 2.549331307411194, "loss/fcd": 1.08984375, "loss/logits": 0.24038879573345184, "step": 1005 }, { "epoch": 0.0031944620856090436, "grad_norm": 0.400390625, "grad_norm_var": 0.0008570194244384765, "learning_rate": 0.01, "loss": 1.4878, "loss/crossentropy": 2.7082276344299316, "loss/fcd": 1.17578125, "loss/logits": 0.30082835257053375, "step": 1006 }, { "epoch": 0.0031976374952368855, "grad_norm": 0.296875, "grad_norm_var": 0.0008978366851806641, "learning_rate": 0.01, "loss": 1.3724, "loss/crossentropy": 2.3085896968841553, "loss/fcd": 1.1171875, "loss/logits": 0.24059658497571945, "step": 1007 }, { "epoch": 0.0032008129048647273, "grad_norm": 0.3359375, "grad_norm_var": 0.0008720239003499349, "learning_rate": 0.01, "loss": 1.3669, "loss/crossentropy": 2.4333022832870483, "loss/fcd": 1.06640625, "loss/logits": 0.2479088455438614, "step": 1008 }, { "epoch": 0.0032039883144925696, "grad_norm": 0.298828125, "grad_norm_var": 0.0009236653645833333, "learning_rate": 0.01, "loss": 1.359, "loss/crossentropy": 2.3898542523384094, "loss/fcd": 1.09375, "loss/logits": 0.23970109224319458, "step": 1009 }, { "epoch": 0.0032071637241204115, "grad_norm": 0.484375, "grad_norm_var": 0.0024103641510009764, "learning_rate": 0.01, "loss": 1.4962, "loss/crossentropy": 2.512204885482788, "loss/fcd": 1.16796875, "loss/logits": 0.28015749156475067, "step": 1010 }, { "epoch": 0.0032103391337482533, "grad_norm": 0.283203125, "grad_norm_var": 0.0025880018870035808, "learning_rate": 0.01, "loss": 1.3787, "loss/crossentropy": 2.4823403358459473, "loss/fcd": 1.1484375, "loss/logits": 0.25986043363809586, "step": 1011 }, { "epoch": 0.0032135145433760956, "grad_norm": 0.306640625, "grad_norm_var": 0.002563921610514323, "learning_rate": 0.01, "loss": 1.4249, "loss/crossentropy": 2.563522219657898, "loss/fcd": 1.13671875, "loss/logits": 0.261352501809597, "step": 1012 }, { "epoch": 0.0032166899530039375, "grad_norm": 0.314453125, "grad_norm_var": 0.002552286783854167, "learning_rate": 0.01, "loss": 1.4218, "loss/crossentropy": 2.785019040107727, "loss/fcd": 1.140625, "loss/logits": 0.27389781177043915, "step": 1013 }, { "epoch": 0.0032198653626317793, "grad_norm": 0.322265625, "grad_norm_var": 0.002502552668253581, "learning_rate": 0.01, "loss": 1.463, "loss/crossentropy": 2.7358250617980957, "loss/fcd": 1.171875, "loss/logits": 0.26100394129753113, "step": 1014 }, { "epoch": 0.0032230407722596216, "grad_norm": 0.328125, "grad_norm_var": 0.0023949782053629557, "learning_rate": 0.01, "loss": 1.4417, "loss/crossentropy": 2.480009913444519, "loss/fcd": 1.1484375, "loss/logits": 0.26029807329177856, "step": 1015 }, { "epoch": 0.0032262161818874635, "grad_norm": 0.349609375, "grad_norm_var": 0.0024042765299479166, "learning_rate": 0.01, "loss": 1.4224, "loss/crossentropy": 2.6159327030181885, "loss/fcd": 1.1953125, "loss/logits": 0.2823094576597214, "step": 1016 }, { "epoch": 0.0032293915915153053, "grad_norm": 0.33984375, "grad_norm_var": 0.002380625406901042, "learning_rate": 0.01, "loss": 1.4616, "loss/crossentropy": 2.6160258054733276, "loss/fcd": 1.1796875, "loss/logits": 0.27377942204475403, "step": 1017 }, { "epoch": 0.0032325670011431476, "grad_norm": 0.361328125, "grad_norm_var": 0.002417103449503581, "learning_rate": 0.01, "loss": 1.4397, "loss/crossentropy": 2.4114701747894287, "loss/fcd": 1.13671875, "loss/logits": 0.25822656601667404, "step": 1018 }, { "epoch": 0.0032357424107709895, "grad_norm": 0.55859375, "grad_norm_var": 0.005431620279947916, "learning_rate": 0.01, "loss": 1.4536, "loss/crossentropy": 2.4348167181015015, "loss/fcd": 1.1328125, "loss/logits": 0.2747166305780411, "step": 1019 }, { "epoch": 0.0032389178203988313, "grad_norm": 0.32421875, "grad_norm_var": 0.005474201838175456, "learning_rate": 0.01, "loss": 1.3954, "loss/crossentropy": 2.6336867809295654, "loss/fcd": 1.140625, "loss/logits": 0.2648170441389084, "step": 1020 }, { "epoch": 0.0032420932300266736, "grad_norm": 0.310546875, "grad_norm_var": 0.005402485529581706, "learning_rate": 0.01, "loss": 1.4155, "loss/crossentropy": 2.7610650062561035, "loss/fcd": 1.17578125, "loss/logits": 0.27431049942970276, "step": 1021 }, { "epoch": 0.0032452686396545155, "grad_norm": 0.310546875, "grad_norm_var": 0.005314747492472331, "learning_rate": 0.01, "loss": 1.481, "loss/crossentropy": 2.3607091903686523, "loss/fcd": 1.16015625, "loss/logits": 0.2790570333600044, "step": 1022 }, { "epoch": 0.0032484440492823573, "grad_norm": 0.283203125, "grad_norm_var": 0.005414772033691406, "learning_rate": 0.01, "loss": 1.3969, "loss/crossentropy": 2.547972083091736, "loss/fcd": 1.10546875, "loss/logits": 0.2501314952969551, "step": 1023 }, { "epoch": 0.003251619458910199, "grad_norm": 0.361328125, "grad_norm_var": 0.00542613665262858, "learning_rate": 0.01, "loss": 1.4363, "loss/crossentropy": 2.212569236755371, "loss/fcd": 1.25, "loss/logits": 0.30927540361881256, "step": 1024 }, { "epoch": 0.0032547948685380415, "grad_norm": 0.337890625, "grad_norm_var": 0.005275456110636393, "learning_rate": 0.01, "loss": 1.4214, "loss/crossentropy": 2.341022491455078, "loss/fcd": 1.05859375, "loss/logits": 0.23233956098556519, "step": 1025 }, { "epoch": 0.0032579702781658833, "grad_norm": 0.32421875, "grad_norm_var": 0.0039773146311442055, "learning_rate": 0.01, "loss": 1.4292, "loss/crossentropy": 2.4576566219329834, "loss/fcd": 1.109375, "loss/logits": 0.25485286861658096, "step": 1026 }, { "epoch": 0.003261145687793725, "grad_norm": 0.341796875, "grad_norm_var": 0.003759876887003581, "learning_rate": 0.01, "loss": 1.5254, "loss/crossentropy": 2.7307602167129517, "loss/fcd": 1.296875, "loss/logits": 0.3331608921289444, "step": 1027 }, { "epoch": 0.0032643210974215675, "grad_norm": 0.3046875, "grad_norm_var": 0.003769365946451823, "learning_rate": 0.01, "loss": 1.4455, "loss/crossentropy": 2.401656150817871, "loss/fcd": 1.0703125, "loss/logits": 0.23490934073925018, "step": 1028 }, { "epoch": 0.0032674965070494093, "grad_norm": 0.3125, "grad_norm_var": 0.0037767887115478516, "learning_rate": 0.01, "loss": 1.4307, "loss/crossentropy": 2.6836836338043213, "loss/fcd": 1.1640625, "loss/logits": 0.26738440990448, "step": 1029 }, { "epoch": 0.003270671916677251, "grad_norm": 0.33984375, "grad_norm_var": 0.003750038146972656, "learning_rate": 0.01, "loss": 1.4284, "loss/crossentropy": 2.783128023147583, "loss/fcd": 1.1640625, "loss/logits": 0.25613802671432495, "step": 1030 }, { "epoch": 0.0032738473263050935, "grad_norm": 0.302734375, "grad_norm_var": 0.0038407484690348307, "learning_rate": 0.01, "loss": 1.4206, "loss/crossentropy": 2.419298768043518, "loss/fcd": 1.1484375, "loss/logits": 0.2692125141620636, "step": 1031 }, { "epoch": 0.0032770227359329353, "grad_norm": 0.314453125, "grad_norm_var": 0.003879658381144206, "learning_rate": 0.01, "loss": 1.4301, "loss/crossentropy": 2.6618926525115967, "loss/fcd": 1.2109375, "loss/logits": 0.311976820230484, "step": 1032 }, { "epoch": 0.003280198145560777, "grad_norm": 0.30859375, "grad_norm_var": 0.003938150405883789, "learning_rate": 0.01, "loss": 1.3934, "loss/crossentropy": 2.1765432357788086, "loss/fcd": 1.0546875, "loss/logits": 0.23913481086492538, "step": 1033 }, { "epoch": 0.0032833735551886195, "grad_norm": 0.36328125, "grad_norm_var": 0.0039446512858072914, "learning_rate": 0.01, "loss": 1.5611, "loss/crossentropy": 2.5830307006835938, "loss/fcd": 1.28515625, "loss/logits": 0.27587559819221497, "step": 1034 }, { "epoch": 0.0032865489648164613, "grad_norm": 0.353515625, "grad_norm_var": 0.0005250136057535807, "learning_rate": 0.01, "loss": 1.4545, "loss/crossentropy": 2.630650520324707, "loss/fcd": 1.234375, "loss/logits": 0.2913895398378372, "step": 1035 }, { "epoch": 0.003289724374444303, "grad_norm": 0.43359375, "grad_norm_var": 0.0012673536936442058, "learning_rate": 0.01, "loss": 1.5054, "loss/crossentropy": 2.890590190887451, "loss/fcd": 1.3046875, "loss/logits": 0.3052217364311218, "step": 1036 }, { "epoch": 0.0032928997840721455, "grad_norm": 0.328125, "grad_norm_var": 0.0012377421061197916, "learning_rate": 0.01, "loss": 1.4378, "loss/crossentropy": 2.127647042274475, "loss/fcd": 1.08984375, "loss/logits": 0.2593713104724884, "step": 1037 }, { "epoch": 0.0032960751936999873, "grad_norm": 0.30078125, "grad_norm_var": 0.0012723128000895181, "learning_rate": 0.01, "loss": 1.3719, "loss/crossentropy": 2.423389196395874, "loss/fcd": 1.107421875, "loss/logits": 0.26104626059532166, "step": 1038 }, { "epoch": 0.003299250603327829, "grad_norm": 0.330078125, "grad_norm_var": 0.0011052290598551433, "learning_rate": 0.01, "loss": 1.4692, "loss/crossentropy": 2.485539436340332, "loss/fcd": 1.24609375, "loss/logits": 0.29236532747745514, "step": 1039 }, { "epoch": 0.0033024260129556715, "grad_norm": 0.349609375, "grad_norm_var": 0.0010724226633707682, "learning_rate": 0.01, "loss": 1.3985, "loss/crossentropy": 2.106372117996216, "loss/fcd": 1.05859375, "loss/logits": 0.2526446133852005, "step": 1040 }, { "epoch": 0.0033056014225835133, "grad_norm": 0.353515625, "grad_norm_var": 0.0010955651601155599, "learning_rate": 0.01, "loss": 1.4717, "loss/crossentropy": 2.7554266452789307, "loss/fcd": 1.296875, "loss/logits": 0.3235396295785904, "step": 1041 }, { "epoch": 0.003308776832211355, "grad_norm": 0.322265625, "grad_norm_var": 0.0010986328125, "learning_rate": 0.01, "loss": 1.4257, "loss/crossentropy": 2.9350199699401855, "loss/fcd": 1.14453125, "loss/logits": 0.2595931738615036, "step": 1042 }, { "epoch": 0.003311952241839197, "grad_norm": 0.33203125, "grad_norm_var": 0.0010956923166910808, "learning_rate": 0.01, "loss": 1.4358, "loss/crossentropy": 2.8299973011016846, "loss/fcd": 1.171875, "loss/logits": 0.27031268179416656, "step": 1043 }, { "epoch": 0.0033151276514670393, "grad_norm": 0.322265625, "grad_norm_var": 0.0010454813639322917, "learning_rate": 0.01, "loss": 1.4252, "loss/crossentropy": 2.59855055809021, "loss/fcd": 1.19140625, "loss/logits": 0.26659615337848663, "step": 1044 }, { "epoch": 0.003318303061094881, "grad_norm": 0.296875, "grad_norm_var": 0.001108551025390625, "learning_rate": 0.01, "loss": 1.3453, "loss/crossentropy": 2.3809969425201416, "loss/fcd": 1.11328125, "loss/logits": 0.25864427536726, "step": 1045 }, { "epoch": 0.003321478470722723, "grad_norm": 0.32421875, "grad_norm_var": 0.0011126200358072917, "learning_rate": 0.01, "loss": 1.447, "loss/crossentropy": 2.435014486312866, "loss/fcd": 1.109375, "loss/logits": 0.25052157044410706, "step": 1046 }, { "epoch": 0.0033246538803505653, "grad_norm": 0.306640625, "grad_norm_var": 0.0010975519816080728, "learning_rate": 0.01, "loss": 1.3931, "loss/crossentropy": 2.3470256328582764, "loss/fcd": 1.11328125, "loss/logits": 0.2404329925775528, "step": 1047 }, { "epoch": 0.003327829289978407, "grad_norm": 0.333984375, "grad_norm_var": 0.0010711669921875, "learning_rate": 0.01, "loss": 1.4373, "loss/crossentropy": 2.6233872175216675, "loss/fcd": 1.17578125, "loss/logits": 0.26587191224098206, "step": 1048 }, { "epoch": 0.003331004699606249, "grad_norm": 0.31640625, "grad_norm_var": 0.001047515869140625, "learning_rate": 0.01, "loss": 1.4413, "loss/crossentropy": 2.737269639968872, "loss/fcd": 1.14453125, "loss/logits": 0.2643962651491165, "step": 1049 }, { "epoch": 0.0033341801092340913, "grad_norm": 0.3203125, "grad_norm_var": 0.0010034561157226563, "learning_rate": 0.01, "loss": 1.4228, "loss/crossentropy": 2.4701744318008423, "loss/fcd": 1.16796875, "loss/logits": 0.2651461884379387, "step": 1050 }, { "epoch": 0.003337355518861933, "grad_norm": 0.330078125, "grad_norm_var": 0.0009729385375976563, "learning_rate": 0.01, "loss": 1.4346, "loss/crossentropy": 2.6813589334487915, "loss/fcd": 1.140625, "loss/logits": 0.26212960481643677, "step": 1051 }, { "epoch": 0.003340530928489775, "grad_norm": 0.322265625, "grad_norm_var": 0.00022912025451660156, "learning_rate": 0.01, "loss": 1.4939, "loss/crossentropy": 2.502977132797241, "loss/fcd": 1.171875, "loss/logits": 0.2674940675497055, "step": 1052 }, { "epoch": 0.0033437063381176173, "grad_norm": 0.322265625, "grad_norm_var": 0.00022830963134765626, "learning_rate": 0.01, "loss": 1.465, "loss/crossentropy": 2.6414841413497925, "loss/fcd": 1.19140625, "loss/logits": 0.2858681082725525, "step": 1053 }, { "epoch": 0.003346881747745459, "grad_norm": 0.328125, "grad_norm_var": 0.00019048055013020832, "learning_rate": 0.01, "loss": 1.4448, "loss/crossentropy": 2.855597138404846, "loss/fcd": 1.1484375, "loss/logits": 0.2675042301416397, "step": 1054 }, { "epoch": 0.003350057157373301, "grad_norm": 0.33203125, "grad_norm_var": 0.0001918633778889974, "learning_rate": 0.01, "loss": 1.4568, "loss/crossentropy": 2.3461395502090454, "loss/fcd": 1.125, "loss/logits": 0.26019398123025894, "step": 1055 }, { "epoch": 0.0033532325670011433, "grad_norm": 0.310546875, "grad_norm_var": 0.0001632531483968099, "learning_rate": 0.01, "loss": 1.4069, "loss/crossentropy": 2.5465755462646484, "loss/fcd": 1.171875, "loss/logits": 0.29447929561138153, "step": 1056 }, { "epoch": 0.003356407976628985, "grad_norm": 0.3125, "grad_norm_var": 0.00010350545247395833, "learning_rate": 0.01, "loss": 1.3506, "loss/crossentropy": 2.2298638820648193, "loss/fcd": 1.09765625, "loss/logits": 0.2639864385128021, "step": 1057 }, { "epoch": 0.003359583386256827, "grad_norm": 0.296875, "grad_norm_var": 0.0001388390858968099, "learning_rate": 0.01, "loss": 1.4068, "loss/crossentropy": 2.5928457975387573, "loss/fcd": 1.1015625, "loss/logits": 0.2589241564273834, "step": 1058 }, { "epoch": 0.003362758795884669, "grad_norm": 0.318359375, "grad_norm_var": 0.00012715657552083334, "learning_rate": 0.01, "loss": 1.4881, "loss/crossentropy": 2.5440473556518555, "loss/fcd": 1.31640625, "loss/logits": 0.3831995874643326, "step": 1059 }, { "epoch": 0.003365934205512511, "grad_norm": 0.365234375, "grad_norm_var": 0.00026493072509765626, "learning_rate": 0.01, "loss": 1.4868, "loss/crossentropy": 2.4000766277313232, "loss/fcd": 1.171875, "loss/logits": 0.2690870612859726, "step": 1060 }, { "epoch": 0.003369109615140353, "grad_norm": 0.33203125, "grad_norm_var": 0.0002288818359375, "learning_rate": 0.01, "loss": 1.4505, "loss/crossentropy": 2.398478150367737, "loss/fcd": 1.140625, "loss/logits": 0.26252414286136627, "step": 1061 }, { "epoch": 0.003372285024768195, "grad_norm": 0.310546875, "grad_norm_var": 0.0002387841542561849, "learning_rate": 0.01, "loss": 1.4852, "loss/crossentropy": 2.5086846351623535, "loss/fcd": 1.16796875, "loss/logits": 0.26408642530441284, "step": 1062 }, { "epoch": 0.003375460434396037, "grad_norm": 0.3671875, "grad_norm_var": 0.0003407796223958333, "learning_rate": 0.01, "loss": 1.4317, "loss/crossentropy": 2.3301165103912354, "loss/fcd": 1.171875, "loss/logits": 0.2675746977329254, "step": 1063 }, { "epoch": 0.003378635844023879, "grad_norm": 0.33984375, "grad_norm_var": 0.0003490289052327474, "learning_rate": 0.01, "loss": 1.4774, "loss/crossentropy": 2.5806082487106323, "loss/fcd": 1.12890625, "loss/logits": 0.2480749562382698, "step": 1064 }, { "epoch": 0.003381811253651721, "grad_norm": 0.32421875, "grad_norm_var": 0.00034228960673014325, "learning_rate": 0.01, "loss": 1.3972, "loss/crossentropy": 2.5531054735183716, "loss/fcd": 1.08984375, "loss/logits": 0.24970652163028717, "step": 1065 }, { "epoch": 0.003384986663279563, "grad_norm": 0.34765625, "grad_norm_var": 0.00036454200744628906, "learning_rate": 0.01, "loss": 1.4121, "loss/crossentropy": 2.5553399324417114, "loss/fcd": 1.15625, "loss/logits": 0.25267454236745834, "step": 1066 }, { "epoch": 0.003388162072907405, "grad_norm": 0.458984375, "grad_norm_var": 0.0014261722564697266, "learning_rate": 0.01, "loss": 1.6507, "loss/crossentropy": 2.406322479248047, "loss/fcd": 1.2109375, "loss/logits": 0.3105914294719696, "step": 1067 }, { "epoch": 0.003391337482535247, "grad_norm": 0.37109375, "grad_norm_var": 0.0014806111653645833, "learning_rate": 0.01, "loss": 1.5117, "loss/crossentropy": 2.5899935960769653, "loss/fcd": 1.1796875, "loss/logits": 0.27687887847423553, "step": 1068 }, { "epoch": 0.003394512892163089, "grad_norm": 0.3671875, "grad_norm_var": 0.00150144894917806, "learning_rate": 0.01, "loss": 1.4477, "loss/crossentropy": 2.7163758277893066, "loss/fcd": 1.2734375, "loss/logits": 0.28674206137657166, "step": 1069 }, { "epoch": 0.003397688301790931, "grad_norm": 0.330078125, "grad_norm_var": 0.0014979044596354167, "learning_rate": 0.01, "loss": 1.3914, "loss/crossentropy": 2.517053008079529, "loss/fcd": 1.07421875, "loss/logits": 0.25124162435531616, "step": 1070 }, { "epoch": 0.003400863711418773, "grad_norm": 0.30859375, "grad_norm_var": 0.0015658060709635416, "learning_rate": 0.01, "loss": 1.3254, "loss/crossentropy": 2.39577579498291, "loss/fcd": 1.12109375, "loss/logits": 0.25313539803028107, "step": 1071 }, { "epoch": 0.003404039121046615, "grad_norm": 0.37109375, "grad_norm_var": 0.0015465895334879558, "learning_rate": 0.01, "loss": 1.5438, "loss/crossentropy": 2.654159665107727, "loss/fcd": 1.2421875, "loss/logits": 0.30044449865818024, "step": 1072 }, { "epoch": 0.003407214530674457, "grad_norm": 0.29296875, "grad_norm_var": 0.0016553084055582683, "learning_rate": 0.01, "loss": 1.3846, "loss/crossentropy": 2.460574746131897, "loss/fcd": 1.203125, "loss/logits": 0.2475254088640213, "step": 1073 }, { "epoch": 0.003410389940302299, "grad_norm": 0.37109375, "grad_norm_var": 0.0015345096588134765, "learning_rate": 0.01, "loss": 1.4338, "loss/crossentropy": 2.8238651752471924, "loss/fcd": 1.203125, "loss/logits": 0.267177551984787, "step": 1074 }, { "epoch": 0.003413565349930141, "grad_norm": 0.3046875, "grad_norm_var": 0.0016011555989583334, "learning_rate": 0.01, "loss": 1.4286, "loss/crossentropy": 2.879183769226074, "loss/fcd": 1.23828125, "loss/logits": 0.2946863919496536, "step": 1075 }, { "epoch": 0.003416740759557983, "grad_norm": 0.328125, "grad_norm_var": 0.0016002496083577474, "learning_rate": 0.01, "loss": 1.4593, "loss/crossentropy": 2.233182191848755, "loss/fcd": 1.072265625, "loss/logits": 0.2457396686077118, "step": 1076 }, { "epoch": 0.003419916169185825, "grad_norm": 0.3125, "grad_norm_var": 0.0016587416330973307, "learning_rate": 0.01, "loss": 1.4043, "loss/crossentropy": 2.1949434876441956, "loss/fcd": 1.06640625, "loss/logits": 0.22846803814172745, "step": 1077 }, { "epoch": 0.0034230915788136668, "grad_norm": 0.318359375, "grad_norm_var": 0.0016275882720947266, "learning_rate": 0.01, "loss": 1.3916, "loss/crossentropy": 2.6788476705551147, "loss/fcd": 1.265625, "loss/logits": 0.3040081709623337, "step": 1078 }, { "epoch": 0.003426266988441509, "grad_norm": 0.3203125, "grad_norm_var": 0.0016237735748291016, "learning_rate": 0.01, "loss": 1.4729, "loss/crossentropy": 2.5255725383758545, "loss/fcd": 1.25, "loss/logits": 0.30991658568382263, "step": 1079 }, { "epoch": 0.003429442398069351, "grad_norm": 0.30859375, "grad_norm_var": 0.0016924381256103516, "learning_rate": 0.01, "loss": 1.3775, "loss/crossentropy": 2.339387893676758, "loss/fcd": 1.078125, "loss/logits": 0.254665307700634, "step": 1080 }, { "epoch": 0.0034326178076971928, "grad_norm": 0.314453125, "grad_norm_var": 0.0017185846964518229, "learning_rate": 0.01, "loss": 1.3908, "loss/crossentropy": 2.440613031387329, "loss/fcd": 1.140625, "loss/logits": 0.26881206035614014, "step": 1081 }, { "epoch": 0.003435793217325035, "grad_norm": 0.337890625, "grad_norm_var": 0.0017134189605712891, "learning_rate": 0.01, "loss": 1.5371, "loss/crossentropy": 2.817227005958557, "loss/fcd": 1.2421875, "loss/logits": 0.2958840876817703, "step": 1082 }, { "epoch": 0.003438968626952877, "grad_norm": 0.322265625, "grad_norm_var": 0.0006853580474853515, "learning_rate": 0.01, "loss": 1.4194, "loss/crossentropy": 2.8334412574768066, "loss/fcd": 1.1953125, "loss/logits": 0.28198152780532837, "step": 1083 }, { "epoch": 0.0034421440365807188, "grad_norm": 0.314453125, "grad_norm_var": 0.0005751927693684896, "learning_rate": 0.01, "loss": 1.4076, "loss/crossentropy": 2.2784258127212524, "loss/fcd": 1.0703125, "loss/logits": 0.2574286684393883, "step": 1084 }, { "epoch": 0.003445319446208561, "grad_norm": 0.296875, "grad_norm_var": 0.0005019505818684896, "learning_rate": 0.01, "loss": 1.4527, "loss/crossentropy": 2.6472941637039185, "loss/fcd": 1.203125, "loss/logits": 0.2890602648258209, "step": 1085 }, { "epoch": 0.003448494855836403, "grad_norm": 0.310546875, "grad_norm_var": 0.0005048116048177084, "learning_rate": 0.01, "loss": 1.3779, "loss/crossentropy": 2.602465271949768, "loss/fcd": 1.109375, "loss/logits": 0.24668119102716446, "step": 1086 }, { "epoch": 0.0034516702654642448, "grad_norm": 0.3046875, "grad_norm_var": 0.0005121231079101562, "learning_rate": 0.01, "loss": 1.4004, "loss/crossentropy": 2.684326410293579, "loss/fcd": 1.14453125, "loss/logits": 0.25191009789705276, "step": 1087 }, { "epoch": 0.003454845675092087, "grad_norm": 0.2890625, "grad_norm_var": 0.00037994384765625, "learning_rate": 0.01, "loss": 1.4051, "loss/crossentropy": 2.5544471740722656, "loss/fcd": 1.140625, "loss/logits": 0.2585143595933914, "step": 1088 }, { "epoch": 0.003458021084719929, "grad_norm": 0.427734375, "grad_norm_var": 0.001111459732055664, "learning_rate": 0.01, "loss": 1.5067, "loss/crossentropy": 2.349204659461975, "loss/fcd": 1.20703125, "loss/logits": 0.2645433247089386, "step": 1089 }, { "epoch": 0.0034611964943477708, "grad_norm": 0.376953125, "grad_norm_var": 0.0011505126953125, "learning_rate": 0.01, "loss": 1.431, "loss/crossentropy": 2.887730121612549, "loss/fcd": 1.17578125, "loss/logits": 0.2853116989135742, "step": 1090 }, { "epoch": 0.003464371903975613, "grad_norm": 0.31640625, "grad_norm_var": 0.0011285781860351563, "learning_rate": 0.01, "loss": 1.4454, "loss/crossentropy": 2.500343084335327, "loss/fcd": 1.109375, "loss/logits": 0.2553185969591141, "step": 1091 }, { "epoch": 0.003467547313603455, "grad_norm": 0.3671875, "grad_norm_var": 0.0012404759724934896, "learning_rate": 0.01, "loss": 1.4313, "loss/crossentropy": 2.887548565864563, "loss/fcd": 1.1796875, "loss/logits": 0.27073781192302704, "step": 1092 }, { "epoch": 0.0034707227232312968, "grad_norm": 0.330078125, "grad_norm_var": 0.0012248833974202475, "learning_rate": 0.01, "loss": 1.4509, "loss/crossentropy": 2.6366779804229736, "loss/fcd": 1.1796875, "loss/logits": 0.28337880969047546, "step": 1093 }, { "epoch": 0.0034738981328591386, "grad_norm": 0.296875, "grad_norm_var": 0.0012827555338541666, "learning_rate": 0.01, "loss": 1.422, "loss/crossentropy": 2.5746976137161255, "loss/fcd": 1.16796875, "loss/logits": 0.27548307180404663, "step": 1094 }, { "epoch": 0.003477073542486981, "grad_norm": 0.376953125, "grad_norm_var": 0.0014316399892171225, "learning_rate": 0.01, "loss": 1.4594, "loss/crossentropy": 2.867998957633972, "loss/fcd": 1.19921875, "loss/logits": 0.27039359509944916, "step": 1095 }, { "epoch": 0.0034802489521148228, "grad_norm": 0.30859375, "grad_norm_var": 0.0014316399892171225, "learning_rate": 0.01, "loss": 1.4462, "loss/crossentropy": 2.5660641193389893, "loss/fcd": 1.13671875, "loss/logits": 0.26347437500953674, "step": 1096 }, { "epoch": 0.0034834243617426646, "grad_norm": 1.078125, "grad_norm_var": 0.036228179931640625, "learning_rate": 0.01, "loss": 1.5798, "loss/crossentropy": 2.6996599435806274, "loss/fcd": 1.14453125, "loss/logits": 0.27632369101047516, "step": 1097 }, { "epoch": 0.003486599771370507, "grad_norm": 0.50390625, "grad_norm_var": 0.03705366452534994, "learning_rate": 0.01, "loss": 1.5491, "loss/crossentropy": 2.7906017303466797, "loss/fcd": 1.19921875, "loss/logits": 0.27025599777698517, "step": 1098 }, { "epoch": 0.0034897751809983488, "grad_norm": 0.396484375, "grad_norm_var": 0.03673958778381348, "learning_rate": 0.01, "loss": 1.4586, "loss/crossentropy": 2.329752564430237, "loss/fcd": 1.140625, "loss/logits": 0.2620559558272362, "step": 1099 }, { "epoch": 0.0034929505906261906, "grad_norm": 0.53515625, "grad_norm_var": 0.03745981852213542, "learning_rate": 0.01, "loss": 1.5193, "loss/crossentropy": 2.733720302581787, "loss/fcd": 1.19140625, "loss/logits": 0.24466054886579514, "step": 1100 }, { "epoch": 0.003496126000254033, "grad_norm": 0.359375, "grad_norm_var": 0.03678436279296875, "learning_rate": 0.01, "loss": 1.3678, "loss/crossentropy": 2.31599223613739, "loss/fcd": 1.08984375, "loss/logits": 0.25845472514629364, "step": 1101 }, { "epoch": 0.0034993014098818748, "grad_norm": 0.45703125, "grad_norm_var": 0.03616089820861816, "learning_rate": 0.01, "loss": 1.5432, "loss/crossentropy": 2.339303970336914, "loss/fcd": 1.16796875, "loss/logits": 0.257125549018383, "step": 1102 }, { "epoch": 0.0035024768195097166, "grad_norm": 0.328125, "grad_norm_var": 0.0358339786529541, "learning_rate": 0.01, "loss": 1.4531, "loss/crossentropy": 2.5037355422973633, "loss/fcd": 1.09765625, "loss/logits": 0.24255798757076263, "step": 1103 }, { "epoch": 0.003505652229137559, "grad_norm": 0.341796875, "grad_norm_var": 0.035074806213378905, "learning_rate": 0.01, "loss": 1.4918, "loss/crossentropy": 2.7007148265838623, "loss/fcd": 1.09765625, "loss/logits": 0.25020332634449005, "step": 1104 }, { "epoch": 0.0035088276387654008, "grad_norm": 0.31640625, "grad_norm_var": 0.035809564590454104, "learning_rate": 0.01, "loss": 1.4169, "loss/crossentropy": 2.5569344758987427, "loss/fcd": 1.24609375, "loss/logits": 0.2790044695138931, "step": 1105 }, { "epoch": 0.0035120030483932426, "grad_norm": 0.365234375, "grad_norm_var": 0.035882425308227536, "learning_rate": 0.01, "loss": 1.4332, "loss/crossentropy": 2.5127986669540405, "loss/fcd": 1.19140625, "loss/logits": 0.26853661239147186, "step": 1106 }, { "epoch": 0.003515178458021085, "grad_norm": 0.353515625, "grad_norm_var": 0.03546899159749349, "learning_rate": 0.01, "loss": 1.4554, "loss/crossentropy": 2.4131299257278442, "loss/fcd": 1.12109375, "loss/logits": 0.25159407407045364, "step": 1107 }, { "epoch": 0.0035183538676489268, "grad_norm": 0.345703125, "grad_norm_var": 0.03564820289611816, "learning_rate": 0.01, "loss": 1.4349, "loss/crossentropy": 2.687081217765808, "loss/fcd": 1.12109375, "loss/logits": 0.26226551830768585, "step": 1108 }, { "epoch": 0.0035215292772767686, "grad_norm": 0.33203125, "grad_norm_var": 0.035625457763671875, "learning_rate": 0.01, "loss": 1.4258, "loss/crossentropy": 2.7863974571228027, "loss/fcd": 1.2578125, "loss/logits": 0.2852618992328644, "step": 1109 }, { "epoch": 0.0035247046869046105, "grad_norm": 0.34765625, "grad_norm_var": 0.03496341705322266, "learning_rate": 0.01, "loss": 1.4504, "loss/crossentropy": 2.146250069141388, "loss/fcd": 1.05859375, "loss/logits": 0.2503824681043625, "step": 1110 }, { "epoch": 0.0035278800965324528, "grad_norm": 0.345703125, "grad_norm_var": 0.035210609436035156, "learning_rate": 0.01, "loss": 1.4255, "loss/crossentropy": 2.7362266778945923, "loss/fcd": 1.2109375, "loss/logits": 0.3060525804758072, "step": 1111 }, { "epoch": 0.0035310555061602946, "grad_norm": 0.30859375, "grad_norm_var": 0.035210609436035156, "learning_rate": 0.01, "loss": 1.403, "loss/crossentropy": 2.55770480632782, "loss/fcd": 1.15234375, "loss/logits": 0.2610636502504349, "step": 1112 }, { "epoch": 0.0035342309157881365, "grad_norm": 0.328125, "grad_norm_var": 0.004522132873535156, "learning_rate": 0.01, "loss": 1.3738, "loss/crossentropy": 2.266708016395569, "loss/fcd": 1.1484375, "loss/logits": 0.24880962073802948, "step": 1113 }, { "epoch": 0.0035374063254159788, "grad_norm": 0.3984375, "grad_norm_var": 0.00337371826171875, "learning_rate": 0.01, "loss": 1.5027, "loss/crossentropy": 2.7558518648147583, "loss/fcd": 1.14453125, "loss/logits": 0.27063145488500595, "step": 1114 }, { "epoch": 0.0035405817350438206, "grad_norm": 0.421875, "grad_norm_var": 0.0035164992014567057, "learning_rate": 0.01, "loss": 1.521, "loss/crossentropy": 2.878845453262329, "loss/fcd": 1.30078125, "loss/logits": 0.33267462253570557, "step": 1115 }, { "epoch": 0.0035437571446716625, "grad_norm": 0.34765625, "grad_norm_var": 0.0015298048655192056, "learning_rate": 0.01, "loss": 1.4089, "loss/crossentropy": 2.51344895362854, "loss/fcd": 1.1484375, "loss/logits": 0.24806544929742813, "step": 1116 }, { "epoch": 0.0035469325542995048, "grad_norm": 0.3125, "grad_norm_var": 0.0016465346018473307, "learning_rate": 0.01, "loss": 1.4445, "loss/crossentropy": 2.537601590156555, "loss/fcd": 1.1640625, "loss/logits": 0.2726535201072693, "step": 1117 }, { "epoch": 0.0035501079639273466, "grad_norm": 0.341796875, "grad_norm_var": 0.0008803685506184896, "learning_rate": 0.01, "loss": 1.5131, "loss/crossentropy": 2.5657413005828857, "loss/fcd": 1.21484375, "loss/logits": 0.30701732635498047, "step": 1118 }, { "epoch": 0.0035532833735551885, "grad_norm": 0.390625, "grad_norm_var": 0.0009759902954101562, "learning_rate": 0.01, "loss": 1.4705, "loss/crossentropy": 2.332374691963196, "loss/fcd": 1.1953125, "loss/logits": 0.2707579508423805, "step": 1119 }, { "epoch": 0.0035564587831830308, "grad_norm": 0.333984375, "grad_norm_var": 0.0009881973266601563, "learning_rate": 0.01, "loss": 1.492, "loss/crossentropy": 2.6399765014648438, "loss/fcd": 1.22265625, "loss/logits": 0.29872867465019226, "step": 1120 }, { "epoch": 0.0035596341928108726, "grad_norm": 0.314453125, "grad_norm_var": 0.000997018814086914, "learning_rate": 0.01, "loss": 1.4214, "loss/crossentropy": 2.404382109642029, "loss/fcd": 1.2109375, "loss/logits": 0.2955154851078987, "step": 1121 }, { "epoch": 0.0035628096024387145, "grad_norm": 0.345703125, "grad_norm_var": 0.0009792168935139974, "learning_rate": 0.01, "loss": 1.5027, "loss/crossentropy": 2.434885621070862, "loss/fcd": 1.18359375, "loss/logits": 0.26397769153118134, "step": 1122 }, { "epoch": 0.0035659850120665568, "grad_norm": 0.322265625, "grad_norm_var": 0.0010173638661702474, "learning_rate": 0.01, "loss": 1.4434, "loss/crossentropy": 2.53593647480011, "loss/fcd": 1.28515625, "loss/logits": 0.3394608050584793, "step": 1123 }, { "epoch": 0.0035691604216943986, "grad_norm": 0.296875, "grad_norm_var": 0.0011687596638997396, "learning_rate": 0.01, "loss": 1.4066, "loss/crossentropy": 2.5355199575424194, "loss/fcd": 1.140625, "loss/logits": 0.25259605795145035, "step": 1124 }, { "epoch": 0.0035723358313222405, "grad_norm": 0.294921875, "grad_norm_var": 0.0013091882069905599, "learning_rate": 0.01, "loss": 1.4249, "loss/crossentropy": 2.5614614486694336, "loss/fcd": 1.11328125, "loss/logits": 0.2524689584970474, "step": 1125 }, { "epoch": 0.0035755112409500828, "grad_norm": 0.33203125, "grad_norm_var": 0.0013099511464436848, "learning_rate": 0.01, "loss": 1.4552, "loss/crossentropy": 2.5092010498046875, "loss/fcd": 1.28125, "loss/logits": 0.3110498636960983, "step": 1126 }, { "epoch": 0.0035786866505779246, "grad_norm": 0.27734375, "grad_norm_var": 0.0015474955240885417, "learning_rate": 0.01, "loss": 1.3622, "loss/crossentropy": 2.4361236095428467, "loss/fcd": 1.11328125, "loss/logits": 0.2490287721157074, "step": 1127 }, { "epoch": 0.0035818620602057665, "grad_norm": 0.294921875, "grad_norm_var": 0.001608133316040039, "learning_rate": 0.01, "loss": 1.341, "loss/crossentropy": 2.5093648433685303, "loss/fcd": 1.13671875, "loss/logits": 0.2512170225381851, "step": 1128 }, { "epoch": 0.0035850374698336083, "grad_norm": 0.3515625, "grad_norm_var": 0.0016222476959228515, "learning_rate": 0.01, "loss": 1.457, "loss/crossentropy": 2.4601742029190063, "loss/fcd": 1.19140625, "loss/logits": 0.2746012806892395, "step": 1129 }, { "epoch": 0.0035882128794614506, "grad_norm": 0.40234375, "grad_norm_var": 0.0016556898752848306, "learning_rate": 0.01, "loss": 1.527, "loss/crossentropy": 2.5778605937957764, "loss/fcd": 1.109375, "loss/logits": 0.2506415694952011, "step": 1130 }, { "epoch": 0.0035913882890892925, "grad_norm": 0.328125, "grad_norm_var": 0.0011353651682535807, "learning_rate": 0.01, "loss": 1.4656, "loss/crossentropy": 2.8056509494781494, "loss/fcd": 1.3203125, "loss/logits": 0.3543570637702942, "step": 1131 }, { "epoch": 0.0035945636987171343, "grad_norm": 0.306640625, "grad_norm_var": 0.0011463801066080729, "learning_rate": 0.01, "loss": 1.3919, "loss/crossentropy": 2.2685288190841675, "loss/fcd": 1.052734375, "loss/logits": 0.24789728969335556, "step": 1132 }, { "epoch": 0.0035977391083449766, "grad_norm": 0.302734375, "grad_norm_var": 0.0011723677317301432, "learning_rate": 0.01, "loss": 1.4475, "loss/crossentropy": 2.7631455659866333, "loss/fcd": 1.1328125, "loss/logits": 0.2655462175607681, "step": 1133 }, { "epoch": 0.0036009145179728185, "grad_norm": 0.32421875, "grad_norm_var": 0.0011576334635416666, "learning_rate": 0.01, "loss": 1.4515, "loss/crossentropy": 2.483760356903076, "loss/fcd": 1.1796875, "loss/logits": 0.2760060429573059, "step": 1134 }, { "epoch": 0.0036040899276006603, "grad_norm": 0.353515625, "grad_norm_var": 0.0009247938791910808, "learning_rate": 0.01, "loss": 1.4528, "loss/crossentropy": 2.5544368028640747, "loss/fcd": 1.23828125, "loss/logits": 0.27079054713249207, "step": 1135 }, { "epoch": 0.0036072653372285026, "grad_norm": 0.318359375, "grad_norm_var": 0.0009189446767171224, "learning_rate": 0.01, "loss": 1.4598, "loss/crossentropy": 2.7300420999526978, "loss/fcd": 1.2265625, "loss/logits": 0.30273695290088654, "step": 1136 }, { "epoch": 0.0036104407468563445, "grad_norm": 0.29296875, "grad_norm_var": 0.0009719212849934896, "learning_rate": 0.01, "loss": 1.4023, "loss/crossentropy": 2.455302119255066, "loss/fcd": 1.07421875, "loss/logits": 0.24349002540111542, "step": 1137 }, { "epoch": 0.0036136161564841863, "grad_norm": 0.302734375, "grad_norm_var": 0.0009488423665364583, "learning_rate": 0.01, "loss": 1.4074, "loss/crossentropy": 2.286602735519409, "loss/fcd": 1.2734375, "loss/logits": 0.26189613342285156, "step": 1138 }, { "epoch": 0.0036167915661120286, "grad_norm": 0.333984375, "grad_norm_var": 0.0009627660115559896, "learning_rate": 0.01, "loss": 1.3884, "loss/crossentropy": 2.611856460571289, "loss/fcd": 1.22265625, "loss/logits": 0.27988147735595703, "step": 1139 }, { "epoch": 0.0036199669757398705, "grad_norm": 0.3203125, "grad_norm_var": 0.0009261449178059896, "learning_rate": 0.01, "loss": 1.4244, "loss/crossentropy": 2.638223886489868, "loss/fcd": 1.17578125, "loss/logits": 0.30142582952976227, "step": 1140 }, { "epoch": 0.0036231423853677123, "grad_norm": 0.3203125, "grad_norm_var": 0.000878000259399414, "learning_rate": 0.01, "loss": 1.4393, "loss/crossentropy": 2.5843143463134766, "loss/fcd": 1.1171875, "loss/logits": 0.25653908401727676, "step": 1141 }, { "epoch": 0.0036263177949955546, "grad_norm": 0.359375, "grad_norm_var": 0.0009589989980061849, "learning_rate": 0.01, "loss": 1.393, "loss/crossentropy": 2.6501078605651855, "loss/fcd": 1.19140625, "loss/logits": 0.25969812273979187, "step": 1142 }, { "epoch": 0.0036294932046233965, "grad_norm": 0.3359375, "grad_norm_var": 0.0008064111073811849, "learning_rate": 0.01, "loss": 1.4579, "loss/crossentropy": 2.6016554832458496, "loss/fcd": 1.19921875, "loss/logits": 0.28507116436958313, "step": 1143 }, { "epoch": 0.0036326686142512383, "grad_norm": 1.2734375, "grad_norm_var": 0.056333669026692706, "learning_rate": 0.01, "loss": 1.4899, "loss/crossentropy": 2.667087197303772, "loss/fcd": 1.11328125, "loss/logits": 0.24600353091955185, "step": 1144 }, { "epoch": 0.00363584402387908, "grad_norm": 0.45703125, "grad_norm_var": 0.056500180562337236, "learning_rate": 0.01, "loss": 1.5425, "loss/crossentropy": 2.4002511501312256, "loss/fcd": 1.1875, "loss/logits": 0.2555523067712784, "step": 1145 }, { "epoch": 0.0036390194335069225, "grad_norm": 0.341796875, "grad_norm_var": 0.05667608578999837, "learning_rate": 0.01, "loss": 1.4625, "loss/crossentropy": 2.6032490730285645, "loss/fcd": 1.18359375, "loss/logits": 0.27435505390167236, "step": 1146 }, { "epoch": 0.0036421948431347643, "grad_norm": 0.34765625, "grad_norm_var": 0.05653367042541504, "learning_rate": 0.01, "loss": 1.4204, "loss/crossentropy": 2.4651230573654175, "loss/fcd": 1.125, "loss/logits": 0.2620478793978691, "step": 1147 }, { "epoch": 0.003645370252762606, "grad_norm": 0.3046875, "grad_norm_var": 0.056556447347005205, "learning_rate": 0.01, "loss": 1.3949, "loss/crossentropy": 2.58842933177948, "loss/fcd": 1.1484375, "loss/logits": 0.2604793608188629, "step": 1148 }, { "epoch": 0.0036485456623904485, "grad_norm": 0.337890625, "grad_norm_var": 0.056210263570149736, "learning_rate": 0.01, "loss": 1.4123, "loss/crossentropy": 2.255470633506775, "loss/fcd": 1.1328125, "loss/logits": 0.25727608799934387, "step": 1149 }, { "epoch": 0.0036517210720182903, "grad_norm": 0.33984375, "grad_norm_var": 0.05607751210530599, "learning_rate": 0.01, "loss": 1.438, "loss/crossentropy": 2.4568862915039062, "loss/fcd": 1.1484375, "loss/logits": 0.24860578775405884, "step": 1150 }, { "epoch": 0.003654896481646132, "grad_norm": 0.310546875, "grad_norm_var": 0.05643768310546875, "learning_rate": 0.01, "loss": 1.4201, "loss/crossentropy": 2.601644992828369, "loss/fcd": 1.21875, "loss/logits": 0.3009377270936966, "step": 1151 }, { "epoch": 0.0036580718912739745, "grad_norm": 0.322265625, "grad_norm_var": 0.05639947255452474, "learning_rate": 0.01, "loss": 1.469, "loss/crossentropy": 2.463584780693054, "loss/fcd": 1.11328125, "loss/logits": 0.26111412048339844, "step": 1152 }, { "epoch": 0.0036612473009018163, "grad_norm": 0.357421875, "grad_norm_var": 0.05579260190327962, "learning_rate": 0.01, "loss": 1.4458, "loss/crossentropy": 2.610145092010498, "loss/fcd": 1.11328125, "loss/logits": 0.2534957379102707, "step": 1153 }, { "epoch": 0.003664422710529658, "grad_norm": 0.330078125, "grad_norm_var": 0.05549263954162598, "learning_rate": 0.01, "loss": 1.4248, "loss/crossentropy": 2.7191637754440308, "loss/fcd": 1.19921875, "loss/logits": 0.28314654529094696, "step": 1154 }, { "epoch": 0.0036675981201575005, "grad_norm": 0.31640625, "grad_norm_var": 0.05566558837890625, "learning_rate": 0.01, "loss": 1.4213, "loss/crossentropy": 2.5929548740386963, "loss/fcd": 1.203125, "loss/logits": 0.2869616895914078, "step": 1155 }, { "epoch": 0.0036707735297853423, "grad_norm": 0.3203125, "grad_norm_var": 0.05566558837890625, "learning_rate": 0.01, "loss": 1.4441, "loss/crossentropy": 2.391794204711914, "loss/fcd": 1.171875, "loss/logits": 0.2582762539386749, "step": 1156 }, { "epoch": 0.003673948939413184, "grad_norm": 0.33984375, "grad_norm_var": 0.055485979715983076, "learning_rate": 0.01, "loss": 1.4683, "loss/crossentropy": 2.6816611289978027, "loss/fcd": 1.21875, "loss/logits": 0.2884572744369507, "step": 1157 }, { "epoch": 0.0036771243490410265, "grad_norm": 0.337890625, "grad_norm_var": 0.05563022295633952, "learning_rate": 0.01, "loss": 1.4622, "loss/crossentropy": 2.510971784591675, "loss/fcd": 1.125, "loss/logits": 0.2614917606115341, "step": 1158 }, { "epoch": 0.0036802997586688683, "grad_norm": 0.330078125, "grad_norm_var": 0.055681101481119794, "learning_rate": 0.01, "loss": 1.4685, "loss/crossentropy": 2.598991870880127, "loss/fcd": 1.1796875, "loss/logits": 0.2745528370141983, "step": 1159 }, { "epoch": 0.00368347516829671, "grad_norm": 0.349609375, "grad_norm_var": 0.0011821587880452473, "learning_rate": 0.01, "loss": 1.422, "loss/crossentropy": 2.6106759309768677, "loss/fcd": 1.25390625, "loss/logits": 0.3237348794937134, "step": 1160 }, { "epoch": 0.0036866505779245525, "grad_norm": 0.361328125, "grad_norm_var": 0.0002639134724934896, "learning_rate": 0.01, "loss": 1.4486, "loss/crossentropy": 2.519943952560425, "loss/fcd": 1.0703125, "loss/logits": 0.25225630402565, "step": 1161 }, { "epoch": 0.0036898259875523943, "grad_norm": 0.326171875, "grad_norm_var": 0.00026340484619140624, "learning_rate": 0.01, "loss": 1.4599, "loss/crossentropy": 2.6115747690200806, "loss/fcd": 1.1328125, "loss/logits": 0.25031594932079315, "step": 1162 }, { "epoch": 0.003693001397180236, "grad_norm": 0.435546875, "grad_norm_var": 0.0009150028228759766, "learning_rate": 0.01, "loss": 1.4833, "loss/crossentropy": 3.348997116088867, "loss/fcd": 1.37890625, "loss/logits": 0.3237081617116928, "step": 1163 }, { "epoch": 0.003696176806808078, "grad_norm": 0.451171875, "grad_norm_var": 0.0015909194946289063, "learning_rate": 0.01, "loss": 1.4294, "loss/crossentropy": 2.493046522140503, "loss/fcd": 1.0859375, "loss/logits": 0.24104822427034378, "step": 1164 }, { "epoch": 0.0036993522164359203, "grad_norm": 0.330078125, "grad_norm_var": 0.0016051610310872397, "learning_rate": 0.01, "loss": 1.4157, "loss/crossentropy": 2.6022807359695435, "loss/fcd": 1.1875, "loss/logits": 0.28911933302879333, "step": 1165 }, { "epoch": 0.003702527626063762, "grad_norm": 0.333984375, "grad_norm_var": 0.0016132195790608723, "learning_rate": 0.01, "loss": 1.4095, "loss/crossentropy": 2.4118971824645996, "loss/fcd": 1.14453125, "loss/logits": 0.254654124379158, "step": 1166 }, { "epoch": 0.003705703035691604, "grad_norm": 0.302734375, "grad_norm_var": 0.0016550540924072266, "learning_rate": 0.01, "loss": 1.41, "loss/crossentropy": 2.495501756668091, "loss/fcd": 1.1015625, "loss/logits": 0.27784664928913116, "step": 1167 }, { "epoch": 0.0037088784453194463, "grad_norm": 0.3125, "grad_norm_var": 0.0016926447550455728, "learning_rate": 0.01, "loss": 1.472, "loss/crossentropy": 2.8153276443481445, "loss/fcd": 1.3125, "loss/logits": 0.3257535398006439, "step": 1168 }, { "epoch": 0.003712053854947288, "grad_norm": 0.392578125, "grad_norm_var": 0.0018236796061197917, "learning_rate": 0.01, "loss": 1.4367, "loss/crossentropy": 3.1086610555648804, "loss/fcd": 1.26171875, "loss/logits": 0.2940136045217514, "step": 1169 }, { "epoch": 0.00371522926457513, "grad_norm": 0.375, "grad_norm_var": 0.001841592788696289, "learning_rate": 0.01, "loss": 1.58, "loss/crossentropy": 2.70519483089447, "loss/fcd": 1.23828125, "loss/logits": 0.27308498322963715, "step": 1170 }, { "epoch": 0.0037184046742029723, "grad_norm": 0.53125, "grad_norm_var": 0.0037368615468343098, "learning_rate": 0.01, "loss": 1.4199, "loss/crossentropy": 2.6044522523880005, "loss/fcd": 1.19921875, "loss/logits": 0.28372132778167725, "step": 1171 }, { "epoch": 0.003721580083830814, "grad_norm": 0.349609375, "grad_norm_var": 0.0036183675130208332, "learning_rate": 0.01, "loss": 1.3992, "loss/crossentropy": 2.6556203365325928, "loss/fcd": 1.19140625, "loss/logits": 0.2799815833568573, "step": 1172 }, { "epoch": 0.003724755493458656, "grad_norm": 0.375, "grad_norm_var": 0.0035720189412434896, "learning_rate": 0.01, "loss": 1.4651, "loss/crossentropy": 2.3634873628616333, "loss/fcd": 1.23828125, "loss/logits": 0.25702004134655, "step": 1173 }, { "epoch": 0.0037279309030864983, "grad_norm": 0.419921875, "grad_norm_var": 0.0036588033040364583, "learning_rate": 0.01, "loss": 1.5503, "loss/crossentropy": 2.7055184841156006, "loss/fcd": 1.234375, "loss/logits": 0.2950890064239502, "step": 1174 }, { "epoch": 0.00373110631271434, "grad_norm": 0.3828125, "grad_norm_var": 0.0035270531972249348, "learning_rate": 0.01, "loss": 1.4235, "loss/crossentropy": 2.6725181341171265, "loss/fcd": 1.109375, "loss/logits": 0.2383343204855919, "step": 1175 }, { "epoch": 0.003734281722342182, "grad_norm": 0.3046875, "grad_norm_var": 0.00381622314453125, "learning_rate": 0.01, "loss": 1.4212, "loss/crossentropy": 2.4386379718780518, "loss/fcd": 1.1171875, "loss/logits": 0.259709395468235, "step": 1176 }, { "epoch": 0.0037374571319700243, "grad_norm": 0.349609375, "grad_norm_var": 0.0038446426391601563, "learning_rate": 0.01, "loss": 1.4798, "loss/crossentropy": 2.544153571128845, "loss/fcd": 1.23046875, "loss/logits": 0.2750667631626129, "step": 1177 }, { "epoch": 0.003740632541597866, "grad_norm": 0.35546875, "grad_norm_var": 0.0037142276763916016, "learning_rate": 0.01, "loss": 1.4649, "loss/crossentropy": 2.6141992807388306, "loss/fcd": 1.15625, "loss/logits": 0.2785794734954834, "step": 1178 }, { "epoch": 0.003743807951225708, "grad_norm": 0.337890625, "grad_norm_var": 0.0035234928131103516, "learning_rate": 0.01, "loss": 1.4687, "loss/crossentropy": 2.446426749229431, "loss/fcd": 1.26171875, "loss/logits": 0.27905312180519104, "step": 1179 }, { "epoch": 0.00374698336085355, "grad_norm": 0.33984375, "grad_norm_var": 0.003078651428222656, "learning_rate": 0.01, "loss": 1.4664, "loss/crossentropy": 2.2693610191345215, "loss/fcd": 1.09375, "loss/logits": 0.25085025280714035, "step": 1180 }, { "epoch": 0.003750158770481392, "grad_norm": 0.3359375, "grad_norm_var": 0.0030558109283447266, "learning_rate": 0.01, "loss": 1.4078, "loss/crossentropy": 2.4158496856689453, "loss/fcd": 1.0703125, "loss/logits": 0.23332488536834717, "step": 1181 }, { "epoch": 0.003753334180109234, "grad_norm": 0.484375, "grad_norm_var": 0.003899065653483073, "learning_rate": 0.01, "loss": 1.4347, "loss/crossentropy": 2.5204185247421265, "loss/fcd": 1.14453125, "loss/logits": 0.2420320212841034, "step": 1182 }, { "epoch": 0.003756509589737076, "grad_norm": 0.3125, "grad_norm_var": 0.0038150628407796223, "learning_rate": 0.01, "loss": 1.4886, "loss/crossentropy": 2.595679521560669, "loss/fcd": 1.171875, "loss/logits": 0.2582363486289978, "step": 1183 }, { "epoch": 0.003759684999364918, "grad_norm": 0.31640625, "grad_norm_var": 0.003784799575805664, "learning_rate": 0.01, "loss": 1.3742, "loss/crossentropy": 2.304825782775879, "loss/fcd": 1.08203125, "loss/logits": 0.2323903813958168, "step": 1184 }, { "epoch": 0.00376286040899276, "grad_norm": 0.3359375, "grad_norm_var": 0.0038350423177083335, "learning_rate": 0.01, "loss": 1.4899, "loss/crossentropy": 2.619007110595703, "loss/fcd": 1.1640625, "loss/logits": 0.2710985243320465, "step": 1185 }, { "epoch": 0.003766035818620602, "grad_norm": 0.33984375, "grad_norm_var": 0.0038848241170247396, "learning_rate": 0.01, "loss": 1.4657, "loss/crossentropy": 2.3884021043777466, "loss/fcd": 1.09375, "loss/logits": 0.24730277061462402, "step": 1186 }, { "epoch": 0.003769211228248444, "grad_norm": 0.376953125, "grad_norm_var": 0.001992527643839518, "learning_rate": 0.01, "loss": 1.4555, "loss/crossentropy": 2.5815001726150513, "loss/fcd": 1.13671875, "loss/logits": 0.23558396100997925, "step": 1187 }, { "epoch": 0.003772386637876286, "grad_norm": 0.349609375, "grad_norm_var": 0.001992527643839518, "learning_rate": 0.01, "loss": 1.4906, "loss/crossentropy": 2.62746798992157, "loss/fcd": 1.16796875, "loss/logits": 0.2597718983888626, "step": 1188 }, { "epoch": 0.003775562047504128, "grad_norm": 0.359375, "grad_norm_var": 0.0019709110260009766, "learning_rate": 0.01, "loss": 1.4444, "loss/crossentropy": 2.595371127128601, "loss/fcd": 1.23828125, "loss/logits": 0.34847554564476013, "step": 1189 }, { "epoch": 0.00377873745713197, "grad_norm": 0.32421875, "grad_norm_var": 0.0017318089803059896, "learning_rate": 0.01, "loss": 1.4189, "loss/crossentropy": 2.5984610319137573, "loss/fcd": 1.13671875, "loss/logits": 0.2660645693540573, "step": 1190 }, { "epoch": 0.003781912866759812, "grad_norm": 0.34375, "grad_norm_var": 0.0016580581665039062, "learning_rate": 0.01, "loss": 1.4607, "loss/crossentropy": 2.313586950302124, "loss/fcd": 1.17578125, "loss/logits": 0.2551446408033371, "step": 1191 }, { "epoch": 0.003785088276387654, "grad_norm": 0.3203125, "grad_norm_var": 0.0015832901000976563, "learning_rate": 0.01, "loss": 1.4547, "loss/crossentropy": 2.315441131591797, "loss/fcd": 1.09375, "loss/logits": 0.2538810521364212, "step": 1192 }, { "epoch": 0.003788263686015496, "grad_norm": 0.283203125, "grad_norm_var": 0.0018524169921875, "learning_rate": 0.01, "loss": 1.4022, "loss/crossentropy": 2.549360990524292, "loss/fcd": 1.19921875, "loss/logits": 0.28008708357810974, "step": 1193 }, { "epoch": 0.003791439095643338, "grad_norm": 0.345703125, "grad_norm_var": 0.0018443902333577474, "learning_rate": 0.01, "loss": 1.4605, "loss/crossentropy": 2.635983109474182, "loss/fcd": 1.18359375, "loss/logits": 0.277266263961792, "step": 1194 }, { "epoch": 0.00379461450527118, "grad_norm": 0.36328125, "grad_norm_var": 0.0018636067708333334, "learning_rate": 0.01, "loss": 1.5264, "loss/crossentropy": 2.5787789821624756, "loss/fcd": 1.12890625, "loss/logits": 0.24805738031864166, "step": 1195 }, { "epoch": 0.0037977899148990218, "grad_norm": 0.287109375, "grad_norm_var": 0.0020786126454671225, "learning_rate": 0.01, "loss": 1.4094, "loss/crossentropy": 2.428821086883545, "loss/fcd": 1.0625, "loss/logits": 0.24732781201601028, "step": 1196 }, { "epoch": 0.003800965324526864, "grad_norm": 0.294921875, "grad_norm_var": 0.0022191365559895834, "learning_rate": 0.01, "loss": 1.4643, "loss/crossentropy": 2.7191922664642334, "loss/fcd": 1.33203125, "loss/logits": 0.36607836186885834, "step": 1197 }, { "epoch": 0.003804140734154706, "grad_norm": 0.306640625, "grad_norm_var": 0.0007683912913004558, "learning_rate": 0.01, "loss": 1.4274, "loss/crossentropy": 2.2969586849212646, "loss/fcd": 1.1640625, "loss/logits": 0.2711605355143547, "step": 1198 }, { "epoch": 0.0038073161437825478, "grad_norm": 0.369140625, "grad_norm_var": 0.0008462905883789062, "learning_rate": 0.01, "loss": 1.4923, "loss/crossentropy": 2.5898343324661255, "loss/fcd": 1.421875, "loss/logits": 0.38097694516181946, "step": 1199 }, { "epoch": 0.00381049155341039, "grad_norm": 0.29296875, "grad_norm_var": 0.0009302139282226562, "learning_rate": 0.01, "loss": 1.3919, "loss/crossentropy": 2.4159947633743286, "loss/fcd": 1.08984375, "loss/logits": 0.23935139924287796, "step": 1200 }, { "epoch": 0.003813666963038232, "grad_norm": 0.3046875, "grad_norm_var": 0.0009698867797851562, "learning_rate": 0.01, "loss": 1.3903, "loss/crossentropy": 2.5132360458374023, "loss/fcd": 1.07421875, "loss/logits": 0.23953358083963394, "step": 1201 }, { "epoch": 0.0038168423726660738, "grad_norm": 0.306640625, "grad_norm_var": 0.000990152359008789, "learning_rate": 0.01, "loss": 1.4423, "loss/crossentropy": 2.67066490650177, "loss/fcd": 1.1875, "loss/logits": 0.24720671027898788, "step": 1202 }, { "epoch": 0.003820017782293916, "grad_norm": 0.294921875, "grad_norm_var": 0.0008619785308837891, "learning_rate": 0.01, "loss": 1.3986, "loss/crossentropy": 2.5758577585220337, "loss/fcd": 1.1171875, "loss/logits": 0.2567456513643265, "step": 1203 }, { "epoch": 0.003823193191921758, "grad_norm": 0.2890625, "grad_norm_var": 0.0008654276529947917, "learning_rate": 0.01, "loss": 1.4, "loss/crossentropy": 2.3205143213272095, "loss/fcd": 1.07421875, "loss/logits": 0.2225775122642517, "step": 1204 }, { "epoch": 0.0038263686015495998, "grad_norm": 0.333984375, "grad_norm_var": 0.0007652123769124349, "learning_rate": 0.01, "loss": 1.4523, "loss/crossentropy": 2.6907862424850464, "loss/fcd": 1.171875, "loss/logits": 0.25760623812675476, "step": 1205 }, { "epoch": 0.003829544011177442, "grad_norm": 0.37890625, "grad_norm_var": 0.001009988784790039, "learning_rate": 0.01, "loss": 1.4808, "loss/crossentropy": 2.2297027111053467, "loss/fcd": 1.06640625, "loss/logits": 0.24180932343006134, "step": 1206 }, { "epoch": 0.003832719420805284, "grad_norm": 0.33984375, "grad_norm_var": 0.0009984175364176433, "learning_rate": 0.01, "loss": 1.441, "loss/crossentropy": 2.418247699737549, "loss/fcd": 1.2734375, "loss/logits": 0.299650639295578, "step": 1207 }, { "epoch": 0.0038358948304331258, "grad_norm": 0.283203125, "grad_norm_var": 0.0010802586873372396, "learning_rate": 0.01, "loss": 1.4242, "loss/crossentropy": 2.539069175720215, "loss/fcd": 1.12890625, "loss/logits": 0.24756956845521927, "step": 1208 }, { "epoch": 0.003839070240060968, "grad_norm": 0.333984375, "grad_norm_var": 0.00101165771484375, "learning_rate": 0.01, "loss": 1.4575, "loss/crossentropy": 2.5147788524627686, "loss/fcd": 1.140625, "loss/logits": 0.26732076704502106, "step": 1209 }, { "epoch": 0.00384224564968881, "grad_norm": 0.36328125, "grad_norm_var": 0.0010904788970947266, "learning_rate": 0.01, "loss": 1.483, "loss/crossentropy": 2.5722566843032837, "loss/fcd": 1.3203125, "loss/logits": 0.33432814478874207, "step": 1210 }, { "epoch": 0.0038454210593166518, "grad_norm": 0.341796875, "grad_norm_var": 0.0009993871053059896, "learning_rate": 0.01, "loss": 1.4951, "loss/crossentropy": 2.5750855207443237, "loss/fcd": 1.2421875, "loss/logits": 0.27135491371154785, "step": 1211 }, { "epoch": 0.003848596468944494, "grad_norm": 0.375, "grad_norm_var": 0.0010959466298421225, "learning_rate": 0.01, "loss": 1.4701, "loss/crossentropy": 2.5221216678619385, "loss/fcd": 1.19921875, "loss/logits": 0.27493688464164734, "step": 1212 }, { "epoch": 0.003851771878572336, "grad_norm": 0.306640625, "grad_norm_var": 0.001056655248006185, "learning_rate": 0.01, "loss": 1.4395, "loss/crossentropy": 2.450919985771179, "loss/fcd": 1.1015625, "loss/logits": 0.24379052966833115, "step": 1213 }, { "epoch": 0.0038549472882001778, "grad_norm": 0.30078125, "grad_norm_var": 0.0010741551717122395, "learning_rate": 0.01, "loss": 1.413, "loss/crossentropy": 2.6099480390548706, "loss/fcd": 1.3046875, "loss/logits": 0.32142966985702515, "step": 1214 }, { "epoch": 0.0038581226978280196, "grad_norm": 0.42578125, "grad_norm_var": 0.0016010125478108724, "learning_rate": 0.01, "loss": 1.4761, "loss/crossentropy": 2.912670850753784, "loss/fcd": 1.2109375, "loss/logits": 0.28954024612903595, "step": 1215 }, { "epoch": 0.003861298107455862, "grad_norm": 0.310546875, "grad_norm_var": 0.0015347798665364583, "learning_rate": 0.01, "loss": 1.4224, "loss/crossentropy": 2.7353790998458862, "loss/fcd": 1.14453125, "loss/logits": 0.2762772664427757, "step": 1216 }, { "epoch": 0.0038644735170837038, "grad_norm": 0.359375, "grad_norm_var": 0.0015329996744791667, "learning_rate": 0.01, "loss": 1.4699, "loss/crossentropy": 2.4240529537200928, "loss/fcd": 1.15234375, "loss/logits": 0.2758324146270752, "step": 1217 }, { "epoch": 0.0038676489267115456, "grad_norm": 0.33203125, "grad_norm_var": 0.0014807224273681641, "learning_rate": 0.01, "loss": 1.4624, "loss/crossentropy": 2.537565231323242, "loss/fcd": 1.15625, "loss/logits": 0.28351983428001404, "step": 1218 }, { "epoch": 0.003870824336339388, "grad_norm": 0.36328125, "grad_norm_var": 0.00140228271484375, "learning_rate": 0.01, "loss": 1.465, "loss/crossentropy": 2.8240954875946045, "loss/fcd": 1.21875, "loss/logits": 0.2765521854162216, "step": 1219 }, { "epoch": 0.0038739997459672298, "grad_norm": 0.337890625, "grad_norm_var": 0.00122068723042806, "learning_rate": 0.01, "loss": 1.4895, "loss/crossentropy": 2.3731629848480225, "loss/fcd": 1.15625, "loss/logits": 0.26921818405389786, "step": 1220 }, { "epoch": 0.0038771751555950716, "grad_norm": 0.337890625, "grad_norm_var": 0.0012169996897379556, "learning_rate": 0.01, "loss": 1.465, "loss/crossentropy": 2.8345093727111816, "loss/fcd": 1.33984375, "loss/logits": 0.3159555047750473, "step": 1221 }, { "epoch": 0.003880350565222914, "grad_norm": 0.3046875, "grad_norm_var": 0.0012073357899983724, "learning_rate": 0.01, "loss": 1.4224, "loss/crossentropy": 2.381617784500122, "loss/fcd": 1.193359375, "loss/logits": 0.31565016508102417, "step": 1222 }, { "epoch": 0.0038835259748507558, "grad_norm": 0.37109375, "grad_norm_var": 0.001273965835571289, "learning_rate": 0.01, "loss": 1.4324, "loss/crossentropy": 2.487219214439392, "loss/fcd": 1.08984375, "loss/logits": 0.23848335444927216, "step": 1223 }, { "epoch": 0.0038867013844785976, "grad_norm": 0.345703125, "grad_norm_var": 0.0010410149892171223, "learning_rate": 0.01, "loss": 1.4393, "loss/crossentropy": 2.5517133474349976, "loss/fcd": 1.25390625, "loss/logits": 0.28321684151887894, "step": 1224 }, { "epoch": 0.00388987679410644, "grad_norm": 0.328125, "grad_norm_var": 0.0010512669881184895, "learning_rate": 0.01, "loss": 1.4263, "loss/crossentropy": 2.6097363233566284, "loss/fcd": 1.125, "loss/logits": 0.26069173216819763, "step": 1225 }, { "epoch": 0.0038930522037342818, "grad_norm": 0.337890625, "grad_norm_var": 0.0010262648264567056, "learning_rate": 0.01, "loss": 1.4804, "loss/crossentropy": 2.659387707710266, "loss/fcd": 1.2421875, "loss/logits": 0.3225764334201813, "step": 1226 }, { "epoch": 0.0038962276133621236, "grad_norm": 0.490234375, "grad_norm_var": 0.0023912906646728514, "learning_rate": 0.01, "loss": 1.6508, "loss/crossentropy": 2.743789553642273, "loss/fcd": 1.20703125, "loss/logits": 0.29360631108283997, "step": 1227 }, { "epoch": 0.003899403022989966, "grad_norm": 0.353515625, "grad_norm_var": 0.0023533503214518228, "learning_rate": 0.01, "loss": 1.5089, "loss/crossentropy": 2.6374075412750244, "loss/fcd": 1.1953125, "loss/logits": 0.3003646731376648, "step": 1228 }, { "epoch": 0.0039025784326178078, "grad_norm": 0.33984375, "grad_norm_var": 0.0022287845611572267, "learning_rate": 0.01, "loss": 1.4435, "loss/crossentropy": 2.4471116065979004, "loss/fcd": 1.10546875, "loss/logits": 0.26029494404792786, "step": 1229 }, { "epoch": 0.0039057538422456496, "grad_norm": 0.31640625, "grad_norm_var": 0.0021364688873291016, "learning_rate": 0.01, "loss": 1.435, "loss/crossentropy": 2.551659107208252, "loss/fcd": 1.21484375, "loss/logits": 0.29492516815662384, "step": 1230 }, { "epoch": 0.003908929251873492, "grad_norm": 0.330078125, "grad_norm_var": 0.0017852147420247396, "learning_rate": 0.01, "loss": 1.4387, "loss/crossentropy": 2.413479208946228, "loss/fcd": 1.15625, "loss/logits": 0.2588907405734062, "step": 1231 }, { "epoch": 0.003912104661501333, "grad_norm": 0.357421875, "grad_norm_var": 0.0016921361287434895, "learning_rate": 0.01, "loss": 1.5254, "loss/crossentropy": 2.34735369682312, "loss/fcd": 1.21875, "loss/logits": 0.2507110461592674, "step": 1232 }, { "epoch": 0.003915280071129176, "grad_norm": 0.328125, "grad_norm_var": 0.0017155329386393228, "learning_rate": 0.01, "loss": 1.4269, "loss/crossentropy": 2.565745711326599, "loss/fcd": 1.16015625, "loss/logits": 0.25614041835069656, "step": 1233 }, { "epoch": 0.003918455480757018, "grad_norm": 0.3203125, "grad_norm_var": 0.0017496744791666667, "learning_rate": 0.01, "loss": 1.3575, "loss/crossentropy": 2.4729398488998413, "loss/fcd": 1.140625, "loss/logits": 0.24720896780490875, "step": 1234 }, { "epoch": 0.003921630890384859, "grad_norm": 0.31640625, "grad_norm_var": 0.0017893473307291667, "learning_rate": 0.01, "loss": 1.3981, "loss/crossentropy": 2.6101608276367188, "loss/fcd": 1.14453125, "loss/logits": 0.2799260914325714, "step": 1235 }, { "epoch": 0.003924806300012702, "grad_norm": 0.322265625, "grad_norm_var": 0.00181884765625, "learning_rate": 0.01, "loss": 1.4678, "loss/crossentropy": 2.6857218742370605, "loss/fcd": 1.21484375, "loss/logits": 0.2783673256635666, "step": 1236 }, { "epoch": 0.003927981709640544, "grad_norm": 0.318359375, "grad_norm_var": 0.0018579483032226563, "learning_rate": 0.01, "loss": 1.3948, "loss/crossentropy": 2.5325331687927246, "loss/fcd": 1.14453125, "loss/logits": 0.25479215383529663, "step": 1237 }, { "epoch": 0.003931157119268385, "grad_norm": 0.3046875, "grad_norm_var": 0.0018579483032226563, "learning_rate": 0.01, "loss": 1.4789, "loss/crossentropy": 2.796862244606018, "loss/fcd": 1.21484375, "loss/logits": 0.27796201407909393, "step": 1238 }, { "epoch": 0.003934332528896228, "grad_norm": 0.326171875, "grad_norm_var": 0.0018129825592041015, "learning_rate": 0.01, "loss": 1.4561, "loss/crossentropy": 2.643467903137207, "loss/fcd": 1.2109375, "loss/logits": 0.2904894948005676, "step": 1239 }, { "epoch": 0.00393750793852407, "grad_norm": 0.359375, "grad_norm_var": 0.0018355687459309896, "learning_rate": 0.01, "loss": 1.4817, "loss/crossentropy": 2.603648066520691, "loss/fcd": 1.18359375, "loss/logits": 0.27167941629886627, "step": 1240 }, { "epoch": 0.003940683348151911, "grad_norm": 0.33984375, "grad_norm_var": 0.0018246968587239583, "learning_rate": 0.01, "loss": 1.5102, "loss/crossentropy": 2.719313621520996, "loss/fcd": 1.26171875, "loss/logits": 0.3047143220901489, "step": 1241 }, { "epoch": 0.003943858757779754, "grad_norm": 0.408203125, "grad_norm_var": 0.002101643880208333, "learning_rate": 0.01, "loss": 1.5765, "loss/crossentropy": 2.6771020889282227, "loss/fcd": 1.2265625, "loss/logits": 0.28070883452892303, "step": 1242 }, { "epoch": 0.003947034167407596, "grad_norm": 0.34375, "grad_norm_var": 0.0006198724110921224, "learning_rate": 0.01, "loss": 1.5044, "loss/crossentropy": 2.5166057348251343, "loss/fcd": 1.328125, "loss/logits": 0.33877988159656525, "step": 1243 }, { "epoch": 0.003950209577035437, "grad_norm": 0.34765625, "grad_norm_var": 0.0006087621053059896, "learning_rate": 0.01, "loss": 1.4523, "loss/crossentropy": 2.551271915435791, "loss/fcd": 1.1953125, "loss/logits": 0.29214996099472046, "step": 1244 }, { "epoch": 0.00395338498666328, "grad_norm": 0.369140625, "grad_norm_var": 0.0006767114003499349, "learning_rate": 0.01, "loss": 1.482, "loss/crossentropy": 2.4673980474472046, "loss/fcd": 1.14453125, "loss/logits": 0.26978321373462677, "step": 1245 }, { "epoch": 0.003956560396291122, "grad_norm": 0.361328125, "grad_norm_var": 0.0006734212239583333, "learning_rate": 0.01, "loss": 1.5001, "loss/crossentropy": 2.5517152547836304, "loss/fcd": 1.40234375, "loss/logits": 0.37660929560661316, "step": 1246 }, { "epoch": 0.003959735805918963, "grad_norm": 0.333984375, "grad_norm_var": 0.0006687800089518229, "learning_rate": 0.01, "loss": 1.4313, "loss/crossentropy": 2.690821647644043, "loss/fcd": 1.10546875, "loss/logits": 0.23918834328651428, "step": 1247 }, { "epoch": 0.003962911215546806, "grad_norm": 0.330078125, "grad_norm_var": 0.0006558736165364583, "learning_rate": 0.01, "loss": 1.412, "loss/crossentropy": 2.382512092590332, "loss/fcd": 1.09765625, "loss/logits": 0.23889730870723724, "step": 1248 }, { "epoch": 0.003966086625174648, "grad_norm": 0.35546875, "grad_norm_var": 0.0006616592407226562, "learning_rate": 0.01, "loss": 1.4725, "loss/crossentropy": 2.7455861568450928, "loss/fcd": 1.140625, "loss/logits": 0.2354552298784256, "step": 1249 }, { "epoch": 0.003969262034802489, "grad_norm": 0.30859375, "grad_norm_var": 0.000702667236328125, "learning_rate": 0.01, "loss": 1.4624, "loss/crossentropy": 2.7124756574630737, "loss/fcd": 1.1328125, "loss/logits": 0.26699987053871155, "step": 1250 }, { "epoch": 0.003972437444430332, "grad_norm": 0.298828125, "grad_norm_var": 0.0007780551910400391, "learning_rate": 0.01, "loss": 1.4404, "loss/crossentropy": 2.3296173810958862, "loss/fcd": 1.12109375, "loss/logits": 0.2619345039129257, "step": 1251 }, { "epoch": 0.003975612854058174, "grad_norm": 0.3671875, "grad_norm_var": 0.0008025487263997396, "learning_rate": 0.01, "loss": 1.3698, "loss/crossentropy": 2.1874274015426636, "loss/fcd": 1.12109375, "loss/logits": 0.255298376083374, "step": 1252 }, { "epoch": 0.003978788263686015, "grad_norm": 0.333984375, "grad_norm_var": 0.0007684707641601562, "learning_rate": 0.01, "loss": 1.4151, "loss/crossentropy": 2.5326846837997437, "loss/fcd": 1.125, "loss/logits": 0.26196645200252533, "step": 1253 }, { "epoch": 0.003981963673313858, "grad_norm": 0.3203125, "grad_norm_var": 0.000703875223795573, "learning_rate": 0.01, "loss": 1.438, "loss/crossentropy": 2.6355420351028442, "loss/fcd": 1.1484375, "loss/logits": 0.2732003480195999, "step": 1254 }, { "epoch": 0.0039851390829417, "grad_norm": 0.345703125, "grad_norm_var": 0.000681304931640625, "learning_rate": 0.01, "loss": 1.4393, "loss/crossentropy": 2.4961410760879517, "loss/fcd": 1.17578125, "loss/logits": 0.2549886703491211, "step": 1255 }, { "epoch": 0.003988314492569541, "grad_norm": 0.3828125, "grad_norm_var": 0.0007598876953125, "learning_rate": 0.01, "loss": 1.4283, "loss/crossentropy": 2.733352780342102, "loss/fcd": 1.1875, "loss/logits": 0.2680300623178482, "step": 1256 }, { "epoch": 0.003991489902197384, "grad_norm": 0.326171875, "grad_norm_var": 0.0007840315500895182, "learning_rate": 0.01, "loss": 1.4803, "loss/crossentropy": 2.68550705909729, "loss/fcd": 1.25390625, "loss/logits": 0.3202988803386688, "step": 1257 }, { "epoch": 0.003994665311825225, "grad_norm": 0.400390625, "grad_norm_var": 0.0007228692372639974, "learning_rate": 0.01, "loss": 1.5034, "loss/crossentropy": 2.600634813308716, "loss/fcd": 1.16015625, "loss/logits": 0.2479611560702324, "step": 1258 }, { "epoch": 0.003997840721453067, "grad_norm": 0.349609375, "grad_norm_var": 0.0007237752278645833, "learning_rate": 0.01, "loss": 1.4905, "loss/crossentropy": 2.75564181804657, "loss/fcd": 1.33203125, "loss/logits": 0.3216111809015274, "step": 1259 }, { "epoch": 0.00400101613108091, "grad_norm": 0.349609375, "grad_norm_var": 0.0007245222727457683, "learning_rate": 0.01, "loss": 1.4171, "loss/crossentropy": 2.529899835586548, "loss/fcd": 1.09765625, "loss/logits": 0.271771177649498, "step": 1260 }, { "epoch": 0.004004191540708751, "grad_norm": 0.302734375, "grad_norm_var": 0.0007936954498291016, "learning_rate": 0.01, "loss": 1.4237, "loss/crossentropy": 2.6720041036605835, "loss/fcd": 1.18359375, "loss/logits": 0.2849181890487671, "step": 1261 }, { "epoch": 0.004007366950336593, "grad_norm": 0.3671875, "grad_norm_var": 0.0008111953735351563, "learning_rate": 0.01, "loss": 1.5072, "loss/crossentropy": 2.863347291946411, "loss/fcd": 1.35546875, "loss/logits": 0.3383914530277252, "step": 1262 }, { "epoch": 0.004010542359964436, "grad_norm": 0.365234375, "grad_norm_var": 0.0008386611938476562, "learning_rate": 0.01, "loss": 1.4475, "loss/crossentropy": 2.468018651008606, "loss/fcd": 1.12109375, "loss/logits": 0.2506176680326462, "step": 1263 }, { "epoch": 0.004013717769592277, "grad_norm": 0.2890625, "grad_norm_var": 0.001019906997680664, "learning_rate": 0.01, "loss": 1.3741, "loss/crossentropy": 2.486486792564392, "loss/fcd": 1.12890625, "loss/logits": 0.25178809463977814, "step": 1264 }, { "epoch": 0.004016893179220119, "grad_norm": 0.306640625, "grad_norm_var": 0.0010775248209635416, "learning_rate": 0.01, "loss": 1.3985, "loss/crossentropy": 2.490716576576233, "loss/fcd": 1.087890625, "loss/logits": 0.2280128076672554, "step": 1265 }, { "epoch": 0.004020068588847962, "grad_norm": 0.31640625, "grad_norm_var": 0.0010503133138020833, "learning_rate": 0.01, "loss": 1.3872, "loss/crossentropy": 2.709531784057617, "loss/fcd": 1.15625, "loss/logits": 0.2688274085521698, "step": 1266 }, { "epoch": 0.004023243998475803, "grad_norm": 0.3046875, "grad_norm_var": 0.0010211785634358724, "learning_rate": 0.01, "loss": 1.4307, "loss/crossentropy": 2.5892921686172485, "loss/fcd": 1.1328125, "loss/logits": 0.24717576801776886, "step": 1267 }, { "epoch": 0.004026419408103645, "grad_norm": 0.294921875, "grad_norm_var": 0.0010782241821289062, "learning_rate": 0.01, "loss": 1.4256, "loss/crossentropy": 2.580000877380371, "loss/fcd": 1.12890625, "loss/logits": 0.2737142890691757, "step": 1268 }, { "epoch": 0.004029594817731488, "grad_norm": 0.31640625, "grad_norm_var": 0.0010992527008056641, "learning_rate": 0.01, "loss": 1.4418, "loss/crossentropy": 2.4925053119659424, "loss/fcd": 1.15234375, "loss/logits": 0.24544715136289597, "step": 1269 }, { "epoch": 0.004032770227359329, "grad_norm": 0.3125, "grad_norm_var": 0.00111692746480306, "learning_rate": 0.01, "loss": 1.3707, "loss/crossentropy": 2.387287139892578, "loss/fcd": 1.0703125, "loss/logits": 0.22482293844223022, "step": 1270 }, { "epoch": 0.004035945636987171, "grad_norm": 0.298828125, "grad_norm_var": 0.0011756738026936849, "learning_rate": 0.01, "loss": 1.4695, "loss/crossentropy": 2.4298460483551025, "loss/fcd": 1.1796875, "loss/logits": 0.2799989655613899, "step": 1271 }, { "epoch": 0.004039121046615014, "grad_norm": 0.318359375, "grad_norm_var": 0.0009831746419270834, "learning_rate": 0.01, "loss": 1.4648, "loss/crossentropy": 2.454038143157959, "loss/fcd": 1.1015625, "loss/logits": 0.24111363291740417, "step": 1272 }, { "epoch": 0.004042296456242855, "grad_norm": 0.451171875, "grad_norm_var": 0.0019597371419270834, "learning_rate": 0.01, "loss": 1.5774, "loss/crossentropy": 2.628638744354248, "loss/fcd": 1.15625, "loss/logits": 0.27326497435569763, "step": 1273 }, { "epoch": 0.004045471865870697, "grad_norm": 0.3046875, "grad_norm_var": 0.0016848087310791016, "learning_rate": 0.01, "loss": 1.3794, "loss/crossentropy": 2.4433882236480713, "loss/fcd": 1.06640625, "loss/logits": 0.23263784497976303, "step": 1274 }, { "epoch": 0.00404864727549854, "grad_norm": 0.36328125, "grad_norm_var": 0.0017358779907226563, "learning_rate": 0.01, "loss": 1.5001, "loss/crossentropy": 2.674124598503113, "loss/fcd": 1.31640625, "loss/logits": 0.28818684816360474, "step": 1275 }, { "epoch": 0.004051822685126381, "grad_norm": 0.3359375, "grad_norm_var": 0.0017097314198811849, "learning_rate": 0.01, "loss": 1.4661, "loss/crossentropy": 2.4235451221466064, "loss/fcd": 1.1875, "loss/logits": 0.28800009191036224, "step": 1276 }, { "epoch": 0.004054998094754223, "grad_norm": 0.29296875, "grad_norm_var": 0.0017485936482747396, "learning_rate": 0.01, "loss": 1.4205, "loss/crossentropy": 2.3037450313568115, "loss/fcd": 1.1171875, "loss/logits": 0.26140692085027695, "step": 1277 }, { "epoch": 0.004058173504382066, "grad_norm": 0.33203125, "grad_norm_var": 0.0016393025716145834, "learning_rate": 0.01, "loss": 1.4778, "loss/crossentropy": 2.667812466621399, "loss/fcd": 1.2109375, "loss/logits": 0.2852103263139725, "step": 1278 }, { "epoch": 0.004061348914009907, "grad_norm": 0.3046875, "grad_norm_var": 0.0015451908111572266, "learning_rate": 0.01, "loss": 1.4693, "loss/crossentropy": 2.4391956329345703, "loss/fcd": 1.1484375, "loss/logits": 0.26627443730831146, "step": 1279 }, { "epoch": 0.004064524323637749, "grad_norm": 0.34765625, "grad_norm_var": 0.0015070438385009766, "learning_rate": 0.01, "loss": 1.4732, "loss/crossentropy": 2.7025269269943237, "loss/fcd": 1.20703125, "loss/logits": 0.28260406851768494, "step": 1280 }, { "epoch": 0.004067699733265592, "grad_norm": 0.302734375, "grad_norm_var": 0.0015175978342692057, "learning_rate": 0.01, "loss": 1.4186, "loss/crossentropy": 2.6425893306732178, "loss/fcd": 1.2578125, "loss/logits": 0.2894894778728485, "step": 1281 }, { "epoch": 0.004070875142893433, "grad_norm": 0.326171875, "grad_norm_var": 0.0015125910441080729, "learning_rate": 0.01, "loss": 1.3827, "loss/crossentropy": 2.3887237310409546, "loss/fcd": 1.09765625, "loss/logits": 0.24146169424057007, "step": 1282 }, { "epoch": 0.004074050552521275, "grad_norm": 0.310546875, "grad_norm_var": 0.0014985243479410807, "learning_rate": 0.01, "loss": 1.4418, "loss/crossentropy": 2.52813184261322, "loss/fcd": 1.20703125, "loss/logits": 0.2914755642414093, "step": 1283 }, { "epoch": 0.004077225962149118, "grad_norm": 0.36328125, "grad_norm_var": 0.00150909423828125, "learning_rate": 0.01, "loss": 1.4782, "loss/crossentropy": 2.76860773563385, "loss/fcd": 1.140625, "loss/logits": 0.2452768310904503, "step": 1284 }, { "epoch": 0.004080401371776959, "grad_norm": 0.384765625, "grad_norm_var": 0.0016765435536702474, "learning_rate": 0.01, "loss": 1.498, "loss/crossentropy": 2.2723467350006104, "loss/fcd": 1.17578125, "loss/logits": 0.26358824223279953, "step": 1285 }, { "epoch": 0.004083576781404801, "grad_norm": 0.3125, "grad_norm_var": 0.0016765435536702474, "learning_rate": 0.01, "loss": 1.426, "loss/crossentropy": 2.2924740314483643, "loss/fcd": 1.11328125, "loss/logits": 0.24014249444007874, "step": 1286 }, { "epoch": 0.004086752191032644, "grad_norm": 0.3359375, "grad_norm_var": 0.0015868504842122396, "learning_rate": 0.01, "loss": 1.4378, "loss/crossentropy": 2.5800981521606445, "loss/fcd": 1.20703125, "loss/logits": 0.30553434789180756, "step": 1287 }, { "epoch": 0.004089927600660485, "grad_norm": 0.32421875, "grad_norm_var": 0.00157469113667806, "learning_rate": 0.01, "loss": 1.4565, "loss/crossentropy": 2.611924648284912, "loss/fcd": 1.1328125, "loss/logits": 0.2662142217159271, "step": 1288 }, { "epoch": 0.004093103010288327, "grad_norm": 0.302734375, "grad_norm_var": 0.0006928602854410807, "learning_rate": 0.01, "loss": 1.4026, "loss/crossentropy": 2.72455894947052, "loss/fcd": 1.140625, "loss/logits": 0.2604304105043411, "step": 1289 }, { "epoch": 0.00409627841991617, "grad_norm": 0.3359375, "grad_norm_var": 0.0006577650705973308, "learning_rate": 0.01, "loss": 1.409, "loss/crossentropy": 2.9217090606689453, "loss/fcd": 1.22265625, "loss/logits": 0.2671070992946625, "step": 1290 }, { "epoch": 0.004099453829544011, "grad_norm": 0.376953125, "grad_norm_var": 0.0007306416829427083, "learning_rate": 0.01, "loss": 1.4311, "loss/crossentropy": 2.7528865337371826, "loss/fcd": 1.16015625, "loss/logits": 0.2920081466436386, "step": 1291 }, { "epoch": 0.004102629239171853, "grad_norm": 0.345703125, "grad_norm_var": 0.0007435957590738933, "learning_rate": 0.01, "loss": 1.4886, "loss/crossentropy": 2.840074300765991, "loss/fcd": 1.18359375, "loss/logits": 0.2826600968837738, "step": 1292 }, { "epoch": 0.004105804648799695, "grad_norm": 0.337890625, "grad_norm_var": 0.000640869140625, "learning_rate": 0.01, "loss": 1.4797, "loss/crossentropy": 2.883775234222412, "loss/fcd": 1.2890625, "loss/logits": 0.3043637126684189, "step": 1293 }, { "epoch": 0.004108980058427537, "grad_norm": 0.37890625, "grad_norm_var": 0.0007659912109375, "learning_rate": 0.01, "loss": 1.4176, "loss/crossentropy": 2.302684187889099, "loss/fcd": 1.091796875, "loss/logits": 0.25298143923282623, "step": 1294 }, { "epoch": 0.004112155468055379, "grad_norm": 0.3359375, "grad_norm_var": 0.0006927490234375, "learning_rate": 0.01, "loss": 1.4535, "loss/crossentropy": 2.8366700410842896, "loss/fcd": 1.12890625, "loss/logits": 0.2685769349336624, "step": 1295 }, { "epoch": 0.004115330877683221, "grad_norm": 0.353515625, "grad_norm_var": 0.0007017612457275391, "learning_rate": 0.01, "loss": 1.3965, "loss/crossentropy": 2.310896575450897, "loss/fcd": 1.1015625, "loss/logits": 0.25244516134262085, "step": 1296 }, { "epoch": 0.004118506287311063, "grad_norm": 0.359375, "grad_norm_var": 0.0006266276041666667, "learning_rate": 0.01, "loss": 1.543, "loss/crossentropy": 2.343865990638733, "loss/fcd": 1.31640625, "loss/logits": 0.31359075009822845, "step": 1297 }, { "epoch": 0.004121681696938905, "grad_norm": 0.359375, "grad_norm_var": 0.0006220340728759766, "learning_rate": 0.01, "loss": 1.5247, "loss/crossentropy": 2.591224431991577, "loss/fcd": 1.15234375, "loss/logits": 0.2871733754873276, "step": 1298 }, { "epoch": 0.004124857106566747, "grad_norm": 0.330078125, "grad_norm_var": 0.0005565484364827474, "learning_rate": 0.01, "loss": 1.4755, "loss/crossentropy": 2.669159412384033, "loss/fcd": 1.25390625, "loss/logits": 0.3233867734670639, "step": 1299 }, { "epoch": 0.004128032516194589, "grad_norm": 0.306640625, "grad_norm_var": 0.0006270726521809896, "learning_rate": 0.01, "loss": 1.4264, "loss/crossentropy": 2.7138874530792236, "loss/fcd": 1.16796875, "loss/logits": 0.2686733603477478, "step": 1300 }, { "epoch": 0.004131207925822431, "grad_norm": 0.33984375, "grad_norm_var": 0.0005002180735270182, "learning_rate": 0.01, "loss": 1.4415, "loss/crossentropy": 2.566162109375, "loss/fcd": 1.125, "loss/logits": 0.26189157366752625, "step": 1301 }, { "epoch": 0.004134383335450273, "grad_norm": 0.359375, "grad_norm_var": 0.00046741167704264324, "learning_rate": 0.01, "loss": 1.4936, "loss/crossentropy": 2.4875816106796265, "loss/fcd": 1.265625, "loss/logits": 0.2859383076429367, "step": 1302 }, { "epoch": 0.004137558745078115, "grad_norm": 0.298828125, "grad_norm_var": 0.000586700439453125, "learning_rate": 0.01, "loss": 1.4441, "loss/crossentropy": 2.506484270095825, "loss/fcd": 1.1015625, "loss/logits": 0.26533307135105133, "step": 1303 }, { "epoch": 0.004140734154705957, "grad_norm": 0.31640625, "grad_norm_var": 0.0006072998046875, "learning_rate": 0.01, "loss": 1.4263, "loss/crossentropy": 2.7427884340286255, "loss/fcd": 1.1796875, "loss/logits": 0.26766709983348846, "step": 1304 }, { "epoch": 0.004143909564333799, "grad_norm": 0.330078125, "grad_norm_var": 0.0005187352498372395, "learning_rate": 0.01, "loss": 1.4142, "loss/crossentropy": 2.6650094985961914, "loss/fcd": 1.1328125, "loss/logits": 0.2549203485250473, "step": 1305 }, { "epoch": 0.004147084973961641, "grad_norm": 0.328125, "grad_norm_var": 0.000528399149576823, "learning_rate": 0.01, "loss": 1.4053, "loss/crossentropy": 2.556913137435913, "loss/fcd": 1.12109375, "loss/logits": 0.2307768240571022, "step": 1306 }, { "epoch": 0.004150260383589483, "grad_norm": 0.283203125, "grad_norm_var": 0.0006291071573893229, "learning_rate": 0.01, "loss": 1.4007, "loss/crossentropy": 2.460480213165283, "loss/fcd": 1.12890625, "loss/logits": 0.23545925319194794, "step": 1307 }, { "epoch": 0.004153435793217325, "grad_norm": 0.314453125, "grad_norm_var": 0.0006464004516601562, "learning_rate": 0.01, "loss": 1.3835, "loss/crossentropy": 2.8481805324554443, "loss/fcd": 1.21484375, "loss/logits": 0.285683736205101, "step": 1308 }, { "epoch": 0.004156611202845167, "grad_norm": 0.306640625, "grad_norm_var": 0.0006881078084309896, "learning_rate": 0.01, "loss": 1.3998, "loss/crossentropy": 2.3539048433303833, "loss/fcd": 1.029296875, "loss/logits": 0.22518763691186905, "step": 1309 }, { "epoch": 0.004159786612473009, "grad_norm": 0.31640625, "grad_norm_var": 0.0005355199178059896, "learning_rate": 0.01, "loss": 1.3795, "loss/crossentropy": 2.4719103574752808, "loss/fcd": 1.2265625, "loss/logits": 0.2904528081417084, "step": 1310 }, { "epoch": 0.004162962022100851, "grad_norm": 0.318359375, "grad_norm_var": 0.0005348046620686849, "learning_rate": 0.01, "loss": 1.4064, "loss/crossentropy": 2.564334273338318, "loss/fcd": 1.1015625, "loss/logits": 0.25157129764556885, "step": 1311 }, { "epoch": 0.004166137431728693, "grad_norm": 0.345703125, "grad_norm_var": 0.000510263442993164, "learning_rate": 0.01, "loss": 1.4539, "loss/crossentropy": 3.0837950706481934, "loss/fcd": 1.28515625, "loss/logits": 0.31429582834243774, "step": 1312 }, { "epoch": 0.004169312841356535, "grad_norm": 0.29296875, "grad_norm_var": 0.0004886468251546224, "learning_rate": 0.01, "loss": 1.3807, "loss/crossentropy": 2.3721253871917725, "loss/fcd": 1.10546875, "loss/logits": 0.2494240254163742, "step": 1313 }, { "epoch": 0.004172488250984377, "grad_norm": 0.330078125, "grad_norm_var": 0.0003949483235677083, "learning_rate": 0.01, "loss": 1.3953, "loss/crossentropy": 2.7933932542800903, "loss/fcd": 1.140625, "loss/logits": 0.24995651096105576, "step": 1314 }, { "epoch": 0.004175663660612219, "grad_norm": 0.302734375, "grad_norm_var": 0.0004042943318684896, "learning_rate": 0.01, "loss": 1.4616, "loss/crossentropy": 2.4732911586761475, "loss/fcd": 1.15234375, "loss/logits": 0.2736980766057968, "step": 1315 }, { "epoch": 0.004178839070240061, "grad_norm": 0.322265625, "grad_norm_var": 0.0003956476847330729, "learning_rate": 0.01, "loss": 1.4608, "loss/crossentropy": 2.701689600944519, "loss/fcd": 1.23828125, "loss/logits": 0.28087399899959564, "step": 1316 }, { "epoch": 0.004182014479867903, "grad_norm": 0.33203125, "grad_norm_var": 0.00037784576416015624, "learning_rate": 0.01, "loss": 1.4967, "loss/crossentropy": 2.4090824127197266, "loss/fcd": 1.18359375, "loss/logits": 0.2695801556110382, "step": 1317 }, { "epoch": 0.004185189889495745, "grad_norm": 0.27734375, "grad_norm_var": 0.00035247802734375, "learning_rate": 0.01, "loss": 1.4061, "loss/crossentropy": 2.7861157655715942, "loss/fcd": 1.25, "loss/logits": 0.2884829342365265, "step": 1318 }, { "epoch": 0.004188365299123587, "grad_norm": 0.35546875, "grad_norm_var": 0.0004423618316650391, "learning_rate": 0.01, "loss": 1.3883, "loss/crossentropy": 2.5148898363113403, "loss/fcd": 1.125, "loss/logits": 0.2602184861898422, "step": 1319 }, { "epoch": 0.004191540708751429, "grad_norm": 0.33203125, "grad_norm_var": 0.0004563490549723307, "learning_rate": 0.01, "loss": 1.4876, "loss/crossentropy": 2.6777840852737427, "loss/fcd": 1.2109375, "loss/logits": 0.28208500146865845, "step": 1320 }, { "epoch": 0.004194716118379271, "grad_norm": 0.3203125, "grad_norm_var": 0.00044657389322916665, "learning_rate": 0.01, "loss": 1.3932, "loss/crossentropy": 2.717270255088806, "loss/fcd": 1.1796875, "loss/logits": 0.2894679605960846, "step": 1321 }, { "epoch": 0.004197891528007113, "grad_norm": 0.328125, "grad_norm_var": 0.00044657389322916665, "learning_rate": 0.01, "loss": 1.397, "loss/crossentropy": 2.6327733993530273, "loss/fcd": 1.22265625, "loss/logits": 0.28402578830718994, "step": 1322 }, { "epoch": 0.004201066937634955, "grad_norm": 0.326171875, "grad_norm_var": 0.0003661473592122396, "learning_rate": 0.01, "loss": 1.4586, "loss/crossentropy": 2.6995646953582764, "loss/fcd": 1.2265625, "loss/logits": 0.2900291830301285, "step": 1323 }, { "epoch": 0.004204242347262797, "grad_norm": 0.32421875, "grad_norm_var": 0.0003647963205973307, "learning_rate": 0.01, "loss": 1.4176, "loss/crossentropy": 2.4139771461486816, "loss/fcd": 1.16796875, "loss/logits": 0.28089456260204315, "step": 1324 }, { "epoch": 0.0042074177568906385, "grad_norm": 0.310546875, "grad_norm_var": 0.0003584384918212891, "learning_rate": 0.01, "loss": 1.4113, "loss/crossentropy": 2.360004186630249, "loss/fcd": 1.0625, "loss/logits": 0.23580221086740494, "step": 1325 }, { "epoch": 0.004210593166518481, "grad_norm": 0.283203125, "grad_norm_var": 0.00044733683268229166, "learning_rate": 0.01, "loss": 1.4417, "loss/crossentropy": 2.6243635416030884, "loss/fcd": 1.3046875, "loss/logits": 0.32921820878982544, "step": 1326 }, { "epoch": 0.004213768576146323, "grad_norm": 0.447265625, "grad_norm_var": 0.001477495829264323, "learning_rate": 0.01, "loss": 1.5418, "loss/crossentropy": 2.670484185218811, "loss/fcd": 1.2890625, "loss/logits": 0.3003707230091095, "step": 1327 }, { "epoch": 0.0042169439857741645, "grad_norm": 0.34375, "grad_norm_var": 0.0014728387196858725, "learning_rate": 0.01, "loss": 1.4112, "loss/crossentropy": 2.547669768333435, "loss/fcd": 1.10546875, "loss/logits": 0.260916993021965, "step": 1328 }, { "epoch": 0.004220119395402007, "grad_norm": 0.302734375, "grad_norm_var": 0.001434771219889323, "learning_rate": 0.01, "loss": 1.4207, "loss/crossentropy": 2.291555881500244, "loss/fcd": 1.2421875, "loss/logits": 0.2930373251438141, "step": 1329 }, { "epoch": 0.004223294805029849, "grad_norm": 0.318359375, "grad_norm_var": 0.0014391581217447916, "learning_rate": 0.01, "loss": 1.4399, "loss/crossentropy": 2.6533578634262085, "loss/fcd": 1.2734375, "loss/logits": 0.3100448325276375, "step": 1330 }, { "epoch": 0.0042264702146576905, "grad_norm": 0.298828125, "grad_norm_var": 0.0014525731404622397, "learning_rate": 0.01, "loss": 1.3907, "loss/crossentropy": 2.209708571434021, "loss/fcd": 1.0546875, "loss/logits": 0.24851273745298386, "step": 1331 }, { "epoch": 0.004229645624285533, "grad_norm": 0.326171875, "grad_norm_var": 0.0014513651529947916, "learning_rate": 0.01, "loss": 1.4242, "loss/crossentropy": 2.673172950744629, "loss/fcd": 1.14453125, "loss/logits": 0.2506696879863739, "step": 1332 }, { "epoch": 0.004232821033913375, "grad_norm": 0.291015625, "grad_norm_var": 0.0015271345774332682, "learning_rate": 0.01, "loss": 1.3957, "loss/crossentropy": 2.383945345878601, "loss/fcd": 1.07421875, "loss/logits": 0.24351903796195984, "step": 1333 }, { "epoch": 0.0042359964435412165, "grad_norm": 0.306640625, "grad_norm_var": 0.0013981501261393229, "learning_rate": 0.01, "loss": 1.3973, "loss/crossentropy": 2.2057820558547974, "loss/fcd": 1.1171875, "loss/logits": 0.26141272485256195, "step": 1334 }, { "epoch": 0.004239171853169059, "grad_norm": 0.337890625, "grad_norm_var": 0.0013482252756754558, "learning_rate": 0.01, "loss": 1.3821, "loss/crossentropy": 2.5266321897506714, "loss/fcd": 1.1015625, "loss/logits": 0.23537910729646683, "step": 1335 }, { "epoch": 0.004242347262796901, "grad_norm": 0.33984375, "grad_norm_var": 0.0013595422108968098, "learning_rate": 0.01, "loss": 1.4677, "loss/crossentropy": 2.5469002723693848, "loss/fcd": 1.10546875, "loss/logits": 0.24853455275297165, "step": 1336 }, { "epoch": 0.0042455226724247425, "grad_norm": 0.306640625, "grad_norm_var": 0.0013803482055664063, "learning_rate": 0.01, "loss": 1.411, "loss/crossentropy": 2.5601704120635986, "loss/fcd": 1.15625, "loss/logits": 0.2457425892353058, "step": 1337 }, { "epoch": 0.004248698082052585, "grad_norm": 0.32421875, "grad_norm_var": 0.00137939453125, "learning_rate": 0.01, "loss": 1.4379, "loss/crossentropy": 2.795596122741699, "loss/fcd": 1.21484375, "loss/logits": 0.2757267951965332, "step": 1338 }, { "epoch": 0.004251873491680427, "grad_norm": 0.345703125, "grad_norm_var": 0.0014083226521809896, "learning_rate": 0.01, "loss": 1.4472, "loss/crossentropy": 2.7075644731521606, "loss/fcd": 1.2578125, "loss/logits": 0.2940382659435272, "step": 1339 }, { "epoch": 0.0042550489013082685, "grad_norm": 0.3828125, "grad_norm_var": 0.0016133626302083333, "learning_rate": 0.01, "loss": 1.4226, "loss/crossentropy": 2.6475530862808228, "loss/fcd": 1.19921875, "loss/logits": 0.2611531913280487, "step": 1340 }, { "epoch": 0.004258224310936111, "grad_norm": 0.345703125, "grad_norm_var": 0.0016036351521809896, "learning_rate": 0.01, "loss": 1.4678, "loss/crossentropy": 2.614858865737915, "loss/fcd": 1.23046875, "loss/logits": 0.27315984666347504, "step": 1341 }, { "epoch": 0.004261399720563953, "grad_norm": 0.314453125, "grad_norm_var": 0.0014642715454101563, "learning_rate": 0.01, "loss": 1.4435, "loss/crossentropy": 2.5878058671951294, "loss/fcd": 1.28125, "loss/logits": 0.2967488616704941, "step": 1342 }, { "epoch": 0.0042645751301917945, "grad_norm": 0.326171875, "grad_norm_var": 0.0005399068196614583, "learning_rate": 0.01, "loss": 1.4237, "loss/crossentropy": 2.7905893325805664, "loss/fcd": 1.20703125, "loss/logits": 0.2699965089559555, "step": 1343 }, { "epoch": 0.004267750539819637, "grad_norm": 0.32421875, "grad_norm_var": 0.0005167007446289062, "learning_rate": 0.01, "loss": 1.4277, "loss/crossentropy": 2.660565972328186, "loss/fcd": 1.1640625, "loss/logits": 0.26573804020881653, "step": 1344 }, { "epoch": 0.004270925949447479, "grad_norm": 0.32421875, "grad_norm_var": 0.0004833062489827474, "learning_rate": 0.01, "loss": 1.4046, "loss/crossentropy": 2.375802516937256, "loss/fcd": 1.125, "loss/logits": 0.24803253263235092, "step": 1345 }, { "epoch": 0.0042741013590753205, "grad_norm": 0.32421875, "grad_norm_var": 0.00047963460286458334, "learning_rate": 0.01, "loss": 1.4682, "loss/crossentropy": 2.3692342042922974, "loss/fcd": 1.1484375, "loss/logits": 0.24302593618631363, "step": 1346 }, { "epoch": 0.004277276768703163, "grad_norm": 0.396484375, "grad_norm_var": 0.0007196426391601563, "learning_rate": 0.01, "loss": 1.4683, "loss/crossentropy": 2.4507782459259033, "loss/fcd": 1.08984375, "loss/logits": 0.23529259115457535, "step": 1347 }, { "epoch": 0.004280452178331005, "grad_norm": 0.35546875, "grad_norm_var": 0.0007494449615478516, "learning_rate": 0.01, "loss": 1.5598, "loss/crossentropy": 2.4833039045333862, "loss/fcd": 1.15234375, "loss/logits": 0.26528795063495636, "step": 1348 }, { "epoch": 0.0042836275879588465, "grad_norm": 0.3359375, "grad_norm_var": 0.0006174723307291667, "learning_rate": 0.01, "loss": 1.4706, "loss/crossentropy": 2.707969903945923, "loss/fcd": 1.1796875, "loss/logits": 0.2763236314058304, "step": 1349 }, { "epoch": 0.004286802997586689, "grad_norm": 0.337890625, "grad_norm_var": 0.0005523681640625, "learning_rate": 0.01, "loss": 1.4903, "loss/crossentropy": 2.8595279455184937, "loss/fcd": 1.23828125, "loss/logits": 0.29072633385658264, "step": 1350 }, { "epoch": 0.004289978407214531, "grad_norm": 0.28515625, "grad_norm_var": 0.000733041763305664, "learning_rate": 0.01, "loss": 1.3918, "loss/crossentropy": 2.6782355308532715, "loss/fcd": 1.125, "loss/logits": 0.243242546916008, "step": 1351 }, { "epoch": 0.0042931538168423725, "grad_norm": 0.349609375, "grad_norm_var": 0.0007445653279622396, "learning_rate": 0.01, "loss": 1.455, "loss/crossentropy": 2.4614741802215576, "loss/fcd": 1.3125, "loss/logits": 0.34382209181785583, "step": 1352 }, { "epoch": 0.004296329226470215, "grad_norm": 0.30859375, "grad_norm_var": 0.0007371107737223307, "learning_rate": 0.01, "loss": 1.3819, "loss/crossentropy": 2.6094011068344116, "loss/fcd": 1.17578125, "loss/logits": 0.2638571858406067, "step": 1353 }, { "epoch": 0.004299504636098057, "grad_norm": 0.28125, "grad_norm_var": 0.0009217421213785807, "learning_rate": 0.01, "loss": 1.3978, "loss/crossentropy": 2.593880534172058, "loss/fcd": 1.15625, "loss/logits": 0.28091951459646225, "step": 1354 }, { "epoch": 0.0043026800457258985, "grad_norm": 0.3046875, "grad_norm_var": 0.0009607950846354167, "learning_rate": 0.01, "loss": 1.4065, "loss/crossentropy": 2.5567115545272827, "loss/fcd": 1.2109375, "loss/logits": 0.2634689658880234, "step": 1355 }, { "epoch": 0.004305855455353741, "grad_norm": 0.32421875, "grad_norm_var": 0.0007710138956705729, "learning_rate": 0.01, "loss": 1.4317, "loss/crossentropy": 2.5892242193222046, "loss/fcd": 1.0625, "loss/logits": 0.23277316987514496, "step": 1356 }, { "epoch": 0.004309030864981583, "grad_norm": 0.353515625, "grad_norm_var": 0.0007939020792643229, "learning_rate": 0.01, "loss": 1.4481, "loss/crossentropy": 2.5315154790878296, "loss/fcd": 1.140625, "loss/logits": 0.262657567858696, "step": 1357 }, { "epoch": 0.0043122062746094245, "grad_norm": 0.33984375, "grad_norm_var": 0.000788736343383789, "learning_rate": 0.01, "loss": 1.5394, "loss/crossentropy": 2.6920342445373535, "loss/fcd": 1.23828125, "loss/logits": 0.28466978669166565, "step": 1358 }, { "epoch": 0.004315381684237267, "grad_norm": 0.302734375, "grad_norm_var": 0.0008333683013916016, "learning_rate": 0.01, "loss": 1.4019, "loss/crossentropy": 2.4664814472198486, "loss/fcd": 1.15625, "loss/logits": 0.25851884484291077, "step": 1359 }, { "epoch": 0.004318557093865108, "grad_norm": 0.640625, "grad_norm_var": 0.006930780410766601, "learning_rate": 0.01, "loss": 1.4538, "loss/crossentropy": 2.533437728881836, "loss/fcd": 1.15625, "loss/logits": 0.26242512464523315, "step": 1360 }, { "epoch": 0.0043217325034929505, "grad_norm": 0.41015625, "grad_norm_var": 0.007122405370076497, "learning_rate": 0.01, "loss": 1.4333, "loss/crossentropy": 2.6225509643554688, "loss/fcd": 1.20703125, "loss/logits": 0.2772522568702698, "step": 1361 }, { "epoch": 0.004324907913120793, "grad_norm": 0.34765625, "grad_norm_var": 0.00706632932027181, "learning_rate": 0.01, "loss": 1.3644, "loss/crossentropy": 2.270558714866638, "loss/fcd": 1.09765625, "loss/logits": 0.22092069685459137, "step": 1362 }, { "epoch": 0.004328083322748634, "grad_norm": 0.345703125, "grad_norm_var": 0.006944004694620768, "learning_rate": 0.01, "loss": 1.4121, "loss/crossentropy": 2.3330254554748535, "loss/fcd": 1.171875, "loss/logits": 0.24612677097320557, "step": 1363 }, { "epoch": 0.0043312587323764765, "grad_norm": 0.34375, "grad_norm_var": 0.006946293512980143, "learning_rate": 0.01, "loss": 1.4905, "loss/crossentropy": 2.7725892066955566, "loss/fcd": 1.19140625, "loss/logits": 0.2769398167729378, "step": 1364 }, { "epoch": 0.004334434142004319, "grad_norm": 0.3671875, "grad_norm_var": 0.00694578488667806, "learning_rate": 0.01, "loss": 1.4953, "loss/crossentropy": 2.4044036865234375, "loss/fcd": 1.25390625, "loss/logits": 0.29712510108947754, "step": 1365 }, { "epoch": 0.00433760955163216, "grad_norm": 0.314453125, "grad_norm_var": 0.007026274998982747, "learning_rate": 0.01, "loss": 1.4154, "loss/crossentropy": 2.6577264070510864, "loss/fcd": 1.08203125, "loss/logits": 0.2481316328048706, "step": 1366 }, { "epoch": 0.0043407849612600025, "grad_norm": 0.328125, "grad_norm_var": 0.006763315200805664, "learning_rate": 0.01, "loss": 1.4093, "loss/crossentropy": 2.5810656547546387, "loss/fcd": 1.21875, "loss/logits": 0.26981040835380554, "step": 1367 }, { "epoch": 0.004343960370887845, "grad_norm": 0.40234375, "grad_norm_var": 0.006907081604003907, "learning_rate": 0.01, "loss": 1.4388, "loss/crossentropy": 2.6675479412078857, "loss/fcd": 1.125, "loss/logits": 0.2646193355321884, "step": 1368 }, { "epoch": 0.004347135780515686, "grad_norm": 0.27734375, "grad_norm_var": 0.007170550028483073, "learning_rate": 0.01, "loss": 1.3538, "loss/crossentropy": 2.202846109867096, "loss/fcd": 1.05078125, "loss/logits": 0.21698091179132462, "step": 1369 }, { "epoch": 0.0043503111901435285, "grad_norm": 0.296875, "grad_norm_var": 0.007031695048014323, "learning_rate": 0.01, "loss": 1.3553, "loss/crossentropy": 2.5293253660202026, "loss/fcd": 1.109375, "loss/logits": 0.24526619166135788, "step": 1370 }, { "epoch": 0.004353486599771371, "grad_norm": 0.353515625, "grad_norm_var": 0.006845331192016602, "learning_rate": 0.01, "loss": 1.512, "loss/crossentropy": 2.5916976928710938, "loss/fcd": 1.27734375, "loss/logits": 0.3228559195995331, "step": 1371 }, { "epoch": 0.004356662009399212, "grad_norm": 0.30859375, "grad_norm_var": 0.00693357785542806, "learning_rate": 0.01, "loss": 1.4113, "loss/crossentropy": 2.6348538398742676, "loss/fcd": 1.1875, "loss/logits": 0.2851094901561737, "step": 1372 }, { "epoch": 0.0043598374190270545, "grad_norm": 0.330078125, "grad_norm_var": 0.006982787450154623, "learning_rate": 0.01, "loss": 1.4867, "loss/crossentropy": 2.5594276189804077, "loss/fcd": 1.12890625, "loss/logits": 0.2585541307926178, "step": 1373 }, { "epoch": 0.004363012828654897, "grad_norm": 0.3203125, "grad_norm_var": 0.007050816218058268, "learning_rate": 0.01, "loss": 1.3667, "loss/crossentropy": 2.22147798538208, "loss/fcd": 1.09375, "loss/logits": 0.24597130715847015, "step": 1374 }, { "epoch": 0.004366188238282738, "grad_norm": 0.31640625, "grad_norm_var": 0.006966145833333334, "learning_rate": 0.01, "loss": 1.4669, "loss/crossentropy": 2.6159130334854126, "loss/fcd": 1.2421875, "loss/logits": 0.29816021025180817, "step": 1375 }, { "epoch": 0.0043693636479105805, "grad_norm": 0.3203125, "grad_norm_var": 0.0012418111165364584, "learning_rate": 0.01, "loss": 1.468, "loss/crossentropy": 2.779896855354309, "loss/fcd": 1.1953125, "loss/logits": 0.28255823254585266, "step": 1376 }, { "epoch": 0.004372539057538423, "grad_norm": 0.32421875, "grad_norm_var": 0.0008585611979166667, "learning_rate": 0.01, "loss": 1.4818, "loss/crossentropy": 2.450703978538513, "loss/fcd": 1.12890625, "loss/logits": 0.25012047588825226, "step": 1377 }, { "epoch": 0.004375714467166264, "grad_norm": 0.330078125, "grad_norm_var": 0.0008389631907145183, "learning_rate": 0.01, "loss": 1.4436, "loss/crossentropy": 2.53182852268219, "loss/fcd": 1.16015625, "loss/logits": 0.27198435366153717, "step": 1378 }, { "epoch": 0.0043788898767941065, "grad_norm": 0.3046875, "grad_norm_var": 0.0008579889933268229, "learning_rate": 0.01, "loss": 1.4192, "loss/crossentropy": 2.610583782196045, "loss/fcd": 1.08984375, "loss/logits": 0.25160080939531326, "step": 1379 }, { "epoch": 0.004382065286421949, "grad_norm": 0.3203125, "grad_norm_var": 0.000841204325358073, "learning_rate": 0.01, "loss": 1.4255, "loss/crossentropy": 2.4733269214630127, "loss/fcd": 1.1328125, "loss/logits": 0.25535134971141815, "step": 1380 }, { "epoch": 0.00438524069604979, "grad_norm": 0.36328125, "grad_norm_var": 0.0008206685384114583, "learning_rate": 0.01, "loss": 1.4665, "loss/crossentropy": 2.7692333459854126, "loss/fcd": 1.1875, "loss/logits": 0.27308203279972076, "step": 1381 }, { "epoch": 0.0043884161056776325, "grad_norm": 0.3203125, "grad_norm_var": 0.0008140405019124349, "learning_rate": 0.01, "loss": 1.439, "loss/crossentropy": 2.4590622186660767, "loss/fcd": 1.109375, "loss/logits": 0.2424609512090683, "step": 1382 }, { "epoch": 0.004391591515305475, "grad_norm": 0.326171875, "grad_norm_var": 0.0008137385050455729, "learning_rate": 0.01, "loss": 1.4147, "loss/crossentropy": 2.6498833894729614, "loss/fcd": 1.23046875, "loss/logits": 0.30393680930137634, "step": 1383 }, { "epoch": 0.004394766924933316, "grad_norm": 0.310546875, "grad_norm_var": 0.0004051049550374349, "learning_rate": 0.01, "loss": 1.4343, "loss/crossentropy": 2.7260329723358154, "loss/fcd": 1.16015625, "loss/logits": 0.26641108095645905, "step": 1384 }, { "epoch": 0.0043979423345611585, "grad_norm": 0.32421875, "grad_norm_var": 0.0002746423085530599, "learning_rate": 0.01, "loss": 1.4779, "loss/crossentropy": 2.493578314781189, "loss/fcd": 1.31640625, "loss/logits": 0.3210445046424866, "step": 1385 }, { "epoch": 0.004401117744189001, "grad_norm": 0.330078125, "grad_norm_var": 0.00022735595703125, "learning_rate": 0.01, "loss": 1.4553, "loss/crossentropy": 2.7193714380264282, "loss/fcd": 1.18359375, "loss/logits": 0.2692257910966873, "step": 1386 }, { "epoch": 0.004404293153816842, "grad_norm": 0.353515625, "grad_norm_var": 0.00022735595703125, "learning_rate": 0.01, "loss": 1.4225, "loss/crossentropy": 2.7353339195251465, "loss/fcd": 1.19140625, "loss/logits": 0.28134213387966156, "step": 1387 }, { "epoch": 0.0044074685634446845, "grad_norm": 0.380859375, "grad_norm_var": 0.00039378801981608075, "learning_rate": 0.01, "loss": 1.5391, "loss/crossentropy": 2.873054027557373, "loss/fcd": 1.3671875, "loss/logits": 0.3420102447271347, "step": 1388 }, { "epoch": 0.004410643973072527, "grad_norm": 0.3359375, "grad_norm_var": 0.00039621988932291664, "learning_rate": 0.01, "loss": 1.4628, "loss/crossentropy": 2.3841891288757324, "loss/fcd": 1.16796875, "loss/logits": 0.2926853895187378, "step": 1389 }, { "epoch": 0.004413819382700368, "grad_norm": 0.296875, "grad_norm_var": 0.00046106974283854164, "learning_rate": 0.01, "loss": 1.4136, "loss/crossentropy": 2.5285985469818115, "loss/fcd": 1.16796875, "loss/logits": 0.2564908117055893, "step": 1390 }, { "epoch": 0.0044169947923282105, "grad_norm": 0.296875, "grad_norm_var": 0.0005167007446289062, "learning_rate": 0.01, "loss": 1.3875, "loss/crossentropy": 2.654350519180298, "loss/fcd": 1.16796875, "loss/logits": 0.28525567054748535, "step": 1391 }, { "epoch": 0.004420170201956053, "grad_norm": 0.33203125, "grad_norm_var": 0.00051422119140625, "learning_rate": 0.01, "loss": 1.433, "loss/crossentropy": 2.443812131881714, "loss/fcd": 1.14453125, "loss/logits": 0.27953577041625977, "step": 1392 }, { "epoch": 0.004423345611583894, "grad_norm": 0.375, "grad_norm_var": 0.0006489435831705729, "learning_rate": 0.01, "loss": 1.4379, "loss/crossentropy": 2.625929594039917, "loss/fcd": 1.1796875, "loss/logits": 0.272493414580822, "step": 1393 }, { "epoch": 0.0044265210212117365, "grad_norm": 0.341796875, "grad_norm_var": 0.0006556193033854167, "learning_rate": 0.01, "loss": 1.4902, "loss/crossentropy": 2.3382182121276855, "loss/fcd": 1.265625, "loss/logits": 0.3017432689666748, "step": 1394 }, { "epoch": 0.004429696430839578, "grad_norm": 0.345703125, "grad_norm_var": 0.0006112257639567057, "learning_rate": 0.01, "loss": 1.4641, "loss/crossentropy": 2.6569149494171143, "loss/fcd": 1.10546875, "loss/logits": 0.2582094222307205, "step": 1395 }, { "epoch": 0.00443287184046742, "grad_norm": 0.298828125, "grad_norm_var": 0.000680987040201823, "learning_rate": 0.01, "loss": 1.4118, "loss/crossentropy": 2.581476926803589, "loss/fcd": 1.109375, "loss/logits": 0.23965629935264587, "step": 1396 }, { "epoch": 0.0044360472500952625, "grad_norm": 0.333984375, "grad_norm_var": 0.0006173292795817058, "learning_rate": 0.01, "loss": 1.437, "loss/crossentropy": 2.4225754737854004, "loss/fcd": 1.1328125, "loss/logits": 0.2506974786520004, "step": 1397 }, { "epoch": 0.004439222659723104, "grad_norm": 0.294921875, "grad_norm_var": 0.0006952285766601562, "learning_rate": 0.01, "loss": 1.362, "loss/crossentropy": 2.5169063806533813, "loss/fcd": 1.0625, "loss/logits": 0.23823533952236176, "step": 1398 }, { "epoch": 0.004442398069350946, "grad_norm": 0.298828125, "grad_norm_var": 0.00075531005859375, "learning_rate": 0.01, "loss": 1.3807, "loss/crossentropy": 2.5822519063949585, "loss/fcd": 1.171875, "loss/logits": 0.2566351592540741, "step": 1399 }, { "epoch": 0.0044455734789787885, "grad_norm": 0.291015625, "grad_norm_var": 0.0008249282836914062, "learning_rate": 0.01, "loss": 1.3772, "loss/crossentropy": 2.3679425716400146, "loss/fcd": 1.0859375, "loss/logits": 0.2193019688129425, "step": 1400 }, { "epoch": 0.00444874888860663, "grad_norm": 0.373046875, "grad_norm_var": 0.0009564558664957682, "learning_rate": 0.01, "loss": 1.4712, "loss/crossentropy": 2.6258989572525024, "loss/fcd": 1.1875, "loss/logits": 0.2775397300720215, "step": 1401 }, { "epoch": 0.004451924298234472, "grad_norm": 0.34375, "grad_norm_var": 0.0009683609008789062, "learning_rate": 0.01, "loss": 1.4625, "loss/crossentropy": 2.8201746940612793, "loss/fcd": 1.19921875, "loss/logits": 0.2869168519973755, "step": 1402 }, { "epoch": 0.0044550997078623145, "grad_norm": 0.337890625, "grad_norm_var": 0.0009363174438476562, "learning_rate": 0.01, "loss": 1.4581, "loss/crossentropy": 2.4342455863952637, "loss/fcd": 1.21484375, "loss/logits": 0.28532321751117706, "step": 1403 }, { "epoch": 0.004458275117490156, "grad_norm": 0.330078125, "grad_norm_var": 0.0007520039876302083, "learning_rate": 0.01, "loss": 1.459, "loss/crossentropy": 2.62434720993042, "loss/fcd": 1.2421875, "loss/logits": 0.28645069152116776, "step": 1404 }, { "epoch": 0.004461450527117998, "grad_norm": 0.31640625, "grad_norm_var": 0.0007516860961914063, "learning_rate": 0.01, "loss": 1.3789, "loss/crossentropy": 2.3656718730926514, "loss/fcd": 1.109375, "loss/logits": 0.2572406679391861, "step": 1405 }, { "epoch": 0.0044646259367458405, "grad_norm": 0.283203125, "grad_norm_var": 0.000815439224243164, "learning_rate": 0.01, "loss": 1.3927, "loss/crossentropy": 2.546653151512146, "loss/fcd": 1.1171875, "loss/logits": 0.2546772211790085, "step": 1406 }, { "epoch": 0.004467801346373682, "grad_norm": 0.38671875, "grad_norm_var": 0.000987990697224935, "learning_rate": 0.01, "loss": 1.5738, "loss/crossentropy": 2.8910043239593506, "loss/fcd": 1.27734375, "loss/logits": 0.3159196972846985, "step": 1407 }, { "epoch": 0.004470976756001524, "grad_norm": 0.333984375, "grad_norm_var": 0.0009887059529622396, "learning_rate": 0.01, "loss": 1.4442, "loss/crossentropy": 2.5851739645004272, "loss/fcd": 1.18359375, "loss/logits": 0.276899516582489, "step": 1408 }, { "epoch": 0.0044741521656293665, "grad_norm": 0.310546875, "grad_norm_var": 0.0008643945058186849, "learning_rate": 0.01, "loss": 1.4198, "loss/crossentropy": 2.614390015602112, "loss/fcd": 1.109375, "loss/logits": 0.2494056224822998, "step": 1409 }, { "epoch": 0.004477327575257208, "grad_norm": 0.337890625, "grad_norm_var": 0.0008572737375895182, "learning_rate": 0.01, "loss": 1.4363, "loss/crossentropy": 2.7930214405059814, "loss/fcd": 1.2578125, "loss/logits": 0.28455136716365814, "step": 1410 }, { "epoch": 0.00448050298488505, "grad_norm": 0.29296875, "grad_norm_var": 0.0008928934733072917, "learning_rate": 0.01, "loss": 1.3619, "loss/crossentropy": 2.5822407007217407, "loss/fcd": 1.1171875, "loss/logits": 0.24501096457242966, "step": 1411 }, { "epoch": 0.0044836783945128925, "grad_norm": 0.375, "grad_norm_var": 0.0010125319163004558, "learning_rate": 0.01, "loss": 1.4674, "loss/crossentropy": 2.4841359853744507, "loss/fcd": 1.21875, "loss/logits": 0.2835097014904022, "step": 1412 }, { "epoch": 0.004486853804140734, "grad_norm": 0.31640625, "grad_norm_var": 0.0010166803995768229, "learning_rate": 0.01, "loss": 1.4202, "loss/crossentropy": 2.599228262901306, "loss/fcd": 1.1015625, "loss/logits": 0.24908974021673203, "step": 1413 }, { "epoch": 0.004490029213768576, "grad_norm": 0.36328125, "grad_norm_var": 0.0010216871897379557, "learning_rate": 0.01, "loss": 1.4531, "loss/crossentropy": 2.589244842529297, "loss/fcd": 1.22265625, "loss/logits": 0.28631316125392914, "step": 1414 }, { "epoch": 0.0044932046233964185, "grad_norm": 0.302734375, "grad_norm_var": 0.0010060469309488933, "learning_rate": 0.01, "loss": 1.4148, "loss/crossentropy": 2.6378756761550903, "loss/fcd": 1.140625, "loss/logits": 0.2509348690509796, "step": 1415 }, { "epoch": 0.00449638003302426, "grad_norm": 0.361328125, "grad_norm_var": 0.0009408156077067057, "learning_rate": 0.01, "loss": 1.4567, "loss/crossentropy": 2.5789411067962646, "loss/fcd": 1.14453125, "loss/logits": 0.25096774846315384, "step": 1416 }, { "epoch": 0.004499555442652102, "grad_norm": 0.326171875, "grad_norm_var": 0.0008423964182535808, "learning_rate": 0.01, "loss": 1.4649, "loss/crossentropy": 2.657409906387329, "loss/fcd": 1.234375, "loss/logits": 0.2840787023305893, "step": 1417 }, { "epoch": 0.0045027308522799445, "grad_norm": 0.45703125, "grad_norm_var": 0.0018159071604410806, "learning_rate": 0.01, "loss": 1.5413, "loss/crossentropy": 2.854301929473877, "loss/fcd": 1.2578125, "loss/logits": 0.3911370262503624, "step": 1418 }, { "epoch": 0.004505906261907786, "grad_norm": 0.314453125, "grad_norm_var": 0.0018551985422770182, "learning_rate": 0.01, "loss": 1.4276, "loss/crossentropy": 2.61715030670166, "loss/fcd": 1.2265625, "loss/logits": 0.26226918399333954, "step": 1419 }, { "epoch": 0.004509081671535628, "grad_norm": 0.30078125, "grad_norm_var": 0.0019398371378580729, "learning_rate": 0.01, "loss": 1.3809, "loss/crossentropy": 2.4985090494155884, "loss/fcd": 1.11328125, "loss/logits": 0.2635648846626282, "step": 1420 }, { "epoch": 0.0045122570811634705, "grad_norm": 0.345703125, "grad_norm_var": 0.0019162336985270182, "learning_rate": 0.01, "loss": 1.4522, "loss/crossentropy": 2.4202325344085693, "loss/fcd": 1.2109375, "loss/logits": 0.2792631834745407, "step": 1421 }, { "epoch": 0.004515432490791312, "grad_norm": 0.3515625, "grad_norm_var": 0.0017087300618489583, "learning_rate": 0.01, "loss": 1.4617, "loss/crossentropy": 2.5949126482009888, "loss/fcd": 1.18359375, "loss/logits": 0.2782231420278549, "step": 1422 }, { "epoch": 0.004518607900419154, "grad_norm": 0.373046875, "grad_norm_var": 0.001639413833618164, "learning_rate": 0.01, "loss": 1.4716, "loss/crossentropy": 2.8144848346710205, "loss/fcd": 1.14453125, "loss/logits": 0.25988084077835083, "step": 1423 }, { "epoch": 0.0045217833100469965, "grad_norm": 0.32421875, "grad_norm_var": 0.0016550699869791667, "learning_rate": 0.01, "loss": 1.397, "loss/crossentropy": 2.709870934486389, "loss/fcd": 1.125, "loss/logits": 0.24850136041641235, "step": 1424 }, { "epoch": 0.004524958719674838, "grad_norm": 0.330078125, "grad_norm_var": 0.0016000747680664062, "learning_rate": 0.01, "loss": 1.4855, "loss/crossentropy": 2.3848791122436523, "loss/fcd": 1.234375, "loss/logits": 0.34815944731235504, "step": 1425 }, { "epoch": 0.00452813412930268, "grad_norm": 0.33984375, "grad_norm_var": 0.0015992323557535808, "learning_rate": 0.01, "loss": 1.4799, "loss/crossentropy": 2.318433403968811, "loss/fcd": 1.23828125, "loss/logits": 0.2952542304992676, "step": 1426 }, { "epoch": 0.0045313095389305225, "grad_norm": 0.333984375, "grad_norm_var": 0.0014353434244791666, "learning_rate": 0.01, "loss": 1.4724, "loss/crossentropy": 2.4044578075408936, "loss/fcd": 1.13671875, "loss/logits": 0.2652297839522362, "step": 1427 }, { "epoch": 0.004534484948558364, "grad_norm": 0.328125, "grad_norm_var": 0.0013834635416666667, "learning_rate": 0.01, "loss": 1.4367, "loss/crossentropy": 2.5571463108062744, "loss/fcd": 1.109375, "loss/logits": 0.2577649801969528, "step": 1428 }, { "epoch": 0.004537660358186206, "grad_norm": 0.34765625, "grad_norm_var": 0.0013387044270833333, "learning_rate": 0.01, "loss": 1.5425, "loss/crossentropy": 2.3130730390548706, "loss/fcd": 1.12109375, "loss/logits": 0.25981394946575165, "step": 1429 }, { "epoch": 0.004540835767814048, "grad_norm": 0.30859375, "grad_norm_var": 0.001383209228515625, "learning_rate": 0.01, "loss": 1.445, "loss/crossentropy": 2.9696460962295532, "loss/fcd": 1.234375, "loss/logits": 0.318489134311676, "step": 1430 }, { "epoch": 0.00454401117744189, "grad_norm": 0.328125, "grad_norm_var": 0.001296218236287435, "learning_rate": 0.01, "loss": 1.4356, "loss/crossentropy": 2.75521981716156, "loss/fcd": 1.1640625, "loss/logits": 0.2783754765987396, "step": 1431 }, { "epoch": 0.004547186587069732, "grad_norm": 0.462890625, "grad_norm_var": 0.0022037347157796224, "learning_rate": 0.01, "loss": 1.507, "loss/crossentropy": 2.52434504032135, "loss/fcd": 1.18359375, "loss/logits": 0.29505589604377747, "step": 1432 }, { "epoch": 0.004550361996697574, "grad_norm": 0.30859375, "grad_norm_var": 0.0022748311360677085, "learning_rate": 0.01, "loss": 1.4169, "loss/crossentropy": 2.723539113998413, "loss/fcd": 1.14453125, "loss/logits": 0.261081300675869, "step": 1433 }, { "epoch": 0.004553537406325416, "grad_norm": 0.33203125, "grad_norm_var": 0.0014203389485677083, "learning_rate": 0.01, "loss": 1.4513, "loss/crossentropy": 2.462752342224121, "loss/fcd": 1.1875, "loss/logits": 0.2554765045642853, "step": 1434 }, { "epoch": 0.004556712815953258, "grad_norm": 0.3359375, "grad_norm_var": 0.0013778527577718099, "learning_rate": 0.01, "loss": 1.4048, "loss/crossentropy": 2.488077998161316, "loss/fcd": 1.1953125, "loss/logits": 0.258402980864048, "step": 1435 }, { "epoch": 0.0045598882255811, "grad_norm": 0.357421875, "grad_norm_var": 0.0012769063313802084, "learning_rate": 0.01, "loss": 1.5116, "loss/crossentropy": 2.4708751440048218, "loss/fcd": 1.17578125, "loss/logits": 0.2679780423641205, "step": 1436 }, { "epoch": 0.004563063635208942, "grad_norm": 0.3125, "grad_norm_var": 0.0013393243153889975, "learning_rate": 0.01, "loss": 1.4379, "loss/crossentropy": 2.8989654779434204, "loss/fcd": 1.21484375, "loss/logits": 0.2771361693739891, "step": 1437 }, { "epoch": 0.004566239044836784, "grad_norm": 0.3828125, "grad_norm_var": 0.001439523696899414, "learning_rate": 0.01, "loss": 1.5189, "loss/crossentropy": 2.6069319248199463, "loss/fcd": 1.47265625, "loss/logits": 0.38492512702941895, "step": 1438 }, { "epoch": 0.004569414454464626, "grad_norm": 0.318359375, "grad_norm_var": 0.0014154911041259766, "learning_rate": 0.01, "loss": 1.4608, "loss/crossentropy": 2.561928153038025, "loss/fcd": 1.14453125, "loss/logits": 0.27600710093975067, "step": 1439 }, { "epoch": 0.004572589864092468, "grad_norm": 0.32421875, "grad_norm_var": 0.0014154911041259766, "learning_rate": 0.01, "loss": 1.4497, "loss/crossentropy": 2.6219643354415894, "loss/fcd": 1.2109375, "loss/logits": 0.2705913931131363, "step": 1440 }, { "epoch": 0.00457576527372031, "grad_norm": 0.318359375, "grad_norm_var": 0.0014406681060791016, "learning_rate": 0.01, "loss": 1.4391, "loss/crossentropy": 2.5818153619766235, "loss/fcd": 1.23046875, "loss/logits": 0.27455832064151764, "step": 1441 }, { "epoch": 0.004578940683348152, "grad_norm": 0.359375, "grad_norm_var": 0.0014641920725504557, "learning_rate": 0.01, "loss": 1.456, "loss/crossentropy": 2.533843994140625, "loss/fcd": 1.14453125, "loss/logits": 0.24855345487594604, "step": 1442 }, { "epoch": 0.004582116092975994, "grad_norm": 0.32421875, "grad_norm_var": 0.0014795303344726563, "learning_rate": 0.01, "loss": 1.396, "loss/crossentropy": 2.350162386894226, "loss/fcd": 1.06640625, "loss/logits": 0.23538944870233536, "step": 1443 }, { "epoch": 0.004585291502603836, "grad_norm": 0.29296875, "grad_norm_var": 0.001615142822265625, "learning_rate": 0.01, "loss": 1.371, "loss/crossentropy": 2.1772284507751465, "loss/fcd": 1.078125, "loss/logits": 0.2452804446220398, "step": 1444 }, { "epoch": 0.004588466912231678, "grad_norm": 0.333984375, "grad_norm_var": 0.0016099135080973306, "learning_rate": 0.01, "loss": 1.4081, "loss/crossentropy": 2.8431143760681152, "loss/fcd": 1.16796875, "loss/logits": 0.27079029381275177, "step": 1445 }, { "epoch": 0.00459164232185952, "grad_norm": 0.33203125, "grad_norm_var": 0.0015538374582926432, "learning_rate": 0.01, "loss": 1.461, "loss/crossentropy": 2.7595001459121704, "loss/fcd": 1.203125, "loss/logits": 0.27778828144073486, "step": 1446 }, { "epoch": 0.004594817731487362, "grad_norm": 0.326171875, "grad_norm_var": 0.0015569051106770833, "learning_rate": 0.01, "loss": 1.4389, "loss/crossentropy": 2.7326654195785522, "loss/fcd": 1.1953125, "loss/logits": 0.2887301743030548, "step": 1447 }, { "epoch": 0.004597993141115204, "grad_norm": 0.33203125, "grad_norm_var": 0.0004632155100504557, "learning_rate": 0.01, "loss": 1.3912, "loss/crossentropy": 2.4585416316986084, "loss/fcd": 1.138671875, "loss/logits": 0.24579910933971405, "step": 1448 }, { "epoch": 0.004601168550743046, "grad_norm": 0.34375, "grad_norm_var": 0.00043689409891764324, "learning_rate": 0.01, "loss": 1.5193, "loss/crossentropy": 2.5819398164749146, "loss/fcd": 1.2265625, "loss/logits": 0.2693522572517395, "step": 1449 }, { "epoch": 0.004604343960370888, "grad_norm": 0.310546875, "grad_norm_var": 0.00046819051106770834, "learning_rate": 0.01, "loss": 1.3841, "loss/crossentropy": 2.5817514657974243, "loss/fcd": 1.109375, "loss/logits": 0.26899273693561554, "step": 1450 }, { "epoch": 0.00460751936999873, "grad_norm": 0.3046875, "grad_norm_var": 0.0005109151204427083, "learning_rate": 0.01, "loss": 1.4726, "loss/crossentropy": 2.2607792615890503, "loss/fcd": 1.078125, "loss/logits": 0.24856027960777283, "step": 1451 }, { "epoch": 0.004610694779626572, "grad_norm": 0.31640625, "grad_norm_var": 0.0004638512929280599, "learning_rate": 0.01, "loss": 1.4191, "loss/crossentropy": 2.4776896238327026, "loss/fcd": 1.1796875, "loss/logits": 0.26761649549007416, "step": 1452 }, { "epoch": 0.004613870189254414, "grad_norm": 0.3125, "grad_norm_var": 0.0004638512929280599, "learning_rate": 0.01, "loss": 1.4086, "loss/crossentropy": 2.6187509298324585, "loss/fcd": 1.1796875, "loss/logits": 0.2660587728023529, "step": 1453 }, { "epoch": 0.004617045598882256, "grad_norm": 0.4140625, "grad_norm_var": 0.0007573286692301432, "learning_rate": 0.01, "loss": 1.4437, "loss/crossentropy": 2.6721800565719604, "loss/fcd": 1.18359375, "loss/logits": 0.29577332735061646, "step": 1454 }, { "epoch": 0.004620221008510098, "grad_norm": 0.353515625, "grad_norm_var": 0.0007847944895426432, "learning_rate": 0.01, "loss": 1.4404, "loss/crossentropy": 2.657042384147644, "loss/fcd": 1.046875, "loss/logits": 0.23840361833572388, "step": 1455 }, { "epoch": 0.00462339641813794, "grad_norm": 0.318359375, "grad_norm_var": 0.0007923762003580729, "learning_rate": 0.01, "loss": 1.4413, "loss/crossentropy": 2.4402010440826416, "loss/fcd": 1.15234375, "loss/logits": 0.24972544610500336, "step": 1456 }, { "epoch": 0.004626571827765782, "grad_norm": 0.34375, "grad_norm_var": 0.0007905165354410807, "learning_rate": 0.01, "loss": 1.4515, "loss/crossentropy": 2.4245442152023315, "loss/fcd": 1.17578125, "loss/logits": 0.27518928050994873, "step": 1457 }, { "epoch": 0.004629747237393624, "grad_norm": 0.38671875, "grad_norm_var": 0.0009356021881103515, "learning_rate": 0.01, "loss": 1.5317, "loss/crossentropy": 2.674047589302063, "loss/fcd": 1.1484375, "loss/logits": 0.2677346169948578, "step": 1458 }, { "epoch": 0.004632922647021466, "grad_norm": 0.416015625, "grad_norm_var": 0.00134124755859375, "learning_rate": 0.01, "loss": 1.5377, "loss/crossentropy": 2.48883855342865, "loss/fcd": 1.10546875, "loss/logits": 0.2620562016963959, "step": 1459 }, { "epoch": 0.004636098056649308, "grad_norm": 0.30078125, "grad_norm_var": 0.001296234130859375, "learning_rate": 0.01, "loss": 1.4127, "loss/crossentropy": 2.2384573817253113, "loss/fcd": 1.19921875, "loss/logits": 0.25907742977142334, "step": 1460 }, { "epoch": 0.00463927346627715, "grad_norm": 0.294921875, "grad_norm_var": 0.0014246622721354166, "learning_rate": 0.01, "loss": 1.4219, "loss/crossentropy": 2.645844578742981, "loss/fcd": 1.18359375, "loss/logits": 0.2650655210018158, "step": 1461 }, { "epoch": 0.004642448875904992, "grad_norm": 0.30078125, "grad_norm_var": 0.0015101114908854167, "learning_rate": 0.01, "loss": 1.4218, "loss/crossentropy": 2.468388557434082, "loss/fcd": 1.15625, "loss/logits": 0.2612268850207329, "step": 1462 }, { "epoch": 0.004645624285532834, "grad_norm": 0.34765625, "grad_norm_var": 0.0015109856923421225, "learning_rate": 0.01, "loss": 1.4591, "loss/crossentropy": 2.435564637184143, "loss/fcd": 1.1171875, "loss/logits": 0.23845092207193375, "step": 1463 }, { "epoch": 0.004648799695160676, "grad_norm": 0.33984375, "grad_norm_var": 0.0015093326568603516, "learning_rate": 0.01, "loss": 1.4519, "loss/crossentropy": 2.706642508506775, "loss/fcd": 1.17578125, "loss/logits": 0.26656900346279144, "step": 1464 }, { "epoch": 0.004651975104788517, "grad_norm": 0.328125, "grad_norm_var": 0.0015121301015218098, "learning_rate": 0.01, "loss": 1.4202, "loss/crossentropy": 2.3622360229492188, "loss/fcd": 1.09375, "loss/logits": 0.24302464723587036, "step": 1465 }, { "epoch": 0.00465515051441636, "grad_norm": 0.294921875, "grad_norm_var": 0.0015820662180582683, "learning_rate": 0.01, "loss": 1.3656, "loss/crossentropy": 2.6914572715759277, "loss/fcd": 1.09375, "loss/logits": 0.2565007954835892, "step": 1466 }, { "epoch": 0.004658325924044202, "grad_norm": 0.298828125, "grad_norm_var": 0.0016085306803385417, "learning_rate": 0.01, "loss": 1.4561, "loss/crossentropy": 2.1899214386940002, "loss/fcd": 1.3359375, "loss/logits": 0.2979612350463867, "step": 1467 }, { "epoch": 0.004661501333672043, "grad_norm": 0.37109375, "grad_norm_var": 0.0016565958658854166, "learning_rate": 0.01, "loss": 1.5721, "loss/crossentropy": 2.703672409057617, "loss/fcd": 1.3125, "loss/logits": 0.36619076132774353, "step": 1468 }, { "epoch": 0.004664676743299886, "grad_norm": 0.30078125, "grad_norm_var": 0.001706377665201823, "learning_rate": 0.01, "loss": 1.428, "loss/crossentropy": 2.4572495818138123, "loss/fcd": 1.2265625, "loss/logits": 0.3030671179294586, "step": 1469 }, { "epoch": 0.004667852152927728, "grad_norm": 0.306640625, "grad_norm_var": 0.0013400872548421225, "learning_rate": 0.01, "loss": 1.3494, "loss/crossentropy": 2.4028340578079224, "loss/fcd": 1.109375, "loss/logits": 0.2425854429602623, "step": 1470 }, { "epoch": 0.004671027562555569, "grad_norm": 0.306640625, "grad_norm_var": 0.0013393243153889975, "learning_rate": 0.01, "loss": 1.429, "loss/crossentropy": 2.159976601600647, "loss/fcd": 1.17578125, "loss/logits": 0.27698640525341034, "step": 1471 }, { "epoch": 0.004674202972183412, "grad_norm": 0.3203125, "grad_norm_var": 0.0013369242350260416, "learning_rate": 0.01, "loss": 1.4715, "loss/crossentropy": 2.5355581045150757, "loss/fcd": 1.30078125, "loss/logits": 0.3245139867067337, "step": 1472 }, { "epoch": 0.004677378381811254, "grad_norm": 0.333984375, "grad_norm_var": 0.0013231754302978516, "learning_rate": 0.01, "loss": 1.4285, "loss/crossentropy": 2.2726725935935974, "loss/fcd": 1.12890625, "loss/logits": 0.2623797655105591, "step": 1473 }, { "epoch": 0.004680553791439095, "grad_norm": 0.345703125, "grad_norm_var": 0.0011072158813476562, "learning_rate": 0.01, "loss": 1.4615, "loss/crossentropy": 2.577520251274109, "loss/fcd": 1.1640625, "loss/logits": 0.27035006880760193, "step": 1474 }, { "epoch": 0.004683729201066938, "grad_norm": 0.294921875, "grad_norm_var": 0.0005612691243489583, "learning_rate": 0.01, "loss": 1.3718, "loss/crossentropy": 2.470705032348633, "loss/fcd": 1.11328125, "loss/logits": 0.26156045496463776, "step": 1475 }, { "epoch": 0.00468690461069478, "grad_norm": 0.3125, "grad_norm_var": 0.0005431493123372396, "learning_rate": 0.01, "loss": 1.4853, "loss/crossentropy": 2.5294660329818726, "loss/fcd": 1.15625, "loss/logits": 0.25937750935554504, "step": 1476 }, { "epoch": 0.004690080020322621, "grad_norm": 0.28515625, "grad_norm_var": 0.0005799452463785807, "learning_rate": 0.01, "loss": 1.3443, "loss/crossentropy": 2.4381799697875977, "loss/fcd": 1.0703125, "loss/logits": 0.23996591567993164, "step": 1477 }, { "epoch": 0.004693255429950464, "grad_norm": 0.267578125, "grad_norm_var": 0.0007250467936197917, "learning_rate": 0.01, "loss": 1.3692, "loss/crossentropy": 2.50251841545105, "loss/fcd": 1.12109375, "loss/logits": 0.24646448343992233, "step": 1478 }, { "epoch": 0.004696430839578306, "grad_norm": 0.32421875, "grad_norm_var": 0.0006601969401041667, "learning_rate": 0.01, "loss": 1.3959, "loss/crossentropy": 2.5602506399154663, "loss/fcd": 1.15234375, "loss/logits": 0.27695298194885254, "step": 1479 }, { "epoch": 0.004699606249206147, "grad_norm": 0.306640625, "grad_norm_var": 0.0006166934967041015, "learning_rate": 0.01, "loss": 1.3935, "loss/crossentropy": 2.8536540269851685, "loss/fcd": 1.16796875, "loss/logits": 0.2823043614625931, "step": 1480 }, { "epoch": 0.00470278165883399, "grad_norm": 0.37109375, "grad_norm_var": 0.0008223056793212891, "learning_rate": 0.01, "loss": 1.4867, "loss/crossentropy": 2.4765796661376953, "loss/fcd": 1.109375, "loss/logits": 0.2375454530119896, "step": 1481 }, { "epoch": 0.004705957068461832, "grad_norm": 0.34375, "grad_norm_var": 0.0008401870727539062, "learning_rate": 0.01, "loss": 1.3889, "loss/crossentropy": 2.2186869382858276, "loss/fcd": 1.10546875, "loss/logits": 0.24289745092391968, "step": 1482 }, { "epoch": 0.004709132478089673, "grad_norm": 0.34765625, "grad_norm_var": 0.0008636315663655599, "learning_rate": 0.01, "loss": 1.5196, "loss/crossentropy": 2.344978928565979, "loss/fcd": 1.14453125, "loss/logits": 0.2798009216785431, "step": 1483 }, { "epoch": 0.004712307887717516, "grad_norm": 0.365234375, "grad_norm_var": 0.0008267720540364583, "learning_rate": 0.01, "loss": 1.4915, "loss/crossentropy": 2.281100869178772, "loss/fcd": 1.16796875, "loss/logits": 0.2632095664739609, "step": 1484 }, { "epoch": 0.004715483297345358, "grad_norm": 0.31640625, "grad_norm_var": 0.000800323486328125, "learning_rate": 0.01, "loss": 1.4342, "loss/crossentropy": 2.6387284994125366, "loss/fcd": 1.171875, "loss/logits": 0.25202758610248566, "step": 1485 }, { "epoch": 0.004718658706973199, "grad_norm": 0.322265625, "grad_norm_var": 0.0007840474446614583, "learning_rate": 0.01, "loss": 1.4566, "loss/crossentropy": 2.428976893424988, "loss/fcd": 1.15625, "loss/logits": 0.26124779880046844, "step": 1486 }, { "epoch": 0.004721834116601042, "grad_norm": 0.3203125, "grad_norm_var": 0.0007663567860921224, "learning_rate": 0.01, "loss": 1.4451, "loss/crossentropy": 2.684321403503418, "loss/fcd": 1.1484375, "loss/logits": 0.2603253871202469, "step": 1487 }, { "epoch": 0.004725009526228884, "grad_norm": 0.30859375, "grad_norm_var": 0.0007800896962483724, "learning_rate": 0.01, "loss": 1.4022, "loss/crossentropy": 2.563972234725952, "loss/fcd": 1.12890625, "loss/logits": 0.26450175046920776, "step": 1488 }, { "epoch": 0.004728184935856725, "grad_norm": 0.283203125, "grad_norm_var": 0.0008660475413004557, "learning_rate": 0.01, "loss": 1.4141, "loss/crossentropy": 2.623717188835144, "loss/fcd": 1.21484375, "loss/logits": 0.2996888905763626, "step": 1489 }, { "epoch": 0.004731360345484568, "grad_norm": 0.3125, "grad_norm_var": 0.0008198420206705729, "learning_rate": 0.01, "loss": 1.3851, "loss/crossentropy": 2.374686360359192, "loss/fcd": 1.1640625, "loss/logits": 0.2465464472770691, "step": 1490 }, { "epoch": 0.00473453575511241, "grad_norm": 0.330078125, "grad_norm_var": 0.0007906595865885417, "learning_rate": 0.01, "loss": 1.3876, "loss/crossentropy": 2.5791667699813843, "loss/fcd": 1.12109375, "loss/logits": 0.2403968870639801, "step": 1491 }, { "epoch": 0.004737711164740251, "grad_norm": 0.375, "grad_norm_var": 0.0009737650553385417, "learning_rate": 0.01, "loss": 1.4529, "loss/crossentropy": 2.51021945476532, "loss/fcd": 1.15625, "loss/logits": 0.27928346395492554, "step": 1492 }, { "epoch": 0.004740886574368094, "grad_norm": 0.294921875, "grad_norm_var": 0.0009294986724853515, "learning_rate": 0.01, "loss": 1.4286, "loss/crossentropy": 2.503031015396118, "loss/fcd": 1.12890625, "loss/logits": 0.26477159559726715, "step": 1493 }, { "epoch": 0.004744061983995936, "grad_norm": 0.3125, "grad_norm_var": 0.00071563720703125, "learning_rate": 0.01, "loss": 1.4152, "loss/crossentropy": 2.4783570766448975, "loss/fcd": 1.1015625, "loss/logits": 0.24885981529951096, "step": 1494 }, { "epoch": 0.004747237393623777, "grad_norm": 0.287109375, "grad_norm_var": 0.0008162021636962891, "learning_rate": 0.01, "loss": 1.3889, "loss/crossentropy": 2.29350209236145, "loss/fcd": 1.1171875, "loss/logits": 0.24100902676582336, "step": 1495 }, { "epoch": 0.00475041280325162, "grad_norm": 0.337890625, "grad_norm_var": 0.0008014520009358724, "learning_rate": 0.01, "loss": 1.3969, "loss/crossentropy": 2.5348265171051025, "loss/fcd": 1.06640625, "loss/logits": 0.2589345723390579, "step": 1496 }, { "epoch": 0.004753588212879461, "grad_norm": 0.31640625, "grad_norm_var": 0.0006652673085530599, "learning_rate": 0.01, "loss": 1.4543, "loss/crossentropy": 2.5758105516433716, "loss/fcd": 1.11328125, "loss/logits": 0.24231060594320297, "step": 1497 }, { "epoch": 0.004756763622507303, "grad_norm": 0.2890625, "grad_norm_var": 0.0007035414377848308, "learning_rate": 0.01, "loss": 1.3587, "loss/crossentropy": 2.4849116802215576, "loss/fcd": 1.04296875, "loss/logits": 0.23846541345119476, "step": 1498 }, { "epoch": 0.004759939032135146, "grad_norm": 0.3203125, "grad_norm_var": 0.0006492455800374349, "learning_rate": 0.01, "loss": 1.4687, "loss/crossentropy": 2.76836097240448, "loss/fcd": 1.14453125, "loss/logits": 0.27676399052143097, "step": 1499 }, { "epoch": 0.004763114441762987, "grad_norm": 0.31640625, "grad_norm_var": 0.0004922866821289063, "learning_rate": 0.01, "loss": 1.4106, "loss/crossentropy": 2.5547595024108887, "loss/fcd": 1.1015625, "loss/logits": 0.2308686003088951, "step": 1500 }, { "epoch": 0.004766289851390829, "grad_norm": 0.30859375, "grad_norm_var": 0.0004948298136393229, "learning_rate": 0.01, "loss": 1.4232, "loss/crossentropy": 2.4202499389648438, "loss/fcd": 1.25, "loss/logits": 0.30156514048576355, "step": 1501 }, { "epoch": 0.004769465261018672, "grad_norm": 0.30078125, "grad_norm_var": 0.0005019982655843099, "learning_rate": 0.01, "loss": 1.3922, "loss/crossentropy": 1.953066647052765, "loss/fcd": 1.046875, "loss/logits": 0.22579305619001389, "step": 1502 }, { "epoch": 0.004772640670646513, "grad_norm": 0.306640625, "grad_norm_var": 0.0005009969075520834, "learning_rate": 0.01, "loss": 1.4379, "loss/crossentropy": 2.3938366174697876, "loss/fcd": 1.0625, "loss/logits": 0.2403060346841812, "step": 1503 }, { "epoch": 0.004775816080274355, "grad_norm": 0.349609375, "grad_norm_var": 0.0005847771962483723, "learning_rate": 0.01, "loss": 1.3823, "loss/crossentropy": 2.611161947250366, "loss/fcd": 1.09765625, "loss/logits": 0.2437141090631485, "step": 1504 }, { "epoch": 0.004778991489902198, "grad_norm": 0.3359375, "grad_norm_var": 0.0005345662434895833, "learning_rate": 0.01, "loss": 1.4221, "loss/crossentropy": 2.611813545227051, "loss/fcd": 1.12109375, "loss/logits": 0.2602939158678055, "step": 1505 }, { "epoch": 0.004782166899530039, "grad_norm": 0.2734375, "grad_norm_var": 0.0006604512532552083, "learning_rate": 0.01, "loss": 1.3423, "loss/crossentropy": 2.3557610511779785, "loss/fcd": 1.0546875, "loss/logits": 0.2389651983976364, "step": 1506 }, { "epoch": 0.004785342309157881, "grad_norm": 0.3125, "grad_norm_var": 0.0006465752919514974, "learning_rate": 0.01, "loss": 1.5326, "loss/crossentropy": 2.592300295829773, "loss/fcd": 1.19921875, "loss/logits": 0.29368337988853455, "step": 1507 }, { "epoch": 0.004788517718785724, "grad_norm": 0.341796875, "grad_norm_var": 0.0004490534464518229, "learning_rate": 0.01, "loss": 1.3807, "loss/crossentropy": 2.411874294281006, "loss/fcd": 1.08984375, "loss/logits": 0.24404437839984894, "step": 1508 }, { "epoch": 0.004791693128413565, "grad_norm": 0.318359375, "grad_norm_var": 0.0004276911417643229, "learning_rate": 0.01, "loss": 1.4146, "loss/crossentropy": 2.4793671369552612, "loss/fcd": 1.171875, "loss/logits": 0.24446283280849457, "step": 1509 }, { "epoch": 0.004794868538041407, "grad_norm": 0.322265625, "grad_norm_var": 0.0004314263661702474, "learning_rate": 0.01, "loss": 1.4422, "loss/crossentropy": 2.4388411045074463, "loss/fcd": 1.125, "loss/logits": 0.26607470214366913, "step": 1510 }, { "epoch": 0.00479804394766925, "grad_norm": 0.328125, "grad_norm_var": 0.00038503011067708335, "learning_rate": 0.01, "loss": 1.4336, "loss/crossentropy": 2.586572289466858, "loss/fcd": 1.1875, "loss/logits": 0.26207631826400757, "step": 1511 }, { "epoch": 0.004801219357297091, "grad_norm": 0.353515625, "grad_norm_var": 0.00044301350911458336, "learning_rate": 0.01, "loss": 1.4604, "loss/crossentropy": 2.2664122581481934, "loss/fcd": 1.12109375, "loss/logits": 0.23589500039815903, "step": 1512 }, { "epoch": 0.004804394766924933, "grad_norm": 0.279296875, "grad_norm_var": 0.0005387465159098307, "learning_rate": 0.01, "loss": 1.35, "loss/crossentropy": 2.3235487937927246, "loss/fcd": 1.0703125, "loss/logits": 0.23969802260398865, "step": 1513 }, { "epoch": 0.004807570176552776, "grad_norm": 0.333984375, "grad_norm_var": 0.0005032857259114583, "learning_rate": 0.01, "loss": 1.4469, "loss/crossentropy": 2.8930797576904297, "loss/fcd": 1.11328125, "loss/logits": 0.23078703135252, "step": 1514 }, { "epoch": 0.004810745586180617, "grad_norm": 0.2890625, "grad_norm_var": 0.0005582173665364584, "learning_rate": 0.01, "loss": 1.4035, "loss/crossentropy": 2.541144371032715, "loss/fcd": 1.16796875, "loss/logits": 0.2521451562643051, "step": 1515 }, { "epoch": 0.004813920995808459, "grad_norm": 0.326171875, "grad_norm_var": 0.0005635420481363932, "learning_rate": 0.01, "loss": 1.44, "loss/crossentropy": 2.6072824001312256, "loss/fcd": 1.16015625, "loss/logits": 0.2629164755344391, "step": 1516 }, { "epoch": 0.004817096405436302, "grad_norm": 0.3125, "grad_norm_var": 0.000559854507446289, "learning_rate": 0.01, "loss": 1.392, "loss/crossentropy": 2.310948133468628, "loss/fcd": 1.109375, "loss/logits": 0.2276080846786499, "step": 1517 }, { "epoch": 0.004820271815064143, "grad_norm": 0.373046875, "grad_norm_var": 0.0007227579752604167, "learning_rate": 0.01, "loss": 1.5263, "loss/crossentropy": 2.6398669481277466, "loss/fcd": 1.24609375, "loss/logits": 0.2825860232114792, "step": 1518 }, { "epoch": 0.004823447224691985, "grad_norm": 0.30078125, "grad_norm_var": 0.0007371107737223307, "learning_rate": 0.01, "loss": 1.4172, "loss/crossentropy": 2.376294493675232, "loss/fcd": 1.234375, "loss/logits": 0.27047960460186005, "step": 1519 }, { "epoch": 0.004826622634319828, "grad_norm": 0.279296875, "grad_norm_var": 0.0007863203684488932, "learning_rate": 0.01, "loss": 1.3785, "loss/crossentropy": 2.496554970741272, "loss/fcd": 1.12109375, "loss/logits": 0.25385206937789917, "step": 1520 }, { "epoch": 0.004829798043947669, "grad_norm": 0.34375, "grad_norm_var": 0.000809335708618164, "learning_rate": 0.01, "loss": 1.4448, "loss/crossentropy": 2.4284698963165283, "loss/fcd": 1.1328125, "loss/logits": 0.23548240959644318, "step": 1521 }, { "epoch": 0.004832973453575511, "grad_norm": 0.375, "grad_norm_var": 0.0008506615956624349, "learning_rate": 0.01, "loss": 1.4282, "loss/crossentropy": 2.453286051750183, "loss/fcd": 1.09765625, "loss/logits": 0.257686011493206, "step": 1522 }, { "epoch": 0.004836148863203354, "grad_norm": 0.3515625, "grad_norm_var": 0.0008843580881754558, "learning_rate": 0.01, "loss": 1.5201, "loss/crossentropy": 2.728175640106201, "loss/fcd": 1.31640625, "loss/logits": 0.32748906314373016, "step": 1523 }, { "epoch": 0.004839324272831195, "grad_norm": 0.345703125, "grad_norm_var": 0.0008931318918863932, "learning_rate": 0.01, "loss": 1.5177, "loss/crossentropy": 2.581049919128418, "loss/fcd": 1.18359375, "loss/logits": 0.2878571003675461, "step": 1524 }, { "epoch": 0.004842499682459037, "grad_norm": 0.33203125, "grad_norm_var": 0.0008890151977539063, "learning_rate": 0.01, "loss": 1.3908, "loss/crossentropy": 2.604188084602356, "loss/fcd": 1.1171875, "loss/logits": 0.2576373443007469, "step": 1525 }, { "epoch": 0.00484567509208688, "grad_norm": 0.333984375, "grad_norm_var": 0.000888824462890625, "learning_rate": 0.01, "loss": 1.4643, "loss/crossentropy": 2.781618356704712, "loss/fcd": 1.16015625, "loss/logits": 0.26449331641197205, "step": 1526 }, { "epoch": 0.004848850501714721, "grad_norm": 0.31640625, "grad_norm_var": 0.0008981704711914062, "learning_rate": 0.01, "loss": 1.3744, "loss/crossentropy": 2.3268712759017944, "loss/fcd": 1.078125, "loss/logits": 0.2327672466635704, "step": 1527 }, { "epoch": 0.004852025911342563, "grad_norm": 0.29296875, "grad_norm_var": 0.0009203433990478515, "learning_rate": 0.01, "loss": 1.4046, "loss/crossentropy": 2.579440951347351, "loss/fcd": 1.14453125, "loss/logits": 0.25902700424194336, "step": 1528 }, { "epoch": 0.004855201320970406, "grad_norm": 0.33984375, "grad_norm_var": 0.000787798563639323, "learning_rate": 0.01, "loss": 1.4272, "loss/crossentropy": 2.6086740493774414, "loss/fcd": 1.19921875, "loss/logits": 0.28272490203380585, "step": 1529 }, { "epoch": 0.004858376730598247, "grad_norm": 0.322265625, "grad_norm_var": 0.0007868448893229167, "learning_rate": 0.01, "loss": 1.462, "loss/crossentropy": 2.5693472623825073, "loss/fcd": 1.21484375, "loss/logits": 0.27477097511291504, "step": 1530 }, { "epoch": 0.004861552140226089, "grad_norm": 0.328125, "grad_norm_var": 0.0006838480631510417, "learning_rate": 0.01, "loss": 1.4272, "loss/crossentropy": 2.5725077390670776, "loss/fcd": 1.11328125, "loss/logits": 0.23553214967250824, "step": 1531 }, { "epoch": 0.004864727549853931, "grad_norm": 0.294921875, "grad_norm_var": 0.000759124755859375, "learning_rate": 0.01, "loss": 1.4477, "loss/crossentropy": 2.206745207309723, "loss/fcd": 1.1953125, "loss/logits": 0.2950148209929466, "step": 1532 }, { "epoch": 0.004867902959481773, "grad_norm": 0.337890625, "grad_norm_var": 0.0007481733957926432, "learning_rate": 0.01, "loss": 1.4892, "loss/crossentropy": 2.3334946632385254, "loss/fcd": 1.125, "loss/logits": 0.26530270278453827, "step": 1533 }, { "epoch": 0.004871078369109615, "grad_norm": 0.3203125, "grad_norm_var": 0.0006138483683268229, "learning_rate": 0.01, "loss": 1.4274, "loss/crossentropy": 2.586618185043335, "loss/fcd": 1.1875, "loss/logits": 0.27727824449539185, "step": 1534 }, { "epoch": 0.004874253778737457, "grad_norm": 0.32421875, "grad_norm_var": 0.0005695978800455729, "learning_rate": 0.01, "loss": 1.4039, "loss/crossentropy": 2.5862772464752197, "loss/fcd": 1.19140625, "loss/logits": 0.2863280326128006, "step": 1535 }, { "epoch": 0.004877429188365299, "grad_norm": 0.318359375, "grad_norm_var": 0.00041446685791015627, "learning_rate": 0.01, "loss": 1.4778, "loss/crossentropy": 2.434378743171692, "loss/fcd": 1.203125, "loss/logits": 0.26611409336328506, "step": 1536 }, { "epoch": 0.004880604597993141, "grad_norm": 0.318359375, "grad_norm_var": 0.00040764808654785155, "learning_rate": 0.01, "loss": 1.4133, "loss/crossentropy": 2.5232841968536377, "loss/fcd": 1.10546875, "loss/logits": 0.24091371148824692, "step": 1537 }, { "epoch": 0.004883780007620983, "grad_norm": 0.30078125, "grad_norm_var": 0.0002892653147379557, "learning_rate": 0.01, "loss": 1.4364, "loss/crossentropy": 2.7323195934295654, "loss/fcd": 1.16796875, "loss/logits": 0.2876850664615631, "step": 1538 }, { "epoch": 0.004886955417248825, "grad_norm": 0.357421875, "grad_norm_var": 0.0003132502237955729, "learning_rate": 0.01, "loss": 1.4847, "loss/crossentropy": 2.7782392501831055, "loss/fcd": 1.35546875, "loss/logits": 0.336336225271225, "step": 1539 }, { "epoch": 0.004890130826876667, "grad_norm": 0.3203125, "grad_norm_var": 0.0002799828847249349, "learning_rate": 0.01, "loss": 1.466, "loss/crossentropy": 2.4189642667770386, "loss/fcd": 1.1796875, "loss/logits": 0.28972384333610535, "step": 1540 }, { "epoch": 0.004893306236504509, "grad_norm": 0.3125, "grad_norm_var": 0.00027871131896972656, "learning_rate": 0.01, "loss": 1.3715, "loss/crossentropy": 2.601462721824646, "loss/fcd": 1.15234375, "loss/logits": 0.2679958641529083, "step": 1541 }, { "epoch": 0.004896481646132351, "grad_norm": 0.35546875, "grad_norm_var": 0.00034427642822265625, "learning_rate": 0.01, "loss": 1.5242, "loss/crossentropy": 2.630943536758423, "loss/fcd": 1.18359375, "loss/logits": 0.29227517545223236, "step": 1542 }, { "epoch": 0.004899657055760193, "grad_norm": 0.326171875, "grad_norm_var": 0.00034228960673014325, "learning_rate": 0.01, "loss": 1.451, "loss/crossentropy": 2.4095311164855957, "loss/fcd": 1.16796875, "loss/logits": 0.2785845100879669, "step": 1543 }, { "epoch": 0.004902832465388035, "grad_norm": 0.306640625, "grad_norm_var": 0.0002990086873372396, "learning_rate": 0.01, "loss": 1.3672, "loss/crossentropy": 2.047276496887207, "loss/fcd": 1.08984375, "loss/logits": 0.24617646634578705, "step": 1544 }, { "epoch": 0.004906007875015877, "grad_norm": 0.328125, "grad_norm_var": 0.00028279622395833336, "learning_rate": 0.01, "loss": 1.4295, "loss/crossentropy": 2.574047088623047, "loss/fcd": 1.171875, "loss/logits": 0.27291473746299744, "step": 1545 }, { "epoch": 0.004909183284643719, "grad_norm": 0.3125, "grad_norm_var": 0.00029002825419108074, "learning_rate": 0.01, "loss": 1.4084, "loss/crossentropy": 2.7243759632110596, "loss/fcd": 1.24609375, "loss/logits": 0.2994740903377533, "step": 1546 }, { "epoch": 0.004912358694271561, "grad_norm": 0.322265625, "grad_norm_var": 0.0002878824869791667, "learning_rate": 0.01, "loss": 1.4298, "loss/crossentropy": 2.508150339126587, "loss/fcd": 1.10546875, "loss/logits": 0.24727857112884521, "step": 1547 }, { "epoch": 0.004915534103899403, "grad_norm": 0.302734375, "grad_norm_var": 0.000263214111328125, "learning_rate": 0.01, "loss": 1.372, "loss/crossentropy": 2.5723437070846558, "loss/fcd": 1.13671875, "loss/logits": 0.2604692429304123, "step": 1548 }, { "epoch": 0.004918709513527245, "grad_norm": 0.31640625, "grad_norm_var": 0.0002487023671468099, "learning_rate": 0.01, "loss": 1.4268, "loss/crossentropy": 2.540448546409607, "loss/fcd": 1.1953125, "loss/logits": 0.29859790205955505, "step": 1549 }, { "epoch": 0.004921884923155087, "grad_norm": 0.34765625, "grad_norm_var": 0.0002914269765218099, "learning_rate": 0.01, "loss": 1.5545, "loss/crossentropy": 2.5344340801239014, "loss/fcd": 1.2890625, "loss/logits": 0.40329815447330475, "step": 1550 }, { "epoch": 0.004925060332782929, "grad_norm": 0.3671875, "grad_norm_var": 0.0004131158192952474, "learning_rate": 0.01, "loss": 1.4861, "loss/crossentropy": 2.8062316179275513, "loss/fcd": 1.2734375, "loss/logits": 0.2931535243988037, "step": 1551 }, { "epoch": 0.004928235742410771, "grad_norm": 0.357421875, "grad_norm_var": 0.0004697004954020182, "learning_rate": 0.01, "loss": 1.4495, "loss/crossentropy": 2.502091646194458, "loss/fcd": 1.23828125, "loss/logits": 0.2693622559309006, "step": 1552 }, { "epoch": 0.004931411152038613, "grad_norm": 0.341796875, "grad_norm_var": 0.00047313372294108074, "learning_rate": 0.01, "loss": 1.4497, "loss/crossentropy": 2.736292004585266, "loss/fcd": 1.1484375, "loss/logits": 0.2732922434806824, "step": 1553 }, { "epoch": 0.004934586561666455, "grad_norm": 0.5390625, "grad_norm_var": 0.003102604548136393, "learning_rate": 0.01, "loss": 1.5266, "loss/crossentropy": 2.7628387212753296, "loss/fcd": 1.22265625, "loss/logits": 0.2831282913684845, "step": 1554 }, { "epoch": 0.004937761971294297, "grad_norm": 0.330078125, "grad_norm_var": 0.003102604548136393, "learning_rate": 0.01, "loss": 1.402, "loss/crossentropy": 2.5812575817108154, "loss/fcd": 1.11328125, "loss/logits": 0.24179952591657639, "step": 1555 }, { "epoch": 0.004940937380922139, "grad_norm": 0.298828125, "grad_norm_var": 0.0031961441040039063, "learning_rate": 0.01, "loss": 1.3652, "loss/crossentropy": 2.6486761569976807, "loss/fcd": 1.12109375, "loss/logits": 0.24043205380439758, "step": 1556 }, { "epoch": 0.004944112790549981, "grad_norm": 0.30859375, "grad_norm_var": 0.003212229410807292, "learning_rate": 0.01, "loss": 1.4, "loss/crossentropy": 2.260856509208679, "loss/fcd": 1.10546875, "loss/logits": 0.260642945766449, "step": 1557 }, { "epoch": 0.004947288200177823, "grad_norm": 0.310546875, "grad_norm_var": 0.0032535394032796225, "learning_rate": 0.01, "loss": 1.4264, "loss/crossentropy": 2.455396294593811, "loss/fcd": 1.17578125, "loss/logits": 0.29221296310424805, "step": 1558 }, { "epoch": 0.004950463609805665, "grad_norm": 0.296875, "grad_norm_var": 0.0033553441365559897, "learning_rate": 0.01, "loss": 1.3928, "loss/crossentropy": 2.6475363969802856, "loss/fcd": 1.11328125, "loss/logits": 0.25282319635152817, "step": 1559 }, { "epoch": 0.004953639019433507, "grad_norm": 0.40234375, "grad_norm_var": 0.00354460080464681, "learning_rate": 0.01, "loss": 1.4521, "loss/crossentropy": 2.473030924797058, "loss/fcd": 1.15625, "loss/logits": 0.2461780309677124, "step": 1560 }, { "epoch": 0.004956814429061349, "grad_norm": 0.3203125, "grad_norm_var": 0.003563547134399414, "learning_rate": 0.01, "loss": 1.4699, "loss/crossentropy": 2.9484708309173584, "loss/fcd": 1.2421875, "loss/logits": 0.3159886598587036, "step": 1561 }, { "epoch": 0.004959989838689191, "grad_norm": 0.310546875, "grad_norm_var": 0.0035715103149414062, "learning_rate": 0.01, "loss": 1.4463, "loss/crossentropy": 2.6670241355895996, "loss/fcd": 1.08203125, "loss/logits": 0.24266932159662247, "step": 1562 }, { "epoch": 0.004963165248317033, "grad_norm": 0.34375, "grad_norm_var": 0.003543710708618164, "learning_rate": 0.01, "loss": 1.4537, "loss/crossentropy": 2.619150757789612, "loss/fcd": 1.21484375, "loss/logits": 0.2604942321777344, "step": 1563 }, { "epoch": 0.004966340657944875, "grad_norm": 0.2890625, "grad_norm_var": 0.0036294937133789064, "learning_rate": 0.01, "loss": 1.3712, "loss/crossentropy": 2.3738853931427, "loss/fcd": 1.078125, "loss/logits": 0.23852763324975967, "step": 1564 }, { "epoch": 0.004969516067572717, "grad_norm": 0.328125, "grad_norm_var": 0.003597259521484375, "learning_rate": 0.01, "loss": 1.4252, "loss/crossentropy": 2.5465248823165894, "loss/fcd": 1.15625, "loss/logits": 0.266863688826561, "step": 1565 }, { "epoch": 0.004972691477200559, "grad_norm": 0.3125, "grad_norm_var": 0.0036539077758789063, "learning_rate": 0.01, "loss": 1.4382, "loss/crossentropy": 2.6143057346343994, "loss/fcd": 1.1484375, "loss/logits": 0.284178763628006, "step": 1566 }, { "epoch": 0.0049758668868284004, "grad_norm": 0.3359375, "grad_norm_var": 0.003606096903483073, "learning_rate": 0.01, "loss": 1.3715, "loss/crossentropy": 2.4656922817230225, "loss/fcd": 1.0859375, "loss/logits": 0.2420826107263565, "step": 1567 }, { "epoch": 0.004979042296456243, "grad_norm": 0.33203125, "grad_norm_var": 0.003584400812784831, "learning_rate": 0.01, "loss": 1.4305, "loss/crossentropy": 2.3664616346359253, "loss/fcd": 1.27734375, "loss/logits": 0.2890924662351608, "step": 1568 }, { "epoch": 0.004982217706084085, "grad_norm": 0.3125, "grad_norm_var": 0.003621355692545573, "learning_rate": 0.01, "loss": 1.435, "loss/crossentropy": 2.6937755346298218, "loss/fcd": 1.21875, "loss/logits": 0.27089807391166687, "step": 1569 }, { "epoch": 0.0049853931157119264, "grad_norm": 0.3203125, "grad_norm_var": 0.0006804784138997396, "learning_rate": 0.01, "loss": 1.4902, "loss/crossentropy": 2.3781652450561523, "loss/fcd": 1.12890625, "loss/logits": 0.24984151124954224, "step": 1570 }, { "epoch": 0.004988568525339769, "grad_norm": 0.30859375, "grad_norm_var": 0.0006862481435139974, "learning_rate": 0.01, "loss": 1.4014, "loss/crossentropy": 2.845761299133301, "loss/fcd": 1.18359375, "loss/logits": 0.2682710140943527, "step": 1571 }, { "epoch": 0.004991743934967611, "grad_norm": 0.306640625, "grad_norm_var": 0.0006673018137613933, "learning_rate": 0.01, "loss": 1.4183, "loss/crossentropy": 2.697476625442505, "loss/fcd": 1.171875, "loss/logits": 0.26140695810317993, "step": 1572 }, { "epoch": 0.0049949193445954524, "grad_norm": 0.296875, "grad_norm_var": 0.0006955305735270183, "learning_rate": 0.01, "loss": 1.3879, "loss/crossentropy": 2.7072709798812866, "loss/fcd": 1.19140625, "loss/logits": 0.28332991898059845, "step": 1573 }, { "epoch": 0.004998094754223295, "grad_norm": 0.326171875, "grad_norm_var": 0.0006901899973551433, "learning_rate": 0.01, "loss": 1.5068, "loss/crossentropy": 2.64555287361145, "loss/fcd": 1.24609375, "loss/logits": 0.29494979977607727, "step": 1574 }, { "epoch": 0.005001270163851137, "grad_norm": 0.3203125, "grad_norm_var": 0.0006478468577067058, "learning_rate": 0.01, "loss": 1.4206, "loss/crossentropy": 2.6008540391921997, "loss/fcd": 1.1640625, "loss/logits": 0.26508720219135284, "step": 1575 }, { "epoch": 0.0050044455734789784, "grad_norm": 0.328125, "grad_norm_var": 0.00020572344462076823, "learning_rate": 0.01, "loss": 1.4659, "loss/crossentropy": 2.724216103553772, "loss/fcd": 1.21875, "loss/logits": 0.27900072932243347, "step": 1576 }, { "epoch": 0.005007620983106821, "grad_norm": 0.4140625, "grad_norm_var": 0.0007809797922770182, "learning_rate": 0.01, "loss": 1.4719, "loss/crossentropy": 2.5958911180496216, "loss/fcd": 1.1484375, "loss/logits": 0.26939232647418976, "step": 1577 }, { "epoch": 0.005010796392734663, "grad_norm": 0.33984375, "grad_norm_var": 0.0007816950480143229, "learning_rate": 0.01, "loss": 1.3822, "loss/crossentropy": 2.4887611865997314, "loss/fcd": 1.1796875, "loss/logits": 0.29799503087997437, "step": 1578 }, { "epoch": 0.0050139718023625044, "grad_norm": 0.33203125, "grad_norm_var": 0.0007624308268229167, "learning_rate": 0.01, "loss": 1.4041, "loss/crossentropy": 2.6196101903915405, "loss/fcd": 1.1953125, "loss/logits": 0.28010424971580505, "step": 1579 }, { "epoch": 0.005017147211990347, "grad_norm": 0.384765625, "grad_norm_var": 0.0008738040924072266, "learning_rate": 0.01, "loss": 1.4624, "loss/crossentropy": 2.6625367403030396, "loss/fcd": 1.11328125, "loss/logits": 0.2537136748433113, "step": 1580 }, { "epoch": 0.005020322621618189, "grad_norm": 0.3828125, "grad_norm_var": 0.0010384718577067058, "learning_rate": 0.01, "loss": 1.3953, "loss/crossentropy": 2.7841016054153442, "loss/fcd": 1.16015625, "loss/logits": 0.26413919031620026, "step": 1581 }, { "epoch": 0.0050234980312460304, "grad_norm": 0.328125, "grad_norm_var": 0.0010076999664306641, "learning_rate": 0.01, "loss": 1.3949, "loss/crossentropy": 2.567612886428833, "loss/fcd": 1.125, "loss/logits": 0.24727674573659897, "step": 1582 }, { "epoch": 0.005026673440873873, "grad_norm": 0.3203125, "grad_norm_var": 0.001022195816040039, "learning_rate": 0.01, "loss": 1.4266, "loss/crossentropy": 2.555685043334961, "loss/fcd": 1.16796875, "loss/logits": 0.26328249275684357, "step": 1583 }, { "epoch": 0.005029848850501715, "grad_norm": 0.357421875, "grad_norm_var": 0.0010538101196289062, "learning_rate": 0.01, "loss": 1.5111, "loss/crossentropy": 2.50522243976593, "loss/fcd": 1.34765625, "loss/logits": 0.3141307085752487, "step": 1584 }, { "epoch": 0.0050330242601295564, "grad_norm": 0.34765625, "grad_norm_var": 0.001020050048828125, "learning_rate": 0.01, "loss": 1.4554, "loss/crossentropy": 2.6021941900253296, "loss/fcd": 1.15625, "loss/logits": 0.2649206072092056, "step": 1585 }, { "epoch": 0.005036199669757399, "grad_norm": 0.3359375, "grad_norm_var": 0.0009976704915364584, "learning_rate": 0.01, "loss": 1.407, "loss/crossentropy": 2.5295404195785522, "loss/fcd": 1.14453125, "loss/logits": 0.24248713999986649, "step": 1586 }, { "epoch": 0.005039375079385241, "grad_norm": 0.34375, "grad_norm_var": 0.0009307225545247395, "learning_rate": 0.01, "loss": 1.5209, "loss/crossentropy": 2.537645101547241, "loss/fcd": 1.1953125, "loss/logits": 0.27597369253635406, "step": 1587 }, { "epoch": 0.0050425504890130824, "grad_norm": 0.296875, "grad_norm_var": 0.0009821414947509765, "learning_rate": 0.01, "loss": 1.4695, "loss/crossentropy": 2.281378984451294, "loss/fcd": 1.25, "loss/logits": 0.30614790320396423, "step": 1588 }, { "epoch": 0.005045725898640925, "grad_norm": 0.3125, "grad_norm_var": 0.0009055932362874349, "learning_rate": 0.01, "loss": 1.4163, "loss/crossentropy": 2.402053475379944, "loss/fcd": 1.1015625, "loss/logits": 0.2491258978843689, "step": 1589 }, { "epoch": 0.005048901308268767, "grad_norm": 0.3203125, "grad_norm_var": 0.0009200414021809896, "learning_rate": 0.01, "loss": 1.416, "loss/crossentropy": 2.4380807876586914, "loss/fcd": 1.12109375, "loss/logits": 0.26058194041252136, "step": 1590 }, { "epoch": 0.0050520767178966084, "grad_norm": 0.337890625, "grad_norm_var": 0.0008895715077718099, "learning_rate": 0.01, "loss": 1.4518, "loss/crossentropy": 2.556450843811035, "loss/fcd": 1.07421875, "loss/logits": 0.23420121520757675, "step": 1591 }, { "epoch": 0.005055252127524451, "grad_norm": 0.326171875, "grad_norm_var": 0.0008935928344726562, "learning_rate": 0.01, "loss": 1.435, "loss/crossentropy": 2.7187869548797607, "loss/fcd": 1.22265625, "loss/logits": 0.2897214889526367, "step": 1592 }, { "epoch": 0.005058427537152293, "grad_norm": 0.36328125, "grad_norm_var": 0.0005704243977864583, "learning_rate": 0.01, "loss": 1.4809, "loss/crossentropy": 2.5373932123184204, "loss/fcd": 1.2109375, "loss/logits": 0.26034314930438995, "step": 1593 }, { "epoch": 0.0050616029467801344, "grad_norm": 0.33984375, "grad_norm_var": 0.0005704243977864583, "learning_rate": 0.01, "loss": 1.4472, "loss/crossentropy": 2.6626449823379517, "loss/fcd": 1.22265625, "loss/logits": 0.29285088181495667, "step": 1594 }, { "epoch": 0.005064778356407977, "grad_norm": 0.4296875, "grad_norm_var": 0.0010711034138997397, "learning_rate": 0.01, "loss": 1.4871, "loss/crossentropy": 2.6839174032211304, "loss/fcd": 1.140625, "loss/logits": 0.24843262135982513, "step": 1595 }, { "epoch": 0.005067953766035819, "grad_norm": 0.291015625, "grad_norm_var": 0.0011290868123372396, "learning_rate": 0.01, "loss": 1.3907, "loss/crossentropy": 2.219956398010254, "loss/fcd": 1.05859375, "loss/logits": 0.2493966817855835, "step": 1596 }, { "epoch": 0.0050711291756636604, "grad_norm": 0.341796875, "grad_norm_var": 0.00099790891011556, "learning_rate": 0.01, "loss": 1.4634, "loss/crossentropy": 2.593619465827942, "loss/fcd": 1.12890625, "loss/logits": 0.27522701025009155, "step": 1597 }, { "epoch": 0.005074304585291503, "grad_norm": 0.32421875, "grad_norm_var": 0.0010035037994384766, "learning_rate": 0.01, "loss": 1.4286, "loss/crossentropy": 2.53367018699646, "loss/fcd": 1.109375, "loss/logits": 0.2418091669678688, "step": 1598 }, { "epoch": 0.005077479994919345, "grad_norm": 0.35546875, "grad_norm_var": 0.0010035037994384766, "learning_rate": 0.01, "loss": 1.4683, "loss/crossentropy": 2.6616939306259155, "loss/fcd": 1.19140625, "loss/logits": 0.24544142186641693, "step": 1599 }, { "epoch": 0.0050806554045471864, "grad_norm": 0.35546875, "grad_norm_var": 0.0009989420572916667, "learning_rate": 0.01, "loss": 1.4165, "loss/crossentropy": 2.5486191511154175, "loss/fcd": 1.1953125, "loss/logits": 0.2725132256746292, "step": 1600 }, { "epoch": 0.005083830814175029, "grad_norm": 0.36328125, "grad_norm_var": 0.0010325113932291667, "learning_rate": 0.01, "loss": 1.5102, "loss/crossentropy": 2.4894551038742065, "loss/fcd": 1.19921875, "loss/logits": 0.281192421913147, "step": 1601 }, { "epoch": 0.00508700622380287, "grad_norm": 0.287109375, "grad_norm_var": 0.0012069543202718098, "learning_rate": 0.01, "loss": 1.3587, "loss/crossentropy": 2.4646495580673218, "loss/fcd": 1.09375, "loss/logits": 0.24686383455991745, "step": 1602 }, { "epoch": 0.0050901816334307124, "grad_norm": 0.310546875, "grad_norm_var": 0.0012450536092122396, "learning_rate": 0.01, "loss": 1.4872, "loss/crossentropy": 2.508363127708435, "loss/fcd": 1.21484375, "loss/logits": 0.28010785579681396, "step": 1603 }, { "epoch": 0.005093357043058555, "grad_norm": 0.298828125, "grad_norm_var": 0.0012354373931884766, "learning_rate": 0.01, "loss": 1.5163, "loss/crossentropy": 2.5229811668395996, "loss/fcd": 1.21484375, "loss/logits": 0.2806214243173599, "step": 1604 }, { "epoch": 0.005096532452686396, "grad_norm": 0.330078125, "grad_norm_var": 0.001202392578125, "learning_rate": 0.01, "loss": 1.4249, "loss/crossentropy": 2.801955819129944, "loss/fcd": 1.140625, "loss/logits": 0.2630513608455658, "step": 1605 }, { "epoch": 0.0050997078623142384, "grad_norm": 0.314453125, "grad_norm_var": 0.001216745376586914, "learning_rate": 0.01, "loss": 1.455, "loss/crossentropy": 2.8157224655151367, "loss/fcd": 1.15625, "loss/logits": 0.2617394030094147, "step": 1606 }, { "epoch": 0.005102883271942081, "grad_norm": 0.345703125, "grad_norm_var": 0.001222976048787435, "learning_rate": 0.01, "loss": 1.4364, "loss/crossentropy": 2.6650545597076416, "loss/fcd": 1.18359375, "loss/logits": 0.2804949879646301, "step": 1607 }, { "epoch": 0.005106058681569922, "grad_norm": 0.337890625, "grad_norm_var": 0.00121610959370931, "learning_rate": 0.01, "loss": 1.4457, "loss/crossentropy": 2.5726951360702515, "loss/fcd": 1.125, "loss/logits": 0.25282832980155945, "step": 1608 }, { "epoch": 0.0051092340911977644, "grad_norm": 0.32421875, "grad_norm_var": 0.0011735121409098307, "learning_rate": 0.01, "loss": 1.4482, "loss/crossentropy": 2.3713066577911377, "loss/fcd": 1.125, "loss/logits": 0.254827044904232, "step": 1609 }, { "epoch": 0.005112409500825607, "grad_norm": 0.3671875, "grad_norm_var": 0.0012402693430582682, "learning_rate": 0.01, "loss": 1.4616, "loss/crossentropy": 2.586561679840088, "loss/fcd": 1.18359375, "loss/logits": 0.2566823959350586, "step": 1610 }, { "epoch": 0.005115584910453448, "grad_norm": 0.3125, "grad_norm_var": 0.0006356398264567057, "learning_rate": 0.01, "loss": 1.4141, "loss/crossentropy": 2.4238003492355347, "loss/fcd": 1.19921875, "loss/logits": 0.2751360684633255, "step": 1611 }, { "epoch": 0.0051187603200812904, "grad_norm": 0.3359375, "grad_norm_var": 0.0005358378092447917, "learning_rate": 0.01, "loss": 1.4611, "loss/crossentropy": 2.575940251350403, "loss/fcd": 1.1328125, "loss/logits": 0.24371810257434845, "step": 1612 }, { "epoch": 0.005121935729709133, "grad_norm": 0.294921875, "grad_norm_var": 0.0006090799967447916, "learning_rate": 0.01, "loss": 1.409, "loss/crossentropy": 2.395525336265564, "loss/fcd": 1.1328125, "loss/logits": 0.2660643011331558, "step": 1613 }, { "epoch": 0.005125111139336974, "grad_norm": 0.3671875, "grad_norm_var": 0.0006992975870768229, "learning_rate": 0.01, "loss": 1.4397, "loss/crossentropy": 2.4640313386917114, "loss/fcd": 1.2265625, "loss/logits": 0.30458979308605194, "step": 1614 }, { "epoch": 0.0051282865489648164, "grad_norm": 0.322265625, "grad_norm_var": 0.0006611982981363933, "learning_rate": 0.01, "loss": 1.4155, "loss/crossentropy": 2.7751983404159546, "loss/fcd": 1.12890625, "loss/logits": 0.2527478486299515, "step": 1615 }, { "epoch": 0.005131461958592659, "grad_norm": 0.314453125, "grad_norm_var": 0.0006228129069010417, "learning_rate": 0.01, "loss": 1.4117, "loss/crossentropy": 2.588060975074768, "loss/fcd": 1.1171875, "loss/logits": 0.26295173168182373, "step": 1616 }, { "epoch": 0.0051346373682205, "grad_norm": 0.3359375, "grad_norm_var": 0.000536028544108073, "learning_rate": 0.01, "loss": 1.4671, "loss/crossentropy": 2.5637298822402954, "loss/fcd": 1.14453125, "loss/logits": 0.2538740038871765, "step": 1617 }, { "epoch": 0.0051378127778483424, "grad_norm": 0.421875, "grad_norm_var": 0.0009911696116129557, "learning_rate": 0.01, "loss": 1.5057, "loss/crossentropy": 2.5897717475891113, "loss/fcd": 1.36328125, "loss/logits": 0.3234192430973053, "step": 1618 }, { "epoch": 0.005140988187476185, "grad_norm": 0.359375, "grad_norm_var": 0.0009915669759114583, "learning_rate": 0.01, "loss": 1.4593, "loss/crossentropy": 2.5573883056640625, "loss/fcd": 1.1875, "loss/logits": 0.2734684646129608, "step": 1619 }, { "epoch": 0.005144163597104026, "grad_norm": 0.3359375, "grad_norm_var": 0.0008916060129801432, "learning_rate": 0.01, "loss": 1.524, "loss/crossentropy": 2.6284821033477783, "loss/fcd": 1.24609375, "loss/logits": 0.2787982374429703, "step": 1620 }, { "epoch": 0.0051473390067318684, "grad_norm": 0.322265625, "grad_norm_var": 0.0009044488271077474, "learning_rate": 0.01, "loss": 1.443, "loss/crossentropy": 2.5707788467407227, "loss/fcd": 1.0625, "loss/logits": 0.24151992797851562, "step": 1621 }, { "epoch": 0.005150514416359711, "grad_norm": 0.306640625, "grad_norm_var": 0.0009330590565999349, "learning_rate": 0.01, "loss": 1.433, "loss/crossentropy": 2.5492278337478638, "loss/fcd": 1.171875, "loss/logits": 0.2640794515609741, "step": 1622 }, { "epoch": 0.005153689825987552, "grad_norm": 0.302734375, "grad_norm_var": 0.0010029951731363933, "learning_rate": 0.01, "loss": 1.4126, "loss/crossentropy": 2.4551364183425903, "loss/fcd": 1.13671875, "loss/logits": 0.2520785331726074, "step": 1623 }, { "epoch": 0.0051568652356153944, "grad_norm": 0.3046875, "grad_norm_var": 0.0010594685872395834, "learning_rate": 0.01, "loss": 1.4083, "loss/crossentropy": 2.44328236579895, "loss/fcd": 1.17578125, "loss/logits": 0.28815487027168274, "step": 1624 }, { "epoch": 0.005160040645243237, "grad_norm": 0.29296875, "grad_norm_var": 0.0011571248372395833, "learning_rate": 0.01, "loss": 1.3877, "loss/crossentropy": 2.467069983482361, "loss/fcd": 1.1015625, "loss/logits": 0.24081403017044067, "step": 1625 }, { "epoch": 0.005163216054871078, "grad_norm": 0.275390625, "grad_norm_var": 0.0012415409088134765, "learning_rate": 0.01, "loss": 1.4011, "loss/crossentropy": 2.1879078149795532, "loss/fcd": 1.140625, "loss/logits": 0.262988343834877, "step": 1626 }, { "epoch": 0.0051663914644989204, "grad_norm": 0.283203125, "grad_norm_var": 0.0013452529907226562, "learning_rate": 0.01, "loss": 1.378, "loss/crossentropy": 2.5941628217697144, "loss/fcd": 1.24609375, "loss/logits": 0.28863975405693054, "step": 1627 }, { "epoch": 0.005169566874126763, "grad_norm": 0.322265625, "grad_norm_var": 0.001334238052368164, "learning_rate": 0.01, "loss": 1.4487, "loss/crossentropy": 2.6039857864379883, "loss/fcd": 1.12109375, "loss/logits": 0.25894466042518616, "step": 1628 }, { "epoch": 0.005172742283754604, "grad_norm": 0.326171875, "grad_norm_var": 0.0012798150380452474, "learning_rate": 0.01, "loss": 1.4144, "loss/crossentropy": 2.4459056854248047, "loss/fcd": 1.15625, "loss/logits": 0.28373852372169495, "step": 1629 }, { "epoch": 0.0051759176933824464, "grad_norm": 0.298828125, "grad_norm_var": 0.0011835734049479166, "learning_rate": 0.01, "loss": 1.4437, "loss/crossentropy": 2.5823564529418945, "loss/fcd": 1.171875, "loss/logits": 0.2654499486088753, "step": 1630 }, { "epoch": 0.005179093103010289, "grad_norm": 0.30859375, "grad_norm_var": 0.0011916955312093098, "learning_rate": 0.01, "loss": 1.4478, "loss/crossentropy": 2.788114309310913, "loss/fcd": 1.296875, "loss/logits": 0.2787179499864578, "step": 1631 }, { "epoch": 0.00518226851263813, "grad_norm": 0.357421875, "grad_norm_var": 0.0012784163157145182, "learning_rate": 0.01, "loss": 1.4727, "loss/crossentropy": 2.3637858629226685, "loss/fcd": 1.25, "loss/logits": 0.29309114813804626, "step": 1632 }, { "epoch": 0.0051854439222659724, "grad_norm": 0.32421875, "grad_norm_var": 0.0012654463450113933, "learning_rate": 0.01, "loss": 1.402, "loss/crossentropy": 2.7042442560195923, "loss/fcd": 1.171875, "loss/logits": 0.2595224231481552, "step": 1633 }, { "epoch": 0.005188619331893815, "grad_norm": 0.3671875, "grad_norm_var": 0.0007198174794514974, "learning_rate": 0.01, "loss": 1.552, "loss/crossentropy": 2.651233434677124, "loss/fcd": 1.2890625, "loss/logits": 0.31896190345287323, "step": 1634 }, { "epoch": 0.005191794741521656, "grad_norm": 0.349609375, "grad_norm_var": 0.0006718953450520833, "learning_rate": 0.01, "loss": 1.4523, "loss/crossentropy": 2.9430251121520996, "loss/fcd": 1.28125, "loss/logits": 0.27818892896175385, "step": 1635 }, { "epoch": 0.0051949701511494984, "grad_norm": 0.31640625, "grad_norm_var": 0.0006474177042643229, "learning_rate": 0.01, "loss": 1.4466, "loss/crossentropy": 2.5805490016937256, "loss/fcd": 1.21875, "loss/logits": 0.28252989053726196, "step": 1636 }, { "epoch": 0.00519814556077734, "grad_norm": 0.3046875, "grad_norm_var": 0.0006524244944254557, "learning_rate": 0.01, "loss": 1.429, "loss/crossentropy": 2.4969730377197266, "loss/fcd": 1.1640625, "loss/logits": 0.26965998113155365, "step": 1637 }, { "epoch": 0.005201320970405182, "grad_norm": 0.46875, "grad_norm_var": 0.0021128336588541667, "learning_rate": 0.01, "loss": 1.4562, "loss/crossentropy": 2.533958077430725, "loss/fcd": 1.30859375, "loss/logits": 0.365728959441185, "step": 1638 }, { "epoch": 0.0052044963800330244, "grad_norm": 0.296875, "grad_norm_var": 0.002132527033487956, "learning_rate": 0.01, "loss": 1.3903, "loss/crossentropy": 2.542503833770752, "loss/fcd": 1.14453125, "loss/logits": 0.2684211730957031, "step": 1639 }, { "epoch": 0.005207671789660866, "grad_norm": 0.294921875, "grad_norm_var": 0.0021647135416666668, "learning_rate": 0.01, "loss": 1.3302, "loss/crossentropy": 2.732142686843872, "loss/fcd": 1.09765625, "loss/logits": 0.2539718896150589, "step": 1640 }, { "epoch": 0.005210847199288708, "grad_norm": 0.349609375, "grad_norm_var": 0.002129220962524414, "learning_rate": 0.01, "loss": 1.4998, "loss/crossentropy": 2.6079468727111816, "loss/fcd": 1.27734375, "loss/logits": 0.31563499569892883, "step": 1641 }, { "epoch": 0.0052140226089165504, "grad_norm": 0.310546875, "grad_norm_var": 0.0019609928131103516, "learning_rate": 0.01, "loss": 1.4015, "loss/crossentropy": 2.795456290245056, "loss/fcd": 1.17578125, "loss/logits": 0.26376454532146454, "step": 1642 }, { "epoch": 0.005217198018544392, "grad_norm": 0.330078125, "grad_norm_var": 0.0018061161041259765, "learning_rate": 0.01, "loss": 1.454, "loss/crossentropy": 2.512035369873047, "loss/fcd": 1.20703125, "loss/logits": 0.2593121752142906, "step": 1643 }, { "epoch": 0.005220373428172234, "grad_norm": 0.32421875, "grad_norm_var": 0.0018035888671875, "learning_rate": 0.01, "loss": 1.3623, "loss/crossentropy": 2.5070972442626953, "loss/fcd": 1.0859375, "loss/logits": 0.2445976659655571, "step": 1644 }, { "epoch": 0.0052235488378000764, "grad_norm": 0.322265625, "grad_norm_var": 0.0018081029256184896, "learning_rate": 0.01, "loss": 1.4008, "loss/crossentropy": 2.428351640701294, "loss/fcd": 1.14453125, "loss/logits": 0.25949402153491974, "step": 1645 }, { "epoch": 0.005226724247427918, "grad_norm": 0.294921875, "grad_norm_var": 0.0018267313639322917, "learning_rate": 0.01, "loss": 1.3536, "loss/crossentropy": 2.552368402481079, "loss/fcd": 1.125, "loss/logits": 0.2582576125860214, "step": 1646 }, { "epoch": 0.00522989965705576, "grad_norm": 0.310546875, "grad_norm_var": 0.0018207391103108725, "learning_rate": 0.01, "loss": 1.4386, "loss/crossentropy": 2.7328609228134155, "loss/fcd": 1.171875, "loss/logits": 0.2663116306066513, "step": 1647 }, { "epoch": 0.0052330750666836024, "grad_norm": 0.32421875, "grad_norm_var": 0.001779937744140625, "learning_rate": 0.01, "loss": 1.4449, "loss/crossentropy": 2.545032501220703, "loss/fcd": 1.23046875, "loss/logits": 0.295625239610672, "step": 1648 }, { "epoch": 0.005236250476311444, "grad_norm": 0.310546875, "grad_norm_var": 0.0018031915028889974, "learning_rate": 0.01, "loss": 1.3839, "loss/crossentropy": 2.152552545070648, "loss/fcd": 1.0625, "loss/logits": 0.24405647814273834, "step": 1649 }, { "epoch": 0.005239425885939286, "grad_norm": 0.330078125, "grad_norm_var": 0.0017038345336914062, "learning_rate": 0.01, "loss": 1.5182, "loss/crossentropy": 2.611814498901367, "loss/fcd": 1.1640625, "loss/logits": 0.2545488253235817, "step": 1650 }, { "epoch": 0.0052426012955671284, "grad_norm": 0.30078125, "grad_norm_var": 0.001708205540974935, "learning_rate": 0.01, "loss": 1.4239, "loss/crossentropy": 2.5946648120880127, "loss/fcd": 1.27734375, "loss/logits": 0.27634476125240326, "step": 1651 }, { "epoch": 0.00524577670519497, "grad_norm": 0.30859375, "grad_norm_var": 0.001720285415649414, "learning_rate": 0.01, "loss": 1.399, "loss/crossentropy": 2.4689542055130005, "loss/fcd": 1.09765625, "loss/logits": 0.24491195380687714, "step": 1652 }, { "epoch": 0.005248952114822812, "grad_norm": 0.328125, "grad_norm_var": 0.0016947269439697265, "learning_rate": 0.01, "loss": 1.4485, "loss/crossentropy": 2.6689454317092896, "loss/fcd": 1.1953125, "loss/logits": 0.26346150040626526, "step": 1653 }, { "epoch": 0.0052521275244506544, "grad_norm": 0.4609375, "grad_norm_var": 0.0015491326649983725, "learning_rate": 0.01, "loss": 1.5614, "loss/crossentropy": 2.5725115537643433, "loss/fcd": 1.5234375, "loss/logits": 0.43554237484931946, "step": 1654 }, { "epoch": 0.005255302934078496, "grad_norm": 0.3046875, "grad_norm_var": 0.0015238285064697265, "learning_rate": 0.01, "loss": 1.407, "loss/crossentropy": 2.799704432487488, "loss/fcd": 1.19921875, "loss/logits": 0.27672891318798065, "step": 1655 }, { "epoch": 0.005258478343706338, "grad_norm": 0.38671875, "grad_norm_var": 0.001678466796875, "learning_rate": 0.01, "loss": 1.4787, "loss/crossentropy": 2.618114471435547, "loss/fcd": 1.16796875, "loss/logits": 0.2667092829942703, "step": 1656 }, { "epoch": 0.0052616537533341804, "grad_norm": 0.2890625, "grad_norm_var": 0.00175779660542806, "learning_rate": 0.01, "loss": 1.4107, "loss/crossentropy": 2.7824281454086304, "loss/fcd": 1.23828125, "loss/logits": 0.326113760471344, "step": 1657 }, { "epoch": 0.005264829162962022, "grad_norm": 0.349609375, "grad_norm_var": 0.001766061782836914, "learning_rate": 0.01, "loss": 1.47, "loss/crossentropy": 2.5898351669311523, "loss/fcd": 1.20703125, "loss/logits": 0.28307758271694183, "step": 1658 }, { "epoch": 0.005268004572589864, "grad_norm": 0.291015625, "grad_norm_var": 0.0018595218658447265, "learning_rate": 0.01, "loss": 1.3768, "loss/crossentropy": 2.4810056686401367, "loss/fcd": 1.16015625, "loss/logits": 0.2533889263868332, "step": 1659 }, { "epoch": 0.0052711799822177064, "grad_norm": 0.3125, "grad_norm_var": 0.001872873306274414, "learning_rate": 0.01, "loss": 1.4363, "loss/crossentropy": 2.6993894577026367, "loss/fcd": 1.30078125, "loss/logits": 0.30054205656051636, "step": 1660 }, { "epoch": 0.005274355391845548, "grad_norm": 0.326171875, "grad_norm_var": 0.0018716017405192057, "learning_rate": 0.01, "loss": 1.4838, "loss/crossentropy": 2.4411933422088623, "loss/fcd": 1.10546875, "loss/logits": 0.2575240433216095, "step": 1661 }, { "epoch": 0.00527753080147339, "grad_norm": 0.3125, "grad_norm_var": 0.001816240946451823, "learning_rate": 0.01, "loss": 1.4311, "loss/crossentropy": 2.3465611934661865, "loss/fcd": 1.0546875, "loss/logits": 0.23356787115335464, "step": 1662 }, { "epoch": 0.0052807062111012324, "grad_norm": 0.3203125, "grad_norm_var": 0.001799631118774414, "learning_rate": 0.01, "loss": 1.4469, "loss/crossentropy": 2.3459896445274353, "loss/fcd": 1.20703125, "loss/logits": 0.2473852038383484, "step": 1663 }, { "epoch": 0.005283881620729074, "grad_norm": 0.34765625, "grad_norm_var": 0.0018206119537353515, "learning_rate": 0.01, "loss": 1.4791, "loss/crossentropy": 2.4506105184555054, "loss/fcd": 1.1796875, "loss/logits": 0.2653851956129074, "step": 1664 }, { "epoch": 0.005287057030356916, "grad_norm": 0.32421875, "grad_norm_var": 0.0017969131469726563, "learning_rate": 0.01, "loss": 1.4706, "loss/crossentropy": 2.450723171234131, "loss/fcd": 1.12109375, "loss/logits": 0.2613823860883713, "step": 1665 }, { "epoch": 0.0052902324399847584, "grad_norm": 0.330078125, "grad_norm_var": 0.0017969131469726563, "learning_rate": 0.01, "loss": 1.3731, "loss/crossentropy": 2.5012307167053223, "loss/fcd": 1.0703125, "loss/logits": 0.22960881888866425, "step": 1666 }, { "epoch": 0.0052934078496126, "grad_norm": 0.298828125, "grad_norm_var": 0.0018049716949462891, "learning_rate": 0.01, "loss": 1.4616, "loss/crossentropy": 2.5758174657821655, "loss/fcd": 1.31640625, "loss/logits": 0.42653490602970123, "step": 1667 }, { "epoch": 0.005296583259240442, "grad_norm": 0.359375, "grad_norm_var": 0.0018165429433186849, "learning_rate": 0.01, "loss": 1.4214, "loss/crossentropy": 2.4187982082366943, "loss/fcd": 1.171875, "loss/logits": 0.2774343192577362, "step": 1668 }, { "epoch": 0.005299758668868284, "grad_norm": 0.333984375, "grad_norm_var": 0.0018142064412434895, "learning_rate": 0.01, "loss": 1.4558, "loss/crossentropy": 2.7708572149276733, "loss/fcd": 1.1875, "loss/logits": 0.27878226339817047, "step": 1669 }, { "epoch": 0.005302934078496126, "grad_norm": 0.328125, "grad_norm_var": 0.0006728490193684896, "learning_rate": 0.01, "loss": 1.4788, "loss/crossentropy": 2.7073339223861694, "loss/fcd": 1.31640625, "loss/logits": 0.3503541350364685, "step": 1670 }, { "epoch": 0.005306109488123968, "grad_norm": 0.34375, "grad_norm_var": 0.0006575902303059896, "learning_rate": 0.01, "loss": 1.4856, "loss/crossentropy": 2.67244553565979, "loss/fcd": 1.234375, "loss/logits": 0.3051234483718872, "step": 1671 }, { "epoch": 0.00530928489775181, "grad_norm": 0.53515625, "grad_norm_var": 0.003189531962076823, "learning_rate": 0.01, "loss": 1.4393, "loss/crossentropy": 2.2649386525154114, "loss/fcd": 1.40625, "loss/logits": 0.2690805494785309, "step": 1672 }, { "epoch": 0.005312460307379652, "grad_norm": 0.314453125, "grad_norm_var": 0.003065347671508789, "learning_rate": 0.01, "loss": 1.4452, "loss/crossentropy": 2.5465664863586426, "loss/fcd": 1.203125, "loss/logits": 0.29032717645168304, "step": 1673 }, { "epoch": 0.005315635717007494, "grad_norm": 0.3125, "grad_norm_var": 0.0031000773111979165, "learning_rate": 0.01, "loss": 1.4603, "loss/crossentropy": 2.3745211362838745, "loss/fcd": 1.1171875, "loss/logits": 0.2568344175815582, "step": 1674 }, { "epoch": 0.005318811126635336, "grad_norm": 0.35546875, "grad_norm_var": 0.002965275446573893, "learning_rate": 0.01, "loss": 1.4482, "loss/crossentropy": 3.004646897315979, "loss/fcd": 1.3125, "loss/logits": 0.32594217360019684, "step": 1675 }, { "epoch": 0.005321986536263178, "grad_norm": 0.34765625, "grad_norm_var": 0.002909199396769206, "learning_rate": 0.01, "loss": 1.4913, "loss/crossentropy": 2.687684178352356, "loss/fcd": 1.16015625, "loss/logits": 0.31805357336997986, "step": 1676 }, { "epoch": 0.00532516194589102, "grad_norm": 0.384765625, "grad_norm_var": 0.0029912153879801434, "learning_rate": 0.01, "loss": 1.4837, "loss/crossentropy": 2.8432259559631348, "loss/fcd": 1.2265625, "loss/logits": 0.2986297532916069, "step": 1677 }, { "epoch": 0.005328337355518862, "grad_norm": 0.455078125, "grad_norm_var": 0.0036096572875976562, "learning_rate": 0.01, "loss": 1.4049, "loss/crossentropy": 2.518496870994568, "loss/fcd": 1.10546875, "loss/logits": 0.24696458876132965, "step": 1678 }, { "epoch": 0.005331512765146704, "grad_norm": 0.330078125, "grad_norm_var": 0.003569523493448893, "learning_rate": 0.01, "loss": 1.4032, "loss/crossentropy": 2.3403525352478027, "loss/fcd": 1.08203125, "loss/logits": 0.25015346705913544, "step": 1679 }, { "epoch": 0.005334688174774546, "grad_norm": 0.2890625, "grad_norm_var": 0.003851811091105143, "learning_rate": 0.01, "loss": 1.3293, "loss/crossentropy": 2.7371069192886353, "loss/fcd": 1.125, "loss/logits": 0.26023803651332855, "step": 1680 }, { "epoch": 0.005337863584402388, "grad_norm": 0.357421875, "grad_norm_var": 0.0037947972615559895, "learning_rate": 0.01, "loss": 1.4967, "loss/crossentropy": 2.71319043636322, "loss/fcd": 1.19921875, "loss/logits": 0.2666834145784378, "step": 1681 }, { "epoch": 0.00534103899403023, "grad_norm": 0.3984375, "grad_norm_var": 0.0038621107737223308, "learning_rate": 0.01, "loss": 1.4708, "loss/crossentropy": 2.56120765209198, "loss/fcd": 1.1875, "loss/logits": 0.26970450580120087, "step": 1682 }, { "epoch": 0.005344214403658072, "grad_norm": 0.294921875, "grad_norm_var": 0.0038944085439046225, "learning_rate": 0.01, "loss": 1.4027, "loss/crossentropy": 2.4131473302841187, "loss/fcd": 1.1484375, "loss/logits": 0.2596333771944046, "step": 1683 }, { "epoch": 0.005347389813285914, "grad_norm": 0.3125, "grad_norm_var": 0.004027922948201497, "learning_rate": 0.01, "loss": 1.3924, "loss/crossentropy": 2.586424946784973, "loss/fcd": 1.1875, "loss/logits": 0.2779024466872215, "step": 1684 }, { "epoch": 0.005350565222913756, "grad_norm": 0.373046875, "grad_norm_var": 0.0040094852447509766, "learning_rate": 0.01, "loss": 1.4704, "loss/crossentropy": 2.0898656249046326, "loss/fcd": 1.2890625, "loss/logits": 0.30374136567115784, "step": 1685 }, { "epoch": 0.005353740632541598, "grad_norm": 0.310546875, "grad_norm_var": 0.004099464416503907, "learning_rate": 0.01, "loss": 1.3981, "loss/crossentropy": 2.5333709716796875, "loss/fcd": 1.1328125, "loss/logits": 0.25827865302562714, "step": 1686 }, { "epoch": 0.00535691604216944, "grad_norm": 0.302734375, "grad_norm_var": 0.004278039932250977, "learning_rate": 0.01, "loss": 1.3932, "loss/crossentropy": 2.3429975509643555, "loss/fcd": 1.1640625, "loss/logits": 0.2790260463953018, "step": 1687 }, { "epoch": 0.005360091451797282, "grad_norm": 0.3125, "grad_norm_var": 0.0020166873931884766, "learning_rate": 0.01, "loss": 1.4417, "loss/crossentropy": 2.6512221097946167, "loss/fcd": 1.29296875, "loss/logits": 0.32033616304397583, "step": 1688 }, { "epoch": 0.005363266861425124, "grad_norm": 0.349609375, "grad_norm_var": 0.0019709110260009766, "learning_rate": 0.01, "loss": 1.4066, "loss/crossentropy": 2.681312680244446, "loss/fcd": 1.17578125, "loss/logits": 0.24615877866744995, "step": 1689 }, { "epoch": 0.005366442271052966, "grad_norm": 0.27734375, "grad_norm_var": 0.0021906375885009767, "learning_rate": 0.01, "loss": 1.3822, "loss/crossentropy": 2.585557222366333, "loss/fcd": 1.2109375, "loss/logits": 0.2850920110940933, "step": 1690 }, { "epoch": 0.005369617680680808, "grad_norm": 0.287109375, "grad_norm_var": 0.0023480733235677082, "learning_rate": 0.01, "loss": 1.394, "loss/crossentropy": 2.431909918785095, "loss/fcd": 1.13671875, "loss/logits": 0.25125400722026825, "step": 1691 }, { "epoch": 0.00537279309030865, "grad_norm": 0.359375, "grad_norm_var": 0.0023742039998372396, "learning_rate": 0.01, "loss": 1.5235, "loss/crossentropy": 2.2795810103416443, "loss/fcd": 1.41796875, "loss/logits": 0.31847870349884033, "step": 1692 }, { "epoch": 0.005375968499936492, "grad_norm": 0.291015625, "grad_norm_var": 0.0023284276326497396, "learning_rate": 0.01, "loss": 1.4161, "loss/crossentropy": 2.4815754890441895, "loss/fcd": 1.06640625, "loss/logits": 0.2535252794623375, "step": 1693 }, { "epoch": 0.005379143909564334, "grad_norm": 0.310546875, "grad_norm_var": 0.0012486775716145833, "learning_rate": 0.01, "loss": 1.4761, "loss/crossentropy": 2.5222415924072266, "loss/fcd": 1.19140625, "loss/logits": 0.2586047351360321, "step": 1694 }, { "epoch": 0.005382319319192176, "grad_norm": 0.33203125, "grad_norm_var": 0.0012509504954020183, "learning_rate": 0.01, "loss": 1.4425, "loss/crossentropy": 2.4645133018493652, "loss/fcd": 1.0703125, "loss/logits": 0.24325621128082275, "step": 1695 }, { "epoch": 0.005385494728820018, "grad_norm": 0.3515625, "grad_norm_var": 0.0012173811594645182, "learning_rate": 0.01, "loss": 1.4809, "loss/crossentropy": 2.5719006061553955, "loss/fcd": 1.23046875, "loss/logits": 0.27312563359737396, "step": 1696 }, { "epoch": 0.00538867013844786, "grad_norm": 0.330078125, "grad_norm_var": 0.0011506239573160807, "learning_rate": 0.01, "loss": 1.3737, "loss/crossentropy": 2.653048872947693, "loss/fcd": 1.125, "loss/logits": 0.24208035320043564, "step": 1697 }, { "epoch": 0.005391845548075702, "grad_norm": 0.404296875, "grad_norm_var": 0.001210467020670573, "learning_rate": 0.01, "loss": 1.463, "loss/crossentropy": 2.6999279260635376, "loss/fcd": 1.20703125, "loss/logits": 0.27425508201122284, "step": 1698 }, { "epoch": 0.005395020957703544, "grad_norm": 0.32421875, "grad_norm_var": 0.0011468092600504557, "learning_rate": 0.01, "loss": 1.488, "loss/crossentropy": 2.7059956789016724, "loss/fcd": 1.30078125, "loss/logits": 0.3632088750600815, "step": 1699 }, { "epoch": 0.005398196367331386, "grad_norm": 0.33203125, "grad_norm_var": 0.0011334578196207682, "learning_rate": 0.01, "loss": 1.4423, "loss/crossentropy": 2.3963496685028076, "loss/fcd": 1.21484375, "loss/logits": 0.2830388844013214, "step": 1700 }, { "epoch": 0.005401371776959228, "grad_norm": 0.310546875, "grad_norm_var": 0.0010022322336832683, "learning_rate": 0.01, "loss": 1.3827, "loss/crossentropy": 2.047423481941223, "loss/fcd": 1.13671875, "loss/logits": 0.21973995119333267, "step": 1701 }, { "epoch": 0.00540454718658707, "grad_norm": 0.546875, "grad_norm_var": 0.004065958658854166, "learning_rate": 0.01, "loss": 1.5554, "loss/crossentropy": 2.1700942516326904, "loss/fcd": 1.45703125, "loss/logits": 0.2812206596136093, "step": 1702 }, { "epoch": 0.005407722596214912, "grad_norm": 0.28515625, "grad_norm_var": 0.004169956843058268, "learning_rate": 0.01, "loss": 1.3929, "loss/crossentropy": 2.501310110092163, "loss/fcd": 1.25, "loss/logits": 0.2961071729660034, "step": 1703 }, { "epoch": 0.005410898005842753, "grad_norm": 0.318359375, "grad_norm_var": 0.004152361551920573, "learning_rate": 0.01, "loss": 1.4114, "loss/crossentropy": 2.4649428129196167, "loss/fcd": 1.1640625, "loss/logits": 0.27528275549411774, "step": 1704 }, { "epoch": 0.005414073415470596, "grad_norm": 0.33203125, "grad_norm_var": 0.004144779841105143, "learning_rate": 0.01, "loss": 1.4381, "loss/crossentropy": 2.5406960248947144, "loss/fcd": 1.2109375, "loss/logits": 0.2837842106819153, "step": 1705 }, { "epoch": 0.005417248825098438, "grad_norm": 0.330078125, "grad_norm_var": 0.003898874918619792, "learning_rate": 0.01, "loss": 1.4158, "loss/crossentropy": 2.546883702278137, "loss/fcd": 1.0625, "loss/logits": 0.23201091587543488, "step": 1706 }, { "epoch": 0.005420424234726279, "grad_norm": 0.314453125, "grad_norm_var": 0.0037515640258789064, "learning_rate": 0.01, "loss": 1.4305, "loss/crossentropy": 2.424559712409973, "loss/fcd": 1.0625, "loss/logits": 0.2391505390405655, "step": 1707 }, { "epoch": 0.005423599644354122, "grad_norm": 0.306640625, "grad_norm_var": 0.0038034915924072266, "learning_rate": 0.01, "loss": 1.4521, "loss/crossentropy": 2.403588056564331, "loss/fcd": 1.15234375, "loss/logits": 0.27822640538215637, "step": 1708 }, { "epoch": 0.005426775053981964, "grad_norm": 0.3671875, "grad_norm_var": 0.003681373596191406, "learning_rate": 0.01, "loss": 1.4814, "loss/crossentropy": 2.458814263343811, "loss/fcd": 1.234375, "loss/logits": 0.31624844670295715, "step": 1709 }, { "epoch": 0.005429950463609805, "grad_norm": 0.291015625, "grad_norm_var": 0.003791046142578125, "learning_rate": 0.01, "loss": 1.3883, "loss/crossentropy": 2.6120320558547974, "loss/fcd": 1.15234375, "loss/logits": 0.2791610509157181, "step": 1710 }, { "epoch": 0.005433125873237648, "grad_norm": 0.34765625, "grad_norm_var": 0.003784942626953125, "learning_rate": 0.01, "loss": 1.4002, "loss/crossentropy": 2.7802462577819824, "loss/fcd": 1.1875, "loss/logits": 0.27945904433727264, "step": 1711 }, { "epoch": 0.00543630128286549, "grad_norm": 0.3046875, "grad_norm_var": 0.003870391845703125, "learning_rate": 0.01, "loss": 1.4076, "loss/crossentropy": 2.5719646215438843, "loss/fcd": 1.22265625, "loss/logits": 0.26753246784210205, "step": 1712 }, { "epoch": 0.005439476692493331, "grad_norm": 0.326171875, "grad_norm_var": 0.0038766860961914062, "learning_rate": 0.01, "loss": 1.4311, "loss/crossentropy": 2.529703378677368, "loss/fcd": 1.19921875, "loss/logits": 0.2710454761981964, "step": 1713 }, { "epoch": 0.005442652102121174, "grad_norm": 0.3203125, "grad_norm_var": 0.003598515192667643, "learning_rate": 0.01, "loss": 1.451, "loss/crossentropy": 2.42561936378479, "loss/fcd": 1.17578125, "loss/logits": 0.26852506399154663, "step": 1714 }, { "epoch": 0.005445827511749016, "grad_norm": 0.306640625, "grad_norm_var": 0.0036427179972330728, "learning_rate": 0.01, "loss": 1.4628, "loss/crossentropy": 2.636641263961792, "loss/fcd": 1.16796875, "loss/logits": 0.2778806686401367, "step": 1715 }, { "epoch": 0.005449002921376857, "grad_norm": 0.345703125, "grad_norm_var": 0.003651285171508789, "learning_rate": 0.01, "loss": 1.4925, "loss/crossentropy": 2.2933279275894165, "loss/fcd": 1.33984375, "loss/logits": 0.3443090617656708, "step": 1716 }, { "epoch": 0.0054521783310047, "grad_norm": 0.33203125, "grad_norm_var": 0.0036112467447916665, "learning_rate": 0.01, "loss": 1.456, "loss/crossentropy": 2.760439395904541, "loss/fcd": 1.2109375, "loss/logits": 0.3025214374065399, "step": 1717 }, { "epoch": 0.005455353740632542, "grad_norm": 0.310546875, "grad_norm_var": 0.0004552046457926432, "learning_rate": 0.01, "loss": 1.4041, "loss/crossentropy": 2.3678133487701416, "loss/fcd": 1.1171875, "loss/logits": 0.2342669665813446, "step": 1718 }, { "epoch": 0.005458529150260383, "grad_norm": 0.318359375, "grad_norm_var": 0.00036468505859375, "learning_rate": 0.01, "loss": 1.4218, "loss/crossentropy": 2.551934003829956, "loss/fcd": 1.2578125, "loss/logits": 0.2777518928050995, "step": 1719 }, { "epoch": 0.005461704559888226, "grad_norm": 0.32421875, "grad_norm_var": 0.00036301612854003904, "learning_rate": 0.01, "loss": 1.4544, "loss/crossentropy": 2.7763537168502808, "loss/fcd": 1.1796875, "loss/logits": 0.25872863829135895, "step": 1720 }, { "epoch": 0.005464879969516068, "grad_norm": 0.30859375, "grad_norm_var": 0.0003710269927978516, "learning_rate": 0.01, "loss": 1.4345, "loss/crossentropy": 2.6043232679367065, "loss/fcd": 1.15234375, "loss/logits": 0.2824757546186447, "step": 1721 }, { "epoch": 0.005468055379143909, "grad_norm": 0.337890625, "grad_norm_var": 0.0003831068674723307, "learning_rate": 0.01, "loss": 1.448, "loss/crossentropy": 2.5804646015167236, "loss/fcd": 1.109375, "loss/logits": 0.24043309688568115, "step": 1722 }, { "epoch": 0.005471230788771752, "grad_norm": 0.40234375, "grad_norm_var": 0.0007700602213541667, "learning_rate": 0.01, "loss": 1.5672, "loss/crossentropy": 2.6391921043395996, "loss/fcd": 1.171875, "loss/logits": 0.2845745086669922, "step": 1723 }, { "epoch": 0.005474406198399594, "grad_norm": 0.3046875, "grad_norm_var": 0.0007758935292561849, "learning_rate": 0.01, "loss": 1.4142, "loss/crossentropy": 2.565014958381653, "loss/fcd": 1.140625, "loss/logits": 0.2756178379058838, "step": 1724 }, { "epoch": 0.005477581608027435, "grad_norm": 0.365234375, "grad_norm_var": 0.0007659276326497396, "learning_rate": 0.01, "loss": 1.5238, "loss/crossentropy": 2.4041879177093506, "loss/fcd": 1.23828125, "loss/logits": 0.3049192577600479, "step": 1725 }, { "epoch": 0.005480757017655278, "grad_norm": 0.298828125, "grad_norm_var": 0.0007313410441080729, "learning_rate": 0.01, "loss": 1.4032, "loss/crossentropy": 2.5784353017807007, "loss/fcd": 1.109375, "loss/logits": 0.25584734976291656, "step": 1726 }, { "epoch": 0.00548393242728312, "grad_norm": 0.41015625, "grad_norm_var": 0.0011362075805664063, "learning_rate": 0.01, "loss": 1.4764, "loss/crossentropy": 2.5257431268692017, "loss/fcd": 1.19140625, "loss/logits": 0.283470556139946, "step": 1727 }, { "epoch": 0.005487107836910961, "grad_norm": 0.310546875, "grad_norm_var": 0.001116800308227539, "learning_rate": 0.01, "loss": 1.3884, "loss/crossentropy": 2.555391788482666, "loss/fcd": 1.08203125, "loss/logits": 0.24228078871965408, "step": 1728 }, { "epoch": 0.005490283246538804, "grad_norm": 0.322265625, "grad_norm_var": 0.0011211236317952473, "learning_rate": 0.01, "loss": 1.4546, "loss/crossentropy": 2.615530848503113, "loss/fcd": 1.19921875, "loss/logits": 0.2692243903875351, "step": 1729 }, { "epoch": 0.005493458656166646, "grad_norm": 0.396484375, "grad_norm_var": 0.0013610204060872397, "learning_rate": 0.01, "loss": 1.4323, "loss/crossentropy": 2.732365846633911, "loss/fcd": 1.17578125, "loss/logits": 0.26883548498153687, "step": 1730 }, { "epoch": 0.005496634065794487, "grad_norm": 0.328125, "grad_norm_var": 0.0013024489084879558, "learning_rate": 0.01, "loss": 1.4361, "loss/crossentropy": 2.6009198427200317, "loss/fcd": 1.19921875, "loss/logits": 0.27197110652923584, "step": 1731 }, { "epoch": 0.00549980947542233, "grad_norm": 0.31640625, "grad_norm_var": 0.001327959696451823, "learning_rate": 0.01, "loss": 1.4188, "loss/crossentropy": 2.5613378286361694, "loss/fcd": 1.1171875, "loss/logits": 0.26026083528995514, "step": 1732 }, { "epoch": 0.005502984885050172, "grad_norm": 0.35546875, "grad_norm_var": 0.001347796122233073, "learning_rate": 0.01, "loss": 1.4315, "loss/crossentropy": 2.5779001712799072, "loss/fcd": 1.1796875, "loss/logits": 0.26903483271598816, "step": 1733 }, { "epoch": 0.005506160294678013, "grad_norm": 0.30859375, "grad_norm_var": 0.0013552188873291015, "learning_rate": 0.01, "loss": 1.4371, "loss/crossentropy": 2.389364004135132, "loss/fcd": 1.24609375, "loss/logits": 0.29791559278964996, "step": 1734 }, { "epoch": 0.005509335704305856, "grad_norm": 0.3125, "grad_norm_var": 0.0013727188110351563, "learning_rate": 0.01, "loss": 1.4381, "loss/crossentropy": 2.7403628826141357, "loss/fcd": 1.19921875, "loss/logits": 0.2883763909339905, "step": 1735 }, { "epoch": 0.005512511113933698, "grad_norm": 0.296875, "grad_norm_var": 0.0014684041341145833, "learning_rate": 0.01, "loss": 1.3872, "loss/crossentropy": 2.144510507583618, "loss/fcd": 1.08203125, "loss/logits": 0.2272828221321106, "step": 1736 }, { "epoch": 0.005515686523561539, "grad_norm": 0.314453125, "grad_norm_var": 0.0014491875966389973, "learning_rate": 0.01, "loss": 1.4759, "loss/crossentropy": 2.6428866386413574, "loss/fcd": 1.171875, "loss/logits": 0.266600601375103, "step": 1737 }, { "epoch": 0.005518861933189382, "grad_norm": 0.322265625, "grad_norm_var": 0.0014611403147379556, "learning_rate": 0.01, "loss": 1.4116, "loss/crossentropy": 2.5943862199783325, "loss/fcd": 1.14453125, "loss/logits": 0.24540768563747406, "step": 1738 }, { "epoch": 0.005522037342817223, "grad_norm": 0.32421875, "grad_norm_var": 0.0011445204416910806, "learning_rate": 0.01, "loss": 1.4416, "loss/crossentropy": 2.456264615058899, "loss/fcd": 1.16796875, "loss/logits": 0.2981448918581009, "step": 1739 }, { "epoch": 0.005525212752445065, "grad_norm": 0.330078125, "grad_norm_var": 0.0010976155598958334, "learning_rate": 0.01, "loss": 1.4568, "loss/crossentropy": 2.601453423500061, "loss/fcd": 1.2109375, "loss/logits": 0.2525002658367157, "step": 1740 }, { "epoch": 0.005528388162072908, "grad_norm": 0.353515625, "grad_norm_var": 0.0010543187459309896, "learning_rate": 0.01, "loss": 1.5383, "loss/crossentropy": 2.8400700092315674, "loss/fcd": 1.29296875, "loss/logits": 0.3341517448425293, "step": 1741 }, { "epoch": 0.005531563571700749, "grad_norm": 0.322265625, "grad_norm_var": 0.0009871800740559897, "learning_rate": 0.01, "loss": 1.3829, "loss/crossentropy": 2.401926636695862, "loss/fcd": 1.06640625, "loss/logits": 0.25453901290893555, "step": 1742 }, { "epoch": 0.005534738981328591, "grad_norm": 0.322265625, "grad_norm_var": 0.0005630334218343099, "learning_rate": 0.01, "loss": 1.3835, "loss/crossentropy": 2.515135884284973, "loss/fcd": 1.1171875, "loss/logits": 0.24854443967342377, "step": 1743 }, { "epoch": 0.005537914390956434, "grad_norm": 0.279296875, "grad_norm_var": 0.0006937503814697266, "learning_rate": 0.01, "loss": 1.3946, "loss/crossentropy": 2.5817352533340454, "loss/fcd": 1.2578125, "loss/logits": 0.34575067460536957, "step": 1744 }, { "epoch": 0.005541089800584275, "grad_norm": 0.3828125, "grad_norm_var": 0.0008982340494791667, "learning_rate": 0.01, "loss": 1.6368, "loss/crossentropy": 2.6690667867660522, "loss/fcd": 1.27734375, "loss/logits": 0.30611903965473175, "step": 1745 }, { "epoch": 0.005544265210212117, "grad_norm": 0.345703125, "grad_norm_var": 0.0006031672159830729, "learning_rate": 0.01, "loss": 1.4601, "loss/crossentropy": 2.4227917194366455, "loss/fcd": 1.1953125, "loss/logits": 0.2583349347114563, "step": 1746 }, { "epoch": 0.00554744061983996, "grad_norm": 0.353515625, "grad_norm_var": 0.0006508986155192057, "learning_rate": 0.01, "loss": 1.5448, "loss/crossentropy": 2.677427053451538, "loss/fcd": 1.19140625, "loss/logits": 0.26189327239990234, "step": 1747 }, { "epoch": 0.005550616029467801, "grad_norm": 0.306640625, "grad_norm_var": 0.0006713231404622395, "learning_rate": 0.01, "loss": 1.4029, "loss/crossentropy": 2.6272119283676147, "loss/fcd": 1.203125, "loss/logits": 0.2734217494726181, "step": 1748 }, { "epoch": 0.005553791439095643, "grad_norm": 0.291015625, "grad_norm_var": 0.0006854852040608724, "learning_rate": 0.01, "loss": 1.3881, "loss/crossentropy": 2.4777952432632446, "loss/fcd": 1.1171875, "loss/logits": 0.2540733814239502, "step": 1749 }, { "epoch": 0.005556966848723486, "grad_norm": 0.337890625, "grad_norm_var": 0.0006833394368489583, "learning_rate": 0.01, "loss": 1.4571, "loss/crossentropy": 2.485927700996399, "loss/fcd": 1.21875, "loss/logits": 0.26738201081752777, "step": 1750 }, { "epoch": 0.005560142258351327, "grad_norm": 0.361328125, "grad_norm_var": 0.0007528781890869141, "learning_rate": 0.01, "loss": 1.4618, "loss/crossentropy": 2.528354287147522, "loss/fcd": 1.12109375, "loss/logits": 0.244489423930645, "step": 1751 }, { "epoch": 0.005563317667979169, "grad_norm": 0.322265625, "grad_norm_var": 0.000688616434733073, "learning_rate": 0.01, "loss": 1.4123, "loss/crossentropy": 2.6144312620162964, "loss/fcd": 1.234375, "loss/logits": 0.29583021998405457, "step": 1752 }, { "epoch": 0.005566493077607012, "grad_norm": 0.310546875, "grad_norm_var": 0.00069732666015625, "learning_rate": 0.01, "loss": 1.4294, "loss/crossentropy": 2.4732394218444824, "loss/fcd": 1.125, "loss/logits": 0.2591251879930496, "step": 1753 }, { "epoch": 0.005569668487234853, "grad_norm": 0.341796875, "grad_norm_var": 0.0007033665974934896, "learning_rate": 0.01, "loss": 1.4353, "loss/crossentropy": 2.5699442625045776, "loss/fcd": 1.171875, "loss/logits": 0.26918888092041016, "step": 1754 }, { "epoch": 0.005572843896862695, "grad_norm": 0.33203125, "grad_norm_var": 0.0007008234659830729, "learning_rate": 0.01, "loss": 1.4922, "loss/crossentropy": 2.6360515356063843, "loss/fcd": 1.2109375, "loss/logits": 0.299432635307312, "step": 1755 }, { "epoch": 0.005576019306490538, "grad_norm": 0.3046875, "grad_norm_var": 0.0007435957590738933, "learning_rate": 0.01, "loss": 1.4021, "loss/crossentropy": 2.593457579612732, "loss/fcd": 1.18359375, "loss/logits": 0.30770231783390045, "step": 1756 }, { "epoch": 0.005579194716118379, "grad_norm": 0.35546875, "grad_norm_var": 0.0007501602172851563, "learning_rate": 0.01, "loss": 1.4854, "loss/crossentropy": 2.7476800680160522, "loss/fcd": 1.16796875, "loss/logits": 0.28153106570243835, "step": 1757 }, { "epoch": 0.005582370125746221, "grad_norm": 0.29296875, "grad_norm_var": 0.0008314609527587891, "learning_rate": 0.01, "loss": 1.3644, "loss/crossentropy": 2.2167781591415405, "loss/fcd": 1.10546875, "loss/logits": 0.2506364434957504, "step": 1758 }, { "epoch": 0.005585545535374064, "grad_norm": 0.361328125, "grad_norm_var": 0.0008994897206624348, "learning_rate": 0.01, "loss": 1.5187, "loss/crossentropy": 2.6545130014419556, "loss/fcd": 1.21875, "loss/logits": 0.2899438142776489, "step": 1759 }, { "epoch": 0.005588720945001905, "grad_norm": 0.30078125, "grad_norm_var": 0.0007832209269205729, "learning_rate": 0.01, "loss": 1.3859, "loss/crossentropy": 2.4758235216140747, "loss/fcd": 1.0859375, "loss/logits": 0.2479354664683342, "step": 1760 }, { "epoch": 0.005591896354629747, "grad_norm": 0.32421875, "grad_norm_var": 0.0005953470865885417, "learning_rate": 0.01, "loss": 1.5242, "loss/crossentropy": 2.547168731689453, "loss/fcd": 1.2109375, "loss/logits": 0.29265573620796204, "step": 1761 }, { "epoch": 0.00559507176425759, "grad_norm": 0.31640625, "grad_norm_var": 0.0005784193674723307, "learning_rate": 0.01, "loss": 1.4448, "loss/crossentropy": 2.6404765844345093, "loss/fcd": 1.1796875, "loss/logits": 0.27990569174289703, "step": 1762 }, { "epoch": 0.005598247173885431, "grad_norm": 0.314453125, "grad_norm_var": 0.0005294640858968098, "learning_rate": 0.01, "loss": 1.4582, "loss/crossentropy": 2.79376220703125, "loss/fcd": 1.23828125, "loss/logits": 0.31613337993621826, "step": 1763 }, { "epoch": 0.005601422583513273, "grad_norm": 0.287109375, "grad_norm_var": 0.0005968570709228516, "learning_rate": 0.01, "loss": 1.3922, "loss/crossentropy": 2.4597020149230957, "loss/fcd": 1.12109375, "loss/logits": 0.266130268573761, "step": 1764 }, { "epoch": 0.005604597993141116, "grad_norm": 0.306640625, "grad_norm_var": 0.0005472660064697266, "learning_rate": 0.01, "loss": 1.455, "loss/crossentropy": 2.216887950897217, "loss/fcd": 1.033203125, "loss/logits": 0.23117846250534058, "step": 1765 }, { "epoch": 0.005607773402768957, "grad_norm": 0.287109375, "grad_norm_var": 0.0006084283192952474, "learning_rate": 0.01, "loss": 1.358, "loss/crossentropy": 2.5979079008102417, "loss/fcd": 1.140625, "loss/logits": 0.2564357668161392, "step": 1766 }, { "epoch": 0.005610948812396799, "grad_norm": 0.314453125, "grad_norm_var": 0.0004871209462483724, "learning_rate": 0.01, "loss": 1.4687, "loss/crossentropy": 2.5373899936676025, "loss/fcd": 1.2578125, "loss/logits": 0.3092518597841263, "step": 1767 }, { "epoch": 0.005614124222024642, "grad_norm": 0.3359375, "grad_norm_var": 0.0005083719889322917, "learning_rate": 0.01, "loss": 1.4402, "loss/crossentropy": 2.737550139427185, "loss/fcd": 1.25, "loss/logits": 0.3238609582185745, "step": 1768 }, { "epoch": 0.005617299631652483, "grad_norm": 0.404296875, "grad_norm_var": 0.0009661356608072917, "learning_rate": 0.01, "loss": 1.4656, "loss/crossentropy": 2.320305824279785, "loss/fcd": 1.10546875, "loss/logits": 0.2672227919101715, "step": 1769 }, { "epoch": 0.005620475041280325, "grad_norm": 0.30859375, "grad_norm_var": 0.0009550571441650391, "learning_rate": 0.01, "loss": 1.4055, "loss/crossentropy": 2.4491394758224487, "loss/fcd": 1.16796875, "loss/logits": 0.2427944391965866, "step": 1770 }, { "epoch": 0.005623650450908168, "grad_norm": 0.34765625, "grad_norm_var": 0.0009919325510660806, "learning_rate": 0.01, "loss": 1.3954, "loss/crossentropy": 2.568453550338745, "loss/fcd": 1.123046875, "loss/logits": 0.2631102353334427, "step": 1771 }, { "epoch": 0.005626825860536009, "grad_norm": 0.33984375, "grad_norm_var": 0.0009850660959879558, "learning_rate": 0.01, "loss": 1.4726, "loss/crossentropy": 2.622509002685547, "loss/fcd": 1.14453125, "loss/logits": 0.2704010158777237, "step": 1772 }, { "epoch": 0.005630001270163851, "grad_norm": 0.322265625, "grad_norm_var": 0.0009183247884114583, "learning_rate": 0.01, "loss": 1.4552, "loss/crossentropy": 2.6154658794403076, "loss/fcd": 1.171875, "loss/logits": 0.2713060677051544, "step": 1773 }, { "epoch": 0.005633176679791693, "grad_norm": 0.298828125, "grad_norm_var": 0.0008972009023030599, "learning_rate": 0.01, "loss": 1.3681, "loss/crossentropy": 2.4632482528686523, "loss/fcd": 1.109375, "loss/logits": 0.24348068237304688, "step": 1774 }, { "epoch": 0.005636352089419535, "grad_norm": 0.388671875, "grad_norm_var": 0.001083230972290039, "learning_rate": 0.01, "loss": 1.4567, "loss/crossentropy": 2.5745131969451904, "loss/fcd": 1.140625, "loss/logits": 0.25846195220947266, "step": 1775 }, { "epoch": 0.005639527499047377, "grad_norm": 0.310546875, "grad_norm_var": 0.001057879130045573, "learning_rate": 0.01, "loss": 1.3887, "loss/crossentropy": 2.48404324054718, "loss/fcd": 1.13671875, "loss/logits": 0.2579190582036972, "step": 1776 }, { "epoch": 0.005642702908675219, "grad_norm": 0.306640625, "grad_norm_var": 0.0010800520579020182, "learning_rate": 0.01, "loss": 1.4026, "loss/crossentropy": 2.72410786151886, "loss/fcd": 1.1640625, "loss/logits": 0.2851269245147705, "step": 1777 }, { "epoch": 0.005645878318303061, "grad_norm": 0.31640625, "grad_norm_var": 0.0010800520579020182, "learning_rate": 0.01, "loss": 1.4398, "loss/crossentropy": 2.3883779048919678, "loss/fcd": 1.11328125, "loss/logits": 0.24898535758256912, "step": 1778 }, { "epoch": 0.005649053727930903, "grad_norm": 0.3046875, "grad_norm_var": 0.0010988871256510417, "learning_rate": 0.01, "loss": 1.4443, "loss/crossentropy": 2.6218948364257812, "loss/fcd": 1.16796875, "loss/logits": 0.2713538631796837, "step": 1779 }, { "epoch": 0.005652229137558745, "grad_norm": 0.357421875, "grad_norm_var": 0.0010645548502604167, "learning_rate": 0.01, "loss": 1.4676, "loss/crossentropy": 2.4276747703552246, "loss/fcd": 1.22265625, "loss/logits": 0.2675796151161194, "step": 1780 }, { "epoch": 0.005655404547186587, "grad_norm": 0.333984375, "grad_norm_var": 0.0010329564412434897, "learning_rate": 0.01, "loss": 1.378, "loss/crossentropy": 2.608278751373291, "loss/fcd": 1.109375, "loss/logits": 0.26241108030080795, "step": 1781 }, { "epoch": 0.005658579956814429, "grad_norm": 0.31640625, "grad_norm_var": 0.0009197076161702474, "learning_rate": 0.01, "loss": 1.4577, "loss/crossentropy": 2.4784488677978516, "loss/fcd": 1.2265625, "loss/logits": 0.28565794229507446, "step": 1782 }, { "epoch": 0.005661755366442271, "grad_norm": 0.333984375, "grad_norm_var": 0.0008987267812093099, "learning_rate": 0.01, "loss": 1.4013, "loss/crossentropy": 2.6598095893859863, "loss/fcd": 1.10546875, "loss/logits": 0.2459757700562477, "step": 1783 }, { "epoch": 0.005664930776070113, "grad_norm": 0.30859375, "grad_norm_var": 0.0009343306223551433, "learning_rate": 0.01, "loss": 1.4271, "loss/crossentropy": 2.594589591026306, "loss/fcd": 1.1796875, "loss/logits": 0.28594980388879776, "step": 1784 }, { "epoch": 0.005668106185697955, "grad_norm": 0.33203125, "grad_norm_var": 0.000556182861328125, "learning_rate": 0.01, "loss": 1.4279, "loss/crossentropy": 2.770109534263611, "loss/fcd": 1.18359375, "loss/logits": 0.26183685660362244, "step": 1785 }, { "epoch": 0.005671281595325797, "grad_norm": 0.333984375, "grad_norm_var": 0.0005353132883707683, "learning_rate": 0.01, "loss": 1.415, "loss/crossentropy": 2.6016682386398315, "loss/fcd": 1.1796875, "loss/logits": 0.2504027262330055, "step": 1786 }, { "epoch": 0.005674457004953639, "grad_norm": 0.48828125, "grad_norm_var": 0.002135197321573893, "learning_rate": 0.01, "loss": 1.4434, "loss/crossentropy": 2.6368253231048584, "loss/fcd": 1.13671875, "loss/logits": 0.249956913292408, "step": 1787 }, { "epoch": 0.005677632414581481, "grad_norm": 0.328125, "grad_norm_var": 0.0021393934885660807, "learning_rate": 0.01, "loss": 1.373, "loss/crossentropy": 2.4316627979278564, "loss/fcd": 1.09375, "loss/logits": 0.23616923391819, "step": 1788 }, { "epoch": 0.005680807824209323, "grad_norm": 0.3984375, "grad_norm_var": 0.002359453837076823, "learning_rate": 0.01, "loss": 1.4882, "loss/crossentropy": 2.5976154804229736, "loss/fcd": 1.26171875, "loss/logits": 0.2963321655988693, "step": 1789 }, { "epoch": 0.005683983233837165, "grad_norm": 0.337890625, "grad_norm_var": 0.0022348403930664063, "learning_rate": 0.01, "loss": 1.4683, "loss/crossentropy": 2.79253613948822, "loss/fcd": 1.234375, "loss/logits": 0.29627932608127594, "step": 1790 }, { "epoch": 0.005687158643465007, "grad_norm": 0.318359375, "grad_norm_var": 0.0021203994750976563, "learning_rate": 0.01, "loss": 1.3775, "loss/crossentropy": 2.5644136667251587, "loss/fcd": 1.08203125, "loss/logits": 0.2524941936135292, "step": 1791 }, { "epoch": 0.005690334053092849, "grad_norm": 0.31640625, "grad_norm_var": 0.002100229263305664, "learning_rate": 0.01, "loss": 1.4609, "loss/crossentropy": 2.674256682395935, "loss/fcd": 1.2265625, "loss/logits": 0.28680163621902466, "step": 1792 }, { "epoch": 0.005693509462720691, "grad_norm": 0.38671875, "grad_norm_var": 0.002150408426920573, "learning_rate": 0.01, "loss": 1.4311, "loss/crossentropy": 2.670706033706665, "loss/fcd": 1.19921875, "loss/logits": 0.29848531633615494, "step": 1793 }, { "epoch": 0.005696684872348533, "grad_norm": 0.37109375, "grad_norm_var": 0.002132606506347656, "learning_rate": 0.01, "loss": 1.4451, "loss/crossentropy": 2.1028560996055603, "loss/fcd": 1.0078125, "loss/logits": 0.2205147072672844, "step": 1794 }, { "epoch": 0.005699860281976375, "grad_norm": 0.326171875, "grad_norm_var": 0.002037668228149414, "learning_rate": 0.01, "loss": 1.3913, "loss/crossentropy": 2.4641966819763184, "loss/fcd": 1.1484375, "loss/logits": 0.2497178539633751, "step": 1795 }, { "epoch": 0.005703035691604217, "grad_norm": 0.32421875, "grad_norm_var": 0.0020703633626302084, "learning_rate": 0.01, "loss": 1.442, "loss/crossentropy": 2.6268848180770874, "loss/fcd": 1.12890625, "loss/logits": 0.25104305148124695, "step": 1796 }, { "epoch": 0.005706211101232059, "grad_norm": 0.337890625, "grad_norm_var": 0.0020644505818684894, "learning_rate": 0.01, "loss": 1.4589, "loss/crossentropy": 2.411762833595276, "loss/fcd": 1.14453125, "loss/logits": 0.2601100206375122, "step": 1797 }, { "epoch": 0.005709386510859901, "grad_norm": 0.349609375, "grad_norm_var": 0.0019960880279541017, "learning_rate": 0.01, "loss": 1.4632, "loss/crossentropy": 2.233980894088745, "loss/fcd": 1.1484375, "loss/logits": 0.28234755992889404, "step": 1798 }, { "epoch": 0.005712561920487743, "grad_norm": 0.5, "grad_norm_var": 0.0033754984537760417, "learning_rate": 0.01, "loss": 1.5118, "loss/crossentropy": 2.444004774093628, "loss/fcd": 1.0859375, "loss/logits": 0.2583284080028534, "step": 1799 }, { "epoch": 0.005715737330115585, "grad_norm": 0.322265625, "grad_norm_var": 0.0032937208811442058, "learning_rate": 0.01, "loss": 1.4205, "loss/crossentropy": 2.7039201259613037, "loss/fcd": 1.1796875, "loss/logits": 0.25464266538619995, "step": 1800 }, { "epoch": 0.005718912739743427, "grad_norm": 0.34375, "grad_norm_var": 0.0032574812571207683, "learning_rate": 0.01, "loss": 1.459, "loss/crossentropy": 2.601548433303833, "loss/fcd": 1.2265625, "loss/logits": 0.2900586724281311, "step": 1801 }, { "epoch": 0.005722088149371269, "grad_norm": 0.318359375, "grad_norm_var": 0.0033299605051676433, "learning_rate": 0.01, "loss": 1.4418, "loss/crossentropy": 2.3790465593338013, "loss/fcd": 1.1171875, "loss/logits": 0.2653738558292389, "step": 1802 }, { "epoch": 0.005725263558999111, "grad_norm": 0.353515625, "grad_norm_var": 0.0021685282389322918, "learning_rate": 0.01, "loss": 1.4533, "loss/crossentropy": 2.5780253410339355, "loss/fcd": 1.20703125, "loss/logits": 0.28444628417491913, "step": 1803 }, { "epoch": 0.005728438968626953, "grad_norm": 0.33203125, "grad_norm_var": 0.0021570205688476564, "learning_rate": 0.01, "loss": 1.4383, "loss/crossentropy": 2.368231177330017, "loss/fcd": 1.2421875, "loss/logits": 0.27523526549339294, "step": 1804 }, { "epoch": 0.005731614378254795, "grad_norm": 0.37890625, "grad_norm_var": 0.002060699462890625, "learning_rate": 0.01, "loss": 1.4646, "loss/crossentropy": 2.751879096031189, "loss/fcd": 1.18359375, "loss/logits": 0.28096291422843933, "step": 1805 }, { "epoch": 0.005734789787882637, "grad_norm": 0.35546875, "grad_norm_var": 0.002049112319946289, "learning_rate": 0.01, "loss": 1.4802, "loss/crossentropy": 2.4835305213928223, "loss/fcd": 1.13671875, "loss/logits": 0.2648111432790756, "step": 1806 }, { "epoch": 0.005737965197510479, "grad_norm": 0.330078125, "grad_norm_var": 0.002004861831665039, "learning_rate": 0.01, "loss": 1.4494, "loss/crossentropy": 2.4140182733535767, "loss/fcd": 1.125, "loss/logits": 0.2506559416651726, "step": 1807 }, { "epoch": 0.005741140607138321, "grad_norm": 0.31640625, "grad_norm_var": 0.002004861831665039, "learning_rate": 0.01, "loss": 1.4553, "loss/crossentropy": 2.817805290222168, "loss/fcd": 1.17578125, "loss/logits": 0.2724747955799103, "step": 1808 }, { "epoch": 0.005744316016766162, "grad_norm": 0.88671875, "grad_norm_var": 0.019884093602498373, "learning_rate": 0.01, "loss": 1.4424, "loss/crossentropy": 2.4607781171798706, "loss/fcd": 1.1484375, "loss/logits": 0.2550867795944214, "step": 1809 }, { "epoch": 0.005747491426394005, "grad_norm": 0.984375, "grad_norm_var": 0.04232316017150879, "learning_rate": 0.01, "loss": 1.3961, "loss/crossentropy": 2.5198408365249634, "loss/fcd": 1.078125, "loss/logits": 0.25158926844596863, "step": 1810 }, { "epoch": 0.005750666836021847, "grad_norm": 0.3984375, "grad_norm_var": 0.041721534729003903, "learning_rate": 0.01, "loss": 1.5471, "loss/crossentropy": 2.7422348260879517, "loss/fcd": 1.18359375, "loss/logits": 0.27667514979839325, "step": 1811 }, { "epoch": 0.005753842245649688, "grad_norm": 0.3203125, "grad_norm_var": 0.041776021321614586, "learning_rate": 0.01, "loss": 1.3941, "loss/crossentropy": 2.614483952522278, "loss/fcd": 1.16015625, "loss/logits": 0.2719123512506485, "step": 1812 }, { "epoch": 0.005757017655277531, "grad_norm": 0.3671875, "grad_norm_var": 0.04148252805074056, "learning_rate": 0.01, "loss": 1.4039, "loss/crossentropy": 2.4615747928619385, "loss/fcd": 1.10546875, "loss/logits": 0.26244837045669556, "step": 1813 }, { "epoch": 0.005760193064905373, "grad_norm": 0.373046875, "grad_norm_var": 0.04127004941304525, "learning_rate": 0.01, "loss": 1.4828, "loss/crossentropy": 2.6630111932754517, "loss/fcd": 1.140625, "loss/logits": 0.27084478735923767, "step": 1814 }, { "epoch": 0.005763368474533214, "grad_norm": 0.310546875, "grad_norm_var": 0.04174645741780599, "learning_rate": 0.01, "loss": 1.3688, "loss/crossentropy": 2.4933866262435913, "loss/fcd": 1.16796875, "loss/logits": 0.27065129578113556, "step": 1815 }, { "epoch": 0.005766543884161057, "grad_norm": 0.345703125, "grad_norm_var": 0.04148095448811849, "learning_rate": 0.01, "loss": 1.4461, "loss/crossentropy": 2.1056578755378723, "loss/fcd": 1.17578125, "loss/logits": 0.2881133407354355, "step": 1816 }, { "epoch": 0.005769719293788899, "grad_norm": 0.31640625, "grad_norm_var": 0.04180450439453125, "learning_rate": 0.01, "loss": 1.446, "loss/crossentropy": 2.356621503829956, "loss/fcd": 1.09375, "loss/logits": 0.27213023602962494, "step": 1817 }, { "epoch": 0.00577289470341674, "grad_norm": 0.32421875, "grad_norm_var": 0.04172883033752441, "learning_rate": 0.01, "loss": 1.428, "loss/crossentropy": 2.42246675491333, "loss/fcd": 1.1328125, "loss/logits": 0.27144576609134674, "step": 1818 }, { "epoch": 0.005776070113044583, "grad_norm": 0.34765625, "grad_norm_var": 0.0417816162109375, "learning_rate": 0.01, "loss": 1.4377, "loss/crossentropy": 2.597900867462158, "loss/fcd": 1.17578125, "loss/logits": 0.26602932810783386, "step": 1819 }, { "epoch": 0.005779245522672425, "grad_norm": 0.35546875, "grad_norm_var": 0.041547393798828124, "learning_rate": 0.01, "loss": 1.4483, "loss/crossentropy": 2.522361993789673, "loss/fcd": 1.13671875, "loss/logits": 0.26169081032276154, "step": 1820 }, { "epoch": 0.005782420932300266, "grad_norm": 0.357421875, "grad_norm_var": 0.04169233640034994, "learning_rate": 0.01, "loss": 1.4504, "loss/crossentropy": 2.32544207572937, "loss/fcd": 1.1328125, "loss/logits": 0.27208544313907623, "step": 1821 }, { "epoch": 0.005785596341928109, "grad_norm": 1.0625, "grad_norm_var": 0.06703222592671712, "learning_rate": 0.01, "loss": 1.5066, "loss/crossentropy": 2.701873540878296, "loss/fcd": 1.15234375, "loss/logits": 0.25446292757987976, "step": 1822 }, { "epoch": 0.005788771751555951, "grad_norm": 0.380859375, "grad_norm_var": 0.06629827817281088, "learning_rate": 0.01, "loss": 1.4884, "loss/crossentropy": 2.4636173248291016, "loss/fcd": 1.1953125, "loss/logits": 0.2662360966205597, "step": 1823 }, { "epoch": 0.005791947161183792, "grad_norm": 0.361328125, "grad_norm_var": 0.06553166707356771, "learning_rate": 0.01, "loss": 1.4585, "loss/crossentropy": 2.589537501335144, "loss/fcd": 1.21875, "loss/logits": 0.28944674134254456, "step": 1824 }, { "epoch": 0.005795122570811635, "grad_norm": 0.388671875, "grad_norm_var": 0.0532466729482015, "learning_rate": 0.01, "loss": 1.4019, "loss/crossentropy": 2.7096924781799316, "loss/fcd": 1.2109375, "loss/logits": 0.2827007472515106, "step": 1825 }, { "epoch": 0.005798297980439477, "grad_norm": 0.32421875, "grad_norm_var": 0.032315937678019206, "learning_rate": 0.01, "loss": 1.3897, "loss/crossentropy": 2.5870803594589233, "loss/fcd": 1.140625, "loss/logits": 0.26677750051021576, "step": 1826 }, { "epoch": 0.005801473390067318, "grad_norm": 0.37109375, "grad_norm_var": 0.03235332171122233, "learning_rate": 0.01, "loss": 1.5168, "loss/crossentropy": 2.4771292209625244, "loss/fcd": 1.30859375, "loss/logits": 0.3120591640472412, "step": 1827 }, { "epoch": 0.005804648799695161, "grad_norm": 0.3984375, "grad_norm_var": 0.03196549415588379, "learning_rate": 0.01, "loss": 1.4903, "loss/crossentropy": 2.5910086631774902, "loss/fcd": 1.2109375, "loss/logits": 0.2610536962747574, "step": 1828 }, { "epoch": 0.005807824209323003, "grad_norm": 0.447265625, "grad_norm_var": 0.03202610015869141, "learning_rate": 0.01, "loss": 1.5621, "loss/crossentropy": 3.0214565992355347, "loss/fcd": 1.296875, "loss/logits": 0.30907364189624786, "step": 1829 }, { "epoch": 0.005810999618950844, "grad_norm": 0.34375, "grad_norm_var": 0.032200860977172854, "learning_rate": 0.01, "loss": 1.4841, "loss/crossentropy": 2.336669921875, "loss/fcd": 1.0703125, "loss/logits": 0.23565935343503952, "step": 1830 }, { "epoch": 0.005814175028578687, "grad_norm": 0.33203125, "grad_norm_var": 0.03196709950764974, "learning_rate": 0.01, "loss": 1.411, "loss/crossentropy": 2.587491512298584, "loss/fcd": 1.27734375, "loss/logits": 0.30614860355854034, "step": 1831 }, { "epoch": 0.005817350438206529, "grad_norm": 0.318359375, "grad_norm_var": 0.032224782307942706, "learning_rate": 0.01, "loss": 1.4849, "loss/crossentropy": 2.6688419580459595, "loss/fcd": 1.15625, "loss/logits": 0.27528001368045807, "step": 1832 }, { "epoch": 0.00582052584783437, "grad_norm": 0.32421875, "grad_norm_var": 0.03213958740234375, "learning_rate": 0.01, "loss": 1.4761, "loss/crossentropy": 2.569050908088684, "loss/fcd": 1.25, "loss/logits": 0.3245510160923004, "step": 1833 }, { "epoch": 0.005823701257462213, "grad_norm": 0.34375, "grad_norm_var": 0.03195997873942057, "learning_rate": 0.01, "loss": 1.4077, "loss/crossentropy": 2.261312961578369, "loss/fcd": 1.1015625, "loss/logits": 0.2338816076517105, "step": 1834 }, { "epoch": 0.005826876667090055, "grad_norm": 0.294921875, "grad_norm_var": 0.032526890436808266, "learning_rate": 0.01, "loss": 1.3958, "loss/crossentropy": 2.572139263153076, "loss/fcd": 1.18359375, "loss/logits": 0.27114491164684296, "step": 1835 }, { "epoch": 0.005830052076717896, "grad_norm": 0.34765625, "grad_norm_var": 0.03257737159729004, "learning_rate": 0.01, "loss": 1.412, "loss/crossentropy": 2.6905394792556763, "loss/fcd": 1.25390625, "loss/logits": 0.2628681883215904, "step": 1836 }, { "epoch": 0.005833227486345739, "grad_norm": 0.35546875, "grad_norm_var": 0.03258864084879557, "learning_rate": 0.01, "loss": 1.4596, "loss/crossentropy": 2.367745041847229, "loss/fcd": 1.22265625, "loss/logits": 0.2710768133401871, "step": 1837 }, { "epoch": 0.005836402895973581, "grad_norm": 0.328125, "grad_norm_var": 0.001392046610514323, "learning_rate": 0.01, "loss": 1.4579, "loss/crossentropy": 2.6826404333114624, "loss/fcd": 1.20703125, "loss/logits": 0.2809663861989975, "step": 1838 }, { "epoch": 0.005839578305601422, "grad_norm": 0.3359375, "grad_norm_var": 0.0013558546702067058, "learning_rate": 0.01, "loss": 1.4737, "loss/crossentropy": 2.6063791513442993, "loss/fcd": 1.1796875, "loss/logits": 0.298393577337265, "step": 1839 }, { "epoch": 0.005842753715229265, "grad_norm": 0.31640625, "grad_norm_var": 0.001419830322265625, "learning_rate": 0.01, "loss": 1.4624, "loss/crossentropy": 2.2988510131835938, "loss/fcd": 1.140625, "loss/logits": 0.28282950818538666, "step": 1840 }, { "epoch": 0.005845929124857106, "grad_norm": 0.29296875, "grad_norm_var": 0.0014751275380452474, "learning_rate": 0.01, "loss": 1.4281, "loss/crossentropy": 2.5407289266586304, "loss/fcd": 1.16796875, "loss/logits": 0.2784429341554642, "step": 1841 }, { "epoch": 0.005849104534484948, "grad_norm": 0.31640625, "grad_norm_var": 0.001497634251912435, "learning_rate": 0.01, "loss": 1.3594, "loss/crossentropy": 2.416848301887512, "loss/fcd": 1.1328125, "loss/logits": 0.2379903346300125, "step": 1842 }, { "epoch": 0.005852279944112791, "grad_norm": 0.3046875, "grad_norm_var": 0.001512765884399414, "learning_rate": 0.01, "loss": 1.4195, "loss/crossentropy": 1.9555911421775818, "loss/fcd": 1.056640625, "loss/logits": 0.2155657485127449, "step": 1843 }, { "epoch": 0.005855455353740632, "grad_norm": 0.357421875, "grad_norm_var": 0.0012847900390625, "learning_rate": 0.01, "loss": 1.4089, "loss/crossentropy": 2.5715596675872803, "loss/fcd": 1.1796875, "loss/logits": 0.29265162348747253, "step": 1844 }, { "epoch": 0.005858630763368474, "grad_norm": 0.310546875, "grad_norm_var": 0.0004058202107747396, "learning_rate": 0.01, "loss": 1.3942, "loss/crossentropy": 2.700554847717285, "loss/fcd": 1.16015625, "loss/logits": 0.25547152757644653, "step": 1845 }, { "epoch": 0.005861806172996317, "grad_norm": 0.30078125, "grad_norm_var": 0.000421905517578125, "learning_rate": 0.01, "loss": 1.4112, "loss/crossentropy": 2.4427586793899536, "loss/fcd": 1.16796875, "loss/logits": 0.2779506891965866, "step": 1846 }, { "epoch": 0.005864981582624158, "grad_norm": 0.28125, "grad_norm_var": 0.0005268732706705729, "learning_rate": 0.01, "loss": 1.3514, "loss/crossentropy": 2.4027719497680664, "loss/fcd": 1.1875, "loss/logits": 0.27206987142562866, "step": 1847 }, { "epoch": 0.005868156992252, "grad_norm": 0.318359375, "grad_norm_var": 0.0005268732706705729, "learning_rate": 0.01, "loss": 1.4571, "loss/crossentropy": 2.6010024547576904, "loss/fcd": 1.26953125, "loss/logits": 0.32210828363895416, "step": 1848 }, { "epoch": 0.005871332401879843, "grad_norm": 0.373046875, "grad_norm_var": 0.0006997267405192058, "learning_rate": 0.01, "loss": 1.4559, "loss/crossentropy": 2.7795016765594482, "loss/fcd": 1.26171875, "loss/logits": 0.28481362760066986, "step": 1849 }, { "epoch": 0.005874507811507684, "grad_norm": 0.318359375, "grad_norm_var": 0.0006718317667643229, "learning_rate": 0.01, "loss": 1.497, "loss/crossentropy": 2.7152591943740845, "loss/fcd": 1.32421875, "loss/logits": 0.3430063873529434, "step": 1850 }, { "epoch": 0.005877683221135526, "grad_norm": 0.3359375, "grad_norm_var": 0.0006287733713785808, "learning_rate": 0.01, "loss": 1.4277, "loss/crossentropy": 2.5750235319137573, "loss/fcd": 1.140625, "loss/logits": 0.2604522407054901, "step": 1851 }, { "epoch": 0.005880858630763369, "grad_norm": 0.33203125, "grad_norm_var": 0.0005959669748942057, "learning_rate": 0.01, "loss": 1.527, "loss/crossentropy": 2.862812638282776, "loss/fcd": 1.2421875, "loss/logits": 0.3002137839794159, "step": 1852 }, { "epoch": 0.00588403404039121, "grad_norm": 0.314453125, "grad_norm_var": 0.0005268732706705729, "learning_rate": 0.01, "loss": 1.432, "loss/crossentropy": 2.5052497386932373, "loss/fcd": 1.1015625, "loss/logits": 0.23887013643980026, "step": 1853 }, { "epoch": 0.005887209450019052, "grad_norm": 0.318359375, "grad_norm_var": 0.0005236148834228515, "learning_rate": 0.01, "loss": 1.3589, "loss/crossentropy": 2.3230772018432617, "loss/fcd": 1.0859375, "loss/logits": 0.2177288979291916, "step": 1854 }, { "epoch": 0.005890384859646895, "grad_norm": 0.337890625, "grad_norm_var": 0.0005278905232747396, "learning_rate": 0.01, "loss": 1.4564, "loss/crossentropy": 2.514824151992798, "loss/fcd": 1.15625, "loss/logits": 0.2819279134273529, "step": 1855 }, { "epoch": 0.005893560269274736, "grad_norm": 0.322265625, "grad_norm_var": 0.0005267937978108724, "learning_rate": 0.01, "loss": 1.4068, "loss/crossentropy": 2.3991973400115967, "loss/fcd": 1.1171875, "loss/logits": 0.25893211364746094, "step": 1856 }, { "epoch": 0.005896735678902578, "grad_norm": 0.3046875, "grad_norm_var": 0.0004916985829671224, "learning_rate": 0.01, "loss": 1.3967, "loss/crossentropy": 2.294077157974243, "loss/fcd": 1.11328125, "loss/logits": 0.28248852491378784, "step": 1857 }, { "epoch": 0.005899911088530421, "grad_norm": 0.337890625, "grad_norm_var": 0.000505510965983073, "learning_rate": 0.01, "loss": 1.4586, "loss/crossentropy": 2.709365725517273, "loss/fcd": 1.16015625, "loss/logits": 0.27952753007411957, "step": 1858 }, { "epoch": 0.005903086498158262, "grad_norm": 0.291015625, "grad_norm_var": 0.0005505720774332683, "learning_rate": 0.01, "loss": 1.3841, "loss/crossentropy": 2.7052277326583862, "loss/fcd": 1.140625, "loss/logits": 0.2379734367132187, "step": 1859 }, { "epoch": 0.005906261907786104, "grad_norm": 0.318359375, "grad_norm_var": 0.00046219825744628904, "learning_rate": 0.01, "loss": 1.432, "loss/crossentropy": 2.697511672973633, "loss/fcd": 1.1328125, "loss/logits": 0.2685427665710449, "step": 1860 }, { "epoch": 0.005909437317413947, "grad_norm": 0.47265625, "grad_norm_var": 0.0019067764282226563, "learning_rate": 0.01, "loss": 1.5341, "loss/crossentropy": 2.4182311296463013, "loss/fcd": 1.06640625, "loss/logits": 0.23573100566864014, "step": 1861 }, { "epoch": 0.005912612727041788, "grad_norm": 0.3046875, "grad_norm_var": 0.0018925984700520833, "learning_rate": 0.01, "loss": 1.4227, "loss/crossentropy": 2.5025492906570435, "loss/fcd": 1.18359375, "loss/logits": 0.2611831873655319, "step": 1862 }, { "epoch": 0.00591578813666963, "grad_norm": 0.353515625, "grad_norm_var": 0.001748514175415039, "learning_rate": 0.01, "loss": 1.4201, "loss/crossentropy": 2.6676464080810547, "loss/fcd": 1.16015625, "loss/logits": 0.2614209055900574, "step": 1863 }, { "epoch": 0.005918963546297473, "grad_norm": 0.34765625, "grad_norm_var": 0.001738739013671875, "learning_rate": 0.01, "loss": 1.4977, "loss/crossentropy": 2.6879594326019287, "loss/fcd": 1.20703125, "loss/logits": 0.268084853887558, "step": 1864 }, { "epoch": 0.005922138955925314, "grad_norm": 0.3203125, "grad_norm_var": 0.0016550540924072266, "learning_rate": 0.01, "loss": 1.4527, "loss/crossentropy": 2.6298590898513794, "loss/fcd": 1.26953125, "loss/logits": 0.30050230026245117, "step": 1865 }, { "epoch": 0.005925314365553156, "grad_norm": 0.326171875, "grad_norm_var": 0.0016434828440348307, "learning_rate": 0.01, "loss": 1.4382, "loss/crossentropy": 2.9001080989837646, "loss/fcd": 1.265625, "loss/logits": 0.30418314039707184, "step": 1866 }, { "epoch": 0.005928489775180999, "grad_norm": 0.34375, "grad_norm_var": 0.0016497135162353515, "learning_rate": 0.01, "loss": 1.4696, "loss/crossentropy": 2.5840306282043457, "loss/fcd": 1.3359375, "loss/logits": 0.37617193162441254, "step": 1867 }, { "epoch": 0.00593166518480884, "grad_norm": 0.314453125, "grad_norm_var": 0.00167388916015625, "learning_rate": 0.01, "loss": 1.4665, "loss/crossentropy": 2.515114188194275, "loss/fcd": 1.23046875, "loss/logits": 0.27558770775794983, "step": 1868 }, { "epoch": 0.005934840594436682, "grad_norm": 0.34765625, "grad_norm_var": 0.0016606489817301432, "learning_rate": 0.01, "loss": 1.4066, "loss/crossentropy": 2.1652050018310547, "loss/fcd": 1.0625, "loss/logits": 0.23196804523468018, "step": 1869 }, { "epoch": 0.005938016004064525, "grad_norm": 0.3515625, "grad_norm_var": 0.0016555150349934896, "learning_rate": 0.01, "loss": 1.457, "loss/crossentropy": 2.607521414756775, "loss/fcd": 1.21484375, "loss/logits": 0.2988707721233368, "step": 1870 }, { "epoch": 0.005941191413692366, "grad_norm": 0.291015625, "grad_norm_var": 0.0017882664998372397, "learning_rate": 0.01, "loss": 1.4215, "loss/crossentropy": 2.3602222204208374, "loss/fcd": 1.3125, "loss/logits": 0.3658839762210846, "step": 1871 }, { "epoch": 0.005944366823320208, "grad_norm": 0.353515625, "grad_norm_var": 0.0017994562784830728, "learning_rate": 0.01, "loss": 1.507, "loss/crossentropy": 2.6523748636245728, "loss/fcd": 1.2578125, "loss/logits": 0.28327734768390656, "step": 1872 }, { "epoch": 0.005947542232948051, "grad_norm": 0.3046875, "grad_norm_var": 0.0017994562784830728, "learning_rate": 0.01, "loss": 1.4189, "loss/crossentropy": 2.347494602203369, "loss/fcd": 1.14453125, "loss/logits": 0.26535485684871674, "step": 1873 }, { "epoch": 0.005950717642575892, "grad_norm": 0.435546875, "grad_norm_var": 0.002417755126953125, "learning_rate": 0.01, "loss": 1.4732, "loss/crossentropy": 2.4662632942199707, "loss/fcd": 1.1015625, "loss/logits": 0.25581879168748856, "step": 1874 }, { "epoch": 0.005953893052203734, "grad_norm": 0.400390625, "grad_norm_var": 0.002417755126953125, "learning_rate": 0.01, "loss": 1.4569, "loss/crossentropy": 2.596408486366272, "loss/fcd": 1.125, "loss/logits": 0.24510548263788223, "step": 1875 }, { "epoch": 0.005957068461831576, "grad_norm": 0.4140625, "grad_norm_var": 0.002597665786743164, "learning_rate": 0.01, "loss": 1.4824, "loss/crossentropy": 2.3459614515304565, "loss/fcd": 1.1640625, "loss/logits": 0.2552497982978821, "step": 1876 }, { "epoch": 0.005960243871459418, "grad_norm": 0.322265625, "grad_norm_var": 0.001654052734375, "learning_rate": 0.01, "loss": 1.425, "loss/crossentropy": 2.4428869485855103, "loss/fcd": 1.24609375, "loss/logits": 0.2922828197479248, "step": 1877 }, { "epoch": 0.00596341928108726, "grad_norm": 0.279296875, "grad_norm_var": 0.001833200454711914, "learning_rate": 0.01, "loss": 1.3607, "loss/crossentropy": 2.3811720609664917, "loss/fcd": 1.041015625, "loss/logits": 0.23003825545310974, "step": 1878 }, { "epoch": 0.005966594690715102, "grad_norm": 0.353515625, "grad_norm_var": 0.001833200454711914, "learning_rate": 0.01, "loss": 1.4948, "loss/crossentropy": 2.6003490686416626, "loss/fcd": 1.15234375, "loss/logits": 0.266837477684021, "step": 1879 }, { "epoch": 0.005969770100342944, "grad_norm": 0.357421875, "grad_norm_var": 0.0018437703450520833, "learning_rate": 0.01, "loss": 1.4893, "loss/crossentropy": 2.415708541870117, "loss/fcd": 1.25, "loss/logits": 0.31984907388687134, "step": 1880 }, { "epoch": 0.005972945509970786, "grad_norm": 0.333984375, "grad_norm_var": 0.0018109480539957683, "learning_rate": 0.01, "loss": 1.4243, "loss/crossentropy": 2.4196611642837524, "loss/fcd": 1.1640625, "loss/logits": 0.25314097106456757, "step": 1881 }, { "epoch": 0.005976120919598628, "grad_norm": 0.5, "grad_norm_var": 0.0032496134440104167, "learning_rate": 0.01, "loss": 1.3579, "loss/crossentropy": 2.3237491846084595, "loss/fcd": 1.10546875, "loss/logits": 0.2403154969215393, "step": 1882 }, { "epoch": 0.00597929632922647, "grad_norm": 0.341796875, "grad_norm_var": 0.00325315793355306, "learning_rate": 0.01, "loss": 1.3849, "loss/crossentropy": 2.2412317991256714, "loss/fcd": 1.0390625, "loss/logits": 0.23579832166433334, "step": 1883 }, { "epoch": 0.005982471738854312, "grad_norm": 0.31640625, "grad_norm_var": 0.00324249267578125, "learning_rate": 0.01, "loss": 1.352, "loss/crossentropy": 2.613751769065857, "loss/fcd": 1.11328125, "loss/logits": 0.25216375291347504, "step": 1884 }, { "epoch": 0.005985647148482154, "grad_norm": 0.33984375, "grad_norm_var": 0.003255462646484375, "learning_rate": 0.01, "loss": 1.4743, "loss/crossentropy": 2.6577444076538086, "loss/fcd": 1.22265625, "loss/logits": 0.29674088954925537, "step": 1885 }, { "epoch": 0.005988822558109996, "grad_norm": 0.314453125, "grad_norm_var": 0.0033632755279541016, "learning_rate": 0.01, "loss": 1.4963, "loss/crossentropy": 2.787277102470398, "loss/fcd": 1.2578125, "loss/logits": 0.28245826065540314, "step": 1886 }, { "epoch": 0.005991997967737838, "grad_norm": 0.294921875, "grad_norm_var": 0.003331613540649414, "learning_rate": 0.01, "loss": 1.4124, "loss/crossentropy": 2.5122971534729004, "loss/fcd": 1.171875, "loss/logits": 0.27991411089897156, "step": 1887 }, { "epoch": 0.00599517337736568, "grad_norm": 0.349609375, "grad_norm_var": 0.0033327579498291016, "learning_rate": 0.01, "loss": 1.4593, "loss/crossentropy": 2.756064295768738, "loss/fcd": 1.10546875, "loss/logits": 0.24170689284801483, "step": 1888 }, { "epoch": 0.005998348786993522, "grad_norm": 0.3203125, "grad_norm_var": 0.0032460371653238933, "learning_rate": 0.01, "loss": 1.4512, "loss/crossentropy": 2.234619915485382, "loss/fcd": 1.234375, "loss/logits": 0.2831808850169182, "step": 1889 }, { "epoch": 0.006001524196621364, "grad_norm": 0.33984375, "grad_norm_var": 0.0027857462565104166, "learning_rate": 0.01, "loss": 1.4338, "loss/crossentropy": 2.7402619123458862, "loss/fcd": 1.1484375, "loss/logits": 0.2676101177930832, "step": 1890 }, { "epoch": 0.006004699606249206, "grad_norm": 0.359375, "grad_norm_var": 0.002607838312784831, "learning_rate": 0.01, "loss": 1.4676, "loss/crossentropy": 2.6213642358779907, "loss/fcd": 1.15234375, "loss/logits": 0.26413851231336594, "step": 1891 }, { "epoch": 0.006007875015877048, "grad_norm": 0.3046875, "grad_norm_var": 0.0023639520009358724, "learning_rate": 0.01, "loss": 1.4289, "loss/crossentropy": 2.6477421522140503, "loss/fcd": 1.16015625, "loss/logits": 0.26153797656297684, "step": 1892 }, { "epoch": 0.00601105042550489, "grad_norm": 0.341796875, "grad_norm_var": 0.002343606948852539, "learning_rate": 0.01, "loss": 1.4655, "loss/crossentropy": 2.5919302701950073, "loss/fcd": 1.1875, "loss/logits": 0.2756563723087311, "step": 1893 }, { "epoch": 0.006014225835132732, "grad_norm": 0.326171875, "grad_norm_var": 0.002098703384399414, "learning_rate": 0.01, "loss": 1.4089, "loss/crossentropy": 2.5592551231384277, "loss/fcd": 1.10546875, "loss/logits": 0.24341261386871338, "step": 1894 }, { "epoch": 0.006017401244760574, "grad_norm": 0.318359375, "grad_norm_var": 0.002128458023071289, "learning_rate": 0.01, "loss": 1.4767, "loss/crossentropy": 2.787070393562317, "loss/fcd": 1.32421875, "loss/logits": 0.33041954040527344, "step": 1895 }, { "epoch": 0.006020576654388416, "grad_norm": 0.322265625, "grad_norm_var": 0.0021296024322509767, "learning_rate": 0.01, "loss": 1.4796, "loss/crossentropy": 2.5095754861831665, "loss/fcd": 1.26171875, "loss/logits": 0.32972003519535065, "step": 1896 }, { "epoch": 0.006023752064016258, "grad_norm": 0.298828125, "grad_norm_var": 0.0022303104400634766, "learning_rate": 0.01, "loss": 1.3797, "loss/crossentropy": 2.4206162691116333, "loss/fcd": 1.1015625, "loss/logits": 0.25241421163082123, "step": 1897 }, { "epoch": 0.0060269274736441, "grad_norm": 0.28515625, "grad_norm_var": 0.0004399458567301432, "learning_rate": 0.01, "loss": 1.4143, "loss/crossentropy": 2.8175556659698486, "loss/fcd": 1.14453125, "loss/logits": 0.2618308514356613, "step": 1898 }, { "epoch": 0.006030102883271942, "grad_norm": 0.3203125, "grad_norm_var": 0.00041599273681640624, "learning_rate": 0.01, "loss": 1.3752, "loss/crossentropy": 2.591304302215576, "loss/fcd": 1.17578125, "loss/logits": 0.2785727083683014, "step": 1899 }, { "epoch": 0.006033278292899784, "grad_norm": 0.47265625, "grad_norm_var": 0.0018248875935872396, "learning_rate": 0.01, "loss": 1.4393, "loss/crossentropy": 2.5867522954940796, "loss/fcd": 1.234375, "loss/logits": 0.2795933783054352, "step": 1900 }, { "epoch": 0.006036453702527626, "grad_norm": 0.35546875, "grad_norm_var": 0.0018569310506184897, "learning_rate": 0.01, "loss": 1.4602, "loss/crossentropy": 2.3891687393188477, "loss/fcd": 1.3125, "loss/logits": 0.30100224912166595, "step": 1901 }, { "epoch": 0.006039629112155468, "grad_norm": 0.349609375, "grad_norm_var": 0.0018483479817708334, "learning_rate": 0.01, "loss": 1.4317, "loss/crossentropy": 2.5520068407058716, "loss/fcd": 1.1328125, "loss/logits": 0.24551530182361603, "step": 1902 }, { "epoch": 0.00604280452178331, "grad_norm": 0.306640625, "grad_norm_var": 0.0017943700154622396, "learning_rate": 0.01, "loss": 1.4446, "loss/crossentropy": 2.6315845251083374, "loss/fcd": 1.25390625, "loss/logits": 0.3102858364582062, "step": 1903 }, { "epoch": 0.006045979931411152, "grad_norm": 0.333984375, "grad_norm_var": 0.0017806371053059897, "learning_rate": 0.01, "loss": 1.4332, "loss/crossentropy": 2.435181975364685, "loss/fcd": 1.15625, "loss/logits": 0.27669376134872437, "step": 1904 }, { "epoch": 0.006049155341038994, "grad_norm": 0.330078125, "grad_norm_var": 0.0017678419748942058, "learning_rate": 0.01, "loss": 1.4165, "loss/crossentropy": 2.3994520902633667, "loss/fcd": 1.08203125, "loss/logits": 0.2351747453212738, "step": 1905 }, { "epoch": 0.006052330750666836, "grad_norm": 0.32421875, "grad_norm_var": 0.001773691177368164, "learning_rate": 0.01, "loss": 1.441, "loss/crossentropy": 2.6251537799835205, "loss/fcd": 1.12890625, "loss/logits": 0.2680409252643585, "step": 1906 }, { "epoch": 0.006055506160294678, "grad_norm": 0.328125, "grad_norm_var": 0.0017304579416910807, "learning_rate": 0.01, "loss": 1.4219, "loss/crossentropy": 2.423299193382263, "loss/fcd": 1.125, "loss/logits": 0.24200047552585602, "step": 1907 }, { "epoch": 0.00605868156992252, "grad_norm": 0.380859375, "grad_norm_var": 0.001811663309733073, "learning_rate": 0.01, "loss": 1.539, "loss/crossentropy": 2.488312840461731, "loss/fcd": 1.16796875, "loss/logits": 0.28715966641902924, "step": 1908 }, { "epoch": 0.006061856979550362, "grad_norm": 0.34765625, "grad_norm_var": 0.0018174330393473308, "learning_rate": 0.01, "loss": 1.4786, "loss/crossentropy": 2.57857882976532, "loss/fcd": 1.2421875, "loss/logits": 0.27766841650009155, "step": 1909 }, { "epoch": 0.006065032389178204, "grad_norm": 0.322265625, "grad_norm_var": 0.0018242994944254558, "learning_rate": 0.01, "loss": 1.4502, "loss/crossentropy": 2.561535358428955, "loss/fcd": 1.203125, "loss/logits": 0.27918165922164917, "step": 1910 }, { "epoch": 0.0060682077988060456, "grad_norm": 0.357421875, "grad_norm_var": 0.0018211205800374348, "learning_rate": 0.01, "loss": 1.4854, "loss/crossentropy": 2.596104383468628, "loss/fcd": 1.08984375, "loss/logits": 0.24248411506414413, "step": 1911 }, { "epoch": 0.006071383208433888, "grad_norm": 0.326171875, "grad_norm_var": 0.0018129825592041015, "learning_rate": 0.01, "loss": 1.3725, "loss/crossentropy": 2.2332987785339355, "loss/fcd": 1.0859375, "loss/logits": 0.236942321062088, "step": 1912 }, { "epoch": 0.00607455861806173, "grad_norm": 0.318359375, "grad_norm_var": 0.0017296950022379558, "learning_rate": 0.01, "loss": 1.4661, "loss/crossentropy": 2.331605553627014, "loss/fcd": 1.15625, "loss/logits": 0.2665382847189903, "step": 1913 }, { "epoch": 0.0060777340276895716, "grad_norm": 0.30078125, "grad_norm_var": 0.0016282240549723307, "learning_rate": 0.01, "loss": 1.4063, "loss/crossentropy": 2.736373543739319, "loss/fcd": 1.1171875, "loss/logits": 0.27155517041683197, "step": 1914 }, { "epoch": 0.006080909437317414, "grad_norm": 0.40625, "grad_norm_var": 0.0018394311269124349, "learning_rate": 0.01, "loss": 1.4799, "loss/crossentropy": 1.9569057822227478, "loss/fcd": 1.34765625, "loss/logits": 0.19951950758695602, "step": 1915 }, { "epoch": 0.006084084846945256, "grad_norm": 0.291015625, "grad_norm_var": 0.0008712132771809896, "learning_rate": 0.01, "loss": 1.4195, "loss/crossentropy": 2.217739522457123, "loss/fcd": 1.13671875, "loss/logits": 0.24931149184703827, "step": 1916 }, { "epoch": 0.0060872602565730976, "grad_norm": 0.349609375, "grad_norm_var": 0.0008582909901936849, "learning_rate": 0.01, "loss": 1.3966, "loss/crossentropy": 2.562654495239258, "loss/fcd": 1.1328125, "loss/logits": 0.2767978310585022, "step": 1917 }, { "epoch": 0.00609043566620094, "grad_norm": 0.310546875, "grad_norm_var": 0.000881814956665039, "learning_rate": 0.01, "loss": 1.3834, "loss/crossentropy": 2.6745625734329224, "loss/fcd": 1.09765625, "loss/logits": 0.24418464303016663, "step": 1918 }, { "epoch": 0.006093611075828782, "grad_norm": 0.287109375, "grad_norm_var": 0.0009752750396728515, "learning_rate": 0.01, "loss": 1.4092, "loss/crossentropy": 2.6264731884002686, "loss/fcd": 1.17578125, "loss/logits": 0.26368894428014755, "step": 1919 }, { "epoch": 0.0060967864854566236, "grad_norm": 0.28515625, "grad_norm_var": 0.00111236572265625, "learning_rate": 0.01, "loss": 1.3921, "loss/crossentropy": 2.5804463624954224, "loss/fcd": 1.109375, "loss/logits": 0.2548646628856659, "step": 1920 }, { "epoch": 0.006099961895084466, "grad_norm": 0.291015625, "grad_norm_var": 0.0012026468912760416, "learning_rate": 0.01, "loss": 1.4171, "loss/crossentropy": 2.854316473007202, "loss/fcd": 1.16015625, "loss/logits": 0.267570361495018, "step": 1921 }, { "epoch": 0.006103137304712308, "grad_norm": 0.30859375, "grad_norm_var": 0.001222991943359375, "learning_rate": 0.01, "loss": 1.4062, "loss/crossentropy": 2.774227261543274, "loss/fcd": 1.15234375, "loss/logits": 0.2783433794975281, "step": 1922 }, { "epoch": 0.0061063127143401496, "grad_norm": 0.478515625, "grad_norm_var": 0.0026855309804280597, "learning_rate": 0.01, "loss": 1.5032, "loss/crossentropy": 2.663804769515991, "loss/fcd": 1.09765625, "loss/logits": 0.2561497837305069, "step": 1923 }, { "epoch": 0.006109488123967992, "grad_norm": 0.384765625, "grad_norm_var": 0.002710326512654622, "learning_rate": 0.01, "loss": 1.4737, "loss/crossentropy": 2.491609215736389, "loss/fcd": 1.16796875, "loss/logits": 0.27085503935813904, "step": 1924 }, { "epoch": 0.006112663533595834, "grad_norm": 0.296875, "grad_norm_var": 0.0027880191802978514, "learning_rate": 0.01, "loss": 1.3769, "loss/crossentropy": 2.5151140689849854, "loss/fcd": 1.08984375, "loss/logits": 0.239692322909832, "step": 1925 }, { "epoch": 0.0061158389432236756, "grad_norm": 0.361328125, "grad_norm_var": 0.002831888198852539, "learning_rate": 0.01, "loss": 1.4381, "loss/crossentropy": 2.2443747520446777, "loss/fcd": 1.18359375, "loss/logits": 0.28273946046829224, "step": 1926 }, { "epoch": 0.006119014352851518, "grad_norm": 0.32421875, "grad_norm_var": 0.002799733479817708, "learning_rate": 0.01, "loss": 1.4436, "loss/crossentropy": 2.4971163272857666, "loss/fcd": 1.3046875, "loss/logits": 0.3023562729358673, "step": 1927 }, { "epoch": 0.00612218976247936, "grad_norm": 0.326171875, "grad_norm_var": 0.002799733479817708, "learning_rate": 0.01, "loss": 1.4259, "loss/crossentropy": 2.3254695534706116, "loss/fcd": 1.2734375, "loss/logits": 0.25089506804943085, "step": 1928 }, { "epoch": 0.0061253651721072016, "grad_norm": 0.37109375, "grad_norm_var": 0.002873977025349935, "learning_rate": 0.01, "loss": 1.4962, "loss/crossentropy": 2.4598041772842407, "loss/fcd": 1.1953125, "loss/logits": 0.27266551554203033, "step": 1929 }, { "epoch": 0.006128540581735044, "grad_norm": 0.31640625, "grad_norm_var": 0.0028162479400634767, "learning_rate": 0.01, "loss": 1.4336, "loss/crossentropy": 2.5038135051727295, "loss/fcd": 1.078125, "loss/logits": 0.2508462518453598, "step": 1930 }, { "epoch": 0.006131715991362886, "grad_norm": 0.33984375, "grad_norm_var": 0.0024768670399983725, "learning_rate": 0.01, "loss": 1.4539, "loss/crossentropy": 2.6750608682632446, "loss/fcd": 1.21484375, "loss/logits": 0.2759644687175751, "step": 1931 }, { "epoch": 0.0061348914009907276, "grad_norm": 0.30078125, "grad_norm_var": 0.002428627014160156, "learning_rate": 0.01, "loss": 1.3811, "loss/crossentropy": 2.5435627698898315, "loss/fcd": 1.1171875, "loss/logits": 0.24698469042778015, "step": 1932 }, { "epoch": 0.00613806681061857, "grad_norm": 0.3515625, "grad_norm_var": 0.0024331251780192057, "learning_rate": 0.01, "loss": 1.4462, "loss/crossentropy": 2.3992018699645996, "loss/fcd": 1.1953125, "loss/logits": 0.2618037462234497, "step": 1933 }, { "epoch": 0.006141242220246412, "grad_norm": 0.3203125, "grad_norm_var": 0.00240936279296875, "learning_rate": 0.01, "loss": 1.4231, "loss/crossentropy": 2.518640637397766, "loss/fcd": 1.1484375, "loss/logits": 0.2731615900993347, "step": 1934 }, { "epoch": 0.0061444176298742536, "grad_norm": 0.32421875, "grad_norm_var": 0.002263498306274414, "learning_rate": 0.01, "loss": 1.42, "loss/crossentropy": 2.4104002714157104, "loss/fcd": 1.13671875, "loss/logits": 0.24719002842903137, "step": 1935 }, { "epoch": 0.006147593039502096, "grad_norm": 0.361328125, "grad_norm_var": 0.0021066665649414062, "learning_rate": 0.01, "loss": 1.4937, "loss/crossentropy": 2.6079261302948, "loss/fcd": 1.19921875, "loss/logits": 0.2759229838848114, "step": 1936 }, { "epoch": 0.006150768449129938, "grad_norm": 0.35546875, "grad_norm_var": 0.001936197280883789, "learning_rate": 0.01, "loss": 1.4451, "loss/crossentropy": 2.45172655582428, "loss/fcd": 1.19140625, "loss/logits": 0.23908894509077072, "step": 1937 }, { "epoch": 0.0061539438587577796, "grad_norm": 0.341796875, "grad_norm_var": 0.0018435160319010416, "learning_rate": 0.01, "loss": 1.5316, "loss/crossentropy": 2.366478443145752, "loss/fcd": 1.21875, "loss/logits": 0.300193190574646, "step": 1938 }, { "epoch": 0.006157119268385622, "grad_norm": 0.298828125, "grad_norm_var": 0.0007146199544270834, "learning_rate": 0.01, "loss": 1.4102, "loss/crossentropy": 2.3451504707336426, "loss/fcd": 1.078125, "loss/logits": 0.24426712095737457, "step": 1939 }, { "epoch": 0.006160294678013464, "grad_norm": 0.33984375, "grad_norm_var": 0.0005482832590738932, "learning_rate": 0.01, "loss": 1.4581, "loss/crossentropy": 2.746036171913147, "loss/fcd": 1.21875, "loss/logits": 0.29555387794971466, "step": 1940 }, { "epoch": 0.0061634700876413056, "grad_norm": 0.34765625, "grad_norm_var": 0.0004639784495035807, "learning_rate": 0.01, "loss": 1.4334, "loss/crossentropy": 2.387279748916626, "loss/fcd": 1.12890625, "loss/logits": 0.2566295564174652, "step": 1941 }, { "epoch": 0.006166645497269148, "grad_norm": 0.279296875, "grad_norm_var": 0.0006108442942301432, "learning_rate": 0.01, "loss": 1.3967, "loss/crossentropy": 2.447601556777954, "loss/fcd": 1.2109375, "loss/logits": 0.30189771950244904, "step": 1942 }, { "epoch": 0.00616982090689699, "grad_norm": 0.345703125, "grad_norm_var": 0.0006197611490885417, "learning_rate": 0.01, "loss": 1.4755, "loss/crossentropy": 2.5454204082489014, "loss/fcd": 1.150390625, "loss/logits": 0.27315346896648407, "step": 1943 }, { "epoch": 0.0061729963165248316, "grad_norm": 0.296875, "grad_norm_var": 0.0006982008616129557, "learning_rate": 0.01, "loss": 1.3916, "loss/crossentropy": 2.5439870357513428, "loss/fcd": 1.06640625, "loss/logits": 0.23871399462223053, "step": 1944 }, { "epoch": 0.006176171726152674, "grad_norm": 0.93359375, "grad_norm_var": 0.023503986994425456, "learning_rate": 0.01, "loss": 1.4115, "loss/crossentropy": 2.756371259689331, "loss/fcd": 1.16796875, "loss/logits": 0.26144419610500336, "step": 1945 }, { "epoch": 0.006179347135780515, "grad_norm": 0.357421875, "grad_norm_var": 0.023338762919108073, "learning_rate": 0.01, "loss": 1.572, "loss/crossentropy": 2.552555561065674, "loss/fcd": 1.19921875, "loss/logits": 0.26503366231918335, "step": 1946 }, { "epoch": 0.0061825225454083576, "grad_norm": 0.322265625, "grad_norm_var": 0.02342502276102702, "learning_rate": 0.01, "loss": 1.4625, "loss/crossentropy": 2.504533529281616, "loss/fcd": 1.1484375, "loss/logits": 0.26010390371084213, "step": 1947 }, { "epoch": 0.0061856979550362, "grad_norm": 0.36328125, "grad_norm_var": 0.023114760716756184, "learning_rate": 0.01, "loss": 1.3766, "loss/crossentropy": 2.3585166931152344, "loss/fcd": 1.17578125, "loss/logits": 0.2726312652230263, "step": 1948 }, { "epoch": 0.006188873364664041, "grad_norm": 0.30859375, "grad_norm_var": 0.02334275245666504, "learning_rate": 0.01, "loss": 1.4295, "loss/crossentropy": 2.385395050048828, "loss/fcd": 1.1484375, "loss/logits": 0.27893996238708496, "step": 1949 }, { "epoch": 0.0061920487742918836, "grad_norm": 0.333984375, "grad_norm_var": 0.02326653798421224, "learning_rate": 0.01, "loss": 1.462, "loss/crossentropy": 2.5103747844696045, "loss/fcd": 1.09765625, "loss/logits": 0.24489781260490417, "step": 1950 }, { "epoch": 0.006195224183919726, "grad_norm": 0.3515625, "grad_norm_var": 0.023148600260416666, "learning_rate": 0.01, "loss": 1.4484, "loss/crossentropy": 2.620427966117859, "loss/fcd": 1.20703125, "loss/logits": 0.28006382286548615, "step": 1951 }, { "epoch": 0.006198399593547567, "grad_norm": 0.30859375, "grad_norm_var": 0.023391071955362955, "learning_rate": 0.01, "loss": 1.4356, "loss/crossentropy": 2.5099040269851685, "loss/fcd": 1.140625, "loss/logits": 0.2706640511751175, "step": 1952 }, { "epoch": 0.0062015750031754096, "grad_norm": 0.326171875, "grad_norm_var": 0.023492876688639322, "learning_rate": 0.01, "loss": 1.4623, "loss/crossentropy": 2.6843101978302, "loss/fcd": 1.16796875, "loss/logits": 0.2643709182739258, "step": 1953 }, { "epoch": 0.006204750412803252, "grad_norm": 0.3671875, "grad_norm_var": 0.02345134417215983, "learning_rate": 0.01, "loss": 1.4151, "loss/crossentropy": 2.6874197721481323, "loss/fcd": 1.14453125, "loss/logits": 0.2743026167154312, "step": 1954 }, { "epoch": 0.006207925822431093, "grad_norm": 0.314453125, "grad_norm_var": 0.023323424657185874, "learning_rate": 0.01, "loss": 1.4128, "loss/crossentropy": 2.384095072746277, "loss/fcd": 1.203125, "loss/logits": 0.261336587369442, "step": 1955 }, { "epoch": 0.0062111012320589356, "grad_norm": 0.279296875, "grad_norm_var": 0.023784128824869792, "learning_rate": 0.01, "loss": 1.3385, "loss/crossentropy": 2.1870433688163757, "loss/fcd": 1.08203125, "loss/logits": 0.24676741659641266, "step": 1956 }, { "epoch": 0.006214276641686778, "grad_norm": 0.31640625, "grad_norm_var": 0.02391637166341146, "learning_rate": 0.01, "loss": 1.4068, "loss/crossentropy": 2.5574594736099243, "loss/fcd": 1.16015625, "loss/logits": 0.27141590416431427, "step": 1957 }, { "epoch": 0.006217452051314619, "grad_norm": 0.373046875, "grad_norm_var": 0.02342198689778646, "learning_rate": 0.01, "loss": 1.5103, "loss/crossentropy": 2.493239641189575, "loss/fcd": 1.140625, "loss/logits": 0.27469296753406525, "step": 1958 }, { "epoch": 0.0062206274609424616, "grad_norm": 0.384765625, "grad_norm_var": 0.0233978271484375, "learning_rate": 0.01, "loss": 1.4944, "loss/crossentropy": 2.5249621868133545, "loss/fcd": 1.140625, "loss/logits": 0.2768844813108444, "step": 1959 }, { "epoch": 0.006223802870570304, "grad_norm": 0.326171875, "grad_norm_var": 0.023161554336547853, "learning_rate": 0.01, "loss": 1.4819, "loss/crossentropy": 2.673667550086975, "loss/fcd": 1.19921875, "loss/logits": 0.27972328662872314, "step": 1960 }, { "epoch": 0.006226978280198145, "grad_norm": 0.359375, "grad_norm_var": 0.0008432865142822266, "learning_rate": 0.01, "loss": 1.4624, "loss/crossentropy": 2.654227375984192, "loss/fcd": 1.16015625, "loss/logits": 0.2697824239730835, "step": 1961 }, { "epoch": 0.0062301536898259876, "grad_norm": 0.31640625, "grad_norm_var": 0.000836944580078125, "learning_rate": 0.01, "loss": 1.4317, "loss/crossentropy": 2.605968475341797, "loss/fcd": 1.1796875, "loss/logits": 0.2711298167705536, "step": 1962 }, { "epoch": 0.00623332909945383, "grad_norm": 0.345703125, "grad_norm_var": 0.0008331298828125, "learning_rate": 0.01, "loss": 1.4771, "loss/crossentropy": 2.6340770721435547, "loss/fcd": 1.1171875, "loss/logits": 0.255732998251915, "step": 1963 }, { "epoch": 0.006236504509081671, "grad_norm": 0.34765625, "grad_norm_var": 0.0007914225260416667, "learning_rate": 0.01, "loss": 1.4296, "loss/crossentropy": 2.021034896373749, "loss/fcd": 1.05078125, "loss/logits": 0.24352626502513885, "step": 1964 }, { "epoch": 0.0062396799187095136, "grad_norm": 0.31640625, "grad_norm_var": 0.0007677714029947917, "learning_rate": 0.01, "loss": 1.3978, "loss/crossentropy": 2.4960880279541016, "loss/fcd": 1.0546875, "loss/logits": 0.2392047792673111, "step": 1965 }, { "epoch": 0.006242855328337356, "grad_norm": 0.326171875, "grad_norm_var": 0.0007731119791666666, "learning_rate": 0.01, "loss": 1.4262, "loss/crossentropy": 2.1355353593826294, "loss/fcd": 1.13671875, "loss/logits": 0.2516033500432968, "step": 1966 }, { "epoch": 0.006246030737965197, "grad_norm": 0.36328125, "grad_norm_var": 0.0008076349894205729, "learning_rate": 0.01, "loss": 1.4465, "loss/crossentropy": 2.5303810834884644, "loss/fcd": 1.14453125, "loss/logits": 0.272224098443985, "step": 1967 }, { "epoch": 0.0062492061475930396, "grad_norm": 0.310546875, "grad_norm_var": 0.0008008162180582683, "learning_rate": 0.01, "loss": 1.392, "loss/crossentropy": 2.526457667350769, "loss/fcd": 1.109375, "loss/logits": 0.24292102456092834, "step": 1968 }, { "epoch": 0.006252381557220882, "grad_norm": 0.3125, "grad_norm_var": 0.000830078125, "learning_rate": 0.01, "loss": 1.4252, "loss/crossentropy": 2.580817699432373, "loss/fcd": 1.234375, "loss/logits": 0.2924085855484009, "step": 1969 }, { "epoch": 0.006255556966848723, "grad_norm": 0.33984375, "grad_norm_var": 0.0007593154907226563, "learning_rate": 0.01, "loss": 1.4173, "loss/crossentropy": 2.6674575805664062, "loss/fcd": 1.17578125, "loss/logits": 0.2583131045103073, "step": 1970 }, { "epoch": 0.0062587323764765656, "grad_norm": 0.3125, "grad_norm_var": 0.0007644494374593099, "learning_rate": 0.01, "loss": 1.3746, "loss/crossentropy": 2.457293152809143, "loss/fcd": 1.078125, "loss/logits": 0.24036839604377747, "step": 1971 }, { "epoch": 0.006261907786104408, "grad_norm": 0.287109375, "grad_norm_var": 0.0007121880849202474, "learning_rate": 0.01, "loss": 1.3634, "loss/crossentropy": 2.5010671615600586, "loss/fcd": 1.1015625, "loss/logits": 0.2475980520248413, "step": 1972 }, { "epoch": 0.006265083195732249, "grad_norm": 0.287109375, "grad_norm_var": 0.0008330663045247396, "learning_rate": 0.01, "loss": 1.3713, "loss/crossentropy": 2.603585720062256, "loss/fcd": 1.125, "loss/logits": 0.25688372552394867, "step": 1973 }, { "epoch": 0.0062682586053600916, "grad_norm": 0.291015625, "grad_norm_var": 0.0008023579915364584, "learning_rate": 0.01, "loss": 1.4372, "loss/crossentropy": 2.538019299507141, "loss/fcd": 1.16015625, "loss/logits": 0.2781108468770981, "step": 1974 }, { "epoch": 0.006271434014987934, "grad_norm": 0.55859375, "grad_norm_var": 0.00403758684794108, "learning_rate": 0.01, "loss": 1.4616, "loss/crossentropy": 2.7016539573669434, "loss/fcd": 1.1953125, "loss/logits": 0.27060626447200775, "step": 1975 }, { "epoch": 0.006274609424615775, "grad_norm": 0.330078125, "grad_norm_var": 0.004032627741495768, "learning_rate": 0.01, "loss": 1.4577, "loss/crossentropy": 2.678841233253479, "loss/fcd": 1.21484375, "loss/logits": 0.28598055243492126, "step": 1976 }, { "epoch": 0.0062777848342436176, "grad_norm": 0.314453125, "grad_norm_var": 0.004029337565104167, "learning_rate": 0.01, "loss": 1.4703, "loss/crossentropy": 2.62246572971344, "loss/fcd": 1.21484375, "loss/logits": 0.2851247563958168, "step": 1977 }, { "epoch": 0.00628096024387146, "grad_norm": 0.3125, "grad_norm_var": 0.004039955139160156, "learning_rate": 0.01, "loss": 1.3746, "loss/crossentropy": 2.5885363817214966, "loss/fcd": 1.1875, "loss/logits": 0.26573337614536285, "step": 1978 }, { "epoch": 0.006284135653499301, "grad_norm": 0.298828125, "grad_norm_var": 0.004108619689941406, "learning_rate": 0.01, "loss": 1.4199, "loss/crossentropy": 2.4439785480499268, "loss/fcd": 1.11328125, "loss/logits": 0.24972957372665405, "step": 1979 }, { "epoch": 0.0062873110631271436, "grad_norm": 0.30859375, "grad_norm_var": 0.00412133534749349, "learning_rate": 0.01, "loss": 1.3887, "loss/crossentropy": 2.408186197280884, "loss/fcd": 1.203125, "loss/logits": 0.2733600437641144, "step": 1980 }, { "epoch": 0.006290486472754985, "grad_norm": 0.345703125, "grad_norm_var": 0.00412443478902181, "learning_rate": 0.01, "loss": 1.4563, "loss/crossentropy": 2.8014578819274902, "loss/fcd": 1.28125, "loss/logits": 0.3223569989204407, "step": 1981 }, { "epoch": 0.006293661882382827, "grad_norm": 0.3203125, "grad_norm_var": 0.0041304906209309895, "learning_rate": 0.01, "loss": 1.3827, "loss/crossentropy": 2.510051727294922, "loss/fcd": 1.140625, "loss/logits": 0.25688496232032776, "step": 1982 }, { "epoch": 0.0062968372920106696, "grad_norm": 0.3359375, "grad_norm_var": 0.004058837890625, "learning_rate": 0.01, "loss": 1.5104, "loss/crossentropy": 2.482144355773926, "loss/fcd": 1.23046875, "loss/logits": 0.28780052810907364, "step": 1983 }, { "epoch": 0.006300012701638511, "grad_norm": 0.318359375, "grad_norm_var": 0.0040433247884114586, "learning_rate": 0.01, "loss": 1.4911, "loss/crossentropy": 2.572335124015808, "loss/fcd": 1.23828125, "loss/logits": 0.27077189087867737, "step": 1984 }, { "epoch": 0.006303188111266353, "grad_norm": 0.34375, "grad_norm_var": 0.004033152262369792, "learning_rate": 0.01, "loss": 1.4474, "loss/crossentropy": 2.5712337493896484, "loss/fcd": 1.21875, "loss/logits": 0.3196438401937485, "step": 1985 }, { "epoch": 0.0063063635208941956, "grad_norm": 0.412109375, "grad_norm_var": 0.004439528783162435, "learning_rate": 0.01, "loss": 1.5921, "loss/crossentropy": 2.2747050523757935, "loss/fcd": 1.3984375, "loss/logits": 0.30464503169059753, "step": 1986 }, { "epoch": 0.006309538930522037, "grad_norm": 0.408203125, "grad_norm_var": 0.004711341857910156, "learning_rate": 0.01, "loss": 1.5379, "loss/crossentropy": 3.0389047861099243, "loss/fcd": 1.25, "loss/logits": 0.2932169735431671, "step": 1987 }, { "epoch": 0.006312714340149879, "grad_norm": 0.318359375, "grad_norm_var": 0.004543495178222656, "learning_rate": 0.01, "loss": 1.44, "loss/crossentropy": 2.7297651767730713, "loss/fcd": 1.28515625, "loss/logits": 0.3167975842952728, "step": 1988 }, { "epoch": 0.0063158897497777216, "grad_norm": 0.31640625, "grad_norm_var": 0.004374933242797851, "learning_rate": 0.01, "loss": 1.4322, "loss/crossentropy": 2.326604723930359, "loss/fcd": 1.1484375, "loss/logits": 0.2706734836101532, "step": 1989 }, { "epoch": 0.006319065159405563, "grad_norm": 0.30859375, "grad_norm_var": 0.004265785217285156, "learning_rate": 0.01, "loss": 1.3573, "loss/crossentropy": 2.55885910987854, "loss/fcd": 1.09375, "loss/logits": 0.2506129592657089, "step": 1990 }, { "epoch": 0.006322240569033405, "grad_norm": 0.357421875, "grad_norm_var": 0.001117563247680664, "learning_rate": 0.01, "loss": 1.4136, "loss/crossentropy": 2.1541104316711426, "loss/fcd": 1.12890625, "loss/logits": 0.27240368723869324, "step": 1991 }, { "epoch": 0.0063254159786612476, "grad_norm": 0.416015625, "grad_norm_var": 0.0015301863352457682, "learning_rate": 0.01, "loss": 1.4898, "loss/crossentropy": 2.564090371131897, "loss/fcd": 1.18359375, "loss/logits": 0.28203777968883514, "step": 1992 }, { "epoch": 0.006328591388289089, "grad_norm": 0.3359375, "grad_norm_var": 0.0014866511027018228, "learning_rate": 0.01, "loss": 1.5128, "loss/crossentropy": 2.6486846208572388, "loss/fcd": 1.26953125, "loss/logits": 0.2989698797464371, "step": 1993 }, { "epoch": 0.006331766797916931, "grad_norm": 0.3359375, "grad_norm_var": 0.001431719462076823, "learning_rate": 0.01, "loss": 1.4558, "loss/crossentropy": 2.579405903816223, "loss/fcd": 1.1875, "loss/logits": 0.25192397087812424, "step": 1994 }, { "epoch": 0.0063349422075447736, "grad_norm": 0.298828125, "grad_norm_var": 0.001431719462076823, "learning_rate": 0.01, "loss": 1.4107, "loss/crossentropy": 2.70742928981781, "loss/fcd": 1.1640625, "loss/logits": 0.2780551314353943, "step": 1995 }, { "epoch": 0.006338117617172615, "grad_norm": 0.7734375, "grad_norm_var": 0.012833404541015624, "learning_rate": 0.01, "loss": 1.4842, "loss/crossentropy": 2.533802032470703, "loss/fcd": 1.171875, "loss/logits": 0.26965461671352386, "step": 1996 }, { "epoch": 0.006341293026800457, "grad_norm": 0.3125, "grad_norm_var": 0.013016875584920247, "learning_rate": 0.01, "loss": 1.3973, "loss/crossentropy": 2.4546743631362915, "loss/fcd": 1.14453125, "loss/logits": 0.26644743978977203, "step": 1997 }, { "epoch": 0.0063444684364282996, "grad_norm": 0.294921875, "grad_norm_var": 0.013223711649576824, "learning_rate": 0.01, "loss": 1.3876, "loss/crossentropy": 2.487960457801819, "loss/fcd": 1.19140625, "loss/logits": 0.26286984980106354, "step": 1998 }, { "epoch": 0.006347643846056141, "grad_norm": 0.33203125, "grad_norm_var": 0.013241322835286458, "learning_rate": 0.01, "loss": 1.3915, "loss/crossentropy": 2.637014865875244, "loss/fcd": 1.12890625, "loss/logits": 0.2634287104010582, "step": 1999 }, { "epoch": 0.006350819255683983, "grad_norm": 0.369140625, "grad_norm_var": 0.013068580627441406, "learning_rate": 0.01, "loss": 1.3879, "loss/crossentropy": 2.4557149410247803, "loss/fcd": 1.09375, "loss/logits": 0.23890340328216553, "step": 2000 } ], "logging_steps": 1, "max_steps": 300000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.940080885235712e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }